Source code for deepke.name_entity_re.standard.tools.preprocess

from .dataset import *

import argparse
import csv
import json
import logging
import os
import random
import sys
import numpy as np

[docs]class NerProcessor(DataProcessor): """Processor for the dataset."""
[docs] def get_train_examples(self, data_dir): """See base class.""" return self._create_examples( self._read_tsv(os.path.join(data_dir, "train.txt")), "train")
[docs] def get_dev_examples(self, data_dir): """See base class.""" return self._create_examples( self._read_tsv(os.path.join(data_dir, "valid.txt")), "dev")
[docs] def get_test_examples(self, data_dir): """See base class.""" return self._create_examples( self._read_tsv(os.path.join(data_dir, "test.txt")), "test")
[docs] def get_labels(self): return ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "[CLS]", "[SEP]"]
def _create_examples(self,lines,set_type): examples = [] for i,(sentence,label) in enumerate(lines): guid = "%s-%s" % (set_type, i) text_a = ' '.join(sentence) text_b = None label = label examples.append(InputExample(guid=guid,text_a=text_a,text_b=text_b,label=label)) return examples
[docs]def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer): """Loads a data file into a list of `InputBatch`s.""" label_map = {label : i for i, label in enumerate(label_list,1)} features = [] for (ex_index,example) in enumerate(examples): textlist = example.text_a.split(' ') labellist = example.label tokens = [] labels = [] valid = [] label_mask = [] for i, word in enumerate(textlist): token = tokenizer.tokenize(word) tokens.extend(token) label_1 = labellist[i] for m in range(len(token)): if m == 0: labels.append(label_1) valid.append(1) label_mask.append(1) else: valid.append(0) if len(tokens) >= max_seq_length - 1: tokens = tokens[0:(max_seq_length - 2)] labels = labels[0:(max_seq_length - 2)] valid = valid[0:(max_seq_length - 2)] label_mask = label_mask[0:(max_seq_length - 2)] ntokens = [] segment_ids = [] label_ids = [] ntokens.append("[CLS]") segment_ids.append(0) valid.insert(0,1) label_mask.insert(0,1) label_ids.append(label_map["[CLS]"]) for i, token in enumerate(tokens): ntokens.append(token) segment_ids.append(0) if len(labels) > i: label_ids.append(label_map[labels[i]]) ntokens.append("[SEP]") segment_ids.append(0) valid.append(1) label_mask.append(1) label_ids.append(label_map["[SEP]"]) input_ids = tokenizer.convert_tokens_to_ids(ntokens) input_mask = [1] * len(input_ids) label_mask = [1] * len(label_ids) while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) label_ids.append(0) valid.append(1) label_mask.append(0) while len(label_ids) < max_seq_length: label_ids.append(0) label_mask.append(0) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length assert len(label_ids) == max_seq_length assert len(valid) == max_seq_length assert len(label_mask) == max_seq_length features.append( InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_id=label_ids, valid_ids=valid, label_mask=label_mask)) return features