Source code for deepke.name_entity_re.standard.tools.dataset

[docs]class InputExample(object): """A single training/test example for simple sequence classification.""" def __init__(self, guid, text_a, text_b=None, label=None): """ Constructs a InputExample. Args: guid(string): Unique id for the example. text_a(string): The untokenized text of the first sequence. For single sequence tasks, only this sequence must be specified. text_b(string, optional): The untokenized text of the second sequence. Only must be specified for sequence pair tasks. label(string, optional): The label of the example. This should be specified for train and dev examples, but not for test examples. """ self.guid = guid self.text_a = text_a self.text_b = text_b self.label = label
[docs]class InputFeatures(object): """A single set of features of data.""" def __init__(self, input_ids, input_mask, segment_ids, label_id, valid_ids=None, label_mask=None): self.input_ids = input_ids self.input_mask = input_mask self.segment_ids = segment_ids self.label_id = label_id self.valid_ids = valid_ids self.label_mask = label_mask
[docs]def readfile(filename): ''' read file ''' f = open(filename, encoding='utf-8') data = [] sentence = [] label= [] for line in f: if len(line)==0 or line.startswith('-DOCSTART') or line[0]=="\n": if len(sentence) > 0: data.append((sentence,label)) sentence = [] label = [] continue splits = line.split(' ') sentence.append(splits[0]) label.append(splits[-1][:-1]) if len(sentence) >0: data.append((sentence,label)) sentence = [] label = [] return data
[docs]class DataProcessor(object): """Base class for data converters for sequence classification data sets."""
[docs] def get_train_examples(self, data_dir): """Gets a collection of `InputExample`s for the train set.""" raise NotImplementedError()
[docs] def get_dev_examples(self, data_dir): """Gets a collection of `InputExample`s for the dev set.""" raise NotImplementedError()
[docs] def get_labels(self): """Gets the list of labels for this data set.""" raise NotImplementedError()
@classmethod def _read_tsv(cls, input_file, quotechar=None): """Reads a tab separated value file.""" return readfile(input_file)