Source code for deepke.name_entity_re.standard.tools.dataset
[docs]class InputExample(object):
"""A single training/test example for simple sequence classification."""
def __init__(self, guid, text_a, text_b=None, label=None):
"""
Constructs a InputExample.
Args:
guid(string): Unique id for the example.
text_a(string): The untokenized text of the first sequence. For single sequence tasks, only this sequence must be specified.
text_b(string, optional): The untokenized text of the second sequence. Only must be specified for sequence pair tasks.
label(string, optional): The label of the example. This should be specified for train and dev examples, but not for test examples.
"""
self.guid = guid
self.text_a = text_a
self.text_b = text_b
self.label = label
[docs]class InputFeatures(object):
"""A single set of features of data."""
def __init__(self, input_ids, input_mask, segment_ids, label_id, valid_ids=None, label_mask=None):
self.input_ids = input_ids
self.input_mask = input_mask
self.segment_ids = segment_ids
self.label_id = label_id
self.valid_ids = valid_ids
self.label_mask = label_mask
[docs]def readfile(filename):
'''
read file
'''
f = open(filename, encoding='utf-8')
data = []
sentence = []
label= []
for line in f:
if len(line)==0 or line.startswith('-DOCSTART') or line[0]=="\n":
if len(sentence) > 0:
data.append((sentence,label))
sentence = []
label = []
continue
splits = line.split(' ')
sentence.append(splits[0])
label.append(splits[-1][:-1])
if len(sentence) >0:
data.append((sentence,label))
sentence = []
label = []
return data
[docs]class DataProcessor(object):
"""Base class for data converters for sequence classification data sets."""
[docs] def get_train_examples(self, data_dir):
"""Gets a collection of `InputExample`s for the train set."""
raise NotImplementedError()
[docs] def get_dev_examples(self, data_dir):
"""Gets a collection of `InputExample`s for the dev set."""
raise NotImplementedError()
[docs] def get_labels(self):
"""Gets the list of labels for this data set."""
raise NotImplementedError()
@classmethod
def _read_tsv(cls, input_file, quotechar=None):
"""Reads a tab separated value file."""
return readfile(input_file)