Source code for deepke.relation_extraction.document.evaluation

import os
import os.path
import json
import numpy as np



[docs]def to_official(args, preds, features): rel2id = json.load(open(f'{args.data_dir}/rel2id.json', 'r')) id2rel = {value: key for key, value in rel2id.items()} h_idx, t_idx, title = [], [], [] for f in features: hts = f["hts"] h_idx += [ht[0] for ht in hts] t_idx += [ht[1] for ht in hts] title += [f["title"] for ht in hts] res = [] # print('h_idx, preds', len(h_idx), len(preds)) # assert len(h_idx) == len(preds) for i in range(preds.shape[0]): pred = preds[i] pred = np.nonzero(pred)[0].tolist() for p in pred: if p != 0: res.append( { 'title': title[i], 'h_idx': h_idx[i], 't_idx': t_idx[i], 'r': id2rel[p], } ) return res
[docs]def gen_train_facts(data_file_name, truth_dir): fact_file_name = data_file_name[data_file_name.find("train_"):] fact_file_name = os.path.join(truth_dir, fact_file_name.replace(".json", ".fact")) if os.path.exists(fact_file_name): fact_in_train = set([]) triples = json.load(open(fact_file_name)) for x in triples: fact_in_train.add(tuple(x)) return fact_in_train fact_in_train = set([]) ori_data = json.load(open(data_file_name)) for data in ori_data: vertexSet = data['vertexSet'] for label in data['labels']: rel = label['r'] for n1 in vertexSet[label['h']]: for n2 in vertexSet[label['t']]: fact_in_train.add((n1['name'], n2['name'], rel)) json.dump(list(fact_in_train), open(fact_file_name, "w")) return fact_in_train
[docs]def official_evaluate(tmp, path): ''' Adapted from the official evaluation code ''' truth_dir = os.path.join(path, 'ref') if not os.path.exists(truth_dir): os.makedirs(truth_dir) fact_in_train_annotated = gen_train_facts(os.path.join(path, "train_annotated.json"), truth_dir) if not os.path.exists(os.path.join(path, "train_distant.json")): raise FileNotFoundError("Sorry, the file: 'train_annotated.json' is too big to upload to github, \ please manually download to 'data/' from DocRED GoogleDrive https://drive.google.com/drive/folders/1c5-0YwnoJx8NS6CV2f-NoTHR__BdkNqw") fact_in_train_distant = gen_train_facts(os.path.join(path, "train_distant.json"), truth_dir) truth = json.load(open(os.path.join(path, "dev.json"))) std = {} tot_evidences = 0 titleset = set([]) title2vectexSet = {} for x in truth: title = x['title'] titleset.add(title) vertexSet = x['vertexSet'] title2vectexSet[title] = vertexSet for label in x['labels']: r = label['r'] h_idx = label['h'] t_idx = label['t'] std[(title, r, h_idx, t_idx)] = set(label['evidence']) tot_evidences += len(label['evidence']) tot_relations = len(std) tmp.sort(key=lambda x: (x['title'], x['h_idx'], x['t_idx'], x['r'])) submission_answer = [tmp[0]] for i in range(1, len(tmp)): x = tmp[i] y = tmp[i - 1] if (x['title'], x['h_idx'], x['t_idx'], x['r']) != (y['title'], y['h_idx'], y['t_idx'], y['r']): submission_answer.append(tmp[i]) correct_re = 0 correct_evidence = 0 pred_evi = 0 correct_in_train_annotated = 0 correct_in_train_distant = 0 titleset2 = set([]) for x in submission_answer: title = x['title'] h_idx = x['h_idx'] t_idx = x['t_idx'] r = x['r'] titleset2.add(title) if title not in title2vectexSet: continue vertexSet = title2vectexSet[title] if 'evidence' in x: evi = set(x['evidence']) else: evi = set([]) pred_evi += len(evi) if (title, r, h_idx, t_idx) in std: correct_re += 1 stdevi = std[(title, r, h_idx, t_idx)] correct_evidence += len(stdevi & evi) in_train_annotated = in_train_distant = False for n1 in vertexSet[h_idx]: for n2 in vertexSet[t_idx]: if (n1['name'], n2['name'], r) in fact_in_train_annotated: in_train_annotated = True if (n1['name'], n2['name'], r) in fact_in_train_distant: in_train_distant = True if in_train_annotated: correct_in_train_annotated += 1 if in_train_distant: correct_in_train_distant += 1 re_p = 1.0 * correct_re / len(submission_answer) re_r = 1.0 * correct_re / tot_relations if re_p + re_r == 0: re_f1 = 0 else: re_f1 = 2.0 * re_p * re_r / (re_p + re_r) evi_p = 1.0 * correct_evidence / pred_evi if pred_evi > 0 else 0 evi_r = 1.0 * correct_evidence / tot_evidences if evi_p + evi_r == 0: evi_f1 = 0 else: evi_f1 = 2.0 * evi_p * evi_r / (evi_p + evi_r) re_p_ignore_train_annotated = 1.0 * (correct_re - correct_in_train_annotated) / (len(submission_answer) - correct_in_train_annotated + 1e-5) re_p_ignore_train = 1.0 * (correct_re - correct_in_train_distant) / (len(submission_answer) - correct_in_train_distant + 1e-5) if re_p_ignore_train_annotated + re_r == 0: re_f1_ignore_train_annotated = 0 else: re_f1_ignore_train_annotated = 2.0 * re_p_ignore_train_annotated * re_r / (re_p_ignore_train_annotated + re_r) if re_p_ignore_train + re_r == 0: re_f1_ignore_train = 0 else: re_f1_ignore_train = 2.0 * re_p_ignore_train * re_r / (re_p_ignore_train + re_r) return re_f1, evi_f1, re_f1_ignore_train_annotated, re_f1_ignore_train, re_p, re_r