import os
import os.path
import json
import numpy as np
[docs]def to_official(args, preds, features):
rel2id = json.load(open(f'{args.data_dir}/rel2id.json', 'r'))
id2rel = {value: key for key, value in rel2id.items()}
h_idx, t_idx, title = [], [], []
for f in features:
hts = f["hts"]
h_idx += [ht[0] for ht in hts]
t_idx += [ht[1] for ht in hts]
title += [f["title"] for ht in hts]
res = []
# print('h_idx, preds', len(h_idx), len(preds))
# assert len(h_idx) == len(preds)
for i in range(preds.shape[0]):
pred = preds[i]
pred = np.nonzero(pred)[0].tolist()
for p in pred:
if p != 0:
res.append(
{
'title': title[i],
'h_idx': h_idx[i],
't_idx': t_idx[i],
'r': id2rel[p],
}
)
return res
[docs]def gen_train_facts(data_file_name, truth_dir):
fact_file_name = data_file_name[data_file_name.find("train_"):]
fact_file_name = os.path.join(truth_dir, fact_file_name.replace(".json", ".fact"))
if os.path.exists(fact_file_name):
fact_in_train = set([])
triples = json.load(open(fact_file_name))
for x in triples:
fact_in_train.add(tuple(x))
return fact_in_train
fact_in_train = set([])
ori_data = json.load(open(data_file_name))
for data in ori_data:
vertexSet = data['vertexSet']
for label in data['labels']:
rel = label['r']
for n1 in vertexSet[label['h']]:
for n2 in vertexSet[label['t']]:
fact_in_train.add((n1['name'], n2['name'], rel))
json.dump(list(fact_in_train), open(fact_file_name, "w"))
return fact_in_train
[docs]def official_evaluate(tmp, path):
'''
Adapted from the official evaluation code
'''
truth_dir = os.path.join(path, 'ref')
if not os.path.exists(truth_dir):
os.makedirs(truth_dir)
fact_in_train_annotated = gen_train_facts(os.path.join(path, "train_annotated.json"), truth_dir)
if not os.path.exists(os.path.join(path, "train_distant.json")):
raise FileNotFoundError("Sorry, the file: 'train_annotated.json' is too big to upload to github, \
please manually download to 'data/' from DocRED GoogleDrive https://drive.google.com/drive/folders/1c5-0YwnoJx8NS6CV2f-NoTHR__BdkNqw")
fact_in_train_distant = gen_train_facts(os.path.join(path, "train_distant.json"), truth_dir)
truth = json.load(open(os.path.join(path, "dev.json")))
std = {}
tot_evidences = 0
titleset = set([])
title2vectexSet = {}
for x in truth:
title = x['title']
titleset.add(title)
vertexSet = x['vertexSet']
title2vectexSet[title] = vertexSet
for label in x['labels']:
r = label['r']
h_idx = label['h']
t_idx = label['t']
std[(title, r, h_idx, t_idx)] = set(label['evidence'])
tot_evidences += len(label['evidence'])
tot_relations = len(std)
tmp.sort(key=lambda x: (x['title'], x['h_idx'], x['t_idx'], x['r']))
submission_answer = [tmp[0]]
for i in range(1, len(tmp)):
x = tmp[i]
y = tmp[i - 1]
if (x['title'], x['h_idx'], x['t_idx'], x['r']) != (y['title'], y['h_idx'], y['t_idx'], y['r']):
submission_answer.append(tmp[i])
correct_re = 0
correct_evidence = 0
pred_evi = 0
correct_in_train_annotated = 0
correct_in_train_distant = 0
titleset2 = set([])
for x in submission_answer:
title = x['title']
h_idx = x['h_idx']
t_idx = x['t_idx']
r = x['r']
titleset2.add(title)
if title not in title2vectexSet:
continue
vertexSet = title2vectexSet[title]
if 'evidence' in x:
evi = set(x['evidence'])
else:
evi = set([])
pred_evi += len(evi)
if (title, r, h_idx, t_idx) in std:
correct_re += 1
stdevi = std[(title, r, h_idx, t_idx)]
correct_evidence += len(stdevi & evi)
in_train_annotated = in_train_distant = False
for n1 in vertexSet[h_idx]:
for n2 in vertexSet[t_idx]:
if (n1['name'], n2['name'], r) in fact_in_train_annotated:
in_train_annotated = True
if (n1['name'], n2['name'], r) in fact_in_train_distant:
in_train_distant = True
if in_train_annotated:
correct_in_train_annotated += 1
if in_train_distant:
correct_in_train_distant += 1
re_p = 1.0 * correct_re / len(submission_answer)
re_r = 1.0 * correct_re / tot_relations
if re_p + re_r == 0:
re_f1 = 0
else:
re_f1 = 2.0 * re_p * re_r / (re_p + re_r)
evi_p = 1.0 * correct_evidence / pred_evi if pred_evi > 0 else 0
evi_r = 1.0 * correct_evidence / tot_evidences
if evi_p + evi_r == 0:
evi_f1 = 0
else:
evi_f1 = 2.0 * evi_p * evi_r / (evi_p + evi_r)
re_p_ignore_train_annotated = 1.0 * (correct_re - correct_in_train_annotated) / (len(submission_answer) - correct_in_train_annotated + 1e-5)
re_p_ignore_train = 1.0 * (correct_re - correct_in_train_distant) / (len(submission_answer) - correct_in_train_distant + 1e-5)
if re_p_ignore_train_annotated + re_r == 0:
re_f1_ignore_train_annotated = 0
else:
re_f1_ignore_train_annotated = 2.0 * re_p_ignore_train_annotated * re_r / (re_p_ignore_train_annotated + re_r)
if re_p_ignore_train + re_r == 0:
re_f1_ignore_train = 0
else:
re_f1_ignore_train = 2.0 * re_p_ignore_train * re_r / (re_p_ignore_train + re_r)
return re_f1, evi_f1, re_f1_ignore_train_annotated, re_f1_ignore_train, re_p, re_r