/
make_noisy_labels.py
79 lines (59 loc) · 2.74 KB
/
make_noisy_labels.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import torch
import numpy as np
import pandas as pd
import glob
import ujson
import os
from snorkel.labeling import LabelingFunction
from snorkel.preprocess.nlp import SpacyPreprocessor
from snorkel.labeling import LFAnalysis
from snorkel.labeling import PandasLFApplier, LFApplier
from snorkel.preprocess import preprocessor
from tqdm.auto import tqdm, trange
from argparse import ArgumentParser
from snorkel_utils import keyword_lookup, make_keyword_lf
def split_data(data, label_matrix):
'''
Split data into labeled and unlabeled according to a particular rule-induced label set
'''
train = data['train']
mask = ((torch.LongTensor(label_matrix) >= 0).sum(dim=1) > 0).bool()
print(mask.sum())
data['labeled'] = {}
data['unlabeled'] = {}
for key, val in train.items():
if torch.is_tensor(val):
data['labeled'][key] = val[mask]
data['unlabeled'][key] = val[~mask]
# print(data['unlabeled'][key].size())
else:
data['labeled'][key] = [t for (i,t) in zip(mask, val) if i == 1]
data['unlabeled'][key] = [t for (i,t) in zip(mask, val) if i == 0]
return data
if __name__=='__main__':
parser = ArgumentParser()
parser.add_argument("--data_path", type=str, required=True, help="Path to data dictionary with train, test, and validation data")
parser.add_argument('--rule_dict_path', type=str, required=True, help='Path to .json file of keyword rules for each class')
parser.add_argument('--save_path', type=str, required=True, help='Path to save output data')
parser.add_argument('--split', action='store_true', help='Split data into labeled and unlabeled for use with multi-source weak supervision model')
args = parser.parse_args()
data = torch.load(args.data_path)
lf_kwds = ujson.load(open(args.rule_dict_path, 'r'))
lfs = [make_keyword_lf(w, int(key), rpn_generated=False) for (key, vals) in lf_kwds.items() for w in vals]
label_matrix = LFApplier(lfs).apply(data['train']['text'])
analysis_df = LFAnalysis(label_matrix).lf_summary(Y=data['train']['labels'].numpy())
print(analysis_df)
data['train']['noisy_labels'] = torch.LongTensor(label_matrix)
for data_slice in ['test','valid']:
noisy_labels = LFApplier(lfs).apply(data[data_slice]['text'])
data[data_slice]['noisy_labels'] = torch.LongTensor(noisy_labels)
used_kwds = [w for (key, vals) in lf_kwds.items() for w in vals]
data['rule_keywords'] = used_kwds
# Split data if desired
if args.split:
data = split_data(data, label_matrix)
# Save data
dir = os.path.dirname(args.save_path)
if not os.path.isdir(dir):
os.makedirs(dir)
torch.save(data, args.save_path)