-
Notifications
You must be signed in to change notification settings - Fork 0
/
feature_extractor.py
106 lines (91 loc) · 4.08 KB
/
feature_extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# -*- coding: utf-8-*-
from typing import List, Dict
import numpy as np
from pydantic import BaseModel
from tqdm import tqdm
S_PAD = '[PAD]'
S_UNK = '[UNK]'
V_PAD = 0
V_UNK = 1
class Vocabulary(BaseModel):
token: Dict[str, int] = {}
char: Dict[str, int] = {}
pos: Dict[str, int] = {}
label: Dict[str, int] = {}
def idx_to_label(self):
return {v: k for k, v in self.label.items()}
class Example(BaseModel):
raw_token: List[str] = []
token: List[int] = []
char: List[List[int]] = []
pos: List[int] = []
label: List[int] = []
class FeatureExtractor(object):
FEATURES = ['token', 'char', 'pos', 'label']
def __init__(self, vocab=None, do_lowercase=True):
self._vocab = vocab if vocab else Vocabulary()
self._vocab = {name: {S_PAD: V_PAD, S_UNK: V_UNK} for name in self.FEATURES}
self._vocab['label']['O'] = len(self._vocab['label'])
self._do_lowercase = do_lowercase
def vocab(self) -> Vocabulary:
return Vocabulary(**self._vocab)
def run(self, paths: List[str], separator, for_train=False) -> List[Example]:
examples: List[Example] = []
for path in tqdm(paths, desc='file'):
try:
with open(path, 'r', encoding='utf-8') as f:
raw_example = {name: [] for name in self.FEATURES if name != 'char'}
for idx, line in enumerate(f):
if idx == 0 and line.startswith('-DOCSTART-'):
continue
line = line.strip('\n')
if line:
tokens = line.split(separator)
token = tokens[0]
pos = tokens[1]
label = tokens[-1]
raw_example['token'].append(token)
raw_example['pos'].append(pos)
raw_example['label'].append(label)
else:
examples.append(self._convert_example(raw_example, for_train))
raw_example = {name: [] for name in self.FEATURES if name != 'char'}
if raw_example:
examples.append(self._convert_example(raw_example, for_train))
except Exception as e:
print(f"{path}:{idx} 에 오류가 있습니다. = {str(e)}")
raise e
print(f"total examples: {len(examples)}")
print(f"label num: {len(self._vocab['label'])}")
print(f"pos num: {len(self._vocab['pos'])}")
print(f"max token length: {np.max([len(example.token) for example in examples])}")
print(f"avg token length: {np.mean([len(example.token) for example in examples])}")
print(f"max char length: {np.max([len(c) for example in examples for c in example.char])}")
print(f"avg char length: {np.mean([len(c) for example in examples for c in example.char])}")
return examples
def _preprocess(self, token):
if self._do_lowercase:
return token.lower()
else:
return token
def _convert_token(self, elements, vocab, for_train=False, ignore_case=False):
values = []
for e in elements:
if ignore_case:
e = e.lower()
if for_train is True and e not in vocab:
vocab[e] = len(vocab)
value = vocab.get(e, V_UNK)
values.append(value)
return values
def _convert_example(self, raw_example: Dict[str, List[str]], for_train: bool = False) -> Example:
example = {}
example['raw_token'] = raw_example['token']
raw_example['token'] = [self._preprocess(t) for t in raw_example['token']]
for name in self.FEATURES:
if name == 'char':
example[name] = [self._convert_token(t, self._vocab[name], for_train) for t in
example['raw_token']]
else:
example[name] = self._convert_token(raw_example[name], self._vocab[name], for_train)
return Example(**example)