/
tendims.py
149 lines (137 loc) · 5.79 KB
/
tendims.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import sys
import os
import numpy as np
from os.path import join
from features.embedding_features import ExtractWordEmbeddings
from models.lstm import LSTMClassifier
import torch
import nltk
from nltk.tokenize import TweetTokenizer
tokenize = TweetTokenizer().tokenize
from nltk import sent_tokenize
nltk.download('punkt')
dimensions_list = ['support', 'knowledge', 'conflict', 'power', 'similarity', 'fun', 'status', 'trust', 'identity', 'romance']
class TenDimensionsClassifier:
def __init__(self, models_dir = './models/lstm_trained_models',
embeddings_dir = './embeddings', is_cuda=False):
"""
@param models_dir: the directory where the LSTM models are stored
@param embeddings_dir: the directory where the embeddings are stored. The directory must contain the following subdirectories:
word2vec/GoogleNews-vectors-negative300.wv
fasttext/wiki-news-300d-1M-subword.wv
glove/glove.42B.300d.wv
@param is_cuda: to enable cuda
"""
self.is_cuda = is_cuda
self.models_dir = models_dir
self.embeddings_dir = embeddings_dir
#load embeddings
self.em_glove = ExtractWordEmbeddings('glove' ,emb_dir=self.embeddings_dir)
self.em_word2vec = ExtractWordEmbeddings('word2vec', emb_dir=self.embeddings_dir)
self.em_fasttext = ExtractWordEmbeddings('fasttext', emb_dir=self.embeddings_dir)
self.dimensions_list = ['support', 'knowledge', 'conflict', 'power', 'similarity', 'fun', 'status', 'trust', 'identity', 'romance']
#load models
self.dim2model = {}
self.dim2embedding = {}
for dim in self.dimensions_list:
model = LSTMClassifier(embedding_dim=300, hidden_dim=300)
if self.is_cuda:
print(f'Torch version: {torch.__version__}')
print(f'Torch CUDA available : {torch.cuda.is_available()}')
if torch.cuda.is_available():
print(f'Torch current device : {torch.cuda.current_device()}')
print(f'Torch device count : {torch.cuda.device_count()}')
print(f'Torch device name : {torch.cuda.get_device_name(0)}')
model.cuda()
else:
print('Cuda not available. Instantiated the TenDimensionsClassifier with CUDA=False')
self.is_cuda = False
model.eval()
for modelname in os.listdir(self.models_dir):
if ('-best.lstm' in modelname) & (dim in modelname):
best_state = torch.load(join(self.models_dir, modelname), map_location='cpu')
model.load_state_dict(best_state)
if 'glove' in modelname:
em = self.em_glove
elif 'word2vec' in modelname:
em = self.em_word2vec
elif 'fasttext' in modelname:
em = self.em_fasttext
self.dim2model[dim] = model
self.dim2embedding[dim] = em
break
def _parse_input_dimensions(self, d):
if d is None:
return self.dimensions_list
elif isinstance(d, str):
return [d]
elif isinstance(d, list) or isinstance(d, tuple):
return d
else:
raise Exception('Unrecognized input for dimension or dimension list: %s'%d)
def compute_score(self, text, dimensions=None):
"""
Computed dimension(s) scores on the whole input text
@param text: the input text
@param dimensions: a string representing the dimension or a list of strings for
multiple dimensions. None triggers the computation of all dimensions
@return the confidence score for the selected dimension
a dictionary dimension:score is returned if multiple dimensions were specified
None (or dimension:None) is returned when the dimension could not be computed
"""
dimension_scores = {d:None for d in self._parse_input_dimensions(dimensions)}
if text is not None and text != '':
for dim in dimension_scores:
try:
model = self.dim2model[dim]
em = self.dim2embedding[dim]
input_ = em.obtain_vectors_from_sentence(tokenize(text), True)
input_ = torch.tensor(input_).float().unsqueeze(0)
if self.is_cuda:
input_ = input_.cuda()
output = model(input_)
score = torch.sigmoid(output).item()
dimension_scores[dim] = score
except:
pass
if len(dimension_scores) == 1:
return list(dimension_scores.values())[0]
else:
return dimension_scores
def compute_score_split(self, text, dimensions=None, min_tokens=3):
"""
Computed dimension(s) scores on each sentence of the input text and returns aggreagated
stats (avg and max)
@param text: the input text
@param dimensions: a string representing the dimension or a list of strings for
multiple dimensions. None triggers the computation of all dimensions
min_tokens: the minimum number of tokens in a sentence for the dimension to be computed
@return a tuple (avg, max, min, std) of confidence scores for the selected dimension
a dictionary dimension:(avg, max, min, std) is returned if multiple dimensions were specified
None (or dimension:None) is returned when the dimension could not be computed
"""
dimension_scores = {d:(None,None) for d in self._parse_input_dimensions(dimensions)}
if text is not None and text != '':
for dim in dimension_scores:
scores = []
try:
sentences = sent_tokenize(text)
except:
sentences = [text]
for sent in sentences:
sent = str(sent)
if sent is not None and sent != '' and sent != 'nan' and len(sent.split()) >= min_tokens:
try:
score = self.compute_score(sent, dim)
if score is not None:
scores.append(score)
except:
pass
if scores:
dimension_scores[dim] = (np.mean(scores), np.max(scores), np.min(scores), np.std(scores))
else:
dimension_scores[dim] = (None, None, None, None)
if len(dimension_scores) == 1:
return list(dimension_scores.values())[0]
else:
return dimension_scores