-
Notifications
You must be signed in to change notification settings - Fork 3
/
data_utils.py
51 lines (37 loc) · 1.73 KB
/
data_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from stemmer import StemTokenizer
import numpy as np
def normalise(X):
"""
Normalise feature values of X between 0 and 1
:param X: input feature vectors
"""
X = X.astype('float32')
X_normalised = X.reshape(
(len(X), np.prod(X.shape[1:])))
return X_normalised
def load_bow_vectors_and_labels(input_data_file,
max_features=10000,
min_df=10):
"""
Loads data from input_data_file and returns a matrix corresponding to the bag-of-words representation of the abstracts (i.e. X) and a list of labels (i.e. Y)
:param input_data_file: a TSV file with the following two columns: column 1: abstract of the citation, column 2: classification label
:param max_features: bag-of-words space consists of consider the top max_features ordered by word frequency across the corpus.
:param min_df: Ignore words that have a document frequency lower than min_df
:return: X, y
"""
# load tsv file into a Pandas data frame
df = pd.read_csv(input_data_file, delimiter='\t')
# get abstracts column into a list
raw_abstracts = list(df['abstracts'])
# get labels column into a list
y = list(df['labels'])
# Convert into bag-of-words. Filter out words based on max_feautres and min_df
count_vectorizer = CountVectorizer(max_features=max_features,
min_df=min_df,
stop_words='english',
tokenizer=StemTokenizer(num_docs=len(raw_abstracts)))
X = count_vectorizer.fit_transform(raw_abstracts)
y = np.asarray(y)
return X, y