-
Notifications
You must be signed in to change notification settings - Fork 0
/
hate_speech_prediction.py
115 lines (84 loc) · 4.86 KB
/
hate_speech_prediction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import argparse
import csv
import numpy as np
import pandas as pd
from sklearn import svm, preprocessing, linear_model
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cross_validation import cross_val_score
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, f1_score, classification_report
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('--input', default='features.csv', type=str, help="Name or path of input data file (be it features or corpus")
parser.add_argument('--features', default='nlp', type=str, help="Features you wish to train on. Options: nlp or ngram. Default: nlp")
parser.add_argument('--model', default='reg', type=str, help="Machine learning model you wish to use. Options: reg or svm. Default: reg")
args = parser.parse_args()
def build_model(arg):
if arg == 'reg':
clf = linear_model.LogisticRegression(C=1.0)
print("Logistic regression model built...")
elif arg == 'svm':
clf = svm.SVC(kernel="linear", C=1.0)
print("SVM model built...")
else:
print("Model argument invalid (Options: reg or svm")
return clf
if args.features == 'nlp':
#import csv with data
nlp_feature_data = pd.read_csv(args.input, header=0) # to import all extracted features (binarized)
print("Data imported...")
#these are our features that we use for prediction; not included in the features.csv file, but obtainable from API: ['is_favorited','is_retweeted','num_fav_corpus', 'num_RT_corpus', 'num_replies', 'is_reply', 'is_reply_to_hate_tweet', 'is_quote_status', 'num_followers', 'num_followees', 'num_times_user_was_listed', 'num_posted_tweets', 'num_favorited_tweets', ]
features=['has_mentions', 'num_mentions', 'has_hashtags', 'num_hashtags', 'has_urls', 'num_urls', 'total_positive_tokens', 'positive_token_ratio', 'total_negative_tokens', 'negative_token_ratio', 'total_subjective_tokens', 'subjective_token_ratio', 'blacklist_total', 'blacklist_ratio', 'char_count', 'token_count', 'has_uppercase_token', 'uppercase_token_ratio', 'lowercase_token_ratio', 'mixedcase_token_ratio', 'has_digits', 'has_questionmark', 'has_fullstop', 'has_exclamationpoint', 'len_handle', 'len_name', 'account_age', 'tweet_age', 'tweet_hour']
#main pandas data frame from which the machine learning will happen
df_main = pd.DataFrame({'is_hate_tweet':nlp_feature_data['is_hate_tweet']})
#extracting and scaling the nlp features
X_nlp = np.array(nlp_feature_data[features].values)
X_nlp = preprocessing.scale(X_nlp)
print('NLP features scaled...')
#defining independent (X) and dependent (y) variables
y = df_main["is_hate_tweet"].values.tolist() #options: is_hate_tweet, is_favorited, is_retweeted, #favorited_class #retweeted_class
X = X_nlp #this is for JUST nlp features
print('Variables defined...')
#building the chosen model
clf = build_model(args.model)
elif args.features == 'ngram':
ngram_train_data = pd.read_csv(args.input, sep='\t', header=0) # to import actual tweet text obtained through API
print("Data imported...")
#main pandas data frame from which the machine learning will happen
df_main = pd.DataFrame({'is_hate_tweet':ngram_train_data['is_hate_tweet']})
#extracting and vectorizing tweets, breaking them down to character n-grams and calculating tf-idf scores
tfidf = TfidfVectorizer(analyzer = "char", ngram_range=(2,4))
X_ngrams = tfidf.fit_transform(ngram_train_data['tweet_text'])
#adding the n-gram feature vector into the main pandas data frame
df_main['tweetsVect']=list(X_ngrams.toarray())
print('N-grams extracted...')
#defining independent (X) and dependent (y) variables
y = df_main["is_hate_tweet"].values.tolist() #options: is_hate_tweet, is_favorited, is_retweeted, #favorited_class #retweeted_class
X = np.array(df_main['tweetsVect'].values).tolist() #this is JUST for ngram features
print('Variables defined...')
#building the chosen model
clf = build_model(args.model)
else:
print("Feature argument invalid (Options: nlp or ngram)")
#evaluating the chosen model
print("Running classification report...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=1)
clf.fit(X_train,y_train)
y_pred = []
y_true = []
target_names = ["non_hate_tweet","hate_tweet"]
labels = [False,True]
for x in range(1, len(X_test)+1):
prediction=clf.predict(X[-x])
y_pred.append(prediction[0])
y_true.append(y[-x])
print(classification_report(y_true, y_pred, labels=labels, target_names=target_names))
#other
a_scores = cross_val_score(clf, X, y, cv=10)
print("Cross validation accuracy scores:", a_scores)
print("Accuracy: %0.2f np (+/- %0.2f)" % (a_scores.mean(), a_scores.std() * 2))
print("...")
f1_scores = cross_val_score(clf, X, y, cv=10, scoring='f1')
print("Cross validation F1 scores:", f1_scores)
print("F1-score: %0.2f (+/- %0.2f)" % (f1_scores.mean(), f1_scores.std() * 2))