/
analyse_mit.py
112 lines (93 loc) · 3.43 KB
/
analyse_mit.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
# -*- coding: utf-8 -*-
import sqlite3 as sql
from collections import defaultdict
import operator
def get_stopwords():
words = []
for line in open('stopwords.txt'):
words.append(line.strip().lower())
return words
def get_tweets(cursor):
tweets = []
# Database was created with user and tweet columns as opposites, therefore we get user column instead of tweet
query_results = cursor.execute('SELECT user FROM tweets WHERE user NOT LIKE "%free%" AND user NOT LIKE "%pengikut%"').fetchall()
for res in query_results:
if 'chair' in res[0]:
continue
tweets.append(clean_words(res[0]))
return tweets
def clean_words(words):
return words.replace('\n', '').replace('.', '').replace(':', '').replace(',', '').replace('"', '').replace('?','').replace('!', '').replace('(', '').replace(')', '').replace("'", "").strip()
def get_tweets_by_intervals(tweets, interval):
tweets_by_interval = dict()
start_point = 0
for iteration in range(1,11):
tweets_by_interval[iteration] = set(all_tweets[start_point:interval*iteration])
start_point += interval
return tweets_by_interval
def clean_a_tweet(tweet):
words = tweet.split(' ')
cleaned = []
for word in words:
word = word.lower()
if word in STOPWORDS or word.startswith('#') or word.startswith('@') or word == '' or word.startswith('http') or word in '123456789.?!,/+|' or word.startswith('"') or word.startswith('&'):
continue
else:
cleaned.append(word.lower())
return cleaned
def clean_tweets(raw_tweets):
cleaned = dict()
for interval, tweets in raw_tweets.iteritems():
cleaned[interval] = []
for tweet in tweets:
cleaned[interval].append(clean_a_tweet(tweet))
return cleaned
def get_frequency_table(raw_tweets):
freqs = defaultdict(int)
for interval, tweets in raw_tweets.iteritems():
for tweet in tweets:
for word in tweet:
freqs[word] += 1
return freqs
def get_interval_frequency_table(raw_tweets):
freqs = dict()
for interval, tweets in raw_tweets.iteritems():
if interval not in freqs.keys():
freqs[interval] = defaultdict(int)
for tweet in tweets:
for word in tweet:
freqs[interval][word] += 1
return freqs
def write_all_words_to_file(fname):
textfile = open(fname, 'w+')
for interval, tweet in cleaned_tweets.iteritems():
for ls in tweet:
for word in ls:
textfile.write("%s " % word)
textfile.close()
def write_freq_to_csv(sorted_freq, fname):
output_file = open(fname, 'w+')
for word,freq in sorted_freq:
output_file.write('%s,%s\n' % (word, freq))
output_file.close()
def write_hourly_freq_to_csv(frequencies,fname):
i = 1
for interval in frequencies.keys():
output_file = open('%s%s.txt' % (fname, i), 'w+')
current_dict = frequencies[interval]
for word, freq in current_dict.iteritems():
output_file.write("%s,%s\n" % (word, freq))
i += 1
output_file.close()
DATABASE = 'mit.db'
STOPWORDS = get_stopwords()
connection = sql.connect(DATABASE)
cursor = connection.cursor()
all_tweets = get_tweets(cursor)
tweets_by_interval = get_tweets_by_intervals(all_tweets, 3600)
cleaned_tweets = clean_tweets(tweets_by_interval)
frequencies = get_frequency_table(cleaned_tweets)
sorted_freq = sorted(frequencies.iteritems(), key=operator.itemgetter(1))
interval_freq = get_interval_frequency_table(cleaned_tweets)
write_hourly_freq_to_csv(interval_freq, 'hourly_frequencies')
#write_freq_to_csv(sorted_freq, 'frequencies.txt')