/
analyze.py
278 lines (206 loc) · 9.41 KB
/
analyze.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
"""
messenger-data-analysis parses through, and analyzes, Facebook Messenger** data dumps for both quantitative information and textual analysis.
** not associated, authorized, or endorsed by Facebook. Software and related trademarks regarding Messenger are owned by Facebook
"""
import os
import math
from datetime import datetime
import json
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import dates
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from unidecode import unidecode
__author__ = "Siraj Chokshi"
__copyright__ = "Copyright 2020, Siraj Chokshi"
__license__ = "NCSA/University of Illinois"
__version__ = "1.0.0"
__maintainer__ = "Siraj Chokshi"
__email__ = "sirajsc2@illinois.edu"
__status__ = "Prototype"
class Chat:
def __init__(self, name):
self.name = name
self.senders = []
def add(self, sender):
self.senders.append(sender)
class Message:
def __init__(self, contents, timestamp):
self.contents = contents
self.timestamp = timestamp
self.datetime = datetime.fromtimestamp(timestamp // 1000.0)
class Sender:
def __init__(self, name):
self.name = name
self.messages = []
self.participated = -1
def add_fb_message(self, message):
msg = Message(message["content"], message["timestamp_ms"])
self.messages.append(msg)
def sort_messages(self):
self.messages = sorted(self.messages, key=lambda x: x.timestamp, reverse=True)
def merge(self, sender_b):
if self.name == sender_b.name:
self.messages = sender_b.messages
self.participated += sender_b.participated
else:
raise ValueError()
# number of messages to consider a chat worthy of parsing
CONVERSATION_THRESHOLD = 20
# list of all conversations
chats = []
# amount of group chats each user participated in with the host
part_freq = {}
# amount of messages sent by each individual
message_freq = {}
for folder in os.listdir('inbox'):
if not folder.startswith('.'):
current_chat = Chat("tmp")
# list of all messages in the chat
users = {}
# is this folder for a group chat
is_group_parent = False
for data in os.listdir('inbox/' + folder):
if data.endswith('.json'):
with open('inbox/' + folder + '/' + data, encoding='utf-8') as f:
# load in JSON data from each file
json_data = json.load(f)
# name of the conversation
current_chat.name = json_data['title']
# list of messages for this chat
messages = json_data['messages']
# list of participants (by full name) for this chat
participants = json_data['participants']
# determines if this iteration is a group chat by seeing how many participants it has
# NOTE: this does NOT account for group chats where individuals have been removed leaving 2 others
is_group = not len(participants) < 3
for participant in participants:
current = participant['name']
try:
part_freq[current] += 1
except:
part_freq[current] = 0
if not is_group and len(messages) > CONVERSATION_THRESHOLD:
for message in messages:
if message['type'] == 'Generic':
current = message['sender_name']
try:
message_freq[current] += 1
users[current].append(message)
except:
message_freq[current] = 1
users[current] = []
users[current].append(message)
is_group_parent = is_group
if len(users) > 0:
# print("\u001b[37;1m" + current_chat.name + "\u001b[0m")
for key in users:
current_sender = Sender(key)
for message in users[key]:
try:
current_sender.add_fb_message(message)
except:
pass
current_sender.sort_messages()
current_chat.add(current_sender)
# for sender in current_chat.senders:
# print(sender.name)
chats.append(current_chat)
frequent_participants = sorted(part_freq.items(), key=lambda x: x[1], reverse=True)
while True:
UINDEX = -1
query_term = input("\u001b[37;1mSearch for a user/conversation (or type '!q' to exit):\u001b[0m\n")
if query_term == '!q':
break
for i in range(len(chats)):
for sender in chats[i].senders:
if query_term.lower() in sender.name.lower():
UINDEX = i
if UINDEX != -1:
main_speaker_index = 0
other_speaker_index = 1
if frequent_participants[0][0] == chats[UINDEX].senders[main_speaker_index].name:
main_speaker_index = 1
other_speaker_index = 0
# Percentage of chat
speaker_one = chats[UINDEX].senders[main_speaker_index].name
total_messages = (len(chats[UINDEX].senders[main_speaker_index].messages) + len(chats[UINDEX].senders[other_speaker_index].messages))
speaker_one_percentage = len(chats[UINDEX].senders[main_speaker_index].messages)/total_messages * 100
print()
print("\u001b[37;1m{} sent {:.2f}% of {} messages\u001b[0m".format(speaker_one, speaker_one_percentage, total_messages))
# Messages against Time plot
time_one = []
time_two = []
for message in chats[UINDEX].senders[main_speaker_index].messages:
time_one.append(message.datetime)
for message in chats[UINDEX].senders[other_speaker_index].messages:
time_two.append(message.datetime)
first = time_one[len(time_one) - 1]
last = time_one[0]
bins = (last.year - first.year) * 12
if bins < 12:
bins = 12
fig, ax = plt.subplots(2,2, figsize=(12,8))
plt.subplots_adjust(left=0.08, bottom=0.1, right=0.92, top=0.96, wspace=0.18, hspace=0.4)
ax[0][0].hist(time_one, bins=bins, color='r', alpha=0.6, range=(first, last), label=chats[UINDEX].senders[main_speaker_index].name)
ax[0][0].hist(time_two, bins=bins, color='y', alpha=0.6, range=(first, last), label=chats[UINDEX].senders[other_speaker_index].name)
ax[0][0].set_title("Messages vs Time")
ax[0][0].tick_params(labelsize=8)
ax[0][0].set_ylabel("Messages")
ax[0][0].set_xlabel("Time")
plt.sca(ax[0, 0])
plt.xticks(rotation=45)
ax[0][0].legend()
fdist_list = []
stop_words = set(stopwords.words('english'))
stop_words.update(['like','u','ur','im','also','gonna','cuz','really','rly','actually','tho','y','ye','yea','yeah','yee','no','na','nah','nahh','would','should','bc','dont','go','get','much','kinda','want', 'one','r','c','cool','think','good','w','good','got','thats','could','pretty','1','2','3','4','5','6','7','8','9','0','know','right','lot','lol','thought',])
stop_words.update(['itas','iam','donat','thatas','iall','didnat','canat','havenat'])
counter = 0
for sender in chats[UINDEX].senders:
if counter == 2:
break;
all_messages = ""
for msg in sender.messages:
all_messages += msg.contents
all_messages = unidecode(all_messages.lower())
tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(all_messages)
filtered_tokens = [w for w in tokens if not w in stop_words]
text = nltk.Text(filtered_tokens)
fdist = nltk.FreqDist(text)
most_common = fdist.most_common(10)
print(sender.name + ": ", most_common)
x = [val[0] for val in most_common]
height = [val[1] for val in most_common]
x2 = list(range(0,len(most_common)))
ax[1][counter].bar(x2, height)
ax[1][counter].set_title(sender.name + "'s Most Frequent Words")
ax[1][counter].set_ylabel("Occurances")
ax[1][counter].set_xlabel("Token")
for index, value in enumerate(height):
ax[1][counter].text(index - len(str(value)) / 8, value + 0.1, str(value), color="black", fontweight='bold')
plt.sca(ax[1, counter])
plt.ylim(0, max(height) * 1.2)
plt.xticks(x2,x, rotation=45)
counter += 1
print()
plt.show()
else:
print("\nNo Results :(\n")
# def clean_content(split_list):
# for item in split_list:
# if ()
# previous_amount = -1
# for user in frequent_participants:
# if user[1] > 0:
# if user[1] == previous_amount:
# print(user[0], end=", ")
# else:
# print("\n{}\t{}".format(user[1],user[0]), end=", ")
# previous_amount = user[1]
# frequent_senders = sorted(message_freq.items(), key=lambda x: x[1], reverse=True)
# for user in frequent_senders:
# if user[1] > 1:
# print("{}\t{}".format(user[1],user[0]))