-
Notifications
You must be signed in to change notification settings - Fork 2
/
json_parse.py
103 lines (84 loc) · 2.87 KB
/
json_parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import os
import sqlite3
import json
from datetime import datetime, timedelta
from tqdm import tqdm
import pickle as pkl
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from userinfo import YOUR_NAME, START_DAY, END_DAY
# Things that are ignored
# Reactions, downloaded files, Audio files, Plans
def parse_file(f):
with open(f) as json_file:
data = json.load(json_file)
if('title' not in data): # No title, seems to be when someone deletes their account
title = data['thread_path']
else:
title = data['title']
id = data['thread_path']
group = True
if( 'participants' not in data): # Talking to a bot
group = True # I guess treat as group chat
else:
if(len(data['participants']) <= 2):
group = False
people_count = {}
date_times = []
texts = []
images = []
user = []
for msg in data['messages']:
if(not 'sender_name' in msg): # User left the group or User Deleted Account
usr = "NO SENDER NAME"
else:
usr = msg['sender_name']
if usr in people_count:
people_count[usr] += 1
else: #Add person to people_count
people_count[usr] = 1
if(not 'content' in msg): #no text
texts.append("")
else:
texts.append(msg['content'])
if(not 'photos' in msg): #no photo url...there still can be sticker url
images.append("")
else:
images.append(msg['photos'])
if(not 'timestamp_ms' in msg):
timestamp = int(msg['timestamp'])
else:
timestamp = int(int(msg['timestamp_ms'])/1000)
date_times.append(datetime.fromtimestamp(timestamp))
user.append(usr)
return(id ,str(title), people_count, date_times, texts, user, group)
def list_files(paths):
files = []
root = os.getcwd()
# print(root)
for path in paths:
# print(os.getcwd())
os.chdir(path)
prefix = os.getcwd()
f = os.listdir()
files.extend(list(map(lambda x: prefix+"/"+x, f)))
os.chdir(root)
return files
paths = ["facebook/messages/archived_threads", "facebook/messages/inbox"]
files = list_files(paths)
# print("START")
fails = []
success = []
for f in tqdm(files):
try:
if(f != ".DS_Store"): # ignore other generic files
filename = os.path.join(f, "message_1.json")
success.append(parse_file(filename))
except Exception as e:
print("Failed to parse. Exception: ", repr(e), " ", "/", filename)
fails.append(f)
pkl.dump(success, open("messages.pkl", "wb"))
print("Dumped messages.pkl")
pkl.dump(fails, open("fails.pkl", "wb"))
print("Dumped fails.pkl")