/
subset_data.py
108 lines (92 loc) · 3.32 KB
/
subset_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
"""
This file deals with the partitioning of the raw Reddit comments data set
100% of this code was written by our team
"""
import json
import os
from string import ascii_lowercase
DATA_FILE_NAME = 'RC_2016-10'
DATA_FILE_PATH = os.path.join('data', DATA_FILE_NAME)
OUTPUT_PATH = os.path.join(DATA_FILE_PATH, 'by_subreddit')
"""
Convert the large data file into a bunch of smaller files partitioned
by letter the subreddit starts with. This makes subreddit lookup
much more efficient
Takes maybe 30min
"""
def partition_subreddits():
data_file = open(os.path.join(DATA_FILE_PATH, DATA_FILE_NAME), 'r')
chunk_size = 1000000
counter = 0
total_counter = 0
comments = []
for line in data_file:
counter += 1
total_counter += 1
line = line.rstrip()
comment = json.loads(line)
comments.append(comment)
if (counter > chunk_size):
write_to_files(comments)
print('Processed chunk of {}/{} comments'.format(counter, total_counter))
counter = 0
comments = []
elif (counter % 50000 == 0):
print('Processed {} comments...'.format(counter))
if (counter > 0):
write_to_files(comments)
print('Processed chunk of {}/{} comments'.format(counter, total_counter))
"""
Helper to write comments to files by alphabetical order
of subreddit
"""
def write_to_files(comments):
alphabet = '0123456789' + ascii_lowercase
comments.sort(key=lambda x: x['subreddit'].lower())
letter_index = 0
letter_file = open(os.path.join(OUTPUT_PATH, alphabet[letter_index]), 'a')
for comment in comments:
# For some reason there is a subreddit _en_nvr
if (comment['subreddit'][0] == '_'):
continue;
if (comment['subreddit'][0].lower() != alphabet[letter_index]):
while (comment['subreddit'][0].lower() != alphabet[letter_index]):
letter_index += 1
letter_file.close()
letter_file = open(os.path.join(OUTPUT_PATH, alphabet[letter_index]), 'a')
letter_file.write(json.dumps(comment) + '\n')
letter_file.close()
"""
Sort the resulting files to make our lives easier later
"""
def sort_files():
file_names = [file for file in os.listdir(OUTPUT_PATH) if os.path.isfile(os.path.join(OUTPUT_PATH, file))]
for file_name in file_names:
print("Starting {}".format(file_name))
comments = []
counter = 0
file = open(os.path.join(OUTPUT_PATH, file_name), 'r')
for line in file:
counter += 1
line = line.rstrip()
comments.append(json.loads(line))
if (counter % 50000 == 0):
print(str(counter) + ' for ' + str(file_name))
file.close()
sorted_file = open(os.path.join(OUTPUT_PATH, file_name + '.sorted'), 'w')
print('Sorting')
comments.sort(key=lambda x: x['subreddit'])
print('Sorted')
for index in range(len(comments)):
comments[index] = json.dumps(comments[index])
print('Jsonified')
for comment in comments:
sorted_file.write(comment + '\n')
print("Finished {}".format(file_name))
sorted_file.close()
if __name__ == "__main__":
print(OUTPUT_PATH)
print(DATA_FILE_NAME)
print(DATA_FILE_PATH)
partition_subreddits()
sort_files()