/
file_io_utils.py
60 lines (45 loc) · 1.48 KB
/
file_io_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
''''
Liam Wang: 111407491
Oswaldo Crespo: 107700568
Varun Goel: 109991128
Ziang Wang: 112077534
'''
'''
A utility file that houses all the code used for writing the collected data to
a json or csv file.
This was required for building the training datasets for our model
'''
import sys
import json
def get_output_filename():
output_files = {
'county_lv': 'twitter/county_level_intermediate.json',
'personal_lv': 'twitter/personal_level_intermediate.csv'
}
try:
folder = 'twitter/'
output_files['county_lv'] = '{}{}'.format(folder, sys.argv[1])
output_files['personal_lv'] = '{}{}'.format(folder, sys.argv[2])
except:
print('Write files to default directory')
return output_files
def transfer2intermediate(line, header):
return [line[key] for key in header]
def to_csv(line):
return '\t'.join(str(data) for data in line)
def write_output_file(rdd, filepath):
if filepath[-4:] == 'json':
write_json_file(rdd, filepath)
elif filepath[-3:] == 'csv':
write_csv_file(rdd, filepath)
def write_json_file(rdd, filepath):
json_string = rdd.map(json.dumps).reduce(lambda x, y: x + "\n" + y)
with open(filepath, 'w') as f:
f.write(json_string)
def write_csv_file(rdd, filepath):
header = list(rdd.take(1)[0])
lines = rdd.map(lambda x: transfer2intermediate(x, header)).map(to_csv)
try:
lines.saveAsTextFile(filepath)
except:
print('Exception: File Already Exists')