/
process_data.py
108 lines (79 loc) · 3.03 KB
/
process_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import csv
import requests
import collections, re
from datetime import datetime
from lxml import html
from multiprocessing import Manager, cpu_count, Lock, Pool
# This file is used to fetch data from NYTimes, or any news articles, and find the content.
# The content then is used to generate bag of words.
def main(ticker):
start = datetime.now()
pool = Pool(processes=cpu_count())
manager = Manager()
words = manager.list()
bags = manager.dict()
with open(('result/%s.csv' % ticker), 'r') as ms_equity:
reader = csv.reader(ms_equity)
for row in list(reader):
date = row[1]
bags[(ticker, date)] = dict()
header = True
with open('articles.csv', 'r') as ms_articles:
reader = csv.reader(ms_articles)
for row in list(reader):
if header:
header = False
continue
url = row[0]
date = row[3]
if (ticker, date) in bags:
pool.apply(func=get_sumbag, args=(ticker, url, bags, words, date, ))
pool.close()
pool.join()
words = list(set(words))
# Write to CSV file
with open(('result/%s_result.csv' % ticker), 'w+') as csv_file:
field_names = ['ticker', 'date']
field_names += words
writer = csv.DictWriter(csv_file, fieldnames=field_names)
writer.writeheader()
for key, bag in bags.items():
ticker = key[0]
date = key[1]
# Count 0 word, that are in the other articles
for word in words:
if word not in bag:
bag[word] = 0
bag['ticker'] = ticker
bag['date'] = date
writer.writerow(bag)
print('Total Operation took: %s' % str(datetime.now() - start))
def get_sumbag(ticker, url, bags, words, date):
r = requests.get(url)
tree = html.fromstring(r.content)
content = tree.xpath('//p[@class="story-body-text story-content"]/text()')
# Generate bag of words
bag_of_words = [collections.Counter(re.findall(r'\w+', txt.lower())) for txt in content]
sum_bag = dict(sum(bag_of_words, collections.Counter()))
# remove articles & common be-verbs
removals = list()
with open('stopwords.txt', 'r') as stop_words:
for line in stop_words:
line = line[:-1]
removals.append(line)
bag = dict()
for key in sum_bag.keys():
if key not in removals and sum_bag[key] > 1:
bag[key] = sum_bag[key]
words += bag.keys()
if len(bags[(ticker, date)]) is 0:
bags[(ticker, date)] = bag
# This was supposed to merge if there were multiple articles on the same date
# else:
# tempDict = {x:y for x in bags[(ticker, date)]}
# else:
# tempDict = {x:y for x in bags[(ticker, date)]}
# bags[(ticker, date)] = {k: tempDict.get(k, 0) + bags.get(k, 0) for k in set(tempDict) | set(bags)}
print('Sumbag Length: %i' % len(bag))
if __name__ == '__main__':
main(ticker='US MSFT') # Needs to be found and replaced