/
main.py
126 lines (86 loc) · 3.23 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
from selenium import webdriver
import pandas as pd
import crawler
import collections
import random
import os
import sys
import time
import pickle
from mlogger import med_logger
exit_flag = False
objective_number = 3000
df = pd.DataFrame( columns=['url', 'length', 'claps', 'claps_people', 'tags'] )
searched_links = set()
if not sys.argv[1]:
print( 'please provide the number of starting point' )
sys.exit(1)
if sys.argv[1] == 'load':
file_name = 'queue_pages_{}.pic'.format( sys.argv[2] )
with open( file_name, 'rb' ) as fr:
link_queue = pickle.load( fr )
else:
link_queue = collections.deque()
start_url = [ "https://thebolditalic.com/thank-you-for-your-undivided-attention-ad39d713dc4a" ]
med_logger.info( 'starting points : {}'.format( '\n'.join( start_url ) ) )
link_queue.append( start_url[ int( sys.argv[1] ) ] )
def main():
global exit_flag, link_queue, searched_links, df
counter = 0
# driver = webdriver.Chrome( 'C:/Program Files (x86)/Google/chromedriver.exe' )
driver = webdriver.PhantomJS( 'phantomjs-2.1.1/bin/phantomjs.exe', service_args=['--load-images=no'] )
# driver = webdriver.PhantomJS( 'phantomjs-2.1.1/bin/phantomjs.exe', service_args=['--load-images=no'] )
try:
while True:
if link_queue:
medium_url = link_queue.popleft()
else:
# running out of links
the_s = 'running out of links'
print( the_s )
med_logger.info( the_s )
exit_flag = True
break
try:
page_data, next_links = crawler.get_data_from_url( driver, medium_url )
except Exception as err:
the_s = 'error calling get_data_from_url : {}'.format( err )
print( the_s )
med_logger.info( the_s )
if isinstance( page_data, list ):
# only add to record when succeed
searched_links.add( medium_url )
counter += 1
inx = df.count()[0]
df.loc[ inx ] = page_data
for l in next_links:
if l not in searched_links:
link_queue.append( l )
elif isinstance( page_data, str ):
if page_data == 'OSError':
break
else:
pass
print( len( searched_links ) )
if counter >= 60:
break
if len( searched_links ) >= objective_number:
exit_flag = True
break
except KeyboardInterrupt:
pass
driver.quit()
if __name__ == '__main__':
while True:
try:
main()
except Exception as err:
med_logger.error( 'Error in the main loop :\n{}\n{}\n'.format( type(err), err ) )
if exit_flag:
file_name = 'pages_{}'.format( int( random.uniform( 0, 1000000 ) ) )
df.to_csv( file_name + '.csv', index = False )
with open( 'queue_{}.pic'.format( file_name ), 'wb' ) as fw:
pickle.dump( link_queue, fw )
# here we should also store searched_set
# TODO
break