This repository has been archived by the owner on Sep 5, 2022. It is now read-only.
/
webctl.py
executable file
·155 lines (126 loc) · 3.7 KB
/
webctl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
#!/usr/bin/python
# -*- coding: utf-8 -*-
#================================================
#copyright:GPL
from __future__ import with_statement
"""
"""
__author__ = "xurenlu"
__version__ = "0.1"
__copyright__ = "Copyright (c) 2008 xurenlu"
__license__ = "LGPL"
import subprocess
import sys
sys.path.append('/usr/lib/python2.5/site-packages/')
sys.path.append('/usr/lib/python2.6/dist-packages/')
sys.path.append("/var/lib/python-support/python2.5/")
sys.path.append("/var/lib/python-support/python2.6/")
sys.path.append("/usr/share/pyshared/")
sys.path.append("/usr/lib/pyshared/python2.5/")
sys.path.append("/usr/lib/pyshared/python2.6/")
import sys, atexit
import signal,time,re
import imp
import getopt
import hyer.document
import hyer.event
import hyer.dbwriter
import hyer.pcolor
import hyer.spider
import hyer.misc
import hyer.config
def sig_exit():
""" handle the exit signal
"""
print "[end time]:"+str(time.time())
print hyer.pcolor.pcolorstr("CAUGHT SIG_EXIT signal,exiting...",hyer.pcolor.PHIGHLIGHT,hyer.pcolor.PRED,hyer.pcolor.PBLACK)
sys.exit()
def handler(signum, frame):
"""
handle signals
"""
sig_exit()
if signum == 3:
sig_exit()
if signum == 2:
sig_exit()
if signum == 9:
sig_exit()
return None
def at_exit():
"""
hook of exit
"""
end_time=time.time()
print "[end time]:"+str(end_time)
print "[cost time]:"+str(end_time-start_time)
print "\n=========================\n"
def usage():
print "\n"
print "Hyer crawler version ",hyer.__version__
print "Author:",hyer.__author__
print "Homepage:",hyer.__homepage__
print "\n\n\n"
sys.exit(0)
signal.signal(signal.SIGINT,handler)
signal.signal(signal.SIGTERM,handler)
signal.signal(3,handler)
#如果子进程退出时主进程不需要处理资源回收等问题
#这样可以避免僵尸进程
signal.signal(signal.SIGCHLD,signal.SIG_IGN)
sys.getdefaultencoding()
reload(sys)
sys.setdefaultencoding("utf-8")
start_time=time.time()
optlist,args=getopt.getopt(sys.argv[1:],"c:h")
config={"conf":None,"help":False}
for (c,v) in optlist:
if c == "-c":
config["conf"]=v
if c == "-h":
config["help"]=True
if config["help"]:
usage()
conf={
"db_path":"./tmp/",
"feed":"http://www.xinhuanet.com/newscenter/index.htm",
#"feed":"http://localhost/htests/",
"max_in_minute":60,
"agent":"Mozilla/Firefox",
"same_domain_regexps":[re.compile("http://www.xinhuanet.com/")],
#"same_domain_regexps":[re.compile("http://localhost/htests/")],
"url_db":hyer.urldb.Urldb_mysql({"host":"localhost","user":"root","pass":"","db":"hyer2"}),
"task":"profiletest",
"leave_domain":False,
"document":hyer.document.SimpleHTMLDocument
}
spider=hyer.spider.spider(conf)
writerconf={
"host":"localhost",
"user":"root",
"pass":"",
"db":"hyer2",
"table":"xinhuall",
"fields":["url","content","tags","charset"]
}
wdb=hyer.dbwriter.MySQLWriter(writerconf)
def handle_new_doc(doc):
print "handle new doc:"
if doc["charset"]!="UTF-8":print "[notice] charset not utf8 but ",doc["charset"],":",doc["URI"]
doc["url"]=doc["URI"]
#doc["tags"]=hyer.misc.gettags(doc["content"])
#doc["update_time"]=9200
#wdb.run(doc)
def filter_new_doc(doc):
r=re.compile(".*content_[0-9]+\.html$")
if r.match(doc["URI"]):
doc["update_time"]=31635000
else:
doc["update_time"]=7200
return doc
def start():
spider=hyer.spider.spider(conf)
hyer.event.add_event("new_document",handle_new_doc)
hyer.event.add_filter("new_document",filter_new_doc)
spider.run_loop()
start()