/
search.py
72 lines (65 loc) · 2.68 KB
/
search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import argparse
from time import sleep
from datetime import datetime, timedelta
import urllib.request
from bs4 import BeautifulSoup
import json
def parse_result(data, day, verbose):
parsed_data = []
html = data["return"]
soup = BeautifulSoup(html, 'lxml')
elems = soup.select(".slink")
for i, elem in enumerate(elems):
elem_dict = {}
id_ = elem.attrs["id"].replace("id", "")
req = urllib.request.Request(f"https://www.promedmail.org/ajax/getPost.php?alert_id={id_}")
req.add_header("Referer", "http://www.promedmail.org")
try:
r = urllib.request.urlopen(req)
except:
continue
data = r.read().decode("utf-8")
data = json.loads(data)
post_html = data["post"]
soup2 = BeautifulSoup(post_html, "lxml")
post_text = soup2.get_text()
elem_dict["id"] = id_
elem_dict["date"] = day.strftime("%m/%d/%Y")
elem_dict["title"] = elem.text
elem_dict["html"] = post_html
elem_dict["text"] = post_text
if verbose:
print(f"{i}/{len(elems)} ID: {elem_dict['id']} Title: {elem_dict['title']}")
r.close()
parsed_data += [elem_dict]
sleep(0.5)
return parsed_data
def search(search_from, search_to, verbose):
search_from = datetime.strptime(search_from, "%m/%d/%Y")
search_to = datetime.strptime(search_to, "%m/%d/%Y")
aday = timedelta(days=1)
day = search_from
data_list = []
while day < search_to:
f = day.strftime("%m/%d/%Y")
t = day + aday
t = t.strftime("%m/%d/%Y")
if verbose:
print(f, t)
with urllib.request.urlopen(f"https://www.promedmail.org/ajax/runSearch.php?kwby1=summary&search=&date1={f}&date2={t}&feed_id=1") as res:
data = res.read().decode("utf-8")
data = json.loads(data)
data = parse_result(data, day, verbose)
data_list += data
day += aday
return data_list
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='ProMed mail Searcher.')
parser.add_argument("from_day", metavar="FROM", type=str, help="The first day you want to searchd search. The format is \"MM/DD/YYYY\"")
parser.add_argument("to_day", metavar="TO", type=str, help="The last day you want to searchd search. The format is \"MM/DD/YYYY\"")
parser.add_argument("--verbose", help="output verbose message", action="store_true")
parser.add_argument("--output", type=str, help="output json file name.", default="data.json")
args = parser.parse_args()
result = search(args.from_day, args.to_day, args.verbose)
with open(args.output, "w") as f:
json.dump(result, f, indent=2)