/
scraper.py
196 lines (152 loc) · 7.42 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
#!/usr/bin/env python3
import csv
import wget
import os
import datetime
import config
import requests
from bs4 import BeautifulSoup
remove_list = {}
clock = datetime.datetime.today().day
last_update = 0
# database file directories
with open('directories.txt', 'r') as directorylist:
directories = [ ]
read = (line for line in directorylist)
for lines in read:
directories.append(lines.strip('\n'))
mun_data = directories[ 0 ]
NL_data = directories[ 1 ]
nice_data = directories[ 2 ]
def database_scrape():
# open file with the file locations and add them to a list
with open('directories.txt', 'r') as directorylist:
dirs = [ ]
read = (line for line in directorylist)
for lines in read:
dirs.append(lines.strip('\n'))
# remove old files from list (if available)
try:
for x in directories:
os.remove(x)
except:
pass
# list for raw repositories add , + <url> when new repo's used
page = [ 'https://raw.githubusercontent.com/J535D165/CoronaWatchNL/master/data/rivm_NL_covid19_hosp_municipality.csv', 'https://raw.githubusercontent.com/J535D165/CoronaWatchNL/master/data/rivm_NL_covid19_national.csv',
'https://raw.githubusercontent.com/J535D165/CoronaWatchNL/master/data/nice_ic_by_day.csv' ]
# RIVM website scraping
# read file and check if date is already included today if not download and upload the file
with open('data/RIVM.csv', 'r') as db:
update_date = datetime.datetime.today().strftime('%d-%m-%Y,')
check_date = datetime.datetime.today().strftime('%d-%m-%Y')
firstline = db.readlines()
firstline = [ x.rstrip('\n') for x in firstline ]
firstline = [ x.split(',', 1)[ 0 ] for x in firstline ]
# RIVM only updates their database at 1400 so check if data is already in list and only update after 1400
check = (True if (check_date not in firstline) and int(datetime.datetime.now().hour) >= 15 else False)
if check:
rivm_db = requests.get('https://www.rivm.nl/coronavirus-kaart-van-nederland-per-gemeente')
soup = BeautifulSoup(rivm_db.content, 'html.parser')
results = soup.find(id="csvData")
RIVM = results.get_text().rstrip('\n').replace(';', ',')
RIVM = RIVM.lower().split('\n')
# delete header and blank newline before appending
del (RIVM[ 0:2 ])
# Append data to csv file include date time stamp (string)
with open("data/RIVM.csv", 'a') as db:
inputdata = [ update_date + x for x in RIVM ]
inputdata = '\n'.join(inputdata)
print(inputdata, file=db, end='')
# download new files and add file directory to directory.txt (for deleting)
directory = [ ]
for x in range(len(page)):
directory.append(wget.download(page[ x ], out='./data'))
with open('directories.txt', 'w') as directorylist:
save_file = ''
length = len(directory)
for x in range(length):
if x != length:
save_file += directory[ x ] + '\n'
else:
save_file += directory[ x ]
print(save_file, file=directorylist, end='')
dataextract()
def dataextract():
config.municipalities.clear()
config.provinces.clear()
with open('data/RIVM.csv', 'r') as csvfile:
has_header = csv.Sniffer().has_header(csvfile.read(1024)) # Check if there is a header present
csvfile.seek(0) # Go back to line 0 in CSV file
readCSV = csv.reader(csvfile, delimiter=',') # Read .CSV file
if has_header:
next(readCSV)
for row in readCSV:
readDate = row[ 0 ].split("-")
rowYear = int(readDate[ 2 ])
rowMonth = int(readDate[ 1 ])
rowDay = int(readDate[ 0 ])
rowDate = datetime.date(rowYear, rowMonth, rowDay)
# gemeentenaam = row[1].lower()
# gemeentecode = row[2]
# provincienaam = row[3].lower()
# aantal = row[4]
config.municipalities.append(config.municipality(rowDate, row[ 2 ].lower(), row[ 1 ], '', row[ 4 ], row[ 6 ], row[ 3 ], '', row[ 5 ]))
provinceExist = False
for i in range(len(config.provinces)):
if config.provinces[ i ].name == row[ 3 ].lower() and config.provinces[ i ].date == rowDate:
provinceExist = True
config.provinces[ i ].hospitalised += int(row[ 4 ])
if provinceExist == False:
config.provinces.append(config.province(rowDate, '', int(row[ 5 ])))
def returnmunicipality(municipality, days):
arrMunici = [ ]
arrSorted = [ ]
arrMunici.clear()
arrSorted.clear()
days = int(days)
# fill temporary array when it's an municipality
for i in range(len(config.municipalities)):
if config.municipalities[ i ].name == municipality:
arrMunici.append([ config.municipalities[ i ].date, config.municipalities[ i ].name, config.municipalities[ i ].hospitalised ])
# fill temporary array when it's an province
for i in range(len(config.provinces)):
if config.provinces[ i ].name == municipality:
arrMunici.append([ config.provinces[ i ].date, config.provinces[ i ].name, config.provinces[ i ].hospitalised ])
if len(arrMunici) == 0:
return (f"There is no municipality or province known called {municipality}\n"
f"For help type '!corona help'")
arrSorted = sorted(arrMunici, key=lambda arrMunici: arrMunici[ 0 ], reverse=True)
reported = [ x for x in config.municipalities if x.name == municipality ]
if days != 0:
try:
difference = int(arrSorted[ 0 ][ 2 ]) - int(arrSorted[ days ][ 2 ])
if difference > 0:
return (f"{municipality.capitalize()}, {days} days ago:\n"
f"Today there have been {abs(difference)} more people hospitalized as on {arrSorted[ days ][ 0 ]} in {arrSorted[ days ][ 1 ].capitalize()}.\n"
f"Today there has been {arrSorted[ 0 ][ 2 ]} people hospitalized.") # f"Today the number of reported people are ")
else:
return (f"{municipality.capitalize()}, {days} days ago:\n"
f"Today there have been {abs(difference)} less people hospitalized as on {arrSorted[ days ][ 0 ]} in {arrSorted[ days ][ 1 ].capitalize()}.\n"
f"Today there has been {arrSorted[ 0 ][ 2 ]} people hospitalized.")
except:
if days == 1:
return (f"No data available from {days} day ago for {arrSorted[ 0 ][ 1 ]}.")
else:
return (f"No data available from {days} days ago for {arrSorted[ 0 ][ 1 ]}.")
else:
return (f"{municipality.capitalize()}, {arrSorted[ 0 ][ 0 ]}:\n"
f"There has been {arrSorted[ 0 ][ 2 ]} hospitalized in {arrSorted[ 0 ][ 1 ].capitalize()} on {arrSorted[ 0 ][ 0 ]}.")
def listProv():
global provinces
dataextract()
tempArray = [ ]
tempArray.clear()
msg = ""
for i in range(len(provinces)):
if not provinces[ i ].name in tempArray:
tempArray.append(provinces[ i ].name)
tempArray = sorted(tempArray, key=lambda tempArray: tempArray[ 0 ], reverse=False)
for i in range(len(tempArray)):
msg += f"{str(tempArray[ i ]).capitalize()}\n"
return (msg)
# todo probleem als municipality niet bestaat maar wel een dagwaarde heeft array error