-
Notifications
You must be signed in to change notification settings - Fork 0
/
webScraper.py
65 lines (55 loc) · 1.98 KB
/
webScraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
from lxml import html
import requests
import sys
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from PyQt4.QtWebKit import *
#from PyQt4 import *
from datetime import datetime
class Render(QWebPage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
self.app.quit()
def main():
url = 'https://ems.cuit.columbia.edu/VirtualEMS/BrowseEvents.aspx'
r = Render(url) # Javascript rendering
page_html = r.frame.toHtml()
formatted_result = str(page_html.toAscii())
tree = html.fromstring(formatted_result)
data = tree.xpath('//*[@id="ctl00_pc_ListViewGrid"]/tbody/tr')
# events start at index 2
# index[0] is start time
# index[1] is end time
# index[2][0] is event name
# index[3][0] is location
# index[4] is group name
# index[2][0].attrib.get('href')
food = []
for index in data[2:len(data)]:
page = requests.get("https://ems.cuit.columbia.edu" + index[2][0].attrib.get('href')) # html scraping
tree = html.fromstring(page.content)
data = tree.xpath('//td[@class="bold w"]')
for element in data:
if 'Food Policy' in element.text:
food.append(index)
return food
def make_text(food):
output = []
for event in food:
event_time = event[0].text
if datetime.strptime(event_time, '%I:%M %p').time() > datetime.now().time():
event_new = []
event_new.append(event[2][0].text) #event name
event_new.append(event[0].text) #time
event_new.append(event[3][0].text) #place
output.append(event_new)
return output
f1=open('list of food.txt', 'w+')
f1.write(str(make_text(main())))
f1.close()