/
covid.py
360 lines (323 loc) · 14 KB
/
covid.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
#!/bin/python3
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
from datetime import datetime, timedelta
import re
import telegram_send
import sys
import requests
from requests import Session, Request
# Wrapper class to hold processed Tweet Data
# content: Tweeet text
# time: Time of tweet
# attachements: List of attachments in the tweet
# phone_numbers: List of phone numbers parsed from the tweet content
class TweetData:
def __init__(self, content, time, attachments, phone_numbers):
self.content = str(content)
self.time = str(time)
self.attachments = list(attachments)
self.phone_numbers = list(phone_numbers)
class TweetParser:
def parse_tweet(self, tweet):
try:
# 'Jaideep Pandey\n@PandeyJaideep\n·\n1h\n#Lucknow #UPDATE #HospitalBeds #CovidHelp \n#Beds with #oxygen are available at St. Joseph Hospital in Gomti Nagar. Call:7947145417 #Verified at 1 AM. \n@awwwnchal\n @jeevika_shiv\n @neeleshmisra\n @Interceptors\n @bhaisaabpayal\n @YahyaRahmani19\n @lakhnauaa\n4\n3\n2'
tweet_text = tweet.find_elements_by_class_name("css-1dbjc4n")
# print("text: ")
# First we cut using '·'
# Take the second part
# Then we cut using '\n'
# [1:] because 0th element is empty item
text_cut = str((str(tweet_text[0].text).split("·"))[1]).split("\n")[1:]
tweet_age = self.get_tweet_age(text_cut)
text_cut = self.clean_tweet_content(text_cut[1:])
tweet_content = self.prettify_content(self.get_tweet_text(text_cut))
tweet_media = tweet.find_elements_by_class_name("css-9pa8cd")
medias = self.get_tweet_media(tweet_media)
return TweetData(tweet_content, self.twime_to_string(tweet_age), medias, self.extract_phone(tweet_content))
except Exception as e:
print(e)
# no media
return None
def clean_tweet_content(self, tweet_content):
tweet_content = list(tweet_content)
# Remove starting content like 'Replying to @kumarmanish9 @AnupamkPandey and @AdminLKO'
reply_index = 0
and_encountered = False
for word in tweet_content:
word = str(word).strip()
if word[0] == '@' or word == "Replying to" or word == "and" or ("and" in word and "others" in word) == True:
reply_index += 1
else:
break
return list(tweet_content[reply_index:])
def prettify_content(self, content):
# Any post processing which needs to be applied to text
content = str(content).strip()
content = content.replace(" ", " ")
return content
# TODO : Check if this is working or not
# Match text patterns to see if tweet is valid
def is_tweet_valid(self, content):
excl = open("exclusions", "r")
not_have = excl.readlines()
excl.close()
content = str(content).lower()
for sent in not_have:
if sent.rstrip() in content:
return False
return True
def get_tweet_text(self, tweet_content):
tweet_content = list(tweet_content)
# print(text_cut)
# 0th element of text_cut is the age of tweet
# Last 3 elements include number of comments, number of retweets and number of likes
# ^ all might not exist
last_counter = 0
if str(tweet_content[-1]).lower() == "show this thread":
last_counter += 1
for i in range(1 + last_counter, 4 + last_counter):
try:
# Assuming that the tweet is <1m in age
if int(tweet_content[-i]) < 50:
last_counter += 1
except:
pass
tweet_text = tweet_content
if last_counter != 0:
tweet_text = tweet_content[0:-last_counter]
tweet_content = " ".join(tweet_text)
return str(tweet_content)
def get_tweet_age(self, tweet_content):
tweet_content = list(tweet_content)
return str(tweet_content[0])
def get_tweet_media(self, tweet_media):
tweet_media = list(tweet_media)
medias = []
for media in tweet_media:
media_src = media.get_attribute("src")
if self.is_media_valid(media_src) == True:
medias.append(media_src)
return medias
def is_media_valid(self, url):
unwanted = ["/profile_images/", "/emoji/", "profile_image", "hashflags"]
for unw in unwanted:
if unw in url:
return False
return True
# https://dev.to/samcodesign/phone-number-email-extractor-with-python-12g2
def extract_phone(self, content):
phoneRegex = re.compile(r'''(
(\d{2}|\(\d{2}\))? # area code
(\s|-|\.)? # separator
(\d{5}) # first 5 digits
(\s|-|\.|) # separator
(\d{5}) # last 5 digits
)''', re.VERBOSE)
matches = []
for groups in phoneRegex.findall(content):
phoneNum = ''.join([groups[1], groups[3], groups[5]])
matches.append(str(phoneNum))
return matches
# twime = twitter time
def twime_to_string(self, twitter_time):
# 45s 5m 3h Apr 24
# converted to : 202104250422
twitter_time = str(twitter_time)
tweet_time = None
if len(twitter_time) <= 3:
last_char = twitter_time[-1]
diff = None
if last_char == 's':
diff = timedelta(seconds=int(twitter_time[0:-1]))
elif last_char == 'm':
diff = timedelta(minutes=int(twitter_time[0:-1]))
elif last_char == 'h':
diff = timedelta(hours=int(twitter_time[0:-1]))
now = datetime.now()
tweet_time = now - diff
else:
tweet_time = datetime.strptime(twitter_time + ", 2021", "%d %b, %Y")
return str(tweet_time.strftime("%Y-%m-%d %H:%M:%S"))
class Main:
def __init__(self, links, tags, config, webdriver_type, headless):
self.LINKS = list(links)
for link in links:
self.LATEST_TWEET.append(None)
self.TAGS = list(tags)
self.CONFIG = str(config)
self.WEBDRIVER_TYPE = webdriver_type
self.HEADLESS = headless
LINKS = []
LATEST_TWEET = []
CURRENT = -1
TAGS = []
CONFIG = ""
API_URL = "https://covidsupport.xyz/api/tweets"
WEBDRIVER_TYPE = "chrome"
HEADLESS = False
driver = None
timeline = None
parser = TweetParser()
tweets = None
# Setup options for chrome driver
def setup_webdriver(self):
# Setting up Chrome options
# option = webdriver.FirefoxOptions()
if self.WEBDRIVER_TYPE == "chrome":
option = webdriver.ChromeOptions()
# For ChromeDriver version 79.0.3945.16 or over
# Hide automation
option.add_argument('--disable-blink-features=AutomationControlled')
option.headless = self.HEADLESS
self.driver = webdriver.Chrome(options=option)
elif self.WEBDRIVER_TYPE == "firefox":
option = webdriver.FirefoxOptions()
option.headless = self.HEADLESS
self.driver = webdriver.Firefox(options=option)
def rotate_link(self):
self.CURRENT += 1
if self.CURRENT == len(self.LINKS):
self.CURRENT = 0
def launch_webdriver(self):
self.rotate_link()
# Opening search results
self.driver.get(self.LINKS[self.CURRENT])
def move_page(self):
self.driver.execute_script("window.scrollTo(0, window.scrollY + 250)")
for i in range(0, 10):
self.driver.execute_script("window.scrollTo(0, window.scrollY - 100)")
def find_timeline(self):
# Trap loop to keep waiting for timeline to load
timeline_parent = None
while (timeline_parent == None):
try:
timeline_parent = self.driver.find_element_by_xpath("//div[@aria-label='Timeline: Search timeline']")
except:
pass
# Trap loop to wait for content to load
while (len(str(timeline_parent.text)) == 0):
self.move_page()
time.sleep(1)
#####
# At this point we are mostly sure that the timeline has loaded
#####
# First and only Child of timeline_parent is the actual timeline list element
self.timeline = (timeline_parent.find_elements_by_xpath("./child::*"))[0]
def push_to_telegram(self, parsed_tweet):
attachment_text = ""
for attach in parsed_tweet.attachments:
attachment_text += str(attach) + "\n"
phone_text = ""
for phn in parsed_tweet.phone_numbers:
phone_text += str(phn) + "\n"
text = parsed_tweet.content + "\n\n" + parsed_tweet.time
if attachment_text != "":
text += "\nAttachments: \n" + attachment_text
if phone_text != "":
text += "\nPhone Numbers: \n" + phone_text
# text = parsed_tweet.content + "\n\n" + parsed_tweet.time + "\n" + "Attachements: \n" + attachment_text + "Phone Numbers: \n" + phone_text
# text = text.replace("\n", "%0A")
# text = text.replace(" ", "%20")
# print(text)
if self.TAGS[self.CURRENT] != "":
text += "\n #" + self.TAGS[self.CURRENT]
if (self.CONFIG != ""):
telegram_send.send(conf=str(self.CONFIG).lower(), messages=[text])
else:
telegram_send.send(messages=[text])
def upload_to_db(self, parsed_tweet):
# requests.post("https://covid-aid.techburner.in/api/tweets?content=check&resource=remdesivir&location=mumbai&tweeted_time=2021-04-25 06:22:46&attachments=[\"media/soham.jpg\", \"media/kk.jpg\"]&contacts=[\"872827892\", \"2877872672\"]")
data = {
"content": parsed_tweet.content,
"resource": self.TAGS[self.CURRENT],
"location": self.CONFIG,
"tweeted_time": parsed_tweet.time,
# "attachments": str(parsed_tweet.attachments),
# "contacts": str(parsed_tweet.phone_numbers)
}
s = Session()
p = Request('POST', self.API_URL, params=data).prepare()
manual_url = p.url
# manual_url += "&attachments=[\"wwa.google.com\", \"www.test.com\"]"
manual_url += "&attachments=" + str(parsed_tweet.attachments).replace("'", "\"")
manual_url += "&contacts=" + str(parsed_tweet.phone_numbers).replace("'", "\"")
print("Pushing to server...")
print(manual_url)
requests.post(manual_url)
# Runs infinitely to constantly find new tweets
def scrape(self):
self.launch_webdriver()
self.find_timeline()
# Children of this element = Root elements of tweets
self.tweets = self.timeline.find_elements_by_xpath("./child::*")
def check_new(self):
self.scrape()
top_tweet_parsed = self.parser.parse_tweet(self.tweets[0])
if self.LATEST_TWEET[self.CURRENT] != top_tweet_parsed.content and self.parser.is_tweet_valid(top_tweet_parsed.content) == True:
# New Tweet
print(top_tweet_parsed.content)
print(top_tweet_parsed.time)
print(top_tweet_parsed.attachments)
print(top_tweet_parsed.phone_numbers)
self.upload_to_db(top_tweet_parsed)
self.push_to_telegram(top_tweet_parsed)
self.LATEST_TWEET[self.CURRENT] = top_tweet_parsed.content
print("------------------------")
def start(self):
self.setup_webdriver()
print("debug")
while True:
self.check_new()
time.sleep(10)
def stop(self):
self.driver.quit()
def generate_link_group(location):
return [
"https://twitter.com/search?q=verified+(" + str(location) + ")+%28bed+OR+beds%29+-%22not+verified%22+-%22unverified%22+-%22needed%22+-%22need%22+-%22needs%22+-%22required%22+-%22require%22+-%22requires%22+-%22requirement%22+-%22requirements%22&f=live",
"https://twitter.com/search?q=verified+(" + str(location) + ")+%28icu%29+-%22not+verified%22+-%22unverified%22+-%22needed%22+-%22need%22+-%22needs%22+-%22required%22+-%22require%22+-%22requires%22+-%22requirement%22+-%22requirements%22&f=live",
"https://twitter.com/search?q=verified+(" + str(location) + ")+%28oxygen+OR+ventilator+OR+ventilators%29+-%22not+verified%22+-%22unverified%22+-%22needed%22+-%22need%22+-%22needs%22+-%22required%22+-%22require%22+-%22requires%22+-%22requirement%22+-%22requirements%22&f=live",
"https://twitter.com/search?q=verified+(" + str(location) + ")+%28fabiflu+OR+remdesivir+OR+favipiravir+OR+tocilizumab%29+-%22not+verified%22+-%22unverified%22+-%22needed%22+-%22need%22+-%22needs%22+-%22required%22+-%22require%22+-%22requires%22+-%22requirement%22+-%22requirements%22&f=live",
"https://twitter.com/search?q=verified+(" + str(location) + ")+%28plasma%29+-%22not+verified%22+-%22unverified%22+-%22needed%22+-%22need%22+-%22needs%22+-%22required%22+-%22require%22+-%22requires%22+-%22requirement%22+-%22requirements%22&f=live"
]
# Beds
# ICU
# Oxygen/Ventilator
# Medicine
# Plasma
tags = [
"Beds",
"ICU",
"Oxygen",
"Medicine",
"Plasma"
]
if len(sys.argv) < 2:
print("Please provide location as argument!")
exit(1)
# city=city 1,city 2
location = str(sys.argv[1])
# [city] [city 1,city 2]
location = location.split("=")
# city
config = location[0]
if len(location) > 1:
city = location[1].replace(",", " OR ")
else:
city = config
print("Using config: " + config)
print("Location provded: " + city)
links = generate_link_group(city)
while True:
scraper = Main(links, tags, config, "firefox", True)
scraper.setup_webdriver()
total_time = 3600 # Reset Timer
while total_time > 0:
time.sleep(5)
total_time -= 5
try:
scraper.check_new()
except:
telegram_send.send(conf="error", messages=["Script errored for " + location])
scraper.stop()