/
find_other_news_sources.py
67 lines (57 loc) · 2.02 KB
/
find_other_news_sources.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# __author__ = 'sree'
import urllib2
from lxml import html
import requests
from goose import Goose
g = Goose()
def get_page_tree(url=None):
page = requests.get(url=url, verify=False)
return html.fromstring(page.content)
def get_title(url=None):
try:
article = g.extract(url=link)
return article.title
except Exception as e:
try:
tree = get_page_tree(url=url)
return tree.xpath('//title//text()')[0].strip().split(' -')[0]
except Exception as b:
return None
def find_other_news_sources(url=None, title=None):
# Google forwards the url using <google_domain>/url?q=<actual_link>. This might change over time
forwarding_identifier = '/url?q='
if not title:
try:
title = get_title(url=url)
except Exception as e:
return None
try:
# parent_url_exclude = '-site:' + url
#google_news_search_url = 'http://www.google.com/search?q=' + urllib2.quote(title) + parent_url_exclude + '&tbm=nws'
google_news_search_url = 'http://www.google.co.in/search?q=' + urllib2.quote(title) + '&tbm=nws'
google_news_search_tree = get_page_tree(url=google_news_search_url)
other_news_sources_links = [a_link.replace(forwarding_identifier, '').split('&')[0] for a_link in
google_news_search_tree.xpath('//a//@href') if forwarding_identifier in a_link]
except Exception as e:
return None
new_links = []
count = 0
for a_link in other_news_sources_links:
try:
article = g.extract(a_link)
title = article.title
templist = [title, a_link]
new_links.append(templist)
count = count + 1
# print count
except Exception as e:
print "1Unknown ERROR\n"
print type(e)
print e.args
print e
# print submission.id
print "\n"
continue
if count is 4:
break
return new_links