-
Notifications
You must be signed in to change notification settings - Fork 0
/
webcrawler.py
42 lines (28 loc) · 1021 Bytes
/
webcrawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import requests as r
from bs4 import BeautifulSoup as bs
import lxml
from links import Links
from keywords import Keyword
class Webcrawler():
def __init__(self):
self.count = 0
def crawl(self, homepage):
""" crawls web """
url = homepage
rep = r.get(url)
if rep.status_code == 200:
soup = bs(rep.text, 'html.parser')
links = soup.find_all('a')
for l in links:
link = (l.get('href'))
if link:
if not link.startswith("http://"):
link = url.split('/', 3)[0] + "//" + url.split(
'/', 3)[1] + url.split('/', 3)[2] + "/" + link
if Links.process_links(link):
self.count += 1
if self.count > 100:
break
if url[7:-9] not in link:
break
self.crawl(link)