/
pdf.py
45 lines (37 loc) · 1.08 KB
/
pdf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import re
import sys
import os
import requests
import argparse
from urllib.request import urlopen
'''
argparse to enable finicky command-line args
'''
parser = argparse.ArgumentParser()
parser.add_argument('website_url')
parser.add_argument('file_path')
parser.add_argument('relative_url', nargs='?', default='none')
args = parser.parse_args()
# HTML Parse URL
url_link = str(args.website_url)
# wget URL
wget_link = str(args.relative_url)
# system path
sys_path = str(args.file_path)
website = urlopen(url_link)
# Decode website into string string
html = website.read().decode('utf-8')
#Find all local and non-hosted pdfs and download
links = re.findall('"(https?://\S*?.pdf)"', html)
local_links = re.findall('"([^.\"=]*.pdf)', html)
#Download non-hosted PDFs
for link in links:
command = 'wget'
os.system("%s %s -P %s" % (command, link, sys_path))
#Download local/hosted PDFs
for link in local_links:
command = 'wget'
if(wget_link == 'none'):
os.system("%s %s/%s -P %s" % (command, url_link, link, sys_path))
else:
os.system("%s %s/%s -P %s" % (command, wget_link, link, sys_path))