/
convert-faq-to-pdf
executable file
·87 lines (65 loc) · 2.22 KB
/
convert-faq-to-pdf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#!/usr/bin/env python3
import requests
from bs4 import BeautifulSoup
import pdfkit
import cssutils
import sys
import os
"""
Python webscraping program to get html content from AWS FAQ page and convert to PDF for offline access.
Enter AWS Service name or the slug in <https://aws.amazon.com/[SLUG]/faqs/> to download the corresponding FAQ as PDF.
Usage:
>> pipenv shell
>> pipenv install
>> chmod +x convert-faq-to-pdf
>> ./convert-faq-to-pdf [SLUG]
For example:
>> ./convert-faq-to-pdf dynamodb
or
>> ./convert-faq-to-pdf kinesis/data-streams
*For educational purposes only.*
"""
service_name = sys.argv[1]
url = f"https://aws.amazon.com/{service_name}/faqs/"
# Make a GET request to fetch the raw HTML content
html_content = requests.get(url).text
# parse the html content
soup = BeautifulSoup(html_content, "lxml")
# print(soup.prettify())
aws_page_content = soup.find(id="aws-page-content")
# print(aws_page_content.h1)
# print(aws_page_content.main)
for div in aws_page_content.find_all("div", {"class":"lb-none-pad"}):
div.decompose()
for div in aws_page_content.find_all("div", {"class":"lb-sticky-sidebar"}):
div.decompose()
for div in aws_page_content.find_all("div", {"class":"lb-expand-trigger"}):
div.decompose()
for div in aws_page_content.find_all("div", {"class":"lb-collapse-trigger"}):
div.decompose()
for div in aws_page_content.find_all("div", {"class":"vbb"}):
div.decompose()
sheets = []
for styletag in soup.find_all('style', type='text/css'):
if not styletag.string: # probably an external sheet
continue
sheets.append(cssutils.parseStyle(styletag.string))
options = {
'page-size': 'Letter',
'margin-top': '0.75in',
'margin-right': '0.75in',
'margin-bottom': '0.75in',
'margin-left': '0.75in',
'encoding': "UTF-8",
'custom-header' : [
('Accept-Encoding', 'gzip')
],
'cookie': [
('cookie-name1', 'cookie-value1'),
('cookie-name2', 'cookie-value2'),
],
'no-outline': None
}
final_html = str(aws_page_content.h1) + str(aws_page_content.main)
file_name = aws_page_content.h1.text.strip() # remove leading and trailing whitespaces
pdfkit.from_string(final_html, os.path.join('pdf', f'{file_name}.pdf'), css=sheets, options=options)