convert-faq-to-pdf

#!/usr/bin/env python3

import requests
from bs4 import BeautifulSoup
import pdfkit
import cssutils
import sys
import os

"""
Python webscraping program to get html content from AWS FAQ page and convert to PDF for offline access.

Enter AWS Service name or the slug in <https://aws.amazon.com/[SLUG]/faqs/> to download the corresponding FAQ as PDF. 

Usage:
>> pipenv shell
>> pipenv install
>> chmod +x convert-faq-to-pdf
>> ./convert-faq-to-pdf [SLUG]

For example:
>> ./convert-faq-to-pdf dynamodb
or
>> ./convert-faq-to-pdf kinesis/data-streams

*For educational purposes only.*

"""

service_name = sys.argv[1]
url = f"https://aws.amazon.com/{service_name}/faqs/"

# Make a GET request to fetch the raw HTML content
html_content = requests.get(url).text

# parse the html content
soup = BeautifulSoup(html_content, "lxml")
# print(soup.prettify())

aws_page_content = soup.find(id="aws-page-content")
# print(aws_page_content.h1)
# print(aws_page_content.main)

for div in aws_page_content.find_all("div", {"class":"lb-none-pad"}):
    div.decompose()

for div in aws_page_content.find_all("div", {"class":"lb-sticky-sidebar"}):
    div.decompose()

for div in aws_page_content.find_all("div", {"class":"lb-expand-trigger"}):
    div.decompose()

for div in aws_page_content.find_all("div", {"class":"lb-collapse-trigger"}):
    div.decompose()

for div in aws_page_content.find_all("div", {"class":"vbb"}):
    div.decompose()


sheets = []
for styletag in soup.find_all('style', type='text/css'):
    if not styletag.string: # probably an external sheet
        continue
    sheets.append(cssutils.parseStyle(styletag.string))

options = {
    'page-size': 'Letter',
    'margin-top': '0.75in',
    'margin-right': '0.75in',
    'margin-bottom': '0.75in',
    'margin-left': '0.75in',
    'encoding': "UTF-8",
    'custom-header' : [
        ('Accept-Encoding', 'gzip')
    ],
    'cookie': [
        ('cookie-name1', 'cookie-value1'),
        ('cookie-name2', 'cookie-value2'),
    ],
    'no-outline': None
}

final_html = str(aws_page_content.h1) + str(aws_page_content.main)

file_name = aws_page_content.h1.text.strip()  # remove leading and trailing whitespaces

pdfkit.from_string(final_html, os.path.join('pdf', f'{file_name}.pdf'), css=sheets, options=options)