siemens · gquere · Jul 8, 2022 · Jul 8, 2022 · Jul 8, 2022 · Jul 8, 2022
diff --git a/confluence_dumper.py b/confluence_dumper.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 
 # confluence-dumper, a Python project to export spaces, pages and attachments
@@ -14,7 +15,6 @@
 Confluence-dumper is a Python project to export spaces, pages and attachments
 """
 
-from __future__ import print_function
 import sys
 import codecs
 
@@ -124,7 +124,7 @@ def handle_html_references(html_content, page_duplicate_file_names, page_file_ma
     except XMLSyntaxError:
         print('%sWARNING: Could not parse HTML content of last page. Original content will be downloaded as it is.'
               % ('\t'*(depth+1)))
-        return html_content
+        return html_content.decode("utf-8")
 
     # Fix links to other Confluence pages
     # Example: /display/TES/pictest1
@@ -179,7 +179,7 @@ def handle_html_references(html_content, page_duplicate_file_names, page_file_ma
         if not 'alt' in img_element.attrib.keys():
             img_element.attrib['alt'] = relative_file_path
 
-    return html.tostring(html_tree)
+    return html.tostring(html_tree).decode("utf-8")
 
 
 def download_file(clean_url, download_folder, downloaded_file_name, depth=0, error_output=True):
@@ -202,7 +202,7 @@ def download_file(clean_url, download_folder, downloaded_file_name, depth=0, err
             utils.http_download_binary_file(absolute_download_url, downloaded_file_path,
                                             auth=settings.HTTP_AUTHENTICATION, headers=settings.HTTP_CUSTOM_HEADERS,
                                             verify_peer_certificate=settings.VERIFY_PEER_CERTIFICATE,
-                                            proxies=settings.HTTP_PROXIES)
+                                            proxies=settings.HTTP_PROXIES, cookies=settings.HTTP_COOKIES)
 
         except utils.ConfluenceException as e:
             if error_output:
@@ -233,22 +233,23 @@ def download_attachment(download_url, download_folder, attachment_id, attachment
     downloaded_file_path = download_file(download_url, download_folder, downloaded_file_name, depth=depth)
 
     # Download the thumbnail as well if the attachment is an image
-    clean_thumbnail_url = clean_url.replace('/attachments/', '/thumbnails/', 1)
-    downloaded_thumbnail_file_name = derive_downloaded_file_name(clean_thumbnail_url)
-    downloaded_thumbnail_file_name = provide_unique_file_name(attachment_duplicate_file_names, attachment_file_matching,
-                                                              downloaded_thumbnail_file_name)
-    if utils.is_file_format(downloaded_thumbnail_file_name, settings.CONFLUENCE_THUMBNAIL_FORMATS):
-        # TODO: Confluence creates thumbnails always as PNGs but does not change the file extension to .png.
-        download_file(clean_thumbnail_url, download_folder, downloaded_thumbnail_file_name, depth=depth,
-                      error_output=False)
-
-    # Download the image preview as well if Confluence generated one for the attachment
-    if utils.is_file_format(downloaded_file_name, settings.CONFLUENCE_GENERATED_PREVIEW_FORMATS):
-        clean_preview_url = '/rest/documentConversion/latest/conversion/thumbnail/%s/1' % attachment_id
-        downloaded_preview_file_name = derive_downloaded_file_name(clean_preview_url)
-        downloaded_preview_file_name = provide_unique_file_name(attachment_duplicate_file_names,
-                                                                attachment_file_matching, downloaded_preview_file_name)
-        download_file(clean_preview_url, download_folder, downloaded_preview_file_name, depth=depth, error_output=False)
+    if settings.GRAB_THUMBNAILS:
+        clean_thumbnail_url = clean_url.replace('/attachments/', '/thumbnails/', 1)
+        downloaded_thumbnail_file_name = derive_downloaded_file_name(clean_thumbnail_url)
+        downloaded_thumbnail_file_name = provide_unique_file_name(attachment_duplicate_file_names, attachment_file_matching,
+                                                                  downloaded_thumbnail_file_name)
+        if utils.is_file_format(downloaded_thumbnail_file_name, settings.CONFLUENCE_THUMBNAIL_FORMATS):
+            # TODO: Confluence creates thumbnails always as PNGs but does not change the file extension to .png.
+            download_file(clean_thumbnail_url, download_folder, downloaded_thumbnail_file_name, depth=depth,
+                          error_output=False)
+
+        # Download the image preview as well if Confluence generated one for the attachment
+        if utils.is_file_format(downloaded_file_name, settings.CONFLUENCE_GENERATED_PREVIEW_FORMATS):
+            clean_preview_url = '/rest/documentConversion/latest/conversion/thumbnail/%s/1' % attachment_id
+            downloaded_preview_file_name = derive_downloaded_file_name(clean_preview_url)
+            downloaded_preview_file_name = provide_unique_file_name(attachment_duplicate_file_names,
+                                                                    attachment_file_matching, downloaded_preview_file_name)
+            download_file(clean_preview_url, download_folder, downloaded_preview_file_name, depth=depth, error_output=False)
 
     return {'file_name': downloaded_file_name, 'file_path': downloaded_file_path}
 
@@ -301,7 +302,7 @@ def fetch_page_recursively(page_id, folder_path, download_folder, html_template,
     try:
         response = utils.http_get(page_url, auth=settings.HTTP_AUTHENTICATION, headers=settings.HTTP_CUSTOM_HEADERS,
                                   verify_peer_certificate=settings.VERIFY_PEER_CERTIFICATE,
-                                  proxies=settings.HTTP_PROXIES)
+                                  proxies=settings.HTTP_PROXIES, cookies=settings.HTTP_COOKIES)
         page_content = response['body']['view']['value']
 
         page_title = response['title']
@@ -321,15 +322,17 @@ def fetch_page_recursively(page_id, folder_path, download_folder, html_template,
         while page_url:
             response = utils.http_get(page_url, auth=settings.HTTP_AUTHENTICATION, headers=settings.HTTP_CUSTOM_HEADERS,
                                       verify_peer_certificate=settings.VERIFY_PEER_CERTIFICATE,
-                                      proxies=settings.HTTP_PROXIES)
+                                      proxies=settings.HTTP_PROXIES, cookies=settings.HTTP_COOKIES)
             counter += len(response['results'])
-            for attachment in response['results']:
-                download_url = attachment['_links']['download']
-                attachment_id = attachment['id'][3:]
-                attachment_info = download_attachment(download_url, download_folder, attachment_id,
-                                                      attachment_duplicate_file_names, attachment_file_matching,
-                                                      depth=depth+1)
-                path_collection['child_attachments'].append(attachment_info)
+
+            if settings.GRAB_ATTACHMENTS:
+                for attachment in response['results']:
+                    download_url = attachment['_links']['download']
+                    attachment_id = attachment['id'][3:]
+                    attachment_info = download_attachment(download_url, download_folder, attachment_id,
+                                                          attachment_duplicate_file_names, attachment_file_matching,
+                                                          depth=depth+1)
+                    path_collection['child_attachments'].append(attachment_info)
 
             if 'next' in response['_links'].keys():
                 page_url = response['_links']['next']
@@ -359,7 +362,7 @@ def fetch_page_recursively(page_id, folder_path, download_folder, html_template,
         while page_url:
             response = utils.http_get(page_url, auth=settings.HTTP_AUTHENTICATION, headers=settings.HTTP_CUSTOM_HEADERS,
                                       verify_peer_certificate=settings.VERIFY_PEER_CERTIFICATE,
-                                      proxies=settings.HTTP_PROXIES)
+                                      proxies=settings.HTTP_PROXIES, cookies=settings.HTTP_COOKIES)
             counter += len(response['results'])
             for child_page in response['results']:
                 paths = fetch_page_recursively(child_page['id'], folder_path, download_folder, html_template,
@@ -445,7 +448,7 @@ def main():
         while page_url:
             response = utils.http_get(page_url, auth=settings.HTTP_AUTHENTICATION, headers=settings.HTTP_CUSTOM_HEADERS,
                                       verify_peer_certificate=settings.VERIFY_PEER_CERTIFICATE,
-                                      proxies=settings.HTTP_PROXIES)
+                                      proxies=settings.HTTP_PROXIES, cookies=settings.HTTP_COOKIES)
             for space in response['results']:
                 spaces_to_export.append(space['key'])
 
@@ -476,7 +479,7 @@ def main():
             response = utils.http_get(space_url, auth=settings.HTTP_AUTHENTICATION,
                                       headers=settings.HTTP_CUSTOM_HEADERS,
                                       verify_peer_certificate=settings.VERIFY_PEER_CERTIFICATE,
-                                      proxies=settings.HTTP_PROXIES)
+                                      proxies=settings.HTTP_PROXIES, cookies=settings.HTTP_COOKIES)
             space_name = response['name']
 
             print('SPACE (%d/%d): %s (%s)' % (space_counter, len(spaces_to_export), space_name, space))

diff --git a/settings.sample.py b/settings.sample.py
@@ -20,6 +20,9 @@
 # Example for HTTP Basic Authentication: ('johndoe', 'sup3rs3cur3pw')
 HTTP_AUTHENTICATION = ('johndoe', 'sup3rs3cur3pw')
 
+# Alternatively, authenticate using a cookie
+HTTP_COOKIES = None
+
 # Verify x.509 certificate of confluence http server
 VERIFY_PEER_CERTIFICATE = True
 
@@ -44,3 +47,9 @@
 
 # The following message is displayed for page forwardings
 HTML_FORWARD_MESSAGE = '<a href="%s">If you are not automatically forwarded to %s, please click here!</a>'
+
+# Download attachments
+GRAB_ATTACHMENTS = True
+
+# Download thumbnails
+GRAB_THUMBNAILS = True
diff --git a/utils.py b/utils.py
@@ -13,16 +13,20 @@
 import requests
 import shutil
 import re
+import urllib3
 import urllib
 
+# SUPPRESS WARNINGS
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+
 
 class ConfluenceException(Exception):
     """ Exception for Confluence export issues """
     def __init__(self, message):
         super(ConfluenceException, self).__init__(message)
 
 
-def http_get(request_url, auth=None, headers=None, verify_peer_certificate=True, proxies=None):
+def http_get(request_url, auth=None, headers=None, verify_peer_certificate=True, proxies=None, cookies=None):
     """ Requests a HTTP url and returns a requested JSON response.
 
     :param request_url: HTTP URL to request.
@@ -33,7 +37,7 @@ def http_get(request_url, auth=None, headers=None, verify_peer_certificate=True,
     :returns: JSON response.
     :raises: ConfluenceException in the case of the server does not answer HTTP code 200.
     """
-    response = requests.get(request_url, auth=auth, headers=headers, verify=verify_peer_certificate, proxies=proxies)
+    response = requests.get(request_url, auth=auth, headers=headers, verify=verify_peer_certificate, proxies=proxies, cookies=cookies)
     if 200 == response.status_code:
         return response.json()
     else:
@@ -42,7 +46,7 @@ def http_get(request_url, auth=None, headers=None, verify_peer_certificate=True,
 
 
 def http_download_binary_file(request_url, file_path, auth=None, headers=None, verify_peer_certificate=True,
-                              proxies=None):
+                              proxies=None, cookies=None):
     """ Requests a HTTP url to save a file on the local filesystem.
 
     :param request_url: Requested HTTP URL.
@@ -54,7 +58,7 @@ def http_download_binary_file(request_url, file_path, auth=None, headers=None, v
     :raises: ConfluenceException in the case of the server does not answer with HTTP code 200.
     """
     response = requests.get(request_url, stream=True, auth=auth, headers=headers, verify=verify_peer_certificate,
-                            proxies=proxies)
+                            proxies=proxies, cookies=cookies)
     if 200 == response.status_code:
         with open(file_path, 'wb') as downloaded_file:
             response.raw.decode_content = True
@@ -75,7 +79,7 @@ def write_2_file(path, content):
     """
     try:
         with open(path, 'w') as the_file:
-            the_file.write(content.encode('utf8'))
+            the_file.write(content)
     except:
         print("File could not be written")
 
@@ -97,7 +101,7 @@ def write_html_2_file(path, title, content, html_template, additional_headers=No
     # Note: One backslash has to be escaped with two avoid that backslashes are interpreted as escape chars
     replacements = {'title': title, 'content': content, 'additional_headers': additional_html_headers}
 
-    for placeholder, replacement in replacements.iteritems():
+    for placeholder, replacement in replacements.items():
         regex_placeholder = r'{%\s*' + placeholder + r'\s*%\}'
         try:
             html_content = re.sub(regex_placeholder, replacement.replace('\\', '\\\\'), html_content,
@@ -124,7 +128,7 @@ def decode_url(encoded_url):
     :param encoded_url: Encoded URL.
     :returns: Decoded URL.
     """
-    return urllib.unquote(encoded_url.encode('utf8')).decode('utf8')
+    return urllib.parse.unquote(encoded_url.encode('utf8'))
 
 
 def encode_url(decoded_url):
@@ -133,7 +137,7 @@ def encode_url(decoded_url):
     :param decoded_url: Decoded URL.
     :returns: Encoded URL.
     """
-    return urllib.quote(decoded_url.encode('utf8')).encode('utf8')
+    return urllib.parse.quote(decoded_url.encode('utf8'))
 
 
 def is_file_format(file_name, file_extensions):