Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Python3 #8

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
67 changes: 35 additions & 32 deletions confluence_dumper.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# confluence-dumper, a Python project to export spaces, pages and attachments
Expand All @@ -14,7 +15,6 @@
Confluence-dumper is a Python project to export spaces, pages and attachments
"""

from __future__ import print_function
import sys
import codecs

Expand Down Expand Up @@ -124,7 +124,7 @@ def handle_html_references(html_content, page_duplicate_file_names, page_file_ma
except XMLSyntaxError:
print('%sWARNING: Could not parse HTML content of last page. Original content will be downloaded as it is.'
% ('\t'*(depth+1)))
return html_content
return html_content.decode("utf-8")

# Fix links to other Confluence pages
# Example: /display/TES/pictest1
Expand Down Expand Up @@ -179,7 +179,7 @@ def handle_html_references(html_content, page_duplicate_file_names, page_file_ma
if not 'alt' in img_element.attrib.keys():
img_element.attrib['alt'] = relative_file_path

return html.tostring(html_tree)
return html.tostring(html_tree).decode("utf-8")


def download_file(clean_url, download_folder, downloaded_file_name, depth=0, error_output=True):
Expand All @@ -202,7 +202,7 @@ def download_file(clean_url, download_folder, downloaded_file_name, depth=0, err
utils.http_download_binary_file(absolute_download_url, downloaded_file_path,
auth=settings.HTTP_AUTHENTICATION, headers=settings.HTTP_CUSTOM_HEADERS,
verify_peer_certificate=settings.VERIFY_PEER_CERTIFICATE,
proxies=settings.HTTP_PROXIES)
proxies=settings.HTTP_PROXIES, cookies=settings.HTTP_COOKIES)

except utils.ConfluenceException as e:
if error_output:
Expand Down Expand Up @@ -233,22 +233,23 @@ def download_attachment(download_url, download_folder, attachment_id, attachment
downloaded_file_path = download_file(download_url, download_folder, downloaded_file_name, depth=depth)

# Download the thumbnail as well if the attachment is an image
clean_thumbnail_url = clean_url.replace('/attachments/', '/thumbnails/', 1)
downloaded_thumbnail_file_name = derive_downloaded_file_name(clean_thumbnail_url)
downloaded_thumbnail_file_name = provide_unique_file_name(attachment_duplicate_file_names, attachment_file_matching,
downloaded_thumbnail_file_name)
if utils.is_file_format(downloaded_thumbnail_file_name, settings.CONFLUENCE_THUMBNAIL_FORMATS):
# TODO: Confluence creates thumbnails always as PNGs but does not change the file extension to .png.
download_file(clean_thumbnail_url, download_folder, downloaded_thumbnail_file_name, depth=depth,
error_output=False)

# Download the image preview as well if Confluence generated one for the attachment
if utils.is_file_format(downloaded_file_name, settings.CONFLUENCE_GENERATED_PREVIEW_FORMATS):
clean_preview_url = '/rest/documentConversion/latest/conversion/thumbnail/%s/1' % attachment_id
downloaded_preview_file_name = derive_downloaded_file_name(clean_preview_url)
downloaded_preview_file_name = provide_unique_file_name(attachment_duplicate_file_names,
attachment_file_matching, downloaded_preview_file_name)
download_file(clean_preview_url, download_folder, downloaded_preview_file_name, depth=depth, error_output=False)
if settings.GRAB_THUMBNAILS:
clean_thumbnail_url = clean_url.replace('/attachments/', '/thumbnails/', 1)
downloaded_thumbnail_file_name = derive_downloaded_file_name(clean_thumbnail_url)
downloaded_thumbnail_file_name = provide_unique_file_name(attachment_duplicate_file_names, attachment_file_matching,
downloaded_thumbnail_file_name)
if utils.is_file_format(downloaded_thumbnail_file_name, settings.CONFLUENCE_THUMBNAIL_FORMATS):
# TODO: Confluence creates thumbnails always as PNGs but does not change the file extension to .png.
download_file(clean_thumbnail_url, download_folder, downloaded_thumbnail_file_name, depth=depth,
error_output=False)

# Download the image preview as well if Confluence generated one for the attachment
if utils.is_file_format(downloaded_file_name, settings.CONFLUENCE_GENERATED_PREVIEW_FORMATS):
clean_preview_url = '/rest/documentConversion/latest/conversion/thumbnail/%s/1' % attachment_id
downloaded_preview_file_name = derive_downloaded_file_name(clean_preview_url)
downloaded_preview_file_name = provide_unique_file_name(attachment_duplicate_file_names,
attachment_file_matching, downloaded_preview_file_name)
download_file(clean_preview_url, download_folder, downloaded_preview_file_name, depth=depth, error_output=False)

return {'file_name': downloaded_file_name, 'file_path': downloaded_file_path}

Expand Down Expand Up @@ -301,7 +302,7 @@ def fetch_page_recursively(page_id, folder_path, download_folder, html_template,
try:
response = utils.http_get(page_url, auth=settings.HTTP_AUTHENTICATION, headers=settings.HTTP_CUSTOM_HEADERS,
verify_peer_certificate=settings.VERIFY_PEER_CERTIFICATE,
proxies=settings.HTTP_PROXIES)
proxies=settings.HTTP_PROXIES, cookies=settings.HTTP_COOKIES)
page_content = response['body']['view']['value']

page_title = response['title']
Expand All @@ -321,15 +322,17 @@ def fetch_page_recursively(page_id, folder_path, download_folder, html_template,
while page_url:
response = utils.http_get(page_url, auth=settings.HTTP_AUTHENTICATION, headers=settings.HTTP_CUSTOM_HEADERS,
verify_peer_certificate=settings.VERIFY_PEER_CERTIFICATE,
proxies=settings.HTTP_PROXIES)
proxies=settings.HTTP_PROXIES, cookies=settings.HTTP_COOKIES)
counter += len(response['results'])
for attachment in response['results']:
download_url = attachment['_links']['download']
attachment_id = attachment['id'][3:]
attachment_info = download_attachment(download_url, download_folder, attachment_id,
attachment_duplicate_file_names, attachment_file_matching,
depth=depth+1)
path_collection['child_attachments'].append(attachment_info)

if settings.GRAB_ATTACHMENTS:
for attachment in response['results']:
download_url = attachment['_links']['download']
attachment_id = attachment['id'][3:]
attachment_info = download_attachment(download_url, download_folder, attachment_id,
attachment_duplicate_file_names, attachment_file_matching,
depth=depth+1)
path_collection['child_attachments'].append(attachment_info)

if 'next' in response['_links'].keys():
page_url = response['_links']['next']
Expand Down Expand Up @@ -359,7 +362,7 @@ def fetch_page_recursively(page_id, folder_path, download_folder, html_template,
while page_url:
response = utils.http_get(page_url, auth=settings.HTTP_AUTHENTICATION, headers=settings.HTTP_CUSTOM_HEADERS,
verify_peer_certificate=settings.VERIFY_PEER_CERTIFICATE,
proxies=settings.HTTP_PROXIES)
proxies=settings.HTTP_PROXIES, cookies=settings.HTTP_COOKIES)
counter += len(response['results'])
for child_page in response['results']:
paths = fetch_page_recursively(child_page['id'], folder_path, download_folder, html_template,
Expand Down Expand Up @@ -445,7 +448,7 @@ def main():
while page_url:
response = utils.http_get(page_url, auth=settings.HTTP_AUTHENTICATION, headers=settings.HTTP_CUSTOM_HEADERS,
verify_peer_certificate=settings.VERIFY_PEER_CERTIFICATE,
proxies=settings.HTTP_PROXIES)
proxies=settings.HTTP_PROXIES, cookies=settings.HTTP_COOKIES)
for space in response['results']:
spaces_to_export.append(space['key'])

Expand Down Expand Up @@ -476,7 +479,7 @@ def main():
response = utils.http_get(space_url, auth=settings.HTTP_AUTHENTICATION,
headers=settings.HTTP_CUSTOM_HEADERS,
verify_peer_certificate=settings.VERIFY_PEER_CERTIFICATE,
proxies=settings.HTTP_PROXIES)
proxies=settings.HTTP_PROXIES, cookies=settings.HTTP_COOKIES)
space_name = response['name']

print('SPACE (%d/%d): %s (%s)' % (space_counter, len(spaces_to_export), space_name, space))
Expand Down
9 changes: 9 additions & 0 deletions settings.sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@
# Example for HTTP Basic Authentication: ('johndoe', 'sup3rs3cur3pw')
HTTP_AUTHENTICATION = ('johndoe', 'sup3rs3cur3pw')

# Alternatively, authenticate using a cookie
HTTP_COOKIES = None

# Verify x.509 certificate of confluence http server
VERIFY_PEER_CERTIFICATE = True

Expand All @@ -44,3 +47,9 @@

# The following message is displayed for page forwardings
HTML_FORWARD_MESSAGE = '<a href="%s">If you are not automatically forwarded to %s, please click here!</a>'

# Download attachments
GRAB_ATTACHMENTS = True

# Download thumbnails
GRAB_THUMBNAILS = True
20 changes: 12 additions & 8 deletions utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,20 @@
import requests
import shutil
import re
import urllib3
import urllib

# SUPPRESS WARNINGS
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)


class ConfluenceException(Exception):
""" Exception for Confluence export issues """
def __init__(self, message):
super(ConfluenceException, self).__init__(message)


def http_get(request_url, auth=None, headers=None, verify_peer_certificate=True, proxies=None):
def http_get(request_url, auth=None, headers=None, verify_peer_certificate=True, proxies=None, cookies=None):
""" Requests a HTTP url and returns a requested JSON response.

:param request_url: HTTP URL to request.
Expand All @@ -33,7 +37,7 @@ def http_get(request_url, auth=None, headers=None, verify_peer_certificate=True,
:returns: JSON response.
:raises: ConfluenceException in the case of the server does not answer HTTP code 200.
"""
response = requests.get(request_url, auth=auth, headers=headers, verify=verify_peer_certificate, proxies=proxies)
response = requests.get(request_url, auth=auth, headers=headers, verify=verify_peer_certificate, proxies=proxies, cookies=cookies)
if 200 == response.status_code:
return response.json()
else:
Expand All @@ -42,7 +46,7 @@ def http_get(request_url, auth=None, headers=None, verify_peer_certificate=True,


def http_download_binary_file(request_url, file_path, auth=None, headers=None, verify_peer_certificate=True,
proxies=None):
proxies=None, cookies=None):
""" Requests a HTTP url to save a file on the local filesystem.

:param request_url: Requested HTTP URL.
Expand All @@ -54,7 +58,7 @@ def http_download_binary_file(request_url, file_path, auth=None, headers=None, v
:raises: ConfluenceException in the case of the server does not answer with HTTP code 200.
"""
response = requests.get(request_url, stream=True, auth=auth, headers=headers, verify=verify_peer_certificate,
proxies=proxies)
proxies=proxies, cookies=cookies)
if 200 == response.status_code:
with open(file_path, 'wb') as downloaded_file:
response.raw.decode_content = True
Expand All @@ -75,7 +79,7 @@ def write_2_file(path, content):
"""
try:
with open(path, 'w') as the_file:
the_file.write(content.encode('utf8'))
the_file.write(content)
except:
print("File could not be written")

Expand All @@ -97,7 +101,7 @@ def write_html_2_file(path, title, content, html_template, additional_headers=No
# Note: One backslash has to be escaped with two avoid that backslashes are interpreted as escape chars
replacements = {'title': title, 'content': content, 'additional_headers': additional_html_headers}

for placeholder, replacement in replacements.iteritems():
for placeholder, replacement in replacements.items():
regex_placeholder = r'{%\s*' + placeholder + r'\s*%\}'
try:
html_content = re.sub(regex_placeholder, replacement.replace('\\', '\\\\'), html_content,
Expand All @@ -124,7 +128,7 @@ def decode_url(encoded_url):
:param encoded_url: Encoded URL.
:returns: Decoded URL.
"""
return urllib.unquote(encoded_url.encode('utf8')).decode('utf8')
return urllib.parse.unquote(encoded_url.encode('utf8'))


def encode_url(decoded_url):
Expand All @@ -133,7 +137,7 @@ def encode_url(decoded_url):
:param decoded_url: Decoded URL.
:returns: Encoded URL.
"""
return urllib.quote(decoded_url.encode('utf8')).encode('utf8')
return urllib.parse.quote(decoded_url.encode('utf8'))


def is_file_format(file_name, file_extensions):
Expand Down