From b5e368ac7e208b2bddd8ae52e350f9d43edecc6f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dan=20Michael=20O=2E=20Hegg=C3=B8?= Date: Mon, 9 Apr 2018 00:02:19 +0200 Subject: [PATCH] [#189] Add chunked file uploads MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use chunked file uploading for files larger than 1 MB as long as we’re on MediaWiki >= 1.20. --- mwclient/client.py | 110 ++++++++++++++++++++++++++++++++++++++------- mwclient/util.py | 9 ++++ 2 files changed, 103 insertions(+), 16 deletions(-) diff --git a/mwclient/client.py b/mwclient/client.py index 85c50584..54a37c9d 100644 --- a/mwclient/client.py +++ b/mwclient/client.py @@ -17,7 +17,7 @@ import mwclient.errors as errors import mwclient.listing as listing from mwclient.sleep import Sleepers -from mwclient.util import parse_timestamp +from mwclient.util import parse_timestamp, read_in_chunks try: import gzip @@ -119,6 +119,9 @@ def __init__(self, host, path='/w/', ext='.php', pool=None, retry_timeout=30, # Initialization status self.initialized = False + # Upload chunk size in bytes + self.chunk_size = 1048576 + if do_init: try: self.site_init() @@ -612,20 +615,34 @@ def upload(self, file=None, filename=None, description='', ignore=False, if not image.can('upload'): raise errors.InsufficientPermission(filename) - predata = {} - if comment is None: - predata['comment'] = description + comment = description + text = None else: - predata['comment'] = comment - predata['text'] = description + comment = comment + text = description + + if file is not None: + if not hasattr(file, 'read'): + file = open(file, 'rb') + + content_size = file.seek(0, 2) + file.seek(0) + + if self.version[:2] >= (1, 20) and content_size > self.chunk_size: + return self.chunk_upload(file, filename, ignore, comment, text) + + predata = { + 'action': 'upload', + 'format': 'json', + 'filename': filename, + 'comment': comment, + 'text': text, + 'token': image.get_token('edit'), + } if ignore: predata['ignorewarnings'] = 'true' - predata['token'] = image.get_token('edit') - predata['action'] = 'upload' - predata['format'] = 'json' - predata['filename'] = filename if url: predata['url'] = url @@ -645,11 +662,7 @@ def upload(self, file=None, filename=None, description='', ignore=False, # Since the filename in Content-Disposition is not interpreted, # we can send some ascii-only dummy name rather than the real # filename, which might contain non-ascii. - file = ('fake-filename', file) - # End of workaround - # ---------------------------------------------------------------- - - files = {'file': file} + files = {'file': ('fake-filename', file)} sleeper = self.sleepers.make() while True: @@ -658,7 +671,72 @@ def upload(self, file=None, filename=None, description='', ignore=False, if not info: info = {} if self.handle_api_result(info, kwargs=predata, sleeper=sleeper): - return info.get('upload', {}) + response = info.get('upload', {}) + break + if file is not None: + file.close() + return response + + def chunk_upload(self, file, filename, ignorewarnings, comment, text): + """Upload a file to the site in chunks. + + This method is called by `Site.upload` if you are connecting to a newer + MediaWiki installation, so it's normally not necessary to call this + method directly. + + Args: + file (file-like object): File object or stream to upload. + params (dict): Dict containing upload parameters. + """ + image = self.Images[filename] + + content_size = file.seek(0, 2) + file.seek(0) + + params = { + 'action': 'upload', + 'format': 'json', + 'stash': 1, + 'offset': 0, + 'filename': filename, + 'filesize': content_size, + 'token': image.get_token('edit'), + } + if ignorewarnings: + params['ignorewarnings'] = 'true' + + sleeper = self.sleepers.make() + offset = 0 + for chunk in read_in_chunks(file, self.chunk_size): + while True: + data = self.raw_call('api', params, files={'chunk': chunk}) + info = json.loads(data) + if self.handle_api_result(info, kwargs=params, sleeper=sleeper): + response = info.get('upload', {}) + break + + offset += chunk.tell() + chunk.close() + log.debug('%s: Uploaded %d of %d bytes', filename, offset, content_size) + params['filekey'] = response['filekey'] + if response['result'] == 'Continue': + params['offset'] = response['offset'] + elif response['result'] == 'Success': + file.close() + break + else: + # Some kind or error or warning occured. In any case, we do not + # get the parameters we need to continue, so we should return + # the response now. + file.close() + return response + + del params['action'] + del params['stash'] + del params['offset'] + params['comment'] = comment + params['text'] = text + return self.post('upload', **params) def parse(self, text=None, title=None, page=None, prop=None, redirects=False, mobileformat=False): diff --git a/mwclient/util.py b/mwclient/util.py index bcd7b60c..8855c36d 100644 --- a/mwclient/util.py +++ b/mwclient/util.py @@ -1,7 +1,16 @@ import time +import io def parse_timestamp(t): if t is None or t == '0000-00-00T00:00:00Z': return (0, 0, 0, 0, 0, 0, 0, 0, 0) return time.strptime(t, '%Y-%m-%dT%H:%M:%SZ') + + +def read_in_chunks(stream, chunk_size): + while True: + data = stream.read(chunk_size) + if not data: + break + yield io.BytesIO(data)