Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Correctly handling unsupported content #359

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
54 changes: 52 additions & 2 deletions splash/browser_tab.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from splash.qtutils import (OPERATION_QT_CONSTANTS, WrappedSignal, qt2py,
qurl2ascii, to_qurl)
from splash.render_options import validate_size_str
from splash.qwebpage import SplashQWebPage, SplashQWebView
from splash.qwebpage import SplashQWebPage, SplashQWebView, RenderErrorInfo
from splash.exceptions import JsError, OneShotCallbackError, ScriptError
from splash.utils import to_bytes
from splash.jsutils import (
Expand Down Expand Up @@ -71,6 +71,9 @@ def __init__(self, network_manager, splash_proxy_factory, verbosity,
self._callback_proxies_to_cancel = weakref.WeakSet()
self._js_console = None
self._autoload_scripts = []
self._is_unsupported_content = False
self._unsupported_content_reply = None
self._load_finished_after_unsupported_content_ready = False

self.logger = _BrowserTabLogger(uid=self._uid, verbosity=verbosity)
self._init_webpage(verbosity, network_manager, splash_proxy_factory,
Expand Down Expand Up @@ -140,6 +143,8 @@ def _setup_webpage_events(self):
self.web_page.mainFrame().loadFinished.connect(self._on_load_finished)
self.web_page.mainFrame().urlChanged.connect(self._on_url_changed)
self.web_page.mainFrame().javaScriptWindowObjectCleared.connect(self._on_javascript_window_object_cleared)
self.web_page.setForwardUnsupportedContent(True)
self.web_page.unsupportedContent.connect(self._on_unsupported_content)
self.logger.add_web_page(self.web_page)

def return_result(self, result):
Expand Down Expand Up @@ -379,6 +384,15 @@ def _on_load_finished(self, ok):
This callback is called for all web_page.mainFrame()
loadFinished events.
"""
if self._is_unsupported_content:
if self._unsupported_content_reply.isRunning():
# XXX: We'll come back later when download finishes
self.logger.log(
'Still receving unsupported content', min_level=3)
return
else:
self._load_finished_after_unsupported_content_ready = True
self.logger.log('Unsupported content received', min_level=3)
if self.web_page.maybe_redirect(ok):
self.logger.log("Redirect or other non-fatal error detected", min_level=2)
return
Expand Down Expand Up @@ -426,7 +440,11 @@ def _on_content_ready(self, ok, callback, errback, callback_id):
"""
This method is called when a QWebPage finishes loading its contents.
"""
if self.web_page.maybe_redirect(ok):
if self._is_unsupported_content:
if self._unsupported_content_reply.isRunning():
# XXX: We'll come back later when download finishes
return
elif self.web_page.maybe_redirect(ok):
# XXX: It assumes loadFinished will be called again because
# redirect happens. If redirect is detected improperly,
# loadFinished won't be called again, and Splash will return
Expand All @@ -438,6 +456,16 @@ def _on_content_ready(self, ok, callback, errback, callback_id):

if self.web_page.is_ok(ok):
callback()
elif self._is_unsupported_content:
# XXX: Error downloading unsupported content.
# `self.web_page.error_info` shall be `None` now
error_info = RenderErrorInfo(
'Network',
int(self._unsupported_content_reply.error()),
six.text_type(self._unsupported_content_reply.errorString()),
six.text_type(self._unsupported_content_reply.url().url())
)
errback(error_info)
elif self.web_page.error_loading(ok):
# XXX: maybe return a meaningful error page instead of generic
# error message?
Expand Down Expand Up @@ -512,6 +540,28 @@ def _on_url_changed(self, url):
self.web_page.har.store_redirect(six.text_type(url.toString()))
self._cancel_timers(self._timers_to_cancel_on_redirect)

def _on_unsupported_content_finished(self):
self.logger.log('Unsupported content finished', min_level=3)
if not self._load_finished_after_unsupported_content_ready:
# XXX: The unsupported content reply might have finished before the
# original loadFinished signal emits. In such cases we do not want
# the same signal twice.
if not self._unsupported_content_reply.error():
self.web_page.mainFrame().loadFinished.emit(True)
else:
self.web_page.mainFrame().loadFinished.emit(False)

def _on_unsupported_content(self, reply):
self.logger.log('Unsupported content detected', min_level=3)
self._is_unsupported_content = True
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this attribute is set per browser tab and browser tab can be used to download multiple resources, if you set this attribute here will it affect other requests going through same webpage? e.g. script doing

function main (splash)
    splash:go("url_with_unsupported_content") # this will set _is_unsupported_content to True
    splash:go("url with supported content") # will this go through?
end

other thing worth noting is that there could be multiple resources downloaded when rendering so e.g.

splash:go("some page")

may issue 10 requests, first 6 will have supported content, but 7th will have unsupported content, what will happen with resources 7-10? will they be downloaded all right?

If only one among many resources is unsupported (e.g. there are many stylesheets and only one is corrupted) what response will user get?

In mockserver there is child resource called "subresources" you could probably add another similar test resource similar that would try to fetch one resource with unsupported content while rendering.

self._unsupported_content_reply = reply
if reply.isFinished():
# Already finished. The content might be very short.
self.logger.log('Unsupported content already finished', min_level=3)
self._on_unsupported_content_finished()
else:
reply.finished.connect(self._on_unsupported_content_finished)

def run_js_file(self, filename, handle_errors=True):
"""
Load JS library from file ``filename`` to the current frame.
Expand Down
24 changes: 24 additions & 0 deletions splash/tests/mockserver.py
Original file line number Diff line number Diff line change
Expand Up @@ -743,6 +743,28 @@ def render_GET(self, request):
return b"ok"


class RawBytes(Resource):

def render_GET(self, request):
body_length = int(request.args.get(b'length', [1024])[0])
body = b'\x00' * body_length
claim_length = int(request.args.get(b'claim_length', [body_length])[0])
content = b'\n'.join([
b'HTTP/1.1 200 OK',
('Content-Length: %d' % claim_length).encode('utf8'),
b'',
body,
])
request.channel.transport.write(content)
if b'delayed_abort' in request.args:
reactor.callLater(1, request.channel.transport.abortConnection)
elif b'abort' in request.args:
request.channel.transport.abortConnection()
else:
request.channel.transport.loseConnection()
return NOT_DONE_YET


class Index(Resource):
isLeaf = True

Expand Down Expand Up @@ -820,6 +842,8 @@ def __init__(self, http_port, https_port, proxy_port):
self.putChild(b"bad-content-type", InvalidContentTypeResource())
self.putChild(b"bad-content-type2", InvalidContentTypeResource2())

self.putChild(b"raw-bytes", RawBytes())

self.putChild(b"jsredirect", JsRedirect())
self.putChild(b"jsredirect-to", JsRedirectTo())
self.putChild(b"jsredirect-slowimage", JsRedirectSlowImage())
Expand Down
15 changes: 15 additions & 0 deletions splash/tests/test_render.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,21 @@ def test_invalid_wait(self):
'wait': wait})
self.assertStatusCode(r, 400)

def test_unsupported_content(self):
cases = [
# Short body (Can be received together with the headers)
("raw-bytes?length=16", 200),
# Short body with error
("raw-bytes?length=16&claim_length=32&abort=1", 502),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this something webkit cannot handle? why? is it because of different content-length and actual length of response? Why do you need to abort() here? cant you just return this invalid response?

I read in docs that

This signal is emitted when WebKit cannot handle a link the user navigated to or a web server's response includes a "Content-Disposition" header with the 'attachment' directive.

so perhaps also worth adding test for content-disposition?

I also wonder about 502 status code, this is Bad Gateway so if there is some load balancer in front of splash, e.g. nginx, user may confuse 502 from splash with 502 from some load balancer. Is 502 best choice for status code? Are there any alternatives?

# Long body (May not be received together with the headers)
("raw-bytes?length=1000000", 200),
# Long body with error
("raw-bytes?length=100&claim_length=200&delayed_abort=1", 502),
]
for url, http_status in cases:
r = self.request({"url": self.mockurl(url)})
self.assertStatusCode(r, http_status)

@pytest.mark.skipif(
not qt_551_plus(),
reason="resource_timeout doesn't work in Qt5 < 5.5.1. See issue #269 for details."
Expand Down