Skip to content

Commit

Permalink
[utils] Rework URL path munging for ., .. components
Browse files Browse the repository at this point in the history
* move processing to YoutubeDLHandler
* also process `Location` header for redirect
* use tests from yt-dlp/yt-dlp#7662
  • Loading branch information
dirkf committed Jul 29, 2023
1 parent e7926ae commit abef534
Show file tree
Hide file tree
Showing 4 changed files with 95 additions and 41 deletions.
14 changes: 14 additions & 0 deletions test/test_http.py
Expand Up @@ -180,6 +180,12 @@ def gzip_compress(p):
respond()
elif self.path == '/%c7%9f':
respond()
elif self.path == '/redirect_dotsegments':
self.send_response(301)
# redirect to /headers but with dot segments before
self.send_header('Location', '/a/b/./../../headers')
self.send_header('Content-Length', '0')
self.end_headers()
elif self.path.startswith('/redirect_'):
self._redirect()
elif self.path.startswith('/method'):
Expand Down Expand Up @@ -489,6 +495,14 @@ def test_unsupported_encoding(self):
self.assertEqual(res.headers.get('Content-Encoding'), 'unsupported')
self.assertEqual(res.read(), b'raw')

def test_remove_dot_segments(self):
with FakeYDL() as ydl:
res = ydl.urlopen(sanitized_Request(self._test_url('a/b/./../../headers')))
self.assertEqual(compat_urllib_parse.urlparse(res.geturl()).path, '/headers')

res = ydl.urlopen(sanitized_Request(self._test_url('redirect_dotsegments')))
self.assertEqual(compat_urllib_parse.urlparse(res.geturl()).path, '/headers')


def _build_proxy_handler(name):
class HTTPTestRequestHandler(compat_http_server.BaseHTTPRequestHandler):
Expand Down
29 changes: 28 additions & 1 deletion test/test_utils.py
Expand Up @@ -64,6 +64,7 @@
parse_age_limit,
parse_duration,
parse_filesize,
parse_codecs,
parse_count,
parse_iso8601,
parse_resolution,
Expand Down Expand Up @@ -114,7 +115,7 @@
cli_option,
cli_valueless_option,
cli_bool_option,
parse_codecs,
YoutubeDLHandler,
)
from youtube_dl.compat import (
compat_chr,
Expand Down Expand Up @@ -905,6 +906,32 @@ def test_escape_url(self):
)
self.assertEqual(escape_url('http://vimeo.com/56015672#at=0'), 'http://vimeo.com/56015672#at=0')

def test_remove_dot_segments(self):

def remove_dot_segments(p):
q = '' if p.startswith('/') else '/'
p = 'http://example.com' + q + p
p = compat_urlparse.urlsplit(YoutubeDLHandler._fix_path(p)).path
return p[1:] if q else p

self.assertEqual(remove_dot_segments('/a/b/c/./../../g'), '/a/g')
self.assertEqual(remove_dot_segments('mid/content=5/../6'), 'mid/6')
self.assertEqual(remove_dot_segments('/ad/../cd'), '/cd')
self.assertEqual(remove_dot_segments('/ad/../cd/'), '/cd/')
self.assertEqual(remove_dot_segments('/..'), '/')
self.assertEqual(remove_dot_segments('/./'), '/')
self.assertEqual(remove_dot_segments('/./a'), '/a')
self.assertEqual(remove_dot_segments('/abc/./.././d/././e/.././f/./../../ghi'), '/ghi')
self.assertEqual(remove_dot_segments('/'), '/')
self.assertEqual(remove_dot_segments('/t'), '/t')
self.assertEqual(remove_dot_segments('t'), 't')
self.assertEqual(remove_dot_segments(''), '')
self.assertEqual(remove_dot_segments('/../a/b/c'), '/a/b/c')
self.assertEqual(remove_dot_segments('../a'), 'a')
self.assertEqual(remove_dot_segments('./a'), 'a')
self.assertEqual(remove_dot_segments('.'), '')
self.assertEqual(remove_dot_segments('////'), '////')

def test_js_to_json_vars_strings(self):
self.assertDictEqual(
json.loads(js_to_json(
Expand Down
23 changes: 0 additions & 23 deletions youtube_dl/YoutubeDL.py
Expand Up @@ -71,7 +71,6 @@
format_bytes,
formatSeconds,
GeoRestrictedError,
HEADRequest,
int_or_none,
ISO3166Utils,
join_nonempty,
Expand All @@ -88,7 +87,6 @@
preferredencoding,
prepend_extension,
process_communicate_or_kill,
PUTRequest,
register_socks_protocols,
render_table,
replace_extension,
Expand Down Expand Up @@ -2460,27 +2458,6 @@ def urlopen(self, req):
""" Start an HTTP download """
if isinstance(req, compat_basestring):
req = sanitized_Request(req)
# an embedded /../ sequence is not automatically handled by urllib2
# see https://github.com/yt-dlp/yt-dlp/issues/3355
url = req.get_full_url()
parts = url.partition('/../')
if parts[1]:
url = compat_urllib_parse.urljoin(parts[0] + parts[1][:1], parts[1][1:] + parts[2])
if url:
# worse, URL path may have initial /../ against RFCs: work-around
# by stripping such prefixes, like eg Firefox
parts = compat_urllib_parse.urlsplit(url)
path = parts.path
while path.startswith('/../'):
path = path[3:]
url = parts._replace(path=path).geturl()
# get a new Request with the munged URL
if url != req.get_full_url():
req_type = {'HEAD': HEADRequest, 'PUT': PUTRequest}.get(
req.get_method(), compat_urllib_request.Request)
req = req_type(
url, data=req.data, headers=dict(req.header_items()),
origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
return self._opener.open(req, timeout=self._socket_timeout)

def print_debug_header(self):
Expand Down
70 changes: 53 additions & 17 deletions youtube_dl/utils.py
Expand Up @@ -2678,17 +2678,52 @@ def brotli(data):
def compress(data):
return data and ncompress.decompress(data)

@staticmethod
def _fix_path(url):
# an embedded /../ or /./ sequence is not automatically handled by urllib2
# see https://github.com/yt-dlp/yt-dlp/issues/3355
parsed_url = compat_urllib_parse.urlsplit(url)
path = parsed_url.path
if not path.endswith('/'):
path += '/'
parts = path.partition('/./')
if not parts[1]:
parts = path.partition('/../')
if parts[1]:
path = compat_urllib_parse.urljoin(
parts[0] + parts[1][:1],
parts[1][1:] + (parts[2] if parsed_url.path.endswith('/') else parts[2][:-1]))
url = parsed_url._replace(path=path).geturl()
if '/.' in url:
# worse, URL path may have initial /../ against RFCs: work-around
# by stripping such prefixes, like eg Firefox
path = parsed_url.path + '/'
while path.startswith('/.'):
if path.startswith('/../'):
path = path[3:]
elif path.startswith('/./'):
path = path[2:]
else:
break
path = path[:-1]
if not path.startswith('/') and parsed_url.path.startswith('/'):
path = '/' + path
url = parsed_url._replace(path=path).geturl()
return url

def http_request(self, req):
# According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
# always respected by websites, some tend to give out URLs with non percent-encoded
url = req.get_full_url()
# resolve embedded . and ..
url_fixed = self._fix_path(url)
# According to RFC 3986, URLs can not contain non-ASCII characters; however this is not
# always respected by websites: some tend to give out URLs with non percent-encoded
# non-ASCII characters (see telemb.py, ard.py [#3412])
# urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
# To work around aforementioned issue we will replace request's original URL with
# percent-encoded one
# Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
# the code of this workaround has been moved here from YoutubeDL.urlopen()
url = req.get_full_url()
url_escaped = escape_url(url)
url_escaped = escape_url(url_fixed)

# Substitute URL if any change after escaping
if url != url_escaped:
Expand All @@ -2702,10 +2737,13 @@ def http_request(self, req):

req.headers = handle_youtubedl_headers(req.headers)

if sys.version_info < (2, 7) and '#' in req.get_full_url():
# Python 2.6 is brain-dead when it comes to fragments
req._Request__original = req._Request__original.partition('#')[0]
req._Request__r_type = req._Request__r_type.partition('#')[0]
if sys.version_info < (2, 7):
# avoid possible race where __r_type may be unset
req.get_type()
if '#' in req.get_full_url():
# Python 2.6 is brain-dead when it comes to fragments
req._Request__original = req._Request__original.partition('#')[0]
req._Request__r_type = req._Request__r_type.partition('#')[0]

# Use the totally undocumented AbstractHTTPHandler per
# https://github.com/yt-dlp/yt-dlp/pull/4158
Expand Down Expand Up @@ -2775,10 +2813,13 @@ def encodings(hdrs):
if sys.version_info >= (3, 0):
location = location.encode('iso-8859-1')
location = location.decode('utf-8')
location_escaped = escape_url(location)
# resolve embedded . and ..
location_fixed = self._fix_path(location)
location_escaped = escape_url(location_fixed)
if location != location_escaped:
del resp.headers['Location']
if sys.version_info < (3, 0):
# if sys.version_info < (3, 0):
if not isinstance(location_escaped, str):
location_escaped = location_escaped.encode('utf-8')
resp.headers['Location'] = location_escaped
return resp
Expand Down Expand Up @@ -4248,13 +4289,8 @@ def update_Request(req, url=None, data=None, headers={}, query={}):
req_headers.update(headers)
req_data = data if data is not None else req.data
req_url = update_url_query(url or req.get_full_url(), query)
req_get_method = req.get_method()
if req_get_method == 'HEAD':
req_type = HEADRequest
elif req_get_method == 'PUT':
req_type = PUTRequest
else:
req_type = compat_urllib_request.Request
req_type = {'HEAD': HEADRequest, 'PUT': PUTRequest}.get(
req.get_method(), compat_urllib_request.Request)
new_req = req_type(
req_url, data=req_data, headers=req_headers,
origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
Expand Down

2 comments on commit abef534

@MysticMerlin
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can't find/see anywhere else that I can ask a simple question which may have a simple answer.
I hope I'm not breaking any laws on here by asking the following.
There are problems with the youtube-dl/yt-dlp software with the main problem being 'ERROR: Unable to extract params; & I believe it is being worked on & like a lot of people I'm waiting for a fix.
Obviusly, until a fix has been done I can't do anything & so I just wanted to get an update on how things are going.
The two people that could give me & other people an update on the progress are dirkf & Vangelis66 but I can't see how to message them other than maybe here. Thanks.

@dirkf
Copy link
Contributor Author

@dirkf dirkf commented on abef534 Jul 31, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not the right place. Follow the relevant issue, or create one, subject to #30839.

Please sign in to comment.