$NetBSD: patch-Lib_test_test__urlparse.py,v 1.3 2023/05/29 23:33:48 gutteridge Exp $ Fix CVE-2021-23336: Add `separator` argument to parse_qs; warn with default Via Fedora: https://src.fedoraproject.org/rpms/python2.7/blob/rawhide/f/00359-CVE-2021-23336.patch Fix CVE-2022-0391: urlparse does not sanitize URLs containing ASCII newline and tabs Via Fedora: https://src.fedoraproject.org/rpms/python2.7/raw/40dd05e5d77dbfa81777c9f84b704bc2239bf710/f/00377-CVE-2022-0391.patch Fix CVE-2023-24329: Add more sanitizing to respect the "Remove any leading C0 control or space from input" rule Via Fedora: https://src.fedoraproject.org/rpms/python2.7/c/3f00cdccd59ef2955a7f4b4c42bb59c631cce4c1.patch --- Lib/test/test_urlparse.py.orig 2020-04-19 21:13:39.000000000 +0000 +++ Lib/test/test_urlparse.py @@ -3,6 +3,12 @@ import sys import unicodedata import unittest import urlparse +from test.support import EnvironmentVarGuard +from warnings import catch_warnings, filterwarnings +import tempfile +import contextlib +import os.path +import shutil RFC1808_BASE = "http://a/b/c/d;p?q#f" RFC2396_BASE = "http://a/b/c/d;p?q" @@ -24,16 +30,29 @@ parse_qsl_test_cases = [ ("&a=b", [('a', 'b')]), ("a=a+b&b=b+c", [('a', 'a b'), ('b', 'b c')]), ("a=1&a=2", [('a', '1'), ('a', '2')]), +] + +parse_qsl_test_cases_semicolon = [ (";", []), (";;", []), (";a=b", [('a', 'b')]), ("a=a+b;b=b+c", [('a', 'a b'), ('b', 'b c')]), ("a=1;a=2", [('a', '1'), ('a', '2')]), - (b";", []), - (b";;", []), - (b";a=b", [(b'a', b'b')]), - (b"a=a+b;b=b+c", [(b'a', b'a b'), (b'b', b'b c')]), - (b"a=1;a=2", [(b'a', b'1'), (b'a', b'2')]), +] + +parse_qsl_test_cases_legacy = [ + ("a=1;a=2&a=3", [('a', '1'), ('a', '2'), ('a', '3')]), + ("a=1;b=2&c=3", [('a', '1'), ('b', '2'), ('c', '3')]), + ("a=1&b=2&c=3;", [('a', '1'), ('b', '2'), ('c', '3')]), +] + +parse_qsl_test_cases_warn = [ + (";a=b", [(';a', 'b')]), + ("a=a+b;b=b+c", [('a', 'a b;b=b c')]), + (b";a=b", [(b';a', b'b')]), + (b"a=a+b;b=b+c", [(b'a', b'a b;b=b c')]), + ("a=1;a=2&a=3", [('a', '1;a=2'), ('a', '3')]), + (b"a=1;a=2&a=3", [(b'a', b'1;a=2'), (b'a', b'3')]), ] parse_qs_test_cases = [ @@ -57,6 +76,9 @@ parse_qs_test_cases = [ (b"&a=b", {b'a': [b'b']}), (b"a=a+b&b=b+c", {b'a': [b'a b'], b'b': [b'b c']}), (b"a=1&a=2", {b'a': [b'1', b'2']}), +] + +parse_qs_test_cases_semicolon = [ (";", {}), (";;", {}), (";a=b", {'a': ['b']}), @@ -69,6 +91,24 @@ parse_qs_test_cases = [ (b"a=1;a=2", {b'a': [b'1', b'2']}), ] +parse_qs_test_cases_legacy = [ + ("a=1;a=2&a=3", {'a': ['1', '2', '3']}), + ("a=1;b=2&c=3", {'a': ['1'], 'b': ['2'], 'c': ['3']}), + ("a=1&b=2&c=3;", {'a': ['1'], 'b': ['2'], 'c': ['3']}), + (b"a=1;a=2&a=3", {b'a': [b'1', b'2', b'3']}), + (b"a=1;b=2&c=3", {b'a': [b'1'], b'b': [b'2'], b'c': [b'3']}), + (b"a=1&b=2&c=3;", {b'a': [b'1'], b'b': [b'2'], b'c': [b'3']}), +] + +parse_qs_test_cases_warn = [ + (";a=b", {';a': ['b']}), + ("a=a+b;b=b+c", {'a': ['a b;b=b c']}), + (b";a=b", {b';a': [b'b']}), + (b"a=a+b;b=b+c", {b'a':[ b'a b;b=b c']}), + ("a=1;a=2&a=3", {'a': ['1;a=2', '3']}), + (b"a=1;a=2&a=3", {b'a': [b'1;a=2', b'3']}), +] + class UrlParseTestCase(unittest.TestCase): def checkRoundtrips(self, url, parsed, split): @@ -141,6 +181,40 @@ class UrlParseTestCase(unittest.TestCase self.assertEqual(result, expect_without_blanks, "Error parsing %r" % orig) + def test_qs_default_warn(self): + for orig, expect in parse_qs_test_cases_warn: + with catch_warnings(record=True) as w: + filterwarnings(action='always', + category=urlparse._QueryStringSeparatorWarning) + result = urlparse.parse_qs(orig, keep_blank_values=True) + self.assertEqual(result, expect, "Error parsing %r" % orig) + self.assertEqual(len(w), 1) + self.assertEqual(w[0].category, urlparse._QueryStringSeparatorWarning) + + def test_qsl_default_warn(self): + for orig, expect in parse_qsl_test_cases_warn: + with catch_warnings(record=True) as w: + filterwarnings(action='always', + category=urlparse._QueryStringSeparatorWarning) + result = urlparse.parse_qsl(orig, keep_blank_values=True) + self.assertEqual(result, expect, "Error parsing %r" % orig) + self.assertEqual(len(w), 1) + self.assertEqual(w[0].category, urlparse._QueryStringSeparatorWarning) + + def test_default_qs_no_warnings(self): + for orig, expect in parse_qs_test_cases: + with catch_warnings(record=True) as w: + result = urlparse.parse_qs(orig, keep_blank_values=True) + self.assertEqual(result, expect, "Error parsing %r" % orig) + self.assertEqual(len(w), 0) + + def test_default_qsl_no_warnings(self): + for orig, expect in parse_qsl_test_cases: + with catch_warnings(record=True) as w: + result = urlparse.parse_qsl(orig, keep_blank_values=True) + self.assertEqual(result, expect, "Error parsing %r" % orig) + self.assertEqual(len(w), 0) + def test_roundtrips(self): testcases = [ ('file:///tmp/junk.txt', @@ -544,6 +618,112 @@ class UrlParseTestCase(unittest.TestCase self.assertEqual(p1.path, '863-1234') self.assertEqual(p1.params, 'phone-context=+1-914-555') + def test_urlsplit_remove_unsafe_bytes(self): + # Remove ASCII tabs and newlines from input, for http common case scenario. + url = "h\nttp://www.python\n.org\t/java\nscript:\talert('msg\r\n')/?query\n=\tsomething#frag\nment" + p = urlparse.urlsplit(url) + self.assertEqual(p.scheme, "http") + self.assertEqual(p.netloc, "www.python.org") + self.assertEqual(p.path, "/javascript:alert('msg')/") + self.assertEqual(p.query, "query=something") + self.assertEqual(p.fragment, "fragment") + self.assertEqual(p.username, None) + self.assertEqual(p.password, None) + self.assertEqual(p.hostname, "www.python.org") + self.assertEqual(p.port, None) + self.assertEqual(p.geturl(), "http://www.python.org/javascript:alert('msg')/?query=something#fragment") + + # Remove ASCII tabs and newlines from input as bytes, for http common case scenario. + url = b"h\nttp://www.python\n.org\t/java\nscript:\talert('msg\r\n')/?query\n=\tsomething#frag\nment" + p = urlparse.urlsplit(url) + self.assertEqual(p.scheme, b"http") + self.assertEqual(p.netloc, b"www.python.org") + self.assertEqual(p.path, b"/javascript:alert('msg')/") + self.assertEqual(p.query, b"query=something") + self.assertEqual(p.fragment, b"fragment") + self.assertEqual(p.username, None) + self.assertEqual(p.password, None) + self.assertEqual(p.hostname, b"www.python.org") + self.assertEqual(p.port, None) + self.assertEqual(p.geturl(), b"http://www.python.org/javascript:alert('msg')/?query=something#fragment") + + # any scheme + url = "x-new-scheme\t://www.python\n.org\t/java\nscript:\talert('msg\r\n')/?query\n=\tsomething#frag\nment" + p = urlparse.urlsplit(url) + self.assertEqual(p.geturl(), "x-new-scheme://www.python.org/javascript:alert('msg')/?query=something#fragment") + + # Remove ASCII tabs and newlines from input as bytes, any scheme. + url = b"x-new-scheme\t://www.python\n.org\t/java\nscript:\talert('msg\r\n')/?query\n=\tsomething#frag\nment" + p = urlparse.urlsplit(url) + self.assertEqual(p.geturl(), b"x-new-scheme://www.python.org/javascript:alert('msg')/?query=something#fragment") + + # Unsafe bytes is not returned from urlparse cache. + # scheme is stored after parsing, sending an scheme with unsafe bytes *will not* return an unsafe scheme + url = "https://www.python\n.org\t/java\nscript:\talert('msg\r\n')/?query\n=\tsomething#frag\nment" + scheme = "htt\nps" + for _ in range(2): + p = urlparse.urlsplit(url, scheme=scheme) + self.assertEqual(p.scheme, "https") + self.assertEqual(p.geturl(), "https://www.python.org/javascript:alert('msg')/?query=something#fragment") + + def test_urlsplit_strip_url(self): + noise = "".join([chr(i) for i in range(0, 0x20 + 1)]) + base_url = "http://User:Pass@www.python.org:080/doc/?query=yes#frag" + + url = noise.decode("utf-8") + base_url + p = urlparse.urlsplit(url) + self.assertEqual(p.scheme, "http") + self.assertEqual(p.netloc, "User:Pass@www.python.org:080") + self.assertEqual(p.path, "/doc/") + self.assertEqual(p.query, "query=yes") + self.assertEqual(p.fragment, "frag") + self.assertEqual(p.username, "User") + self.assertEqual(p.password, "Pass") + self.assertEqual(p.hostname, "www.python.org") + self.assertEqual(p.port, 80) + self.assertEqual(p.geturl(), base_url) + + url = noise + base_url.encode("utf-8") + p = urlparse.urlsplit(url) + self.assertEqual(p.scheme, b"http") + self.assertEqual(p.netloc, b"User:Pass@www.python.org:080") + self.assertEqual(p.path, b"/doc/") + self.assertEqual(p.query, b"query=yes") + self.assertEqual(p.fragment, b"frag") + self.assertEqual(p.username, b"User") + self.assertEqual(p.password, b"Pass") + self.assertEqual(p.hostname, b"www.python.org") + self.assertEqual(p.port, 80) + self.assertEqual(p.geturl(), base_url.encode("utf-8")) + + # Test that trailing space is preserved as some applications rely on + # this within query strings. + query_spaces_url = "https://www.python.org:88/doc/?query= " + p = urlparse.urlsplit(noise.decode("utf-8") + query_spaces_url) + self.assertEqual(p.scheme, "https") + self.assertEqual(p.netloc, "www.python.org:88") + self.assertEqual(p.path, "/doc/") + self.assertEqual(p.query, "query= ") + self.assertEqual(p.port, 88) + self.assertEqual(p.geturl(), query_spaces_url) + + p = urlparse.urlsplit("www.pypi.org ") + # That "hostname" gets considered a "path" due to the + # trailing space and our existing logic... YUCK... + # and re-assembles via geturl aka unurlsplit into the original. + # django.core.validators.URLValidator (at least through v3.2) relies on + # this, for better or worse, to catch it in a ValidationError via its + # regular expressions. + # Here we test the basic round trip concept of such a trailing space. + self.assertEqual(urlparse.urlunsplit(p), "www.pypi.org ") + + # with scheme as cache-key + url = "//www.python.org/" + scheme = noise.decode("utf-8") + "https" + noise.decode("utf-8") + for _ in range(2): + p = urlparse.urlsplit(url, scheme=scheme) + self.assertEqual(p.scheme, "https") + self.assertEqual(p.geturl(), "https://www.python.org/") def test_attributes_bad_port(self): """Check handling of non-integer ports.""" @@ -626,6 +806,132 @@ class UrlParseTestCase(unittest.TestCase self.assertEqual(urlparse.urlparse("http://www.python.org:80"), ('http','www.python.org:80','','','','')) + def test_parse_qs_separator_bytes(self): + expected = {b'a': [b'1'], b'b': [b'2']} + + result = urlparse.parse_qs(b'a=1;b=2', separator=b';') + self.assertEqual(result, expected) + result = urlparse.parse_qs(b'a=1;b=2', separator=';') + self.assertEqual(result, expected) + result = urlparse.parse_qs('a=1;b=2', separator=';') + self.assertEqual(result, {'a': ['1'], 'b': ['2']}) + + @contextlib.contextmanager + def _qsl_sep_config(self, sep): + """Context for the given parse_qsl default separator configured in config file""" + old_filename = urlparse._QS_SEPARATOR_CONFIG_FILENAME + urlparse._default_qs_separator = None + try: + tmpdirname = tempfile.mkdtemp() + filename = os.path.join(tmpdirname, 'conf.cfg') + with open(filename, 'w') as file: + file.write('[parse_qs]\n') + file.write('PYTHON_URLLIB_QS_SEPARATOR = {}'.format(sep)) + urlparse._QS_SEPARATOR_CONFIG_FILENAME = filename + yield + finally: + urlparse._QS_SEPARATOR_CONFIG_FILENAME = old_filename + urlparse._default_qs_separator = None + shutil.rmtree(tmpdirname) + + def test_parse_qs_separator_semicolon(self): + for orig, expect in parse_qs_test_cases_semicolon: + result = urlparse.parse_qs(orig, separator=';') + self.assertEqual(result, expect, "Error parsing %r" % orig) + with EnvironmentVarGuard() as environ, catch_warnings(record=True) as w: + environ['PYTHON_URLLIB_QS_SEPARATOR'] = ';' + result = urlparse.parse_qs(orig) + self.assertEqual(result, expect, "Error parsing %r" % orig) + self.assertEqual(len(w), 0) + with self._qsl_sep_config(';'), catch_warnings(record=True) as w: + result = urlparse.parse_qs(orig) + self.assertEqual(result, expect, "Error parsing %r" % orig) + self.assertEqual(len(w), 0) + + def test_parse_qsl_separator_semicolon(self): + for orig, expect in parse_qsl_test_cases_semicolon: + result = urlparse.parse_qsl(orig, separator=';') + self.assertEqual(result, expect, "Error parsing %r" % orig) + with EnvironmentVarGuard() as environ, catch_warnings(record=True) as w: + environ['PYTHON_URLLIB_QS_SEPARATOR'] = ';' + result = urlparse.parse_qsl(orig) + self.assertEqual(result, expect, "Error parsing %r" % orig) + self.assertEqual(len(w), 0) + with self._qsl_sep_config(';'), catch_warnings(record=True) as w: + result = urlparse.parse_qsl(orig) + self.assertEqual(result, expect, "Error parsing %r" % orig) + self.assertEqual(len(w), 0) + + def test_parse_qs_separator_legacy(self): + for orig, expect in parse_qs_test_cases_legacy: + with EnvironmentVarGuard() as environ, catch_warnings(record=True) as w: + environ['PYTHON_URLLIB_QS_SEPARATOR'] = 'legacy' + result = urlparse.parse_qs(orig) + self.assertEqual(result, expect, "Error parsing %r" % orig) + self.assertEqual(len(w), 0) + with self._qsl_sep_config('legacy'), catch_warnings(record=True) as w: + result = urlparse.parse_qs(orig) + self.assertEqual(result, expect, "Error parsing %r" % orig) + self.assertEqual(len(w), 0) + + def test_parse_qsl_separator_legacy(self): + for orig, expect in parse_qsl_test_cases_legacy: + with EnvironmentVarGuard() as environ, catch_warnings(record=True) as w: + environ['PYTHON_URLLIB_QS_SEPARATOR'] = 'legacy' + result = urlparse.parse_qsl(orig) + self.assertEqual(result, expect, "Error parsing %r" % orig) + self.assertEqual(len(w), 0) + with self._qsl_sep_config('legacy'), catch_warnings(record=True) as w: + result = urlparse.parse_qsl(orig) + self.assertEqual(result, expect, "Error parsing %r" % orig) + self.assertEqual(len(w), 0) + + def test_parse_qs_separator_bad_value_env_or_config(self): + for bad_sep in '', 'abc', 'safe', '&;', 'SEP': + with EnvironmentVarGuard() as environ, catch_warnings(record=True) as w: + environ['PYTHON_URLLIB_QS_SEPARATOR'] = bad_sep + with self.assertRaises(ValueError): + urlparse.parse_qsl('a=1;b=2') + with self._qsl_sep_config('bad_sep'), catch_warnings(record=True) as w: + with self.assertRaises(ValueError): + urlparse.parse_qsl('a=1;b=2') + + def test_parse_qs_separator_bad_value_arg(self): + for bad_sep in True, {}, '': + with self.assertRaises(ValueError): + urlparse.parse_qsl('a=1;b=2', separator=bad_sep) + + def test_parse_qs_separator_num_fields(self): + for qs, sep in ( + ('a&b&c', '&'), + ('a;b;c', ';'), + ('a&b;c', 'legacy'), + ): + with EnvironmentVarGuard() as environ, catch_warnings(record=True) as w: + if sep != 'legacy': + with self.assertRaises(ValueError): + urlparse.parse_qsl(qs, separator=sep, max_num_fields=2) + if sep: + environ['PYTHON_URLLIB_QS_SEPARATOR'] = sep + with self.assertRaises(ValueError): + urlparse.parse_qsl(qs, max_num_fields=2) + + def test_parse_qs_separator_priority(self): + # env variable trumps config file + with self._qsl_sep_config('~'), EnvironmentVarGuard() as environ: + environ['PYTHON_URLLIB_QS_SEPARATOR'] = '!' + result = urlparse.parse_qs('a=1!b=2~c=3') + self.assertEqual(result, {'a': ['1'], 'b': ['2~c=3']}) + # argument trumps config file + with self._qsl_sep_config('~'): + result = urlparse.parse_qs('a=1$b=2~c=3', separator='$') + self.assertEqual(result, {'a': ['1'], 'b': ['2~c=3']}) + # argument trumps env variable + with EnvironmentVarGuard() as environ: + environ['PYTHON_URLLIB_QS_SEPARATOR'] = '~' + result = urlparse.parse_qs('a=1$b=2~c=3', separator='$') + self.assertEqual(result, {'a': ['1'], 'b': ['2~c=3']}) + def test_urlsplit_normalization(self): # Certain characters should never occur in the netloc, # including under normalization.