$NetBSD: patch-Lib_urlparse.py,v 1.4 2023/05/29 23:33:48 gutteridge Exp $ Fix CVE-2021-23336: Add `separator` argument to parse_qs; warn with default Via Fedora: https://src.fedoraproject.org/rpms/python2.7/blob/rawhide/f/00359-CVE-2021-23336.patch Fix CVE-2022-0391: urlparse does not sanitize URLs containing ASCII newline and tabs Via Fedora: https://src.fedoraproject.org/rpms/python2.7/raw/40dd05e5d77dbfa81777c9f84b704bc2239bf710/f/00377-CVE-2022-0391.patch Fix CVE-2023-24329: Add more sanitizing to respect the "Remove any leading C0 control or space from input" rule Via Fedora: https://src.fedoraproject.org/rpms/python2.7/c/3f00cdccd59ef2955a7f4b4c42bb59c631cce4c1.patch --- Lib/urlparse.py.orig 2020-04-19 21:13:39.000000000 +0000 +++ Lib/urlparse.py @@ -26,9 +26,14 @@ scenarios for parsing, and for backward parsing quirks from older RFCs are retained. The testcases in test_urlparse.py provides a good indicator of parsing behavior. +The WHATWG URL Parser spec should also be considered. We are not compliant with +it either due to existing user code API behavior expectations (Hyrum's Law). +It serves as a useful guide when making changes. + """ import re +import os __all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag", "urlsplit", "urlunsplit", "parse_qs", "parse_qsl"] @@ -62,6 +67,13 @@ scheme_chars = ('abcdefghijklmnopqrstuvw '0123456789' '+-.') +# Leading and trailing C0 control and space to be stripped per WHATWG spec. +# == "".join([chr(i) for i in range(0, 0x20 + 1)]) +_WHATWG_C0_CONTROL_OR_SPACE = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f ' + +# Unsafe bytes to be removed per WHATWG spec +_UNSAFE_URL_BYTES_TO_REMOVE = ['\t', '\r', '\n'] + MAX_CACHE_SIZE = 20 _parse_cache = {} @@ -184,12 +196,21 @@ def _checknetloc(netloc): "under NFKC normalization" % netloc) +def _remove_unsafe_bytes_from_url(url): + for b in _UNSAFE_URL_BYTES_TO_REMOVE: + url = url.replace(b, "") + return url + def urlsplit(url, scheme='', allow_fragments=True): """Parse a URL into 5 components: :///?# Return a 5-tuple: (scheme, netloc, path, query, fragment). Note that we don't break the components up in smaller bits (e.g. netloc is a single string) and we don't expand % escapes.""" + url = _remove_unsafe_bytes_from_url(url) + scheme = _remove_unsafe_bytes_from_url(scheme) + url = url.lstrip(_WHATWG_C0_CONTROL_OR_SPACE) + scheme = scheme.strip(_WHATWG_C0_CONTROL_OR_SPACE) allow_fragments = bool(allow_fragments) key = url, scheme, allow_fragments, type(url), type(scheme) cached = _parse_cache.get(key, None) @@ -382,7 +403,8 @@ def unquote(s): append(item) return ''.join(res) -def parse_qs(qs, keep_blank_values=0, strict_parsing=0, max_num_fields=None): +def parse_qs(qs, keep_blank_values=0, strict_parsing=0, max_num_fields=None, + separator=None): """Parse a query given as a string argument. Arguments: @@ -405,14 +427,23 @@ def parse_qs(qs, keep_blank_values=0, st """ dict = {} for name, value in parse_qsl(qs, keep_blank_values, strict_parsing, - max_num_fields): + max_num_fields, separator): if name in dict: dict[name].append(value) else: dict[name] = [value] return dict -def parse_qsl(qs, keep_blank_values=0, strict_parsing=0, max_num_fields=None): +class _QueryStringSeparatorWarning(RuntimeWarning): + """Warning for using default `separator` in parse_qs or parse_qsl""" + +# The default "separator" for parse_qsl can be specified in a config file. +# It's cached after first read. +_QS_SEPARATOR_CONFIG_FILENAME = '@PKG_SYSCONFDIR@/python/urllib.cfg' +_default_qs_separator = None + +def parse_qsl(qs, keep_blank_values=0, strict_parsing=0, max_num_fields=None, + separator=None): """Parse a query given as a string argument. Arguments: @@ -434,15 +465,72 @@ def parse_qsl(qs, keep_blank_values=0, s Returns a list, as G-d intended. """ + + if (not separator or (not isinstance(separator, (str, bytes)))) and separator is not None: + raise ValueError("Separator must be of type string or bytes.") + + # Used when both "&" and ";" act as separators. (Need a non-string value.) + _legacy = object() + + if separator is None: + global _default_qs_separator + separator = _default_qs_separator + envvar_name = 'PYTHON_URLLIB_QS_SEPARATOR' + if separator is None: + # Set default separator from environment variable + separator = os.environ.get(envvar_name) + config_source = 'environment variable' + if separator is None: + # Set default separator from the configuration file + try: + file = open(_QS_SEPARATOR_CONFIG_FILENAME) + except EnvironmentError: + pass + else: + with file: + import ConfigParser + config = ConfigParser.ConfigParser() + config.readfp(file) + separator = config.get('parse_qs', envvar_name) + _default_qs_separator = separator + config_source = _QS_SEPARATOR_CONFIG_FILENAME + if separator is None: + # The default is '&', but warn if not specified explicitly + if ';' in qs: + from warnings import warn + warn("The default separator of urlparse.parse_qsl and " + + "parse_qs was changed to '&' to avoid a web cache " + + "poisoning issue (CVE-2021-23336). " + + "By default, semicolons no longer act as query field " + + "separators. " + + "See https://access.redhat.com/articles/5860431 for " + + "more details.", + _QueryStringSeparatorWarning, stacklevel=2) + separator = '&' + elif separator == 'legacy': + separator = _legacy + elif len(separator) != 1: + raise ValueError( + '{} (from {}) must contain '.format(envvar_name, config_source) + + '1 character, or "legacy". See ' + + 'https://access.redhat.com/articles/5860431 for more details.' + ) + # If max_num_fields is defined then check that the number of fields # is less than max_num_fields. This prevents a memory exhaustion DOS # attack via post bodies with many fields. if max_num_fields is not None: - num_fields = 1 + qs.count('&') + qs.count(';') + if separator is _legacy: + num_fields = 1 + qs.count('&') + qs.count(';') + else: + num_fields = 1 + qs.count(separator) if max_num_fields < num_fields: raise ValueError('Max number of fields exceeded') - pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')] + if separator is _legacy: + pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')] + else: + pairs = [s1 for s1 in qs.split(separator)] r = [] for name_value in pairs: if not name_value and not strict_parsing: