123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458 |
- import base64
- import calendar
- import datetime
- import re
- import unicodedata
- import warnings
- from binascii import Error as BinasciiError
- from email.utils import formatdate
- from urllib.parse import (
- ParseResult, SplitResult, _coerce_args, _splitnetloc, _splitparams, quote,
- quote_plus, scheme_chars, unquote, unquote_plus,
- urlencode as original_urlencode, uses_params,
- )
-
- from django.core.exceptions import TooManyFieldsSent
- from django.utils.datastructures import MultiValueDict
- from django.utils.deprecation import RemovedInDjango30Warning
- from django.utils.functional import keep_lazy_text
-
- # based on RFC 7232, Appendix C
- ETAG_MATCH = re.compile(r'''
- \A( # start of string and capture group
- (?:W/)? # optional weak indicator
- " # opening quote
- [^"]* # any sequence of non-quote characters
- " # end quote
- )\Z # end of string and capture group
- ''', re.X)
-
- MONTHS = 'jan feb mar apr may jun jul aug sep oct nov dec'.split()
- __D = r'(?P<day>\d{2})'
- __D2 = r'(?P<day>[ \d]\d)'
- __M = r'(?P<mon>\w{3})'
- __Y = r'(?P<year>\d{4})'
- __Y2 = r'(?P<year>\d{2})'
- __T = r'(?P<hour>\d{2}):(?P<min>\d{2}):(?P<sec>\d{2})'
- RFC1123_DATE = re.compile(r'^\w{3}, %s %s %s %s GMT$' % (__D, __M, __Y, __T))
- RFC850_DATE = re.compile(r'^\w{6,9}, %s-%s-%s %s GMT$' % (__D, __M, __Y2, __T))
- ASCTIME_DATE = re.compile(r'^\w{3} %s %s %s %s$' % (__M, __D2, __T, __Y))
-
- RFC3986_GENDELIMS = ":/?#[]@"
- RFC3986_SUBDELIMS = "!$&'()*+,;="
-
- FIELDS_MATCH = re.compile('[&;]')
-
-
- @keep_lazy_text
- def urlquote(url, safe='/'):
- """
- A legacy compatibility wrapper to Python's urllib.parse.quote() function.
- (was used for unicode handling on Python 2)
- """
- return quote(url, safe)
-
-
- @keep_lazy_text
- def urlquote_plus(url, safe=''):
- """
- A legacy compatibility wrapper to Python's urllib.parse.quote_plus()
- function. (was used for unicode handling on Python 2)
- """
- return quote_plus(url, safe)
-
-
- @keep_lazy_text
- def urlunquote(quoted_url):
- """
- A legacy compatibility wrapper to Python's urllib.parse.unquote() function.
- (was used for unicode handling on Python 2)
- """
- return unquote(quoted_url)
-
-
- @keep_lazy_text
- def urlunquote_plus(quoted_url):
- """
- A legacy compatibility wrapper to Python's urllib.parse.unquote_plus()
- function. (was used for unicode handling on Python 2)
- """
- return unquote_plus(quoted_url)
-
-
- def urlencode(query, doseq=False):
- """
- A version of Python's urllib.parse.urlencode() function that can operate on
- MultiValueDict and non-string values.
- """
- if isinstance(query, MultiValueDict):
- query = query.lists()
- elif hasattr(query, 'items'):
- query = query.items()
- query_params = []
- for key, value in query:
- if value is None:
- raise TypeError(
- 'Cannot encode None in a query string. Did you mean to pass '
- 'an empty string or omit the value?'
- )
- elif isinstance(value, (str, bytes)):
- query_val = value
- else:
- try:
- itr = iter(value)
- except TypeError:
- query_val = value
- else:
- # Consume generators and iterators, even when doseq=True, to
- # work around https://bugs.python.org/issue31706.
- query_val = []
- for item in itr:
- if item is None:
- raise TypeError(
- 'Cannot encode None in a query string. Did you '
- 'mean to pass an empty string or omit the value?'
- )
- elif not isinstance(item, bytes):
- item = str(item)
- query_val.append(item)
- query_params.append((key, query_val))
- return original_urlencode(query_params, doseq)
-
-
- def cookie_date(epoch_seconds=None):
- """
- Format the time to ensure compatibility with Netscape's cookie standard.
-
- `epoch_seconds` is a floating point number expressed in seconds since the
- epoch, in UTC - such as that outputted by time.time(). If set to None, it
- defaults to the current time.
-
- Output a string in the format 'Wdy, DD-Mon-YYYY HH:MM:SS GMT'.
- """
- warnings.warn(
- 'cookie_date() is deprecated in favor of http_date(), which follows '
- 'the format of the latest RFC.',
- RemovedInDjango30Warning, stacklevel=2,
- )
- rfcdate = formatdate(epoch_seconds)
- return '%s-%s-%s GMT' % (rfcdate[:7], rfcdate[8:11], rfcdate[12:25])
-
-
- def http_date(epoch_seconds=None):
- """
- Format the time to match the RFC1123 date format as specified by HTTP
- RFC7231 section 7.1.1.1.
-
- `epoch_seconds` is a floating point number expressed in seconds since the
- epoch, in UTC - such as that outputted by time.time(). If set to None, it
- defaults to the current time.
-
- Output a string in the format 'Wdy, DD Mon YYYY HH:MM:SS GMT'.
- """
- return formatdate(epoch_seconds, usegmt=True)
-
-
- def parse_http_date(date):
- """
- Parse a date format as specified by HTTP RFC7231 section 7.1.1.1.
-
- The three formats allowed by the RFC are accepted, even if only the first
- one is still in widespread use.
-
- Return an integer expressed in seconds since the epoch, in UTC.
- """
- # email.utils.parsedate() does the job for RFC1123 dates; unfortunately
- # RFC7231 makes it mandatory to support RFC850 dates too. So we roll
- # our own RFC-compliant parsing.
- for regex in RFC1123_DATE, RFC850_DATE, ASCTIME_DATE:
- m = regex.match(date)
- if m is not None:
- break
- else:
- raise ValueError("%r is not in a valid HTTP date format" % date)
- try:
- year = int(m.group('year'))
- if year < 100:
- if year < 70:
- year += 2000
- else:
- year += 1900
- month = MONTHS.index(m.group('mon').lower()) + 1
- day = int(m.group('day'))
- hour = int(m.group('hour'))
- min = int(m.group('min'))
- sec = int(m.group('sec'))
- result = datetime.datetime(year, month, day, hour, min, sec)
- return calendar.timegm(result.utctimetuple())
- except Exception as exc:
- raise ValueError("%r is not a valid date" % date) from exc
-
-
- def parse_http_date_safe(date):
- """
- Same as parse_http_date, but return None if the input is invalid.
- """
- try:
- return parse_http_date(date)
- except Exception:
- pass
-
-
- # Base 36 functions: useful for generating compact URLs
-
- def base36_to_int(s):
- """
- Convert a base 36 string to an int. Raise ValueError if the input won't fit
- into an int.
- """
- # To prevent overconsumption of server resources, reject any
- # base36 string that is longer than 13 base36 digits (13 digits
- # is sufficient to base36-encode any 64-bit integer)
- if len(s) > 13:
- raise ValueError("Base36 input too large")
- return int(s, 36)
-
-
- def int_to_base36(i):
- """Convert an integer to a base36 string."""
- char_set = '0123456789abcdefghijklmnopqrstuvwxyz'
- if i < 0:
- raise ValueError("Negative base36 conversion input.")
- if i < 36:
- return char_set[i]
- b36 = ''
- while i != 0:
- i, n = divmod(i, 36)
- b36 = char_set[n] + b36
- return b36
-
-
- def urlsafe_base64_encode(s):
- """
- Encode a bytestring to a base64 string for use in URLs. Strip any trailing
- equal signs.
- """
- return base64.urlsafe_b64encode(s).rstrip(b'\n=').decode('ascii')
-
-
- def urlsafe_base64_decode(s):
- """
- Decode a base64 encoded string. Add back any trailing equal signs that
- might have been stripped.
- """
- s = s.encode()
- try:
- return base64.urlsafe_b64decode(s.ljust(len(s) + len(s) % 4, b'='))
- except (LookupError, BinasciiError) as e:
- raise ValueError(e)
-
-
- def parse_etags(etag_str):
- """
- Parse a string of ETags given in an If-None-Match or If-Match header as
- defined by RFC 7232. Return a list of quoted ETags, or ['*'] if all ETags
- should be matched.
- """
- if etag_str.strip() == '*':
- return ['*']
- else:
- # Parse each ETag individually, and return any that are valid.
- etag_matches = (ETAG_MATCH.match(etag.strip()) for etag in etag_str.split(','))
- return [match.group(1) for match in etag_matches if match]
-
-
- def quote_etag(etag_str):
- """
- If the provided string is already a quoted ETag, return it. Otherwise, wrap
- the string in quotes, making it a strong ETag.
- """
- if ETAG_MATCH.match(etag_str):
- return etag_str
- else:
- return '"%s"' % etag_str
-
-
- def is_same_domain(host, pattern):
- """
- Return ``True`` if the host is either an exact match or a match
- to the wildcard pattern.
-
- Any pattern beginning with a period matches a domain and all of its
- subdomains. (e.g. ``.example.com`` matches ``example.com`` and
- ``foo.example.com``). Anything else is an exact string match.
- """
- if not pattern:
- return False
-
- pattern = pattern.lower()
- return (
- pattern[0] == '.' and (host.endswith(pattern) or host == pattern[1:]) or
- pattern == host
- )
-
-
- def is_safe_url(url, allowed_hosts, require_https=False):
- """
- Return ``True`` if the url is a safe redirection (i.e. it doesn't point to
- a different host and uses a safe scheme).
-
- Always return ``False`` on an empty url.
-
- If ``require_https`` is ``True``, only 'https' will be considered a valid
- scheme, as opposed to 'http' and 'https' with the default, ``False``.
- """
- if url is not None:
- url = url.strip()
- if not url:
- return False
- if allowed_hosts is None:
- allowed_hosts = set()
- elif isinstance(allowed_hosts, str):
- allowed_hosts = {allowed_hosts}
- # Chrome treats \ completely as / in paths but it could be part of some
- # basic auth credentials so we need to check both URLs.
- return (_is_safe_url(url, allowed_hosts, require_https=require_https) and
- _is_safe_url(url.replace('\\', '/'), allowed_hosts, require_https=require_https))
-
-
- # Copied from urllib.parse.urlparse() but uses fixed urlsplit() function.
- def _urlparse(url, scheme='', allow_fragments=True):
- """Parse a URL into 6 components:
- <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
- Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
- Note that we don't break the components up in smaller bits
- (e.g. netloc is a single string) and we don't expand % escapes."""
- url, scheme, _coerce_result = _coerce_args(url, scheme)
- splitresult = _urlsplit(url, scheme, allow_fragments)
- scheme, netloc, url, query, fragment = splitresult
- if scheme in uses_params and ';' in url:
- url, params = _splitparams(url)
- else:
- params = ''
- result = ParseResult(scheme, netloc, url, params, query, fragment)
- return _coerce_result(result)
-
-
- # Copied from urllib.parse.urlsplit() with
- # https://github.com/python/cpython/pull/661 applied.
- def _urlsplit(url, scheme='', allow_fragments=True):
- """Parse a URL into 5 components:
- <scheme>://<netloc>/<path>?<query>#<fragment>
- Return a 5-tuple: (scheme, netloc, path, query, fragment).
- Note that we don't break the components up in smaller bits
- (e.g. netloc is a single string) and we don't expand % escapes."""
- url, scheme, _coerce_result = _coerce_args(url, scheme)
- netloc = query = fragment = ''
- i = url.find(':')
- if i > 0:
- for c in url[:i]:
- if c not in scheme_chars:
- break
- else:
- scheme, url = url[:i].lower(), url[i + 1:]
-
- if url[:2] == '//':
- netloc, url = _splitnetloc(url, 2)
- if (('[' in netloc and ']' not in netloc) or
- (']' in netloc and '[' not in netloc)):
- raise ValueError("Invalid IPv6 URL")
- if allow_fragments and '#' in url:
- url, fragment = url.split('#', 1)
- if '?' in url:
- url, query = url.split('?', 1)
- v = SplitResult(scheme, netloc, url, query, fragment)
- return _coerce_result(v)
-
-
- def _is_safe_url(url, allowed_hosts, require_https=False):
- # Chrome considers any URL with more than two slashes to be absolute, but
- # urlparse is not so flexible. Treat any url with three slashes as unsafe.
- if url.startswith('///'):
- return False
- try:
- url_info = _urlparse(url)
- except ValueError: # e.g. invalid IPv6 addresses
- return False
- # Forbid URLs like http:///example.com - with a scheme, but without a hostname.
- # In that URL, example.com is not the hostname but, a path component. However,
- # Chrome will still consider example.com to be the hostname, so we must not
- # allow this syntax.
- if not url_info.netloc and url_info.scheme:
- return False
- # Forbid URLs that start with control characters. Some browsers (like
- # Chrome) ignore quite a few control characters at the start of a
- # URL and might consider the URL as scheme relative.
- if unicodedata.category(url[0])[0] == 'C':
- return False
- scheme = url_info.scheme
- # Consider URLs without a scheme (e.g. //example.com/p) to be http.
- if not url_info.scheme and url_info.netloc:
- scheme = 'http'
- valid_schemes = ['https'] if require_https else ['http', 'https']
- return ((not url_info.netloc or url_info.netloc in allowed_hosts) and
- (not scheme or scheme in valid_schemes))
-
-
- def limited_parse_qsl(qs, keep_blank_values=False, encoding='utf-8',
- errors='replace', fields_limit=None):
- """
- Return a list of key/value tuples parsed from query string.
-
- Copied from urlparse with an additional "fields_limit" argument.
- Copyright (C) 2013 Python Software Foundation (see LICENSE.python).
-
- Arguments:
-
- qs: percent-encoded query string to be parsed
-
- keep_blank_values: flag indicating whether blank values in
- percent-encoded queries should be treated as blank strings. A
- true value indicates that blanks should be retained as blank
- strings. The default false value indicates that blank values
- are to be ignored and treated as if they were not included.
-
- encoding and errors: specify how to decode percent-encoded sequences
- into Unicode characters, as accepted by the bytes.decode() method.
-
- fields_limit: maximum number of fields parsed or an exception
- is raised. None means no limit and is the default.
- """
- if fields_limit:
- pairs = FIELDS_MATCH.split(qs, fields_limit)
- if len(pairs) > fields_limit:
- raise TooManyFieldsSent(
- 'The number of GET/POST parameters exceeded '
- 'settings.DATA_UPLOAD_MAX_NUMBER_FIELDS.'
- )
- else:
- pairs = FIELDS_MATCH.split(qs)
- r = []
- for name_value in pairs:
- if not name_value:
- continue
- nv = name_value.split('=', 1)
- if len(nv) != 2:
- # Handle case of a control-name with no equal sign
- if keep_blank_values:
- nv.append('')
- else:
- continue
- if nv[1] or keep_blank_values:
- name = nv[0].replace('+', ' ')
- name = unquote(name, encoding=encoding, errors=errors)
- value = nv[1].replace('+', ' ')
- value = unquote(value, encoding=encoding, errors=errors)
- r.append((name, value))
- return r
-
-
- def escape_leading_slashes(url):
- """
- If redirecting to an absolute path (two leading slashes), a slash must be
- escaped to prevent browsers from handling the path as schemaless and
- redirecting to another host.
- """
- if url.startswith('//'):
- url = '/%2F{}'.format(url[2:])
- return url
|