"""Parse (absolute and relative) URLs.
urlparse module is based upon the following RFC specifications.
RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
and L. Masinter, January 2005.
RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
and L.Masinter, December 1999.
RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
Berners-Lee, R. Fielding, and L. Masinter, August 1998.
RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zwinski, July 1998.
RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
RFC 3986 is considered the current standard and any future changes to
urlparse module should conform with it. The urlparse module is
currently not entirely compliant with this RFC due to defacto
scenarios for parsing, and for backward compatibility purposes, some
parsing quirks from older RFCs are retained. The testcases in
test_urlparse.py provides a good indicator of parsing behavior.
__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
"urlsplit", "urlunsplit", "parse_qs", "parse_qsl"]
# A classification of schemes ('' means apply by default)
uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
'wais', 'file', 'https', 'shttp', 'mms',
'prospero', 'rtsp', 'rtspu', '', 'sftp',
uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
'imap', 'wais', 'file', 'mms', 'https', 'shttp',
'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
'svn', 'svn+ssh', 'sftp','nfs','git', 'git+ssh']
uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
'mms', '', 'sftp', 'tel']
# These are not actually used anymore, but should stay for backwards
# compatibility. (They are undocumented, but have a public-looking name.)
non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
'nntp', 'wais', 'https', 'shttp', 'snews',
# Characters valid in scheme names
scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
"""Clear the parse cache."""
class ResultMixin(object):
"""Shared methods for the parsed result objects."""
userinfo = netloc.rsplit("@", 1)[0]
userinfo = userinfo.split(":", 1)[0]
userinfo = netloc.rsplit("@", 1)[0]
return userinfo.split(":", 1)[1]
netloc = self.netloc.split('@')[-1]
if '[' in netloc and ']' in netloc:
return netloc.split(']')[0][1:].lower()
return netloc.split(':')[0].lower()
netloc = self.netloc.split('@')[-1].split(']')[-1]
port = netloc.split(':')[1]
from collections import namedtuple
class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
def urlparse(url, scheme='', allow_fragments=True):
"""Parse a URL into 6 components:
<scheme>://<netloc>/<path>;<params>?<query>#<fragment>
Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
Note that we don't break the components up in smaller bits
(e.g. netloc is a single string) and we don't expand % escapes."""
tuple = urlsplit(url, scheme, allow_fragments)
scheme, netloc, url, query, fragment = tuple
if scheme in uses_params and ';' in url:
url, params = _splitparams(url)
return ParseResult(scheme, netloc, url, params, query, fragment)
i = url.find(';', url.rfind('/'))
return url[:i], url[i+1:]
def _splitnetloc(url, start=0):
delim = len(url) # position of end of domain part of url, default is end
for c in '/?#': # look for delimiters; the order is NOT important
wdelim = url.find(c, start) # find first of this delim
if wdelim >= 0: # if found
delim = min(delim, wdelim) # use earliest delim position
return url[start:delim], url[delim:] # return (domain, rest)
def _checknetloc(netloc):
if not netloc or not isinstance(netloc, unicode):
# looking for characters like \u2100 that expand to 'a/c'
# IDNA uses NFKC equivalence, so normalize for this check
n = netloc.replace(u'@', u'') # ignore characters already included
n = n.replace(u':', u'') # but not the surrounding text
netloc2 = unicodedata.normalize('NFKC', n)
raise ValueError("netloc %r contains invalid characters "
"under NFKC normalization"
def urlsplit(url, scheme='', allow_fragments=True):
"""Parse a URL into 5 components:
<scheme>://<netloc>/<path>?<query>#<fragment>
Return a 5-tuple: (scheme, netloc, path, query, fragment).
Note that we don't break the components up in smaller bits
(e.g. netloc is a single string) and we don't expand % escapes."""
allow_fragments = bool(allow_fragments)
key = url, scheme, allow_fragments, type(url), type(scheme)
cached = _parse_cache.get(key, None)
if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
netloc = query = fragment = ''
if url[:i] == 'http': # optimize the common case
netloc, url = _splitnetloc(url, 2)
if (('[' in netloc and ']' not in netloc) or
(']' in netloc and '[' not in netloc)):
raise ValueError("Invalid IPv6 URL")
if allow_fragments and '#' in url:
url, fragment = url.split('#', 1)
url, query = url.split('?', 1)
v = SplitResult(scheme, netloc, url, query, fragment)
if c not in scheme_chars:
# make sure "url" is not actually a port number (in which case
# "scheme" is really part of the path)
if not rest or any(c not in '0123456789' for c in rest):
scheme, url = url[:i].lower(), rest
netloc, url = _splitnetloc(url, 2)
if (('[' in netloc and ']' not in netloc) or
(']' in netloc and '[' not in netloc)):
raise ValueError("Invalid IPv6 URL")
if allow_fragments and '#' in url:
url, fragment = url.split('#', 1)
url, query = url.split('?', 1)
v = SplitResult(scheme, netloc, url, query, fragment)
"""Put a parsed URL back together again. This may result in a
slightly different, but equivalent URL, if the URL that was parsed
originally had redundant delimiters, e.g. a ? with an empty query
(the draft states that these are equivalent)."""
scheme, netloc, url, params, query, fragment = data
url = "%s;%s" % (url, params)
return urlunsplit((scheme, netloc, url, query, fragment))
"""Combine the elements of a tuple as returned by urlsplit() into a
complete URL as a string. The data argument can be any five-item iterable.
This may result in a slightly different, but equivalent URL, if the URL that
was parsed originally had unnecessary delimiters (for example, a ? with an
empty query; the RFC states that these are equivalent)."""
scheme, netloc, url, query, fragment = data
if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
if url and url[:1] != '/': url = '/' + url
url = '//' + (netloc or '') + url
url = url + '#' + fragment
def urljoin(base, url, allow_fragments=True):
"""Join a base URL and a possibly relative URL to form an absolute
interpretation of the latter."""
bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
urlparse(base, '', allow_fragments)
scheme, netloc, path, params, query, fragment = \
urlparse(url, bscheme, allow_fragments)
if scheme != bscheme or scheme not in uses_relative:
if scheme in uses_netloc:
return urlunparse((scheme, netloc, path,
params, query, fragment))
return urlunparse((scheme, netloc, path,
params, query, fragment))
if not path and not params:
return urlunparse((scheme, netloc, path,
params, query, fragment))
segments = bpath.split('/')[:-1] + path.split('/')
# XXX The stuff below is bogus in various ways...
and segments[i-1] not in ('', '..')):
if segments == ['', '..']:
elif len(segments) >= 2 and segments[-1] == '..':
return urlunparse((scheme, netloc, '/'.join(segments),
params, query, fragment))
"""Removes any existing fragment from URL.
Returns a tuple of the defragmented URL and the fragment. If
the URL contained no fragments, the second element is the
s, n, p, a, q, frag = urlparse(url)
defrag = urlunparse((s, n, p, a, q, ''))
return isinstance(x, unicode)
# unquote method for parse_qs and parse_qsl
# Cannot use directly from urllib as it would create a circular reference
# because urllib uses urlparse methods (urljoin). If you update this function,
# update it also in urllib. This code duplication does not existin in Python3.
_hexdig = '0123456789ABCDEFabcdef'
_hextochr = dict((a+b, chr(int(a+b,16)))
for a in _hexdig for b in _hexdig)
_asciire = re.compile('([\x00-\x7f]+)')
"""unquote('abc%20def') -> 'abc def'."""
for i in range(1, len(bits), 2):
append(unquote(str(bits[i])).decode('latin1'))
append(_hextochr[item[:2]])
def parse_qs(qs, keep_blank_values=0, strict_parsing=0, max_num_fields=None):
"""Parse a query given as a string argument.
qs: percent-encoded query string to be parsed
keep_blank_values: flag indicating whether blank values in
percent-encoded queries should be treated as blank strings.
A true value indicates that blanks should be retained as
blank strings. The default false value indicates that
blank values are to be ignored and treated as if they were
strict_parsing: flag indicating what to do with parsing errors.
If false (the default), errors are silently ignored.
If true, errors raise a ValueError exception.
max_num_fields: int. If set, then throws a ValueError if there
are more than n fields read by parse_qsl().
for name, value in parse_qsl(qs, keep_blank_values, strict_parsing,
def parse_qsl(qs, keep_blank_values=0, strict_parsing=0, max_num_fields=None):
"""Parse a query given as a string argument.
qs: percent-encoded query string to be parsed
keep_blank_values: flag indicating whether blank values in
percent-encoded queries should be treated as blank strings. A
true value indicates that blanks should be retained as blank
strings. The default false value indicates that blank values
are to be ignored and treated as if they were not included.
strict_parsing: flag indicating what to do with parsing errors. If
false (the default), errors are silently ignored. If true,
errors raise a ValueError exception.
max_num_fields: int. If set, then throws a ValueError if there
are more than n fields read by parse_qsl().
Returns a list, as G-d intended.
# If max_num_fields is defined then check that the number of fields
# is less than max_num_fields. This prevents a memory exhaustion DOS
# attack via post bodies with many fields.
if max_num_fields is not None:
num_fields = 1 + qs.count('&') + qs.count(';')
if max_num_fields < num_fields:
raise ValueError('Max number of fields exceeded')
pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
if not name_value and not strict_parsing:
nv = name_value.split('=', 1)
raise ValueError, "bad query field: %r" % (name_value,)
# Handle case of a control-name with no equal sign
if len(nv[1]) or keep_blank_values:
name = unquote(nv[0].replace('+', ' '))
value = unquote(nv[1].replace('+', ' '))