Edit File by line

"""Parse (absolute and relative) URLs.

[0] Fix | Delete

[1] Fix | Delete

urlparse module is based upon the following RFC specifications.

[2] Fix | Delete

[3] Fix | Delete

RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding

[4] Fix | Delete

and L. Masinter, January 2005.

[5] Fix | Delete

[6] Fix | Delete

RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter

[7] Fix | Delete

and L.Masinter, December 1999.

[8] Fix | Delete

[9] Fix | Delete

RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.

[10] Fix | Delete

Berners-Lee, R. Fielding, and L. Masinter, August 1998.

[11] Fix | Delete

[12] Fix | Delete

RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zawinski, July 1998.

[13] Fix | Delete

[14] Fix | Delete

RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June

[15] Fix | Delete

1995.

[16] Fix | Delete

[17] Fix | Delete

RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.

[18] Fix | Delete

McCahill, December 1994

[19] Fix | Delete

[20] Fix | Delete

RFC 3986 is considered the current standard and any future changes to

[21] Fix | Delete

urlparse module should conform with it. The urlparse module is

[22] Fix | Delete

currently not entirely compliant with this RFC due to defacto

[23] Fix | Delete

scenarios for parsing, and for backward compatibility purposes, some

[24] Fix | Delete

parsing quirks from older RFCs are retained. The testcases in

[25] Fix | Delete

test_urlparse.py provides a good indicator of parsing behavior.

[26] Fix | Delete

[27] Fix | Delete

The WHATWG URL Parser spec should also be considered. We are not compliant with

[28] Fix | Delete

it either due to existing user code API behavior expectations (Hyrum's Law).

[29] Fix | Delete

It serves as a useful guide when making changes.

[30] Fix | Delete

"""

[31] Fix | Delete

[32] Fix | Delete

import re

[33] Fix | Delete

import os

[34] Fix | Delete

import sys

[35] Fix | Delete

import collections

[36] Fix | Delete

[37] Fix | Delete

__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",

[38] Fix | Delete

"urlsplit", "urlunsplit", "urlencode", "parse_qs",

[39] Fix | Delete

"parse_qsl", "quote", "quote_plus", "quote_from_bytes",

[40] Fix | Delete

"unquote", "unquote_plus", "unquote_to_bytes",

[41] Fix | Delete

"DefragResult", "ParseResult", "SplitResult",

[42] Fix | Delete

"DefragResultBytes", "ParseResultBytes", "SplitResultBytes"]

[43] Fix | Delete

[44] Fix | Delete

# A classification of schemes.

[45] Fix | Delete

# The empty string classifies URLs with no scheme specified,

[46] Fix | Delete

# being the default value returned by “urlsplit” and “urlparse”.

[47] Fix | Delete

[48] Fix | Delete

uses_relative = ['', 'ftp', 'http', 'gopher', 'nntp', 'imap',

[49] Fix | Delete

'wais', 'file', 'https', 'shttp', 'mms',

[50] Fix | Delete

'prospero', 'rtsp', 'rtspu', 'sftp',

[51] Fix | Delete

'svn', 'svn+ssh', 'ws', 'wss']

[52] Fix | Delete

[53] Fix | Delete

uses_netloc = ['', 'ftp', 'http', 'gopher', 'nntp', 'telnet',

[54] Fix | Delete

'imap', 'wais', 'file', 'mms', 'https', 'shttp',

[55] Fix | Delete

'snews', 'prospero', 'rtsp', 'rtspu', 'rsync',

[56] Fix | Delete

'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh',

[57] Fix | Delete

'ws', 'wss']

[58] Fix | Delete

[59] Fix | Delete

uses_params = ['', 'ftp', 'hdl', 'prospero', 'http', 'imap',

[60] Fix | Delete

'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',

[61] Fix | Delete

'mms', 'sftp', 'tel']

[62] Fix | Delete

[63] Fix | Delete

# These are not actually used anymore, but should stay for backwards

[64] Fix | Delete

# compatibility. (They are undocumented, but have a public-looking name.)

[65] Fix | Delete

[66] Fix | Delete

non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',

[67] Fix | Delete

'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']

[68] Fix | Delete

[69] Fix | Delete

uses_query = ['', 'http', 'wais', 'imap', 'https', 'shttp', 'mms',

[70] Fix | Delete

'gopher', 'rtsp', 'rtspu', 'sip', 'sips']

[71] Fix | Delete

[72] Fix | Delete

uses_fragment = ['', 'ftp', 'hdl', 'http', 'gopher', 'news',

[73] Fix | Delete

'nntp', 'wais', 'https', 'shttp', 'snews',

[74] Fix | Delete

'file', 'prospero']

[75] Fix | Delete

[76] Fix | Delete

# Characters valid in scheme names

[77] Fix | Delete

scheme_chars = ('abcdefghijklmnopqrstuvwxyz'

[78] Fix | Delete

'ABCDEFGHIJKLMNOPQRSTUVWXYZ'

[79] Fix | Delete

'0123456789'

[80] Fix | Delete

'+-.')

[81] Fix | Delete

[82] Fix | Delete

# Leading and trailing C0 control and space to be stripped per WHATWG spec.

[83] Fix | Delete

# == "".join([chr(i) for i in range(0, 0x20 + 1)])

[84] Fix | Delete

_WHATWG_C0_CONTROL_OR_SPACE = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f '

[85] Fix | Delete

[86] Fix | Delete

# Unsafe bytes to be removed per WHATWG spec

[87] Fix | Delete

_UNSAFE_URL_BYTES_TO_REMOVE = ['\t', '\r', '\n']

[88] Fix | Delete

[89] Fix | Delete

# XXX: Consider replacing with functools.lru_cache

[90] Fix | Delete

MAX_CACHE_SIZE = 20

[91] Fix | Delete

_parse_cache = {}

[92] Fix | Delete

[93] Fix | Delete

def clear_cache():

[94] Fix | Delete

"""Clear the parse cache and the quoters cache."""

[95] Fix | Delete

_parse_cache.clear()

[96] Fix | Delete

_safe_quoters.clear()

[97] Fix | Delete

[98] Fix | Delete

[99] Fix | Delete

# Helpers for bytes handling

[100] Fix | Delete

# For 3.2, we deliberately require applications that

[101] Fix | Delete

# handle improperly quoted URLs to do their own

[102] Fix | Delete

# decoding and encoding. If valid use cases are

[103] Fix | Delete

# presented, we may relax this by using latin-1

[104] Fix | Delete

# decoding internally for 3.3

[105] Fix | Delete

_implicit_encoding = 'ascii'

[106] Fix | Delete

_implicit_errors = 'strict'

[107] Fix | Delete

[108] Fix | Delete

def _noop(obj):

[109] Fix | Delete

return obj

[110] Fix | Delete

[111] Fix | Delete

def _encode_result(obj, encoding=_implicit_encoding,

[112] Fix | Delete

errors=_implicit_errors):

[113] Fix | Delete

return obj.encode(encoding, errors)

[114] Fix | Delete

[115] Fix | Delete

def _decode_args(args, encoding=_implicit_encoding,

[116] Fix | Delete

errors=_implicit_errors):

[117] Fix | Delete

return tuple(x.decode(encoding, errors) if x else '' for x in args)

[118] Fix | Delete

[119] Fix | Delete

def _coerce_args(*args):

[120] Fix | Delete

# Invokes decode if necessary to create str args

[121] Fix | Delete

# and returns the coerced inputs along with

[122] Fix | Delete

# an appropriate result coercion function

[123] Fix | Delete

# - noop for str inputs

[124] Fix | Delete

# - encoding function otherwise

[125] Fix | Delete

str_input = isinstance(args[0], str)

[126] Fix | Delete

for arg in args[1:]:

[127] Fix | Delete

# We special-case the empty string to support the

[128] Fix | Delete

# "scheme=''" default argument to some functions

[129] Fix | Delete

if arg and isinstance(arg, str) != str_input:

[130] Fix | Delete

raise TypeError("Cannot mix str and non-str arguments")

[131] Fix | Delete

if str_input:

[132] Fix | Delete

return args + (_noop,)

[133] Fix | Delete

return _decode_args(args) + (_encode_result,)

[134] Fix | Delete

[135] Fix | Delete

# Result objects are more helpful than simple tuples

[136] Fix | Delete

class _ResultMixinStr(object):

[137] Fix | Delete

"""Standard approach to encoding parsed results from str to bytes"""

[138] Fix | Delete

__slots__ = ()

[139] Fix | Delete

[140] Fix | Delete

def encode(self, encoding='ascii', errors='strict'):

[141] Fix | Delete

return self._encoded_counterpart(*(x.encode(encoding, errors) for x in self))

[142] Fix | Delete

[143] Fix | Delete

[144] Fix | Delete

class _ResultMixinBytes(object):

[145] Fix | Delete

"""Standard approach to decoding parsed results from bytes to str"""

[146] Fix | Delete

__slots__ = ()

[147] Fix | Delete

[148] Fix | Delete

def decode(self, encoding='ascii', errors='strict'):

[149] Fix | Delete

return self._decoded_counterpart(*(x.decode(encoding, errors) for x in self))

[150] Fix | Delete

[151] Fix | Delete

[152] Fix | Delete

class _NetlocResultMixinBase(object):

[153] Fix | Delete

"""Shared methods for the parsed result objects containing a netloc element"""

[154] Fix | Delete

__slots__ = ()

[155] Fix | Delete

[156] Fix | Delete

@property

[157] Fix | Delete

def username(self):

[158] Fix | Delete

return self._userinfo[0]

[159] Fix | Delete

[160] Fix | Delete

@property

[161] Fix | Delete

def password(self):

[162] Fix | Delete

return self._userinfo[1]

[163] Fix | Delete

[164] Fix | Delete

@property

[165] Fix | Delete

def hostname(self):

[166] Fix | Delete

hostname = self._hostinfo[0]

[167] Fix | Delete

if not hostname:

[168] Fix | Delete

return None

[169] Fix | Delete

# Scoped IPv6 address may have zone info, which must not be lowercased

[170] Fix | Delete

# like http://[fe80::822a:a8ff:fe49:470c%tESt]:1234/keys

[171] Fix | Delete

separator = '%' if isinstance(hostname, str) else b'%'

[172] Fix | Delete

hostname, percent, zone = hostname.partition(separator)

[173] Fix | Delete

return hostname.lower() + percent + zone

[174] Fix | Delete

[175] Fix | Delete

@property

[176] Fix | Delete

def port(self):

[177] Fix | Delete

port = self._hostinfo[1]

[178] Fix | Delete

if port is not None:

[179] Fix | Delete

port = int(port, 10)

[180] Fix | Delete

if not ( 0 <= port <= 65535):

[181] Fix | Delete

raise ValueError("Port out of range 0-65535")

[182] Fix | Delete

return port

[183] Fix | Delete

[184] Fix | Delete

[185] Fix | Delete

class _NetlocResultMixinStr(_NetlocResultMixinBase, _ResultMixinStr):

[186] Fix | Delete

__slots__ = ()

[187] Fix | Delete

[188] Fix | Delete

@property

[189] Fix | Delete

def _userinfo(self):

[190] Fix | Delete

netloc = self.netloc

[191] Fix | Delete

userinfo, have_info, hostinfo = netloc.rpartition('@')

[192] Fix | Delete

if have_info:

[193] Fix | Delete

username, have_password, password = userinfo.partition(':')

[194] Fix | Delete

if not have_password:

[195] Fix | Delete

password = None

[196] Fix | Delete

else:

[197] Fix | Delete

username = password = None

[198] Fix | Delete

return username, password

[199] Fix | Delete

[200] Fix | Delete

@property

[201] Fix | Delete

def _hostinfo(self):

[202] Fix | Delete

netloc = self.netloc

[203] Fix | Delete

_, _, hostinfo = netloc.rpartition('@')

[204] Fix | Delete

_, have_open_br, bracketed = hostinfo.partition('[')

[205] Fix | Delete

if have_open_br:

[206] Fix | Delete

hostname, _, port = bracketed.partition(']')

[207] Fix | Delete

_, _, port = port.partition(':')

[208] Fix | Delete

else:

[209] Fix | Delete

hostname, _, port = hostinfo.partition(':')

[210] Fix | Delete

if not port:

[211] Fix | Delete

port = None

[212] Fix | Delete

return hostname, port

[213] Fix | Delete

[214] Fix | Delete

[215] Fix | Delete

class _NetlocResultMixinBytes(_NetlocResultMixinBase, _ResultMixinBytes):

[216] Fix | Delete

__slots__ = ()

[217] Fix | Delete

[218] Fix | Delete

@property

[219] Fix | Delete

def _userinfo(self):

[220] Fix | Delete

netloc = self.netloc

[221] Fix | Delete

userinfo, have_info, hostinfo = netloc.rpartition(b'@')

[222] Fix | Delete

if have_info:

[223] Fix | Delete

username, have_password, password = userinfo.partition(b':')

[224] Fix | Delete

if not have_password:

[225] Fix | Delete

password = None

[226] Fix | Delete

else:

[227] Fix | Delete

username = password = None

[228] Fix | Delete

return username, password

[229] Fix | Delete

[230] Fix | Delete

@property

[231] Fix | Delete

def _hostinfo(self):

[232] Fix | Delete

netloc = self.netloc

[233] Fix | Delete

_, _, hostinfo = netloc.rpartition(b'@')

[234] Fix | Delete

_, have_open_br, bracketed = hostinfo.partition(b'[')

[235] Fix | Delete

if have_open_br:

[236] Fix | Delete

hostname, _, port = bracketed.partition(b']')

[237] Fix | Delete

_, _, port = port.partition(b':')

[238] Fix | Delete

else:

[239] Fix | Delete

hostname, _, port = hostinfo.partition(b':')

[240] Fix | Delete

if not port:

[241] Fix | Delete

port = None

[242] Fix | Delete

return hostname, port

[243] Fix | Delete

[244] Fix | Delete

[245] Fix | Delete

from collections import namedtuple

[246] Fix | Delete

[247] Fix | Delete

_DefragResultBase = namedtuple('DefragResult', 'url fragment')

[248] Fix | Delete

_SplitResultBase = namedtuple(

[249] Fix | Delete

'SplitResult', 'scheme netloc path query fragment')

[250] Fix | Delete

_ParseResultBase = namedtuple(

[251] Fix | Delete

'ParseResult', 'scheme netloc path params query fragment')

[252] Fix | Delete

[253] Fix | Delete

_DefragResultBase.__doc__ = """

[254] Fix | Delete

DefragResult(url, fragment)

[255] Fix | Delete

[256] Fix | Delete

A 2-tuple that contains the url without fragment identifier and the fragment

[257] Fix | Delete

identifier as a separate argument.

[258] Fix | Delete

"""

[259] Fix | Delete

[260] Fix | Delete

_DefragResultBase.url.__doc__ = """The URL with no fragment identifier."""

[261] Fix | Delete

[262] Fix | Delete

_DefragResultBase.fragment.__doc__ = """

[263] Fix | Delete

Fragment identifier separated from URL, that allows indirect identification of a

[264] Fix | Delete

secondary resource by reference to a primary resource and additional identifying

[265] Fix | Delete

information.

[266] Fix | Delete

"""

[267] Fix | Delete

[268] Fix | Delete

_SplitResultBase.__doc__ = """

[269] Fix | Delete

SplitResult(scheme, netloc, path, query, fragment)

[270] Fix | Delete

[271] Fix | Delete

A 5-tuple that contains the different components of a URL. Similar to

[272] Fix | Delete

ParseResult, but does not split params.

[273] Fix | Delete

"""

[274] Fix | Delete

[275] Fix | Delete

_SplitResultBase.scheme.__doc__ = """Specifies URL scheme for the request."""

[276] Fix | Delete

[277] Fix | Delete

_SplitResultBase.netloc.__doc__ = """

[278] Fix | Delete

Network location where the request is made to.

[279] Fix | Delete

"""

[280] Fix | Delete

[281] Fix | Delete

_SplitResultBase.path.__doc__ = """

[282] Fix | Delete

The hierarchical path, such as the path to a file to download.

[283] Fix | Delete

"""

[284] Fix | Delete

[285] Fix | Delete

_SplitResultBase.query.__doc__ = """

[286] Fix | Delete

The query component, that contains non-hierarchical data, that along with data

[287] Fix | Delete

in path component, identifies a resource in the scope of URI's scheme and

[288] Fix | Delete

network location.

[289] Fix | Delete

"""

[290] Fix | Delete

[291] Fix | Delete

_SplitResultBase.fragment.__doc__ = """

[292] Fix | Delete

Fragment identifier, that allows indirect identification of a secondary resource

[293] Fix | Delete

by reference to a primary resource and additional identifying information.

[294] Fix | Delete

"""

[295] Fix | Delete

[296] Fix | Delete

_ParseResultBase.__doc__ = """

[297] Fix | Delete

ParseResult(scheme, netloc, path, params, query, fragment)

[298] Fix | Delete

[299] Fix | Delete

A 6-tuple that contains components of a parsed URL.

[300] Fix | Delete

"""

[301] Fix | Delete

[302] Fix | Delete

_ParseResultBase.scheme.__doc__ = _SplitResultBase.scheme.__doc__

[303] Fix | Delete

_ParseResultBase.netloc.__doc__ = _SplitResultBase.netloc.__doc__

[304] Fix | Delete

_ParseResultBase.path.__doc__ = _SplitResultBase.path.__doc__

[305] Fix | Delete

_ParseResultBase.params.__doc__ = """

[306] Fix | Delete

Parameters for last path element used to dereference the URI in order to provide

[307] Fix | Delete

access to perform some operation on the resource.

[308] Fix | Delete

"""

[309] Fix | Delete

[310] Fix | Delete

_ParseResultBase.query.__doc__ = _SplitResultBase.query.__doc__

[311] Fix | Delete

_ParseResultBase.fragment.__doc__ = _SplitResultBase.fragment.__doc__

[312] Fix | Delete

[313] Fix | Delete

[314] Fix | Delete

# For backwards compatibility, alias _NetlocResultMixinStr

[315] Fix | Delete

# ResultBase is no longer part of the documented API, but it is

[316] Fix | Delete

# retained since deprecating it isn't worth the hassle

[317] Fix | Delete

ResultBase = _NetlocResultMixinStr

[318] Fix | Delete

[319] Fix | Delete

# Structured result objects for string data

[320] Fix | Delete

class DefragResult(_DefragResultBase, _ResultMixinStr):

[321] Fix | Delete

__slots__ = ()

[322] Fix | Delete

def geturl(self):

[323] Fix | Delete

if self.fragment:

[324] Fix | Delete

return self.url + '#' + self.fragment

[325] Fix | Delete

else:

[326] Fix | Delete

return self.url

[327] Fix | Delete

[328] Fix | Delete

class SplitResult(_SplitResultBase, _NetlocResultMixinStr):

[329] Fix | Delete

__slots__ = ()

[330] Fix | Delete

def geturl(self):

[331] Fix | Delete

return urlunsplit(self)

[332] Fix | Delete

[333] Fix | Delete

class ParseResult(_ParseResultBase, _NetlocResultMixinStr):

[334] Fix | Delete

__slots__ = ()

[335] Fix | Delete

def geturl(self):

[336] Fix | Delete

return urlunparse(self)

[337] Fix | Delete

[338] Fix | Delete

# Structured result objects for bytes data

[339] Fix | Delete

class DefragResultBytes(_DefragResultBase, _ResultMixinBytes):

[340] Fix | Delete

__slots__ = ()

[341] Fix | Delete

def geturl(self):

[342] Fix | Delete

if self.fragment:

[343] Fix | Delete

return self.url + b'#' + self.fragment

[344] Fix | Delete

else:

[345] Fix | Delete

return self.url

[346] Fix | Delete

[347] Fix | Delete

class SplitResultBytes(_SplitResultBase, _NetlocResultMixinBytes):

[348] Fix | Delete

__slots__ = ()

[349] Fix | Delete

def geturl(self):

[350] Fix | Delete

return urlunsplit(self)

[351] Fix | Delete

[352] Fix | Delete

class ParseResultBytes(_ParseResultBase, _NetlocResultMixinBytes):

[353] Fix | Delete

__slots__ = ()

[354] Fix | Delete

def geturl(self):

[355] Fix | Delete

return urlunparse(self)

[356] Fix | Delete

[357] Fix | Delete

# Set up the encode/decode result pairs

[358] Fix | Delete

def _fix_result_transcoding():

[359] Fix | Delete

_result_pairs = (

[360] Fix | Delete

(DefragResult, DefragResultBytes),

[361] Fix | Delete

(SplitResult, SplitResultBytes),

[362] Fix | Delete

(ParseResult, ParseResultBytes),

[363] Fix | Delete

)

[364] Fix | Delete

for _decoded, _encoded in _result_pairs:

[365] Fix | Delete

_decoded._encoded_counterpart = _encoded

[366] Fix | Delete

_encoded._decoded_counterpart = _decoded

[367] Fix | Delete

[368] Fix | Delete

_fix_result_transcoding()

[369] Fix | Delete

del _fix_result_transcoding

[370] Fix | Delete

[371] Fix | Delete

def urlparse(url, scheme='', allow_fragments=True):

[372] Fix | Delete

"""Parse a URL into 6 components:

[373] Fix | Delete

[374] Fix | Delete

Return a 6-tuple: (scheme, netloc, path, params, query, fragment).

[375] Fix | Delete

Note that we don't break the components up in smaller bits

[376] Fix | Delete

(e.g. netloc is a single string) and we don't expand % escapes."""

[377] Fix | Delete

url, scheme, _coerce_result = _coerce_args(url, scheme)

[378] Fix | Delete

splitresult = urlsplit(url, scheme, allow_fragments)

[379] Fix | Delete

scheme, netloc, url, query, fragment = splitresult

[380] Fix | Delete

if scheme in uses_params and ';' in url:

[381] Fix | Delete

url, params = _splitparams(url)

[382] Fix | Delete

else:

[383] Fix | Delete

params = ''

[384] Fix | Delete

result = ParseResult(scheme, netloc, url, params, query, fragment)

[385] Fix | Delete

return _coerce_result(result)

[386] Fix | Delete

[387] Fix | Delete

def _splitparams(url):

[388] Fix | Delete

if '/' in url:

[389] Fix | Delete

i = url.find(';', url.rfind('/'))

[390] Fix | Delete

if i < 0:

[391] Fix | Delete

return url, ''

[392] Fix | Delete

else:

[393] Fix | Delete

i = url.find(';')

[394] Fix | Delete

return url[:i], url[i+1:]

[395] Fix | Delete

[396] Fix | Delete

def _splitnetloc(url, start=0):

[397] Fix | Delete

delim = len(url) # position of end of domain part of url, default is end

[398] Fix | Delete

for c in '/?#': # look for delimiters; the order is NOT important

[399] Fix | Delete

wdelim = url.find(c, start) # find first of this delim

[400] Fix | Delete

if wdelim >= 0: # if found

[401] Fix | Delete

delim = min(delim, wdelim) # use earliest delim position

[402] Fix | Delete

return url[start:delim], url[delim:] # return (domain, rest)

[403] Fix | Delete

[404] Fix | Delete

def _checknetloc(netloc):

[405] Fix | Delete

if not netloc or not any(ord(c) > 127 for c in netloc):

[406] Fix | Delete

return

[407] Fix | Delete

# looking for characters like \u2100 that expand to 'a/c'

[408] Fix | Delete

# IDNA uses NFKC equivalence, so normalize for this check

[409] Fix | Delete

import unicodedata

[410] Fix | Delete

n = netloc.replace('@', '') # ignore characters already included

[411] Fix | Delete

n = n.replace(':', '') # but not the surrounding text

[412] Fix | Delete

n = n.replace('#', '')

[413] Fix | Delete

n = n.replace('?', '')

[414] Fix | Delete

netloc2 = unicodedata.normalize('NFKC', n)

[415] Fix | Delete

if n == netloc2:

[416] Fix | Delete

return

[417] Fix | Delete

for c in '/?#@:':

[418] Fix | Delete

if c in netloc2:

[419] Fix | Delete

raise ValueError("netloc '" + netloc + "' contains invalid " +

[420] Fix | Delete

"characters under NFKC normalization")

[421] Fix | Delete

[422] Fix | Delete

def _remove_unsafe_bytes_from_url(url):

[423] Fix | Delete

for b in _UNSAFE_URL_BYTES_TO_REMOVE:

[424] Fix | Delete

url = url.replace(b, "")

[425] Fix | Delete

return url

[426] Fix | Delete

[427] Fix | Delete

def urlsplit(url, scheme='', allow_fragments=True):

[428] Fix | Delete

"""Parse a URL into 5 components:

[429] Fix | Delete

[430] Fix | Delete

Return a 5-tuple: (scheme, netloc, path, query, fragment).

[431] Fix | Delete

Note that we don't break the components up in smaller bits

[432] Fix | Delete

(e.g. netloc is a single string) and we don't expand % escapes."""

[433] Fix | Delete

url, scheme, _coerce_result = _coerce_args(url, scheme)

[434] Fix | Delete

url = _remove_unsafe_bytes_from_url(url)

[435] Fix | Delete

scheme = _remove_unsafe_bytes_from_url(scheme)

[436] Fix | Delete

# Only lstrip url as some applications rely on preserving trailing space.

[437] Fix | Delete

# (https://url.spec.whatwg.org/#concept-basic-url-parser would strip both)

[438] Fix | Delete

url = url.lstrip(_WHATWG_C0_CONTROL_OR_SPACE)

[439] Fix | Delete

scheme = scheme.strip(_WHATWG_C0_CONTROL_OR_SPACE)

[440] Fix | Delete

allow_fragments = bool(allow_fragments)

[441] Fix | Delete

key = url, scheme, allow_fragments, type(url), type(scheme)

[442] Fix | Delete

cached = _parse_cache.get(key, None)

[443] Fix | Delete

if cached:

[444] Fix | Delete

return _coerce_result(cached)

[445] Fix | Delete

if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth

[446] Fix | Delete

clear_cache()

[447] Fix | Delete

netloc = query = fragment = ''

[448] Fix | Delete

i = url.find(':')

[449] Fix | Delete

if i > 0:

[450] Fix | Delete

if url[:i] == 'http': # optimize the common case

[451] Fix | Delete

scheme = url[:i].lower()

[452] Fix | Delete

url = url[i+1:]

[453] Fix | Delete

if url[:2] == '//':

[454] Fix | Delete

netloc, url = _splitnetloc(url, 2)

[455] Fix | Delete

if (('[' in netloc and ']' not in netloc) or

[456] Fix | Delete

(']' in netloc and '[' not in netloc)):

[457] Fix | Delete

raise ValueError("Invalid IPv6 URL")

[458] Fix | Delete

if allow_fragments and '#' in url:

[459] Fix | Delete

url, fragment = url.split('#', 1)

[460] Fix | Delete

if '?' in url:

[461] Fix | Delete

url, query = url.split('?', 1)

[462] Fix | Delete

_checknetloc(netloc)

[463] Fix | Delete

v = SplitResult(scheme, netloc, url, query, fragment)

[464] Fix | Delete

_parse_cache[key] = v

[465] Fix | Delete

return _coerce_result(v)

[466] Fix | Delete

for c in url[:i]:

[467] Fix | Delete

if c not in scheme_chars:

[468] Fix | Delete

break

[469] Fix | Delete

else:

[470] Fix | Delete

# make sure "url" is not actually a port number (in which case

[471] Fix | Delete

# "scheme" is really part of the path)

[472] Fix | Delete

rest = url[i+1:]

[473] Fix | Delete

if not rest or any(c not in '0123456789' for c in rest):

[474] Fix | Delete

# not a port number

[475] Fix | Delete

scheme, url = url[:i].lower(), rest

[476] Fix | Delete

[477] Fix | Delete

if url[:2] == '//':

[478] Fix | Delete

netloc, url = _splitnetloc(url, 2)

[479] Fix | Delete

if (('[' in netloc and ']' not in netloc) or

[480] Fix | Delete

(']' in netloc and '[' not in netloc)):

[481] Fix | Delete

raise ValueError("Invalid IPv6 URL")

[482] Fix | Delete

if allow_fragments and '#' in url:

[483] Fix | Delete

url, fragment = url.split('#', 1)

[484] Fix | Delete

if '?' in url:

[485] Fix | Delete

url, query = url.split('?', 1)

[486] Fix | Delete

_checknetloc(netloc)

[487] Fix | Delete

v = SplitResult(scheme, netloc, url, query, fragment)

[488] Fix | Delete

_parse_cache[key] = v

[489] Fix | Delete

return _coerce_result(v)

[490] Fix | Delete

[491] Fix | Delete

def urlunparse(components):

[492] Fix | Delete

"""Put a parsed URL back together again. This may result in a

[493] Fix | Delete

slightly different, but equivalent URL, if the URL that was parsed

[494] Fix | Delete

originally had redundant delimiters, e.g. a ? with an empty query

[495] Fix | Delete

(the draft states that these are equivalent)."""

[496] Fix | Delete

scheme, netloc, url, params, query, fragment, _coerce_result = (

[497] Fix | Delete

_coerce_args(*components))

[498] Fix | Delete

if params:

[499] Fix | Delete

12 3