Edit File by line

"""Parse (absolute and relative) URLs.

[0] Fix | Delete

[1] Fix | Delete

urlparse module is based upon the following RFC specifications.

[2] Fix | Delete

[3] Fix | Delete

RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding

[4] Fix | Delete

and L. Masinter, January 2005.

[5] Fix | Delete

[6] Fix | Delete

RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter

[7] Fix | Delete

and L.Masinter, December 1999.

[8] Fix | Delete

[9] Fix | Delete

RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.

[10] Fix | Delete

Berners-Lee, R. Fielding, and L. Masinter, August 1998.

[11] Fix | Delete

[12] Fix | Delete

RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zawinski, July 1998.

[13] Fix | Delete

[14] Fix | Delete

RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June

[15] Fix | Delete

1995.

[16] Fix | Delete

[17] Fix | Delete

RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.

[18] Fix | Delete

McCahill, December 1994

[19] Fix | Delete

[20] Fix | Delete

RFC 3986 is considered the current standard and any future changes to

[21] Fix | Delete

urlparse module should conform with it. The urlparse module is

[22] Fix | Delete

currently not entirely compliant with this RFC due to defacto

[23] Fix | Delete

scenarios for parsing, and for backward compatibility purposes, some

[24] Fix | Delete

parsing quirks from older RFCs are retained. The testcases in

[25] Fix | Delete

test_urlparse.py provides a good indicator of parsing behavior.

[26] Fix | Delete

[27] Fix | Delete

The WHATWG URL Parser spec should also be considered. We are not compliant with

[28] Fix | Delete

it either due to existing user code API behavior expectations (Hyrum's Law).

[29] Fix | Delete

It serves as a useful guide when making changes.

[30] Fix | Delete

"""

[31] Fix | Delete

[32] Fix | Delete

import re

[33] Fix | Delete

import os

[34] Fix | Delete

import sys

[35] Fix | Delete

import collections

[36] Fix | Delete

import warnings

[37] Fix | Delete

[38] Fix | Delete

__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",

[39] Fix | Delete

"urlsplit", "urlunsplit", "urlencode", "parse_qs",

[40] Fix | Delete

"parse_qsl", "quote", "quote_plus", "quote_from_bytes",

[41] Fix | Delete

"unquote", "unquote_plus", "unquote_to_bytes",

[42] Fix | Delete

"DefragResult", "ParseResult", "SplitResult",

[43] Fix | Delete

"DefragResultBytes", "ParseResultBytes", "SplitResultBytes"]

[44] Fix | Delete

[45] Fix | Delete

# A classification of schemes.

[46] Fix | Delete

# The empty string classifies URLs with no scheme specified,

[47] Fix | Delete

# being the default value returned by “urlsplit” and “urlparse”.

[48] Fix | Delete

[49] Fix | Delete

uses_relative = ['', 'ftp', 'http', 'gopher', 'nntp', 'imap',

[50] Fix | Delete

'wais', 'file', 'https', 'shttp', 'mms',

[51] Fix | Delete

'prospero', 'rtsp', 'rtspu', 'sftp',

[52] Fix | Delete

'svn', 'svn+ssh', 'ws', 'wss']

[53] Fix | Delete

[54] Fix | Delete

uses_netloc = ['', 'ftp', 'http', 'gopher', 'nntp', 'telnet',

[55] Fix | Delete

'imap', 'wais', 'file', 'mms', 'https', 'shttp',

[56] Fix | Delete

'snews', 'prospero', 'rtsp', 'rtspu', 'rsync',

[57] Fix | Delete

'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh',

[58] Fix | Delete

'ws', 'wss']

[59] Fix | Delete

[60] Fix | Delete

uses_params = ['', 'ftp', 'hdl', 'prospero', 'http', 'imap',

[61] Fix | Delete

'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',

[62] Fix | Delete

'mms', 'sftp', 'tel']

[63] Fix | Delete

[64] Fix | Delete

# These are not actually used anymore, but should stay for backwards

[65] Fix | Delete

# compatibility. (They are undocumented, but have a public-looking name.)

[66] Fix | Delete

[67] Fix | Delete

non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',

[68] Fix | Delete

'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']

[69] Fix | Delete

[70] Fix | Delete

uses_query = ['', 'http', 'wais', 'imap', 'https', 'shttp', 'mms',

[71] Fix | Delete

'gopher', 'rtsp', 'rtspu', 'sip', 'sips']

[72] Fix | Delete

[73] Fix | Delete

uses_fragment = ['', 'ftp', 'hdl', 'http', 'gopher', 'news',

[74] Fix | Delete

'nntp', 'wais', 'https', 'shttp', 'snews',

[75] Fix | Delete

'file', 'prospero']

[76] Fix | Delete

[77] Fix | Delete

# Characters valid in scheme names

[78] Fix | Delete

scheme_chars = ('abcdefghijklmnopqrstuvwxyz'

[79] Fix | Delete

'ABCDEFGHIJKLMNOPQRSTUVWXYZ'

[80] Fix | Delete

'0123456789'

[81] Fix | Delete

'+-.')

[82] Fix | Delete

[83] Fix | Delete

# Leading and trailing C0 control and space to be stripped per WHATWG spec.

[84] Fix | Delete

# == "".join([chr(i) for i in range(0, 0x20 + 1)])

[85] Fix | Delete

_WHATWG_C0_CONTROL_OR_SPACE = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f '

[86] Fix | Delete

[87] Fix | Delete

# Unsafe bytes to be removed per WHATWG spec

[88] Fix | Delete

_UNSAFE_URL_BYTES_TO_REMOVE = ['\t', '\r', '\n']

[89] Fix | Delete

[90] Fix | Delete

# XXX: Consider replacing with functools.lru_cache

[91] Fix | Delete

MAX_CACHE_SIZE = 20

[92] Fix | Delete

_parse_cache = {}

[93] Fix | Delete

[94] Fix | Delete

def clear_cache():

[95] Fix | Delete

"""Clear the parse cache and the quoters cache."""

[96] Fix | Delete

_parse_cache.clear()

[97] Fix | Delete

_safe_quoters.clear()

[98] Fix | Delete

[99] Fix | Delete

[100] Fix | Delete

# Helpers for bytes handling

[101] Fix | Delete

# For 3.2, we deliberately require applications that

[102] Fix | Delete

# handle improperly quoted URLs to do their own

[103] Fix | Delete

# decoding and encoding. If valid use cases are

[104] Fix | Delete

# presented, we may relax this by using latin-1

[105] Fix | Delete

# decoding internally for 3.3

[106] Fix | Delete

_implicit_encoding = 'ascii'

[107] Fix | Delete

_implicit_errors = 'strict'

[108] Fix | Delete

[109] Fix | Delete

def _noop(obj):

[110] Fix | Delete

return obj

[111] Fix | Delete

[112] Fix | Delete

def _encode_result(obj, encoding=_implicit_encoding,

[113] Fix | Delete

errors=_implicit_errors):

[114] Fix | Delete

return obj.encode(encoding, errors)

[115] Fix | Delete

[116] Fix | Delete

def _decode_args(args, encoding=_implicit_encoding,

[117] Fix | Delete

errors=_implicit_errors):

[118] Fix | Delete

return tuple(x.decode(encoding, errors) if x else '' for x in args)

[119] Fix | Delete

[120] Fix | Delete

def _coerce_args(*args):

[121] Fix | Delete

# Invokes decode if necessary to create str args

[122] Fix | Delete

# and returns the coerced inputs along with

[123] Fix | Delete

# an appropriate result coercion function

[124] Fix | Delete

# - noop for str inputs

[125] Fix | Delete

# - encoding function otherwise

[126] Fix | Delete

str_input = isinstance(args[0], str)

[127] Fix | Delete

for arg in args[1:]:

[128] Fix | Delete

# We special-case the empty string to support the

[129] Fix | Delete

# "scheme=''" default argument to some functions

[130] Fix | Delete

if arg and isinstance(arg, str) != str_input:

[131] Fix | Delete

raise TypeError("Cannot mix str and non-str arguments")

[132] Fix | Delete

if str_input:

[133] Fix | Delete

return args + (_noop,)

[134] Fix | Delete

return _decode_args(args) + (_encode_result,)

[135] Fix | Delete

[136] Fix | Delete

# Result objects are more helpful than simple tuples

[137] Fix | Delete

class _ResultMixinStr(object):

[138] Fix | Delete

"""Standard approach to encoding parsed results from str to bytes"""

[139] Fix | Delete

__slots__ = ()

[140] Fix | Delete

[141] Fix | Delete

def encode(self, encoding='ascii', errors='strict'):

[142] Fix | Delete

return self._encoded_counterpart(*(x.encode(encoding, errors) for x in self))

[143] Fix | Delete

[144] Fix | Delete

[145] Fix | Delete

class _ResultMixinBytes(object):

[146] Fix | Delete

"""Standard approach to decoding parsed results from bytes to str"""

[147] Fix | Delete

__slots__ = ()

[148] Fix | Delete

[149] Fix | Delete

def decode(self, encoding='ascii', errors='strict'):

[150] Fix | Delete

return self._decoded_counterpart(*(x.decode(encoding, errors) for x in self))

[151] Fix | Delete

[152] Fix | Delete

[153] Fix | Delete

class _NetlocResultMixinBase(object):

[154] Fix | Delete

"""Shared methods for the parsed result objects containing a netloc element"""

[155] Fix | Delete

__slots__ = ()

[156] Fix | Delete

[157] Fix | Delete

@property

[158] Fix | Delete

def username(self):

[159] Fix | Delete

return self._userinfo[0]

[160] Fix | Delete

[161] Fix | Delete

@property

[162] Fix | Delete

def password(self):

[163] Fix | Delete

return self._userinfo[1]

[164] Fix | Delete

[165] Fix | Delete

@property

[166] Fix | Delete

def hostname(self):

[167] Fix | Delete

hostname = self._hostinfo[0]

[168] Fix | Delete

if not hostname:

[169] Fix | Delete

return None

[170] Fix | Delete

# Scoped IPv6 address may have zone info, which must not be lowercased

[171] Fix | Delete

# like http://[fe80::822a:a8ff:fe49:470c%tESt]:1234/keys

[172] Fix | Delete

separator = '%' if isinstance(hostname, str) else b'%'

[173] Fix | Delete

hostname, percent, zone = hostname.partition(separator)

[174] Fix | Delete

return hostname.lower() + percent + zone

[175] Fix | Delete

[176] Fix | Delete

@property

[177] Fix | Delete

def port(self):

[178] Fix | Delete

port = self._hostinfo[1]

[179] Fix | Delete

if port is not None:

[180] Fix | Delete

try:

[181] Fix | Delete

port = int(port, 10)

[182] Fix | Delete

except ValueError:

[183] Fix | Delete

message = f'Port could not be cast to integer value as {port!r}'

[184] Fix | Delete

raise ValueError(message) from None

[185] Fix | Delete

if not ( 0 <= port <= 65535):

[186] Fix | Delete

raise ValueError("Port out of range 0-65535")

[187] Fix | Delete

return port

[188] Fix | Delete

[189] Fix | Delete

[190] Fix | Delete

class _NetlocResultMixinStr(_NetlocResultMixinBase, _ResultMixinStr):

[191] Fix | Delete

__slots__ = ()

[192] Fix | Delete

[193] Fix | Delete

@property

[194] Fix | Delete

def _userinfo(self):

[195] Fix | Delete

netloc = self.netloc

[196] Fix | Delete

userinfo, have_info, hostinfo = netloc.rpartition('@')

[197] Fix | Delete

if have_info:

[198] Fix | Delete

username, have_password, password = userinfo.partition(':')

[199] Fix | Delete

if not have_password:

[200] Fix | Delete

password = None

[201] Fix | Delete

else:

[202] Fix | Delete

username = password = None

[203] Fix | Delete

return username, password

[204] Fix | Delete

[205] Fix | Delete

@property

[206] Fix | Delete

def _hostinfo(self):

[207] Fix | Delete

netloc = self.netloc

[208] Fix | Delete

_, _, hostinfo = netloc.rpartition('@')

[209] Fix | Delete

_, have_open_br, bracketed = hostinfo.partition('[')

[210] Fix | Delete

if have_open_br:

[211] Fix | Delete

hostname, _, port = bracketed.partition(']')

[212] Fix | Delete

_, _, port = port.partition(':')

[213] Fix | Delete

else:

[214] Fix | Delete

hostname, _, port = hostinfo.partition(':')

[215] Fix | Delete

if not port:

[216] Fix | Delete

port = None

[217] Fix | Delete

return hostname, port

[218] Fix | Delete

[219] Fix | Delete

[220] Fix | Delete

class _NetlocResultMixinBytes(_NetlocResultMixinBase, _ResultMixinBytes):

[221] Fix | Delete

__slots__ = ()

[222] Fix | Delete

[223] Fix | Delete

@property

[224] Fix | Delete

def _userinfo(self):

[225] Fix | Delete

netloc = self.netloc

[226] Fix | Delete

userinfo, have_info, hostinfo = netloc.rpartition(b'@')

[227] Fix | Delete

if have_info:

[228] Fix | Delete

username, have_password, password = userinfo.partition(b':')

[229] Fix | Delete

if not have_password:

[230] Fix | Delete

password = None

[231] Fix | Delete

else:

[232] Fix | Delete

username = password = None

[233] Fix | Delete

return username, password

[234] Fix | Delete

[235] Fix | Delete

@property

[236] Fix | Delete

def _hostinfo(self):

[237] Fix | Delete

netloc = self.netloc

[238] Fix | Delete

_, _, hostinfo = netloc.rpartition(b'@')

[239] Fix | Delete

_, have_open_br, bracketed = hostinfo.partition(b'[')

[240] Fix | Delete

if have_open_br:

[241] Fix | Delete

hostname, _, port = bracketed.partition(b']')

[242] Fix | Delete

_, _, port = port.partition(b':')

[243] Fix | Delete

else:

[244] Fix | Delete

hostname, _, port = hostinfo.partition(b':')

[245] Fix | Delete

if not port:

[246] Fix | Delete

port = None

[247] Fix | Delete

return hostname, port

[248] Fix | Delete

[249] Fix | Delete

[250] Fix | Delete

from collections import namedtuple

[251] Fix | Delete

[252] Fix | Delete

_DefragResultBase = namedtuple('DefragResult', 'url fragment')

[253] Fix | Delete

_SplitResultBase = namedtuple(

[254] Fix | Delete

'SplitResult', 'scheme netloc path query fragment')

[255] Fix | Delete

_ParseResultBase = namedtuple(

[256] Fix | Delete

'ParseResult', 'scheme netloc path params query fragment')

[257] Fix | Delete

[258] Fix | Delete

_DefragResultBase.__doc__ = """

[259] Fix | Delete

DefragResult(url, fragment)

[260] Fix | Delete

[261] Fix | Delete

A 2-tuple that contains the url without fragment identifier and the fragment

[262] Fix | Delete

identifier as a separate argument.

[263] Fix | Delete

"""

[264] Fix | Delete

[265] Fix | Delete

_DefragResultBase.url.__doc__ = """The URL with no fragment identifier."""

[266] Fix | Delete

[267] Fix | Delete

_DefragResultBase.fragment.__doc__ = """

[268] Fix | Delete

Fragment identifier separated from URL, that allows indirect identification of a

[269] Fix | Delete

secondary resource by reference to a primary resource and additional identifying

[270] Fix | Delete

information.

[271] Fix | Delete

"""

[272] Fix | Delete

[273] Fix | Delete

_SplitResultBase.__doc__ = """

[274] Fix | Delete

SplitResult(scheme, netloc, path, query, fragment)

[275] Fix | Delete

[276] Fix | Delete

A 5-tuple that contains the different components of a URL. Similar to

[277] Fix | Delete

ParseResult, but does not split params.

[278] Fix | Delete

"""

[279] Fix | Delete

[280] Fix | Delete

_SplitResultBase.scheme.__doc__ = """Specifies URL scheme for the request."""

[281] Fix | Delete

[282] Fix | Delete

_SplitResultBase.netloc.__doc__ = """

[283] Fix | Delete

Network location where the request is made to.

[284] Fix | Delete

"""

[285] Fix | Delete

[286] Fix | Delete

_SplitResultBase.path.__doc__ = """

[287] Fix | Delete

The hierarchical path, such as the path to a file to download.

[288] Fix | Delete

"""

[289] Fix | Delete

[290] Fix | Delete

_SplitResultBase.query.__doc__ = """

[291] Fix | Delete

The query component, that contains non-hierarchical data, that along with data

[292] Fix | Delete

in path component, identifies a resource in the scope of URI's scheme and

[293] Fix | Delete

network location.

[294] Fix | Delete

"""

[295] Fix | Delete

[296] Fix | Delete

_SplitResultBase.fragment.__doc__ = """

[297] Fix | Delete

Fragment identifier, that allows indirect identification of a secondary resource

[298] Fix | Delete

by reference to a primary resource and additional identifying information.

[299] Fix | Delete

"""

[300] Fix | Delete

[301] Fix | Delete

_ParseResultBase.__doc__ = """

[302] Fix | Delete

ParseResult(scheme, netloc, path, params, query, fragment)

[303] Fix | Delete

[304] Fix | Delete

A 6-tuple that contains components of a parsed URL.

[305] Fix | Delete

"""

[306] Fix | Delete

[307] Fix | Delete

_ParseResultBase.scheme.__doc__ = _SplitResultBase.scheme.__doc__

[308] Fix | Delete

_ParseResultBase.netloc.__doc__ = _SplitResultBase.netloc.__doc__

[309] Fix | Delete

_ParseResultBase.path.__doc__ = _SplitResultBase.path.__doc__

[310] Fix | Delete

_ParseResultBase.params.__doc__ = """

[311] Fix | Delete

Parameters for last path element used to dereference the URI in order to provide

[312] Fix | Delete

access to perform some operation on the resource.

[313] Fix | Delete

"""

[314] Fix | Delete

[315] Fix | Delete

_ParseResultBase.query.__doc__ = _SplitResultBase.query.__doc__

[316] Fix | Delete

_ParseResultBase.fragment.__doc__ = _SplitResultBase.fragment.__doc__

[317] Fix | Delete

[318] Fix | Delete

[319] Fix | Delete

# For backwards compatibility, alias _NetlocResultMixinStr

[320] Fix | Delete

# ResultBase is no longer part of the documented API, but it is

[321] Fix | Delete

# retained since deprecating it isn't worth the hassle

[322] Fix | Delete

ResultBase = _NetlocResultMixinStr

[323] Fix | Delete

[324] Fix | Delete

# Structured result objects for string data

[325] Fix | Delete

class DefragResult(_DefragResultBase, _ResultMixinStr):

[326] Fix | Delete

__slots__ = ()

[327] Fix | Delete

def geturl(self):

[328] Fix | Delete

if self.fragment:

[329] Fix | Delete

return self.url + '#' + self.fragment

[330] Fix | Delete

else:

[331] Fix | Delete

return self.url

[332] Fix | Delete

[333] Fix | Delete

class SplitResult(_SplitResultBase, _NetlocResultMixinStr):

[334] Fix | Delete

__slots__ = ()

[335] Fix | Delete

def geturl(self):

[336] Fix | Delete

return urlunsplit(self)

[337] Fix | Delete

[338] Fix | Delete

class ParseResult(_ParseResultBase, _NetlocResultMixinStr):

[339] Fix | Delete

__slots__ = ()

[340] Fix | Delete

def geturl(self):

[341] Fix | Delete

return urlunparse(self)

[342] Fix | Delete

[343] Fix | Delete

# Structured result objects for bytes data

[344] Fix | Delete

class DefragResultBytes(_DefragResultBase, _ResultMixinBytes):

[345] Fix | Delete

__slots__ = ()

[346] Fix | Delete

def geturl(self):

[347] Fix | Delete

if self.fragment:

[348] Fix | Delete

return self.url + b'#' + self.fragment

[349] Fix | Delete

else:

[350] Fix | Delete

return self.url

[351] Fix | Delete

[352] Fix | Delete

class SplitResultBytes(_SplitResultBase, _NetlocResultMixinBytes):

[353] Fix | Delete

__slots__ = ()

[354] Fix | Delete

def geturl(self):

[355] Fix | Delete

return urlunsplit(self)

[356] Fix | Delete

[357] Fix | Delete

class ParseResultBytes(_ParseResultBase, _NetlocResultMixinBytes):

[358] Fix | Delete

__slots__ = ()

[359] Fix | Delete

def geturl(self):

[360] Fix | Delete

return urlunparse(self)

[361] Fix | Delete

[362] Fix | Delete

# Set up the encode/decode result pairs

[363] Fix | Delete

def _fix_result_transcoding():

[364] Fix | Delete

_result_pairs = (

[365] Fix | Delete

(DefragResult, DefragResultBytes),

[366] Fix | Delete

(SplitResult, SplitResultBytes),

[367] Fix | Delete

(ParseResult, ParseResultBytes),

[368] Fix | Delete

)

[369] Fix | Delete

for _decoded, _encoded in _result_pairs:

[370] Fix | Delete

_decoded._encoded_counterpart = _encoded

[371] Fix | Delete

_encoded._decoded_counterpart = _decoded

[372] Fix | Delete

[373] Fix | Delete

_fix_result_transcoding()

[374] Fix | Delete

del _fix_result_transcoding

[375] Fix | Delete

[376] Fix | Delete

def urlparse(url, scheme='', allow_fragments=True):

[377] Fix | Delete

"""Parse a URL into 6 components:

[378] Fix | Delete

[379] Fix | Delete

Return a 6-tuple: (scheme, netloc, path, params, query, fragment).

[380] Fix | Delete

Note that we don't break the components up in smaller bits

[381] Fix | Delete

(e.g. netloc is a single string) and we don't expand % escapes."""

[382] Fix | Delete

url, scheme, _coerce_result = _coerce_args(url, scheme)

[383] Fix | Delete

splitresult = urlsplit(url, scheme, allow_fragments)

[384] Fix | Delete

scheme, netloc, url, query, fragment = splitresult

[385] Fix | Delete

if scheme in uses_params and ';' in url:

[386] Fix | Delete

url, params = _splitparams(url)

[387] Fix | Delete

else:

[388] Fix | Delete

params = ''

[389] Fix | Delete

result = ParseResult(scheme, netloc, url, params, query, fragment)

[390] Fix | Delete

return _coerce_result(result)

[391] Fix | Delete

[392] Fix | Delete

def _splitparams(url):

[393] Fix | Delete

if '/' in url:

[394] Fix | Delete

i = url.find(';', url.rfind('/'))

[395] Fix | Delete

if i < 0:

[396] Fix | Delete

return url, ''

[397] Fix | Delete

else:

[398] Fix | Delete

i = url.find(';')

[399] Fix | Delete

return url[:i], url[i+1:]

[400] Fix | Delete

[401] Fix | Delete

def _splitnetloc(url, start=0):

[402] Fix | Delete

delim = len(url) # position of end of domain part of url, default is end

[403] Fix | Delete

for c in '/?#': # look for delimiters; the order is NOT important

[404] Fix | Delete

wdelim = url.find(c, start) # find first of this delim

[405] Fix | Delete

if wdelim >= 0: # if found

[406] Fix | Delete

delim = min(delim, wdelim) # use earliest delim position

[407] Fix | Delete

return url[start:delim], url[delim:] # return (domain, rest)

[408] Fix | Delete

[409] Fix | Delete

def _checknetloc(netloc):

[410] Fix | Delete

if not netloc or netloc.isascii():

[411] Fix | Delete

return

[412] Fix | Delete

# looking for characters like \u2100 that expand to 'a/c'

[413] Fix | Delete

# IDNA uses NFKC equivalence, so normalize for this check

[414] Fix | Delete

import unicodedata

[415] Fix | Delete

n = netloc.replace('@', '') # ignore characters already included

[416] Fix | Delete

n = n.replace(':', '') # but not the surrounding text

[417] Fix | Delete

n = n.replace('#', '')

[418] Fix | Delete

n = n.replace('?', '')

[419] Fix | Delete

netloc2 = unicodedata.normalize('NFKC', n)

[420] Fix | Delete

if n == netloc2:

[421] Fix | Delete

return

[422] Fix | Delete

for c in '/?#@:':

[423] Fix | Delete

if c in netloc2:

[424] Fix | Delete

raise ValueError("netloc '" + netloc + "' contains invalid " +

[425] Fix | Delete

"characters under NFKC normalization")

[426] Fix | Delete

[427] Fix | Delete

def _remove_unsafe_bytes_from_url(url):

[428] Fix | Delete

for b in _UNSAFE_URL_BYTES_TO_REMOVE:

[429] Fix | Delete

url = url.replace(b, "")

[430] Fix | Delete

return url

[431] Fix | Delete

[432] Fix | Delete

def urlsplit(url, scheme='', allow_fragments=True):

[433] Fix | Delete

"""Parse a URL into 5 components:

[434] Fix | Delete

[435] Fix | Delete

Return a 5-tuple: (scheme, netloc, path, query, fragment).

[436] Fix | Delete

Note that we don't break the components up in smaller bits

[437] Fix | Delete

(e.g. netloc is a single string) and we don't expand % escapes."""

[438] Fix | Delete

url, scheme, _coerce_result = _coerce_args(url, scheme)

[439] Fix | Delete

url = _remove_unsafe_bytes_from_url(url)

[440] Fix | Delete

scheme = _remove_unsafe_bytes_from_url(scheme)

[441] Fix | Delete

# Only lstrip url as some applications rely on preserving trailing space.

[442] Fix | Delete

# (https://url.spec.whatwg.org/#concept-basic-url-parser would strip both)

[443] Fix | Delete

url = url.lstrip(_WHATWG_C0_CONTROL_OR_SPACE)

[444] Fix | Delete

scheme = scheme.strip(_WHATWG_C0_CONTROL_OR_SPACE)

[445] Fix | Delete

allow_fragments = bool(allow_fragments)

[446] Fix | Delete

key = url, scheme, allow_fragments, type(url), type(scheme)

[447] Fix | Delete

cached = _parse_cache.get(key, None)

[448] Fix | Delete

if cached:

[449] Fix | Delete

return _coerce_result(cached)

[450] Fix | Delete

if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth

[451] Fix | Delete

clear_cache()

[452] Fix | Delete

netloc = query = fragment = ''

[453] Fix | Delete

i = url.find(':')

[454] Fix | Delete

if i > 0:

[455] Fix | Delete

if url[:i] == 'http': # optimize the common case

[456] Fix | Delete

url = url[i+1:]

[457] Fix | Delete

if url[:2] == '//':

[458] Fix | Delete

netloc, url = _splitnetloc(url, 2)

[459] Fix | Delete

if (('[' in netloc and ']' not in netloc) or

[460] Fix | Delete

(']' in netloc and '[' not in netloc)):

[461] Fix | Delete

raise ValueError("Invalid IPv6 URL")

[462] Fix | Delete

if allow_fragments and '#' in url:

[463] Fix | Delete

url, fragment = url.split('#', 1)

[464] Fix | Delete

if '?' in url:

[465] Fix | Delete

url, query = url.split('?', 1)

[466] Fix | Delete

_checknetloc(netloc)

[467] Fix | Delete

v = SplitResult('http', netloc, url, query, fragment)

[468] Fix | Delete

_parse_cache[key] = v

[469] Fix | Delete

return _coerce_result(v)

[470] Fix | Delete

for c in url[:i]:

[471] Fix | Delete

if c not in scheme_chars:

[472] Fix | Delete

break

[473] Fix | Delete

else:

[474] Fix | Delete

# make sure "url" is not actually a port number (in which case

[475] Fix | Delete

# "scheme" is really part of the path)

[476] Fix | Delete

rest = url[i+1:]

[477] Fix | Delete

if not rest or any(c not in '0123456789' for c in rest):

[478] Fix | Delete

# not a port number

[479] Fix | Delete

scheme, url = url[:i].lower(), rest

[480] Fix | Delete

[481] Fix | Delete

if url[:2] == '//':

[482] Fix | Delete

netloc, url = _splitnetloc(url, 2)

[483] Fix | Delete

if (('[' in netloc and ']' not in netloc) or

[484] Fix | Delete

(']' in netloc and '[' not in netloc)):

[485] Fix | Delete

raise ValueError("Invalid IPv6 URL")

[486] Fix | Delete

if allow_fragments and '#' in url:

[487] Fix | Delete

url, fragment = url.split('#', 1)

[488] Fix | Delete

if '?' in url:

[489] Fix | Delete

url, query = url.split('?', 1)

[490] Fix | Delete

_checknetloc(netloc)

[491] Fix | Delete

v = SplitResult(scheme, netloc, url, query, fragment)

[492] Fix | Delete

_parse_cache[key] = v

[493] Fix | Delete

return _coerce_result(v)

[494] Fix | Delete

[495] Fix | Delete

def urlunparse(components):

[496] Fix | Delete

"""Put a parsed URL back together again. This may result in a

[497] Fix | Delete

slightly different, but equivalent URL, if the URL that was parsed

[498] Fix | Delete

originally had redundant delimiters, e.g. a ? with an empty query

[499] Fix | Delete

12 3