"""An extensible library for opening URLs using a variety of protocols
The simplest way to use this module is to call the urlopen function,
which accepts a string containing a URL or a Request object (described
below). It opens the URL and returns the results as file-like
object; the returned object has some extra methods described below.
The OpenerDirector manages a collection of Handler objects that do
all the actual work. Each Handler implements a particular protocol or
option. The OpenerDirector is a composite object that invokes the
Handlers needed to open the requested URL. For example, the
HTTPHandler performs HTTP GET and POST requests and deals with
non-error returns. The HTTPRedirectHandler automatically deals with
HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
deals with digest authentication.
urlopen(url, data=None) -- Basic usage is the same as original
urllib. pass the url and optionally data to post to an HTTP URL, and
get a file-like object back. One difference is that you can also pass
a Request instance instead of URL. Raises a URLError (subclass of
OSError); for HTTP errors, raises an HTTPError, which can also be
treated as a valid response.
build_opener -- Function that creates a new OpenerDirector instance.
Will install the default handlers. Accepts one or more Handlers as
arguments, either instances or Handler classes that it will
instantiate. If one of the argument is a subclass of the default
handler, the argument will be installed instead of the default.
install_opener -- Installs a new opener as the default opener.
OpenerDirector -- Sets up the User Agent as the Python-urllib client and manages
the Handler classes, while dealing with requests and responses.
Request -- An object that encapsulates the state of a request. The
state can be as simple as the URL. It can also include extra HTTP
headers, e.g. a User-Agent.
# set up authentication info
authinfo = urllib.request.HTTPBasicAuthHandler()
authinfo.add_password(realm='PDQ Application',
uri='https://mahler:8092/site-updates.py',
proxy_support = urllib.request.ProxyHandler({"http" : "http://ahad-haam:3128"})
# build a new opener that adds authentication and caching FTP handlers
opener = urllib.request.build_opener(proxy_support, authinfo,
urllib.request.CacheFTPHandler)
urllib.request.install_opener(opener)
f = urllib.request.urlopen('http://www.python.org/')
# If an authentication error handler that tries to perform
# authentication for some reason but fails, how should the error be
# signalled? The client needs to know the HTTP error code. But if
# the handler knows that the problem was, e.g., that it didn't know
# that hash algo that requested in the challenge, it would be good to
# pass that information along to the client, too.
# ftp errors aren't handled cleanly
# check digest against correct (i.e. non-apache) implementation
# complex proxies XXX not sure what exactly was meant by this
# abstract factory for opener
from urllib.error import URLError, HTTPError, ContentTooShortError
from urllib.parse import (
urlparse, urlsplit, urljoin, unwrap, quote, unquote,
splittype, splithost, splitport, splituser, splitpasswd,
splitattr, splitquery, splitvalue, splittag, to_bytes,
unquote_to_bytes, urlunparse)
from urllib.response import addinfourl, addclosehook
'Request', 'OpenerDirector', 'BaseHandler', 'HTTPDefaultErrorHandler',
'HTTPRedirectHandler', 'HTTPCookieProcessor', 'ProxyHandler',
'HTTPPasswordMgr', 'HTTPPasswordMgrWithDefaultRealm',
'HTTPPasswordMgrWithPriorAuth', 'AbstractBasicAuthHandler',
'HTTPBasicAuthHandler', 'ProxyBasicAuthHandler', 'AbstractDigestAuthHandler',
'HTTPDigestAuthHandler', 'ProxyDigestAuthHandler', 'HTTPHandler',
'FileHandler', 'FTPHandler', 'CacheFTPHandler', 'DataHandler',
'UnknownHandler', 'HTTPErrorProcessor',
'urlopen', 'install_opener', 'build_opener',
'pathname2url', 'url2pathname', 'getproxies',
'urlretrieve', 'urlcleanup', 'URLopener', 'FancyURLopener',
# used in User-Agent header sent
__version__ = '%d.%d' % sys.version_info[:2]
def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
*, cafile=None, capath=None, cadefault=False, context=None):
'''Open the URL url, which can be either a string or a Request object.
*data* must be an object specifying additional data to be sent to
the server, or None if no such data is needed. See Request for
urllib.request module uses HTTP/1.1 and includes a "Connection:close"
header in its HTTP requests.
The optional *timeout* parameter specifies a timeout in seconds for
blocking operations like the connection attempt (if not specified, the
global default timeout setting will be used). This only works for HTTP,
HTTPS and FTP connections.
If *context* is specified, it must be a ssl.SSLContext instance describing
the various SSL options. See HTTPSConnection for more details.
The optional *cafile* and *capath* parameters specify a set of trusted CA
certificates for HTTPS requests. cafile should point to a single file
containing a bundle of CA certificates, whereas capath should point to a
directory of hashed certificate files. More information can be found in
ssl.SSLContext.load_verify_locations().
The *cadefault* parameter is ignored.
This function always returns an object which can work as a context
manager and has methods such as
* geturl() - return the URL of the resource retrieved, commonly used to
determine if a redirect was followed
* info() - return the meta-information of the page, such as headers, in the
form of an email.message_from_string() instance (see Quick Reference to
* getcode() - return the HTTP status code of the response. Raises URLError
For HTTP and HTTPS URLs, this function returns a http.client.HTTPResponse
object slightly modified. In addition to the three new methods above, the
msg attribute contains the same information as the reason attribute ---
the reason phrase returned by the server --- instead of the response
headers as it is specified in the documentation for HTTPResponse.
For FTP, file, and data URLs and requests explicitly handled by legacy
URLopener and FancyURLopener classes, this function returns a
urllib.response.addinfourl object.
Note that None may be returned if no handler handles the request (though
the default installed global OpenerDirector uses UnknownHandler to ensure
In addition, if proxy settings are detected (for example, when a *_proxy
environment variable like http_proxy is set), ProxyHandler is default
installed and makes sure the requests are handled through the proxy.
if cafile or capath or cadefault:
warnings.warn("cafile, capath and cadefault are deprecated, use a "
"custom context instead.", DeprecationWarning, 2)
"You can't pass both context and any of cafile, capath, and "
raise ValueError('SSL support not available')
context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH,
https_handler = HTTPSHandler(context=context)
opener = build_opener(https_handler)
https_handler = HTTPSHandler(context=context)
opener = build_opener(https_handler)
_opener = opener = build_opener()
return opener.open(url, data, timeout)
def install_opener(opener):
def urlretrieve(url, filename=None, reporthook=None, data=None):
Retrieve a URL into a temporary location on disk.
Requires a URL argument. If a filename is passed, it is used as
the temporary file location. The reporthook argument should be
a callable that accepts a block number, a read size, and the
total file size of the URL target. The data argument should be
If a filename is passed and the URL points to a local resource,
the result is a copy from local file to new file.
Returns a tuple containing the path to the newly created
data file as well as the resulting HTTPMessage object.
url_type, path = splittype(url)
with contextlib.closing(urlopen(url, data)) as fp:
# Just return the local path and the "headers" for file://
# URLs. No sense in performing a copy unless requested.
if url_type == "file" and not filename:
return os.path.normpath(path), headers
# Handle temporary file setup.
tfp = open(filename, 'wb')
tfp = tempfile.NamedTemporaryFile(delete=False)
_url_tempfiles.append(filename)
result = filename, headers
if "content-length" in headers:
size = int(headers["Content-Length"])
reporthook(blocknum, bs, size)
reporthook(blocknum, bs, size)
if size >= 0 and read < size:
raise ContentTooShortError(
"retrieval incomplete: got only %i out of %i bytes"
"""Clean up temporary files from urlretrieve calls."""
for temp_file in _url_tempfiles:
# copied from cookielib.py
_cut_port_re = re.compile(r":\d+$", re.ASCII)
def request_host(request):
"""Return request-host, as defined by RFC 2965.
Variation from RFC: returned value is lowercased, for convenient
host = request.get_header("Host", "")
# remove port, if present
host = _cut_port_re.sub("", host, 1)
def __init__(self, url, data=None, headers={},
origin_req_host=None, unverifiable=False,
self.unredirected_hdrs = {}
for key, value in headers.items():
self.add_header(key, value)
if origin_req_host is None:
origin_req_host = request_host(self)
self.origin_req_host = origin_req_host
self.unverifiable = unverifiable
return '{}#{}'.format(self._full_url, self.fragment)
# unwrap('<URL:type://host/path>') --> 'type://host/path'
self._full_url = unwrap(url)
self._full_url, self.fragment = splittag(self._full_url)
# if we change data we need to remove content-length header
# (cause it's most probably calculated for previous value)
if self.has_header("Content-length"):
self.remove_header("Content-length")
self.type, rest = splittype(self._full_url)
raise ValueError("unknown url type: %r" % self.full_url)
self.host, self.selector = splithost(rest)
self.host = unquote(self.host)
"""Return a string indicating the HTTP request method."""
default_method = "POST" if self.data is not None else "GET"
return getattr(self, 'method', default_method)
def set_proxy(self, host, type):
if self.type == 'https' and not self._tunnel_host:
self._tunnel_host = self.host
self.selector = self.full_url
return self.selector == self.full_url
def add_header(self, key, val):
# useful for something like authentication
self.headers[key.capitalize()] = val
def add_unredirected_header(self, key, val):
# will not be added to a redirected request
self.unredirected_hdrs[key.capitalize()] = val
def has_header(self, header_name):
return (header_name in self.headers or
header_name in self.unredirected_hdrs)
def get_header(self, header_name, default=None):
self.unredirected_hdrs.get(header_name, default))
def remove_header(self, header_name):
self.headers.pop(header_name, None)
self.unredirected_hdrs.pop(header_name, None)
hdrs = self.unredirected_hdrs.copy()
hdrs.update(self.headers)
return list(hdrs.items())
client_version = "Python-urllib/%s" % __version__
self.addheaders = [('User-agent', client_version)]
# self.handlers is retained only for backward compatibility
# manage the individual handlers
self.process_response = {}
self.process_request = {}
def add_handler(self, handler):
if not hasattr(handler, "add_parent"):
raise TypeError("expected BaseHandler instance, got %r" %
for meth in dir(handler):
if meth in ["redirect_request", "do_open", "proxy_open"]:
# oops, coincidental match
if condition.startswith("error"):
j = condition.find("_") + i + 1
lookup = self.handle_error.get(protocol, {})
self.handle_error[protocol] = lookup
elif condition == "open":
lookup = self.handle_open
elif condition == "response":
lookup = self.process_response
elif condition == "request":
lookup = self.process_request
handlers = lookup.setdefault(kind, [])
bisect.insort(handlers, handler)
bisect.insort(self.handlers, handler)
# Only exists for backwards compatibility.
def _call_chain(self, chain, kind, meth_name, *args):
# Handlers raise an exception if no one else should try to handle
# the request, or return None if they can't but another handler
# could. Otherwise, they return the response.