"""An extensible library for opening URLs using a variety of protocols
The simplest way to use this module is to call the urlopen function,
which accepts a string containing a URL or a Request object (described
below). It opens the URL and returns the results as file-like
object; the returned object has some extra methods described below.
The OpenerDirector manages a collection of Handler objects that do
all the actual work. Each Handler implements a particular protocol or
option. The OpenerDirector is a composite object that invokes the
Handlers needed to open the requested URL. For example, the
HTTPHandler performs HTTP GET and POST requests and deals with
non-error returns. The HTTPRedirectHandler automatically deals with
HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
deals with digest authentication.
urlopen(url, data=None) -- Basic usage is the same as original
urllib. pass the url and optionally data to post to an HTTP URL, and
get a file-like object back. One difference is that you can also pass
a Request instance instead of URL. Raises a URLError (subclass of
IOError); for HTTP errors, raises an HTTPError, which can also be
treated as a valid response.
build_opener -- Function that creates a new OpenerDirector instance.
Will install the default handlers. Accepts one or more Handlers as
arguments, either instances or Handler classes that it will
instantiate. If one of the argument is a subclass of the default
handler, the argument will be installed instead of the default.
install_opener -- Installs a new opener as the default opener.
OpenerDirector -- Sets up the User Agent as the Python-urllib client and manages
the Handler classes, while dealing with requests and responses.
Request -- An object that encapsulates the state of a request. The
state can be as simple as the URL. It can also include extra HTTP
headers, e.g. a User-Agent.
URLError -- A subclass of IOError, individual protocols have their own
HTTPError -- Also a valid HTTP response, so you can treat an HTTP error
as an exceptional event or valid response.
# set up authentication info
authinfo = urllib2.HTTPBasicAuthHandler()
authinfo.add_password(realm='PDQ Application',
uri='https://mahler:8092/site-updates.py',
proxy_support = urllib2.ProxyHandler({"http" : "http://ahad-haam:3128"})
# build a new opener that adds authentication and caching FTP handlers
opener = urllib2.build_opener(proxy_support, authinfo, urllib2.CacheFTPHandler)
urllib2.install_opener(opener)
f = urllib2.urlopen('http://www.python.org/')
# If an authentication error handler that tries to perform
# authentication for some reason but fails, how should the error be
# signalled? The client needs to know the HTTP error code. But if
# the handler knows that the problem was, e.g., that it didn't know
# that hash algo that requested in the challenge, it would be good to
# pass that information along to the client, too.
# ftp errors aren't handled cleanly
# check digest against correct (i.e. non-apache) implementation
# complex proxies XXX not sure what exactly was meant by this
# abstract factory for opener
from cStringIO import StringIO
from StringIO import StringIO
from urllib import (unwrap, unquote, splittype, splithost, quote,
addinfourl, splitport, splittag, toBytes,
splitattr, ftpwrapper, splituser, splitpasswd, splitvalue)
# support for FileHandler, proxies via environment variables
from urllib import localhost, url2pathname, getproxies, proxy_bypass
# used in User-Agent header sent
__version__ = sys.version[:3]
def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
cafile=None, capath=None, cadefault=False, context=None):
if cafile or capath or cadefault:
"You can't pass both context and any of cafile, capath, and "
raise ValueError('SSL support not available')
context = ssl.create_default_context(purpose=ssl.Purpose.SERVER_AUTH,
https_handler = HTTPSHandler(context=context)
opener = build_opener(https_handler)
https_handler = HTTPSHandler(context=context)
opener = build_opener(https_handler)
_opener = opener = build_opener()
return opener.open(url, data, timeout)
def install_opener(opener):
# do these error classes make sense?
# make sure all of the IOError stuff is overridden. we just want to be
# URLError is a sub-type of IOError, but it doesn't share any of
# the implementation. need to override __init__ and __str__.
# It sets self.args for compatibility with other EnvironmentError
# subclasses, but args doesn't have the typical format with errno in
# slot 0 and strerror in slot 1. This may be better than nothing.
def __init__(self, reason):
return '<urlopen error %s>' % self.reason
class HTTPError(URLError, addinfourl):
"""Raised when HTTP error occurs, but also acts like non-error return"""
__super_init = addinfourl.__init__
def __init__(self, url, code, msg, hdrs, fp):
# The addinfourl classes depend on fp being a valid file
# object. In some cases, the HTTPError may not have a valid
# file object. If this happens, the simplest workaround is to
# not initialize the base classes.
self.__super_init(fp, hdrs, url, code)
return 'HTTP Error %s: %s' % (self.code, self.msg)
# since URLError specifies a .reason attribute, HTTPError should also
# provide this attribute. See issue13211 fo discussion.
# copied from cookielib.py
_cut_port_re = re.compile(r":\d+$")
def request_host(request):
"""Return request-host, as defined by RFC 2965.
Variation from RFC: returned value is lowercased, for convenient
url = request.get_full_url()
host = urlparse.urlparse(url)[1]
host = request.get_header("Host", "")
# remove port, if present
host = _cut_port_re.sub("", host, 1)
def __init__(self, url, data=None, headers={},
origin_req_host=None, unverifiable=False):
# unwrap('<URL:type://host/path>') --> 'type://host/path'
self.__original = unwrap(url)
self.__original, self.__fragment = splittag(self.__original)
# self.__r_type is what's left after doing the splittype
for key, value in headers.items():
self.add_header(key, value)
self.unredirected_hdrs = {}
if origin_req_host is None:
origin_req_host = request_host(self)
self.origin_req_host = origin_req_host
self.unverifiable = unverifiable
def __getattr__(self, attr):
# XXX this is a fallback mechanism to guard against these
# methods getting called in a non-standard order. this may be
# too complicated and/or unnecessary.
# XXX should the __r_XXX attributes be public?
if attr in ('_Request__r_type', '_Request__r_host'):
getattr(self, 'get_' + attr[12:])()
return self.__dict__[attr]
raise AttributeError, attr
# XXX these helper methods are lame
def add_data(self, data):
return self.data is not None
return '%s#%s' % (self.__original, self.__fragment)
self.type, self.__r_type = splittype(self.__original)
raise ValueError, "unknown url type: %s" % self.__original
self.host, self.__r_host = splithost(self.__r_type)
self.host = unquote(self.host)
def set_proxy(self, host, type):
if self.type == 'https' and not self._tunnel_host:
self._tunnel_host = self.host
self.__r_host = self.__original
return self.__r_host == self.__original
def get_origin_req_host(self):
return self.origin_req_host
def is_unverifiable(self):
def add_header(self, key, val):
# useful for something like authentication
self.headers[key.capitalize()] = val
def add_unredirected_header(self, key, val):
# will not be added to a redirected request
self.unredirected_hdrs[key.capitalize()] = val
def has_header(self, header_name):
return (header_name in self.headers or
header_name in self.unredirected_hdrs)
def get_header(self, header_name, default=None):
self.unredirected_hdrs.get(header_name, default))
hdrs = self.unredirected_hdrs.copy()
hdrs.update(self.headers)
client_version = "Python-urllib/%s" % __version__
self.addheaders = [('User-agent', client_version)]
# self.handlers is retained only for backward compatibility
# manage the individual handlers
self.process_response = {}
self.process_request = {}
def add_handler(self, handler):
if not hasattr(handler, "add_parent"):
raise TypeError("expected BaseHandler instance, got %r" %
for meth in dir(handler):
if meth in ["redirect_request", "do_open", "proxy_open"]:
# oops, coincidental match
if condition.startswith("error"):
j = condition.find("_") + i + 1
lookup = self.handle_error.get(protocol, {})
self.handle_error[protocol] = lookup
elif condition == "open":
lookup = self.handle_open
elif condition == "response":
lookup = self.process_response
elif condition == "request":
lookup = self.process_request
handlers = lookup.setdefault(kind, [])
bisect.insort(handlers, handler)
bisect.insort(self.handlers, handler)
# Only exists for backwards compatibility.
def _call_chain(self, chain, kind, meth_name, *args):
# Handlers raise an exception if no one else should try to handle
# the request, or return None if they can't but another handler
# could. Otherwise, they return the response.
handlers = chain.get(kind, ())
func = getattr(handler, meth_name)
def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
# accept a URL or a Request object
if isinstance(fullurl, basestring):
req = Request(fullurl, data)
protocol = req.get_type()
meth_name = protocol+"_request"
for processor in self.process_request.get(protocol, []):
meth = getattr(processor, meth_name)
response = self._open(req, data)
meth_name = protocol+"_response"
for processor in self.process_response.get(protocol, []):
meth = getattr(processor, meth_name)
response = meth(req, response)
def _open(self, req, data=None):
result = self._call_chain(self.handle_open, 'default',
protocol = req.get_type()
result = self._call_chain(self.handle_open, protocol, protocol +
return self._call_chain(self.handle_open, 'unknown',
def error(self, proto, *args):
if proto in ('http', 'https'):
# XXX http[s] protocols are special-cased
dict = self.handle_error['http'] # https is not different than http
meth_name = 'http_error_%s' % proto
meth_name = proto + '_error'
args = (dict, proto, meth_name) + args
result = self._call_chain(*args)
args = (dict, 'default', 'http_error_default') + orig_args
return self._call_chain(*args)
# XXX probably also want an abstract factory that knows when it makes
# sense to skip a superclass in favor of a subclass and when it might
# make sense to include both
def build_opener(*handlers):
"""Create an opener object from a list of handlers.
The opener will use several default handlers, including support
for HTTP, FTP and when applicable, HTTPS.
If any of the handlers passed as arguments are subclasses of the
default handlers, the default handlers will not be used.
return isinstance(obj, (types.ClassType, type))
opener = OpenerDirector()
default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
HTTPDefaultErrorHandler, HTTPRedirectHandler,
FTPHandler, FileHandler, HTTPErrorProcessor]
if hasattr(httplib, 'HTTPS'):
default_classes.append(HTTPSHandler)
for klass in default_classes: