"""Open an arbitrary URL.
See the following document for more info on URLs:
"Names and Addresses, URIs, URLs, URNs, URCs", at
http://www.w3.org/pub/WWW/Addressing/Overview.html
See also the HTTP spec (from which the error codes are derived):
"HTTP - Hypertext Transfer Protocol", at
http://www.w3.org/pub/WWW/Protocols/
Related standards and specs:
- RFC1808: the "relative URL" spec. (authoritative status)
- RFC1738 - the "URL standard". (authoritative status)
- RFC1630 - the "URI spec". (informational status)
The object returned by URLopener().open(file) will differ per
protocol. All you know is that is has methods read(), readline(),
readlines(), fileno(), close() and info(). The read*(), fileno()
and close() methods work like those of open files.
The info() method returns a mimetools.Message object which can be
used to query various info about the object, if available.
(mimetools.Message objects are queried with the getheader() method.)
from urlparse import urljoin as basejoin
__all__ = ["urlopen", "URLopener", "FancyURLopener", "urlretrieve",
"urlcleanup", "quote", "quote_plus", "unquote", "unquote_plus",
"urlencode", "url2pathname", "pathname2url", "splittag",
"localhost", "thishost", "ftperrors", "basejoin", "unwrap",
"splittype", "splithost", "splituser", "splitpasswd", "splitport",
"splitnport", "splitquery", "splitattr", "splitvalue",
__version__ = '1.17' # XXX This version is not always updated :-(
MAXFTPCACHE = 10 # Trim the ftp cache beyond this size
# Helper for non-unix systems
from nturl2path import url2pathname, pathname2url
elif os.name == 'riscos':
from rourl2path import url2pathname, pathname2url
def url2pathname(pathname):
"""OS-specific conversion from a relative URL of the 'file' scheme
to a file system path; not recommended for general use."""
def pathname2url(pathname):
"""OS-specific conversion from a file system path to a relative URL
of the 'file' scheme; not recommended for general use."""
# This really consists of two pieces:
# (1) a class which handles opening of all sorts of URLs
# (plus assorted utilities etc.)
# (2) a set of functions for parsing URLs
# XXX Should these be separated out into different modules?
# Shortcut for basic usage
def urlopen(url, data=None, proxies=None, context=None):
"""Create a file-like object for the specified URL to read from."""
from warnings import warnpy3k
warnpy3k("urllib.urlopen() has been removed in Python 3.0 in "
"favor of urllib2.urlopen()", stacklevel=2)
if proxies is not None or context is not None:
opener = FancyURLopener(proxies=proxies, context=context)
opener = FancyURLopener()
return opener.open(url, data)
def urlretrieve(url, filename=None, reporthook=None, data=None, context=None):
opener = FancyURLopener(context=context)
_urlopener = opener = FancyURLopener()
return opener.retrieve(url, filename, reporthook, data)
# exception raised when downloaded size does not match content-length
class ContentTooShortError(IOError):
def __init__(self, message, content):
IOError.__init__(self, message)
This is a class rather than just a subroutine because we may need
more than one set of global protocol-specific options.
Note -- this is a base class for those who don't want the
automatic handling of errors type 302 (relocated) and 401
(authorization needed)."""
version = "Python-urllib/%s" % __version__
def __init__(self, proxies=None, context=None, **x509):
assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
self.key_file = x509.get('key_file')
self.cert_file = x509.get('cert_file')
self.addheaders = [('User-Agent', self.version)]
self.__unlink = os.unlink # See cleanup()
# Undocumented feature: if you assign {} to tempcache,
# it is used to cache files retrieved with
# self.retrieve(). This is not enabled by default
# since it does not work for changing documents (and I
# haven't got the logic to check expiration headers
# Undocumented feature: you can use a different
# ftp cache by assigning to the .ftpcache member;
# in case you want logically independent URL openers
# XXX This is not threadsafe. Bah.
# This code sometimes runs when the rest of this module
# has already been deleted, so it can't use any globals
for file in self.__tempfiles:
def addheader(self, *args):
"""Add a header to be used by the HTTP interface only
e.g. u.addheader('Accept', 'sound/basic')"""
self.addheaders.append(args)
def open(self, fullurl, data=None):
"""Use URLopener().open(file) instead of open(file, 'r')."""
fullurl = unwrap(toBytes(fullurl))
# percent encode url, fixing lame server errors for e.g, like space
fullurl = quote(fullurl, safe="%/:=&?~#+!$,;'@()*[]|")
if self.tempcache and fullurl in self.tempcache:
filename, headers = self.tempcache[fullurl]
fp = open(filename, 'rb')
return addinfourl(fp, headers, fullurl)
urltype, url = splittype(fullurl)
if urltype in self.proxies:
proxy = self.proxies[urltype]
urltype, proxyhost = splittype(proxy)
host, selector = splithost(proxyhost)
url = (host, fullurl) # Signal special case to open_*()
name = name.replace('-', '_')
if not hasattr(self, name):
return self.open_unknown_proxy(proxy, fullurl, data)
return self.open_unknown(fullurl, data)
return getattr(self, name)(url)
return getattr(self, name)(url, data)
except socket.error, msg:
raise IOError, ('socket error', msg), sys.exc_info()[2]
def open_unknown(self, fullurl, data=None):
"""Overridable interface to open unknown URL type."""
type, url = splittype(fullurl)
raise IOError, ('url error', 'unknown url type', type)
def open_unknown_proxy(self, proxy, fullurl, data=None):
"""Overridable interface to open unknown URL type."""
type, url = splittype(fullurl)
raise IOError, ('url error', 'invalid proxy for %s' % type, proxy)
def retrieve(self, url, filename=None, reporthook=None, data=None):
"""retrieve(url) returns (filename, headers) for a local object
or (tempfilename, headers) for a remote object."""
url = unwrap(toBytes(url))
if self.tempcache and url in self.tempcache:
return self.tempcache[url]
type, url1 = splittype(url)
if filename is None and (not type or type == 'file'):
fp = self.open_local_file(url1)
return url2pathname(splithost(url1)[1]), hdrs
fp = self.open(url, data)
tfp = open(filename, 'wb')
garbage, path = splittype(url)
garbage, path = splithost(path or "")
path, garbage = splitquery(path or "")
path, garbage = splitattr(path or "")
suffix = os.path.splitext(path)[1]
(fd, filename) = tempfile.mkstemp(suffix)
self.__tempfiles.append(filename)
tfp = os.fdopen(fd, 'wb')
result = filename, headers
if self.tempcache is not None:
self.tempcache[url] = result
if "content-length" in headers:
size = int(headers["Content-Length"])
reporthook(blocknum, bs, size)
reporthook(blocknum, bs, size)
# raise exception if actual size does not match content-length header
if size >= 0 and read < size:
raise ContentTooShortError("retrieval incomplete: got only %i out "
"of %i bytes" % (read, size), result)
# Each method named open_<type> knows how to open that type of URL
def open_http(self, url, data=None):
host, selector = splithost(url)
user_passwd, host = splituser(host)
# check whether the proxy contains authorization information
proxy_passwd, host = splituser(host)
# now we proceed with the url we want to obtain
urltype, rest = splittype(selector)
if urltype.lower() != 'http':
realhost, rest = splithost(rest)
user_passwd, realhost = splituser(realhost)
selector = "%s://%s%s" % (urltype, realhost, rest)
if proxy_bypass(realhost):
#print "proxy via http:", host, selector
if not host: raise IOError, ('http error', 'no host given')
proxy_passwd = unquote(proxy_passwd)
proxy_auth = base64.b64encode(proxy_passwd).strip()
user_passwd = unquote(user_passwd)
auth = base64.b64encode(user_passwd).strip()
h.putrequest('POST', selector)
h.putheader('Content-Type', 'application/x-www-form-urlencoded')
h.putheader('Content-Length', '%d' % len(data))
h.putrequest('GET', selector)
if proxy_auth: h.putheader('Proxy-Authorization', 'Basic %s' % proxy_auth)
if auth: h.putheader('Authorization', 'Basic %s' % auth)
if realhost: h.putheader('Host', realhost)
for args in self.addheaders: h.putheader(*args)
errcode, errmsg, headers = h.getreply()
# something went wrong with the HTTP status line
raise IOError, ('http protocol error', 0,
'got a bad status line', None)
# According to RFC 2616, "2xx" code indicates that the client's
# request was successfully received, understood, and accepted.
if (200 <= errcode < 300):
return addinfourl(fp, headers, "http:" + url, errcode)
return self.http_error(url, fp, errcode, errmsg, headers)
return self.http_error(url, fp, errcode, errmsg, headers, data)
def http_error(self, url, fp, errcode, errmsg, headers, data=None):
Derived class can override this, or provide specific handlers
named http_error_DDD where DDD is the 3-digit error code."""
# First check if there's a specific handler for this error
name = 'http_error_%d' % errcode
method = getattr(self, name)
result = method(url, fp, errcode, errmsg, headers)
result = method(url, fp, errcode, errmsg, headers, data)
return self.http_error_default(url, fp, errcode, errmsg, headers)
def http_error_default(self, url, fp, errcode, errmsg, headers):
"""Default error handler: close the connection and raise IOError."""
raise IOError, ('http error', errcode, errmsg, headers)
def open_https(self, url, data=None):
"""Use HTTPS protocol."""
host, selector = splithost(url)
user_passwd, host = splituser(host)
# here, we determine, whether the proxy contains authorization information
proxy_passwd, host = splituser(host)
urltype, rest = splittype(selector)
if urltype.lower() != 'https':
realhost, rest = splithost(rest)
user_passwd, realhost = splituser(realhost)
selector = "%s://%s%s" % (urltype, realhost, rest)
#print "proxy via https:", host, selector
if not host: raise IOError, ('https error', 'no host given')
proxy_passwd = unquote(proxy_passwd)
proxy_auth = base64.b64encode(proxy_passwd).strip()
user_passwd = unquote(user_passwd)
auth = base64.b64encode(user_passwd).strip()
h = httplib.HTTPS(host, 0,
cert_file=self.cert_file,
h.putrequest('POST', selector)
h.putheader('Content-Type',
'application/x-www-form-urlencoded')
h.putheader('Content-Length', '%d' % len(data))
h.putrequest('GET', selector)
if proxy_auth: h.putheader('Proxy-Authorization', 'Basic %s' % proxy_auth)
if auth: h.putheader('Authorization', 'Basic %s' % auth)
if realhost: h.putheader('Host', realhost)
for args in self.addheaders: h.putheader(*args)
errcode, errmsg, headers = h.getreply()
# something went wrong with the HTTP status line
raise IOError, ('http protocol error', 0,
'got a bad status line', None)
# According to RFC 2616, "2xx" code indicates that the client's
# request was successfully received, understood, and accepted.
if (200 <= errcode < 300):
return addinfourl(fp, headers, "https:" + url, errcode)
return self.http_error(url, fp, errcode, errmsg, headers)
return self.http_error(url, fp, errcode, errmsg, headers,
def open_file(self, url):
"""Use local file or FTP depending on form of URL."""
if not isinstance(url, str):
raise IOError, ('file error', 'proxy support for file protocol currently not implemented')
if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
return self.open_ftp(url)
return self.open_local_file(url)
def open_local_file(self, url):
import mimetypes, mimetools, email.utils
from cStringIO import StringIO
from StringIO import StringIO
host, file = splithost(url)
localname = url2pathname(file)
stats = os.stat(localname)
raise IOError(e.errno, e.strerror, e.filename)
modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
mtype = mimetypes.guess_type(url)[0]
headers = mimetools.Message(StringIO(
'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
(mtype or 'text/plain', size, modified)))
urlfile = 'file://' + file
raise ValueError("local file url may start with / or file:. Unknown url of type: %s" % url)
return addinfourl(open(localname, 'rb'),
host, port = splitport(host)
and socket.gethostbyname(host) in (localhost(), thishost()):