IGN:Use patched mechanize implementation that correctly closes connections

2025-07-09 03:04:10 -04:00 · 2008-09-11 17:07:21 -07:00 · 2008-09-11 17:07:21 -07:00 · 6fee09b9d2
commit 6fee09b9d2
parent e7c7cc64eb
24 changed files with 8779 additions and 2 deletions
--- a/src/calibre/init.py
+++ b/src/calibre/init.py
@ -2,7 +2,7 @@
 __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
-import sys, os, re, logging, time, subprocess, mechanize, atexit
+import sys, os, re, logging, time, subprocess, atexit
 from htmlentitydefs import name2codepoint
 from math import floor
 from logging import Formatter
@ -14,7 +14,7 @@ from calibre.constants import iswindows, isosx, islinux, isfrozen, \
                              terminal_controller, preferred_encoding, \
                              __appname__, __version__, __author__, \
                              win32event, win32api, winerror, fcntl
-
+from calibre.utils import mechanize
 def unicode_path(path, abs=False):
    if not isinstance(path, unicode):
--- a/src/calibre/utils/mechanize/init.py
+++ b/src/calibre/utils/mechanize/init.py
@ -0,0 +1,125 @@
 __all__ = [
    'AbstractBasicAuthHandler',
    'AbstractDigestAuthHandler',
    'BaseHandler',
    'Browser',
    'BrowserStateError',
    'CacheFTPHandler',
    'ContentTooShortError',
    'Cookie',
    'CookieJar',
    'CookiePolicy',
    'DefaultCookiePolicy',
    'DefaultFactory',
    'FTPHandler',
    'Factory',
    'FileCookieJar',
    'FileHandler',
    'FormNotFoundError',
    'FormsFactory',
    'GopherError',
    'GopherHandler',
    'HTTPBasicAuthHandler',
    'HTTPCookieProcessor',
    'HTTPDefaultErrorHandler',
    'HTTPDigestAuthHandler',
    'HTTPEquivProcessor',
    'HTTPError',
    'HTTPErrorProcessor',
    'HTTPHandler',
    'HTTPPasswordMgr',
    'HTTPPasswordMgrWithDefaultRealm',
    'HTTPProxyPasswordMgr',
    'HTTPRedirectDebugProcessor',
    'HTTPRedirectHandler',
    'HTTPRefererProcessor',
    'HTTPRefreshProcessor',
    'HTTPRequestUpgradeProcessor',
    'HTTPResponseDebugProcessor',
    'HTTPRobotRulesProcessor',
    'HTTPSClientCertMgr',
    'HTTPSHandler',
    'HeadParser',
    'History',
    'LWPCookieJar',
    'Link',
    'LinkNotFoundError',
    'LinksFactory',
    'LoadError',
    'MSIECookieJar',
    'MozillaCookieJar',
    'OpenerDirector',
    'OpenerFactory',
    'ParseError',
    'ProxyBasicAuthHandler',
    'ProxyDigestAuthHandler',
    'ProxyHandler',
    'Request',
    'ResponseUpgradeProcessor',
    'RobotExclusionError',
    'RobustFactory',
    'RobustFormsFactory',
    'RobustLinksFactory',
    'RobustTitleFactory',
    'SeekableProcessor',
    'SeekableResponseOpener',
    'TitleFactory',
    'URLError',
    'USE_BARE_EXCEPT',
    'UnknownHandler',
    'UserAgent',
    'UserAgentBase',
    'XHTMLCompatibleHeadParser',
    '__version__',
    'build_opener',
    'install_opener',
    'lwp_cookie_str',
    'make_response',
    'request_host',
    'response_seek_wrapper',  # XXX deprecate in public interface?
    'seek_wrapped_response'   # XXX should probably use this internally in place of response_seek_wrapper()
    'str2time',
    'urlopen',
    'urlretrieve']
 from _mechanize import __version__
 # high-level stateful browser-style interface
 from _mechanize import \
     Browser, History, \
     BrowserStateError, LinkNotFoundError, FormNotFoundError
 # configurable URL-opener interface
 from _useragent import UserAgentBase, UserAgent
 from _html import \
     ParseError, \
     Link, \
     Factory, DefaultFactory, RobustFactory, \
     FormsFactory, LinksFactory, TitleFactory, \
     RobustFormsFactory, RobustLinksFactory, RobustTitleFactory
 # urllib2 work-alike interface (part from mechanize, part from urllib2)
 # This is a superset of the urllib2 interface.
 from _urllib2 import *
 # misc
 from _opener import ContentTooShortError, OpenerFactory, urlretrieve
 from _util import http2time as str2time
 from _response import \
     response_seek_wrapper, seek_wrapped_response, make_response
 from _http import HeadParser
 try:
    from _http import XHTMLCompatibleHeadParser
 except ImportError:
    pass
 # cookies
 from _clientcookie import Cookie, CookiePolicy, DefaultCookiePolicy, \
     CookieJar, FileCookieJar, LoadError, request_host
 from _lwpcookiejar import LWPCookieJar, lwp_cookie_str
 from _mozillacookiejar import MozillaCookieJar
 from _msiecookiejar import MSIECookieJar
 # If you hate the idea of turning bugs into warnings, do:
 # import mechanize; mechanize.USE_BARE_EXCEPT = False
 USE_BARE_EXCEPT = True
--- a/src/calibre/utils/mechanize/_auth.py
+++ b/src/calibre/utils/mechanize/_auth.py
@ -0,0 +1,500 @@
 """HTTP Authentication and Proxy support.
 All but HTTPProxyPasswordMgr come from Python 2.5.
 Copyright 2006 John J. Lee <jjl@pobox.com>
 This code is free software; you can redistribute it and/or modify it under
 the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt
 included with the distribution).
 """
 import re, base64, urlparse, posixpath, md5, sha, sys, copy
 from urllib2 import BaseHandler
 from urllib import getproxies, unquote, splittype, splituser, splitpasswd, \
     splitport
 def _parse_proxy(proxy):
    """Return (scheme, user, password, host/port) given a URL or an authority.
    If a URL is supplied, it must have an authority (host:port) component.
    According to RFC 3986, having an authority component means the URL must
    have two slashes after the scheme:
    >>> _parse_proxy('file:/ftp.example.com/')
    Traceback (most recent call last):
    ValueError: proxy URL with no authority: 'file:/ftp.example.com/'
    The first three items of the returned tuple may be None.
    Examples of authority parsing:
    >>> _parse_proxy('proxy.example.com')
    (None, None, None, 'proxy.example.com')
    >>> _parse_proxy('proxy.example.com:3128')
    (None, None, None, 'proxy.example.com:3128')
    The authority component may optionally include userinfo (assumed to be
    username:password):
    >>> _parse_proxy('joe:password@proxy.example.com')
    (None, 'joe', 'password', 'proxy.example.com')
    >>> _parse_proxy('joe:password@proxy.example.com:3128')
    (None, 'joe', 'password', 'proxy.example.com:3128')
    Same examples, but with URLs instead:
    >>> _parse_proxy('http://proxy.example.com/')
    ('http', None, None, 'proxy.example.com')
    >>> _parse_proxy('http://proxy.example.com:3128/')
    ('http', None, None, 'proxy.example.com:3128')
    >>> _parse_proxy('http://joe:password@proxy.example.com/')
    ('http', 'joe', 'password', 'proxy.example.com')
    >>> _parse_proxy('http://joe:password@proxy.example.com:3128')
    ('http', 'joe', 'password', 'proxy.example.com:3128')
    Everything after the authority is ignored:
    >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128')
    ('ftp', 'joe', 'password', 'proxy.example.com')
    Test for no trailing '/' case:
    >>> _parse_proxy('http://joe:password@proxy.example.com')
    ('http', 'joe', 'password', 'proxy.example.com')
    """
    scheme, r_scheme = splittype(proxy)
    if not r_scheme.startswith("/"):
        # authority
        scheme = None
        authority = proxy
    else:
        # URL
        if not r_scheme.startswith("//"):
            raise ValueError("proxy URL with no authority: %r" % proxy)
        # We have an authority, so for RFC 3986-compliant URLs (by ss 3.
        # and 3.3.), path is empty or starts with '/'
        end = r_scheme.find("/", 2)
        if end == -1:
            end = None
        authority = r_scheme[2:end]
    userinfo, hostport = splituser(authority)
    if userinfo is not None:
        user, password = splitpasswd(userinfo)
    else:
        user = password = None
    return scheme, user, password, hostport
 class ProxyHandler(BaseHandler):
    # Proxies must be in front
    handler_order = 100
    def __init__(self, proxies=None):
        if proxies is None:
            proxies = getproxies()
        assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
        self.proxies = proxies
        for type, url in proxies.items():
            setattr(self, '%s_open' % type,
                    lambda r, proxy=url, type=type, meth=self.proxy_open: \
                    meth(r, proxy, type))
    def proxy_open(self, req, proxy, type):
        orig_type = req.get_type()
        proxy_type, user, password, hostport = _parse_proxy(proxy)
        if proxy_type is None:
            proxy_type = orig_type
        if user and password:
            user_pass = '%s:%s' % (unquote(user), unquote(password))
            creds = base64.encodestring(user_pass).strip()
            req.add_header('Proxy-authorization', 'Basic ' + creds)
        hostport = unquote(hostport)
        req.set_proxy(hostport, proxy_type)
        if orig_type == proxy_type:
            # let other handlers take care of it
            return None
        else:
            # need to start over, because the other handlers don't
            # grok the proxy's URL type
            # e.g. if we have a constructor arg proxies like so:
            # {'http': 'ftp://proxy.example.com'}, we may end up turning
            # a request for http://acme.example.com/a into one for
            # ftp://proxy.example.com/a
            return self.parent.open(req)
 class HTTPPasswordMgr:
    def __init__(self):
        self.passwd = {}
    def add_password(self, realm, uri, user, passwd):
        # uri could be a single URI or a sequence
        if isinstance(uri, basestring):
            uri = [uri]
        if not realm in self.passwd:
            self.passwd[realm] = {}
        for default_port in True, False:
            reduced_uri = tuple(
                [self.reduce_uri(u, default_port) for u in uri])
            self.passwd[realm][reduced_uri] = (user, passwd)
    def find_user_password(self, realm, authuri):
        domains = self.passwd.get(realm, {})
        for default_port in True, False:
            reduced_authuri = self.reduce_uri(authuri, default_port)
            for uris, authinfo in domains.iteritems():
                for uri in uris:
                    if self.is_suburi(uri, reduced_authuri):
                        return authinfo
        return None, None
    def reduce_uri(self, uri, default_port=True):
        """Accept authority or URI and extract only the authority and path."""
        # note HTTP URLs do not have a userinfo component
        parts = urlparse.urlsplit(uri)
        if parts[1]:
            # URI
            scheme = parts[0]
            authority = parts[1]
            path = parts[2] or '/'
        else:
            # host or host:port
            scheme = None
            authority = uri
            path = '/'
        host, port = splitport(authority)
        if default_port and port is None and scheme is not None:
            dport = {"http": 80,
                     "https": 443,
                     }.get(scheme)
            if dport is not None:
                authority = "%s:%d" % (host, dport)
        return authority, path
    def is_suburi(self, base, test):
        """Check if test is below base in a URI tree
        Both args must be URIs in reduced form.
        """
        if base == test:
            return True
        if base[0] != test[0]:
            return False
        common = posixpath.commonprefix((base[1], test[1]))
        if len(common) == len(base[1]):
            return True
        return False
 class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
    def find_user_password(self, realm, authuri):
        user, password = HTTPPasswordMgr.find_user_password(self, realm,
                                                            authuri)
        if user is not None:
            return user, password
        return HTTPPasswordMgr.find_user_password(self, None, authuri)
 class AbstractBasicAuthHandler:
    rx = re.compile('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', re.I)
    # XXX there can actually be multiple auth-schemes in a
    # www-authenticate header.  should probably be a lot more careful
    # in parsing them to extract multiple alternatives
    def __init__(self, password_mgr=None):
        if password_mgr is None:
            password_mgr = HTTPPasswordMgr()
        self.passwd = password_mgr
        self.add_password = self.passwd.add_password
    def http_error_auth_reqed(self, authreq, host, req, headers):
        # host may be an authority (without userinfo) or a URL with an
        # authority
        # XXX could be multiple headers
        authreq = headers.get(authreq, None)
        if authreq:
            mo = AbstractBasicAuthHandler.rx.search(authreq)
            if mo:
                scheme, realm = mo.groups()
                if scheme.lower() == 'basic':
                    return self.retry_http_basic_auth(host, req, realm)
    def retry_http_basic_auth(self, host, req, realm):
        user, pw = self.passwd.find_user_password(realm, host)
        if pw is not None:
            raw = "%s:%s" % (user, pw)
            auth = 'Basic %s' % base64.encodestring(raw).strip()
            if req.headers.get(self.auth_header, None) == auth:
                return None
            newreq = copy.copy(req)
            newreq.add_header(self.auth_header, auth)
            newreq.visit = False
            return self.parent.open(newreq)
        else:
            return None
 class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
    auth_header = 'Authorization'
    def http_error_401(self, req, fp, code, msg, headers):
        url = req.get_full_url()
        return self.http_error_auth_reqed('www-authenticate',
                                          url, req, headers)
 class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
    auth_header = 'Proxy-authorization'
    def http_error_407(self, req, fp, code, msg, headers):
        # http_error_auth_reqed requires that there is no userinfo component in
        # authority.  Assume there isn't one, since urllib2 does not (and
        # should not, RFC 3986 s. 3.2.1) support requests for URLs containing
        # userinfo.
        authority = req.get_host()
        return self.http_error_auth_reqed('proxy-authenticate',
                                          authority, req, headers)
 def randombytes(n):
    """Return n random bytes."""
    # Use /dev/urandom if it is available.  Fall back to random module
    # if not.  It might be worthwhile to extend this function to use
    # other platform-specific mechanisms for getting random bytes.
    if os.path.exists("/dev/urandom"):
        f = open("/dev/urandom")
        s = f.read(n)
        f.close()
        return s
    else:
        L = [chr(random.randrange(0, 256)) for i in range(n)]
        return "".join(L)
 class AbstractDigestAuthHandler:
    # Digest authentication is specified in RFC 2617.
    # XXX The client does not inspect the Authentication-Info header
    # in a successful response.
    # XXX It should be possible to test this implementation against
    # a mock server that just generates a static set of challenges.
    # XXX qop="auth-int" supports is shaky
    def __init__(self, passwd=None):
        if passwd is None:
            passwd = HTTPPasswordMgr()
        self.passwd = passwd
        self.add_password = self.passwd.add_password
        self.retried = 0
        self.nonce_count = 0
    def reset_retry_count(self):
        self.retried = 0
    def http_error_auth_reqed(self, auth_header, host, req, headers):
        authreq = headers.get(auth_header, None)
        if self.retried > 5:
            # Don't fail endlessly - if we failed once, we'll probably
            # fail a second time. Hm. Unless the Password Manager is
            # prompting for the information. Crap. This isn't great
            # but it's better than the current 'repeat until recursion
            # depth exceeded' approach <wink>
            raise HTTPError(req.get_full_url(), 401, "digest auth failed",
                            headers, None)
        else:
            self.retried += 1
        if authreq:
            scheme = authreq.split()[0]
            if scheme.lower() == 'digest':
                return self.retry_http_digest_auth(req, authreq)
    def retry_http_digest_auth(self, req, auth):
        token, challenge = auth.split(' ', 1)
        chal = parse_keqv_list(parse_http_list(challenge))
        auth = self.get_authorization(req, chal)
        if auth:
            auth_val = 'Digest %s' % auth
            if req.headers.get(self.auth_header, None) == auth_val:
                return None
            newreq = copy.copy(req)
            newreq.add_unredirected_header(self.auth_header, auth_val)
            newreq.visit = False
            return self.parent.open(newreq)
    def get_cnonce(self, nonce):
        # The cnonce-value is an opaque
        # quoted string value provided by the client and used by both client
        # and server to avoid chosen plaintext attacks, to provide mutual
        # authentication, and to provide some message integrity protection.
        # This isn't a fabulous effort, but it's probably Good Enough.
        dig = sha.new("%s:%s:%s:%s" % (self.nonce_count, nonce, time.ctime(),
                                       randombytes(8))).hexdigest()
        return dig[:16]
    def get_authorization(self, req, chal):
        try:
            realm = chal['realm']
            nonce = chal['nonce']
            qop = chal.get('qop')
            algorithm = chal.get('algorithm', 'MD5')
            # mod_digest doesn't send an opaque, even though it isn't
            # supposed to be optional
            opaque = chal.get('opaque', None)
        except KeyError:
            return None
        H, KD = self.get_algorithm_impls(algorithm)
        if H is None:
            return None
        user, pw = self.passwd.find_user_password(realm, req.get_full_url())
        if user is None:
            return None
        # XXX not implemented yet
        if req.has_data():
            entdig = self.get_entity_digest(req.get_data(), chal)
        else:
            entdig = None
        A1 = "%s:%s:%s" % (user, realm, pw)
        A2 = "%s:%s" % (req.get_method(),
                        # XXX selector: what about proxies and full urls
                        req.get_selector())
        if qop == 'auth':
            self.nonce_count += 1
            ncvalue = '%08x' % self.nonce_count
            cnonce = self.get_cnonce(nonce)
            noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2))
            respdig = KD(H(A1), noncebit)
        elif qop is None:
            respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
        else:
            # XXX handle auth-int.
            pass
        # XXX should the partial digests be encoded too?
        base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
               'response="%s"' % (user, realm, nonce, req.get_selector(),
                                  respdig)
        if opaque:
            base += ', opaque="%s"' % opaque
        if entdig:
            base += ', digest="%s"' % entdig
        base += ', algorithm="%s"' % algorithm
        if qop:
            base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
        return base
    def get_algorithm_impls(self, algorithm):
        # lambdas assume digest modules are imported at the top level
        if algorithm == 'MD5':
            H = lambda x: md5.new(x).hexdigest()
        elif algorithm == 'SHA':
            H = lambda x: sha.new(x).hexdigest()
        # XXX MD5-sess
        KD = lambda s, d: H("%s:%s" % (s, d))
        return H, KD
    def get_entity_digest(self, data, chal):
        # XXX not implemented yet
        return None
 class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
    """An authentication protocol defined by RFC 2069
    Digest authentication improves on basic authentication because it
    does not transmit passwords in the clear.
    """
    auth_header = 'Authorization'
    handler_order = 490
    def http_error_401(self, req, fp, code, msg, headers):
        host = urlparse.urlparse(req.get_full_url())[1]
        retry = self.http_error_auth_reqed('www-authenticate',
                                           host, req, headers)
        self.reset_retry_count()
        return retry
 class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
    auth_header = 'Proxy-Authorization'
    handler_order = 490
    def http_error_407(self, req, fp, code, msg, headers):
        host = req.get_host()
        retry = self.http_error_auth_reqed('proxy-authenticate',
                                           host, req, headers)
        self.reset_retry_count()
        return retry
 # XXX ugly implementation, should probably not bother deriving
 class HTTPProxyPasswordMgr(HTTPPasswordMgr):
    # has default realm and host/port
    def add_password(self, realm, uri, user, passwd):
        # uri could be a single URI or a sequence
        if uri is None or isinstance(uri, basestring):
            uris = [uri]
        else:
            uris = uri
        passwd_by_domain = self.passwd.setdefault(realm, {})
        for uri in uris:
            for default_port in True, False:
                reduced_uri = self.reduce_uri(uri, default_port)
                passwd_by_domain[reduced_uri] = (user, passwd)
    def find_user_password(self, realm, authuri):
        attempts = [(realm, authuri), (None, authuri)]
        # bleh, want default realm to take precedence over default
        # URI/authority, hence this outer loop
        for default_uri in False, True:
            for realm, authuri in attempts:
                authinfo_by_domain = self.passwd.get(realm, {})
                for default_port in True, False:
                    reduced_authuri = self.reduce_uri(authuri, default_port)
                    for uri, authinfo in authinfo_by_domain.iteritems():
                        if uri is None and not default_uri:
                            continue
                        if self.is_suburi(uri, reduced_authuri):
                            return authinfo
                    user, password = None, None
                    if user is not None:
                        break
        return user, password
    def reduce_uri(self, uri, default_port=True):
        if uri is None:
            return None
        return HTTPPasswordMgr.reduce_uri(self, uri, default_port)
    def is_suburi(self, base, test):
        if base is None:
            # default to the proxy's host/port
            hostport, path = test
            base = (hostport, "/")
        return HTTPPasswordMgr.is_suburi(self, base, test)
 class HTTPSClientCertMgr(HTTPPasswordMgr):
    # implementation inheritance: this is not a proper subclass
    def add_key_cert(self, uri, key_file, cert_file):
        self.add_password(None, uri, key_file, cert_file)
    def find_key_cert(self, authuri):
        return HTTPPasswordMgr.find_user_password(self, None, authuri)
--- a/src/calibre/utils/mechanize/_beautifulsoup.py
+++ b/src/calibre/utils/mechanize/_beautifulsoup.py
--- a/src/calibre/utils/mechanize/_clientcookie.py
+++ b/src/calibre/utils/mechanize/_clientcookie.py
--- a/src/calibre/utils/mechanize/_debug.py
+++ b/src/calibre/utils/mechanize/_debug.py
@ -0,0 +1,28 @@
 import logging
 from urllib2 import BaseHandler
 from _response import response_seek_wrapper
 class HTTPResponseDebugProcessor(BaseHandler):
    handler_order = 900  # before redirections, after everything else
    def http_response(self, request, response):
        if not hasattr(response, "seek"):
            response = response_seek_wrapper(response)
        info = logging.getLogger("mechanize.http_responses").info
        try:
            info(response.read())
        finally:
            response.seek(0)
        info("*****************************************************")
        return response
    https_response = http_response
 class HTTPRedirectDebugProcessor(BaseHandler):
    def http_request(self, request):
        if hasattr(request, "redirect_dict"):
            info = logging.getLogger("mechanize.http_redirects").info
            info("redirecting to %s", request.get_full_url())
        return request
--- a/src/calibre/utils/mechanize/_gzip.py
+++ b/src/calibre/utils/mechanize/_gzip.py
@ -0,0 +1,103 @@
 import urllib2
 from cStringIO import StringIO
 import _response
 # GzipConsumer was taken from Fredrik Lundh's effbot.org-0.1-20041009 library
 class GzipConsumer:
    def __init__(self, consumer):
        self.__consumer = consumer
        self.__decoder = None
        self.__data = ""
    def __getattr__(self, key):
        return getattr(self.__consumer, key)
    def feed(self, data):
        if self.__decoder is None:
            # check if we have a full gzip header
            data = self.__data + data
            try:
                i = 10
                flag = ord(data[3])
                if flag & 4: # extra
                    x = ord(data[i]) + 256*ord(data[i+1])
                    i = i + 2 + x
                if flag & 8: # filename
                    while ord(data[i]):
                        i = i + 1
                    i = i + 1
                if flag & 16: # comment
                    while ord(data[i]):
                        i = i + 1
                    i = i + 1
                if flag & 2: # crc
                    i = i + 2
                if len(data) < i:
                    raise IndexError("not enough data")
                if data[:3] != "\x1f\x8b\x08":
                    raise IOError("invalid gzip data")
                data = data[i:]
            except IndexError:
                self.__data = data
                return # need more data
            import zlib
            self.__data = ""
            self.__decoder = zlib.decompressobj(-zlib.MAX_WBITS)
        data = self.__decoder.decompress(data)
        if data:
            self.__consumer.feed(data)
    def close(self):
        if self.__decoder:
            data = self.__decoder.flush()
            if data:
                self.__consumer.feed(data)
        self.__consumer.close()
 # --------------------------------------------------------------------
 # the rest of this module is John Lee's stupid code, not
 # Fredrik's nice code :-)
 class stupid_gzip_consumer:
    def __init__(self): self.data = []
    def feed(self, data): self.data.append(data)
 class stupid_gzip_wrapper(_response.closeable_response):
    def __init__(self, response):
        self._response = response
        c = stupid_gzip_consumer()
        gzc = GzipConsumer(c)
        gzc.feed(response.read())
        self.__data = StringIO("".join(c.data))
    def read(self, size=-1):
        return self.__data.read(size)
    def readline(self, size=-1):
        return self.__data.readline(size)
    def readlines(self, sizehint=-1):
        return self.__data.readlines(size)
    def __getattr__(self, name):
        # delegate unknown methods/attributes
        return getattr(self._response, name)
 class HTTPGzipProcessor(urllib2.BaseHandler):
    handler_order = 200  # response processing before HTTPEquivProcessor
    def http_request(self, request):
        request.add_header("Accept-Encoding", "gzip")
        return request
    def http_response(self, request, response):
        # post-process response
        enc_hdrs = response.info().getheaders("Content-encoding")
        for enc_hdr in enc_hdrs:
            if ("gzip" in enc_hdr) or ("compress" in enc_hdr):
                return stupid_gzip_wrapper(response)
        return response
    https_response = http_response
--- a/src/calibre/utils/mechanize/_headersutil.py
+++ b/src/calibre/utils/mechanize/_headersutil.py
@ -0,0 +1,226 @@
 """Utility functions for HTTP header value parsing and construction.
 Copyright 1997-1998, Gisle Aas
 Copyright 2002-2006, John J. Lee
 This code is free software; you can redistribute it and/or modify it
 under the terms of the BSD or ZPL 2.1 licenses (see the file
 COPYING.txt included with the distribution).
 """
 import os, re
 from types import StringType
 from types import UnicodeType
 STRING_TYPES = StringType, UnicodeType
 from _util import http2time
 import _rfc3986
 def is_html(ct_headers, url, allow_xhtml=False):
    """
    ct_headers: Sequence of Content-Type headers
    url: Response URL
    """
    if not ct_headers:
        # guess
        ext = os.path.splitext(_rfc3986.urlsplit(url)[2])[1]
        html_exts = [".htm", ".html"]
        if allow_xhtml:
            html_exts += [".xhtml"]
        return ext in html_exts
    # use first header
    ct = split_header_words(ct_headers)[0][0][0]
    html_types = ["text/html"]
    if allow_xhtml:
        html_types += [
            "text/xhtml", "text/xml",
            "application/xml", "application/xhtml+xml",
            ]
    return ct in html_types
 def unmatched(match):
    """Return unmatched part of re.Match object."""
    start, end = match.span(0)
    return match.string[:start]+match.string[end:]
 token_re =        re.compile(r"^\s*([^=\s;,]+)")
 quoted_value_re = re.compile(r"^\s*=\s*\"([^\"\\]*(?:\\.[^\"\\]*)*)\"")
 value_re =        re.compile(r"^\s*=\s*([^\s;,]*)")
 escape_re = re.compile(r"\\(.)")
 def split_header_words(header_values):
    r"""Parse header values into a list of lists containing key,value pairs.
    The function knows how to deal with ",", ";" and "=" as well as quoted
    values after "=".  A list of space separated tokens are parsed as if they
    were separated by ";".
    If the header_values passed as argument contains multiple values, then they
    are treated as if they were a single value separated by comma ",".
    This means that this function is useful for parsing header fields that
    follow this syntax (BNF as from the HTTP/1.1 specification, but we relax
    the requirement for tokens).
      headers           = #header
      header            = (token | parameter) *( [";"] (token | parameter))
      token             = 1*<any CHAR except CTLs or separators>
      separators        = "(" | ")" | "<" | ">" | "@"
                        | "," | ";" | ":" | "\" | <">
                        | "/" | "[" | "]" | "?" | "="
                        | "{" | "}" | SP | HT
      quoted-string     = ( <"> *(qdtext | quoted-pair ) <"> )
      qdtext            = <any TEXT except <">>
      quoted-pair       = "\" CHAR
      parameter         = attribute "=" value
      attribute         = token
      value             = token | quoted-string
    Each header is represented by a list of key/value pairs.  The value for a
    simple token (not part of a parameter) is None.  Syntactically incorrect
    headers will not necessarily be parsed as you would want.
    This is easier to describe with some examples:
    >>> split_header_words(['foo="bar"; port="80,81"; discard, bar=baz'])
    [[('foo', 'bar'), ('port', '80,81'), ('discard', None)], [('bar', 'baz')]]
    >>> split_header_words(['text/html; charset="iso-8859-1"'])
    [[('text/html', None), ('charset', 'iso-8859-1')]]
    >>> split_header_words([r'Basic realm="\"foo\bar\""'])
    [[('Basic', None), ('realm', '"foobar"')]]
    """
    assert type(header_values) not in STRING_TYPES
    result = []
    for text in header_values:
        orig_text = text
        pairs = []
        while text:
            m = token_re.search(text)
            if m:
                text = unmatched(m)
                name = m.group(1)
                m = quoted_value_re.search(text)
                if m:  # quoted value
                    text = unmatched(m)
                    value = m.group(1)
                    value = escape_re.sub(r"\1", value)
                else:
                    m = value_re.search(text)
                    if m:  # unquoted value
                        text = unmatched(m)
                        value = m.group(1)
                        value = value.rstrip()
                    else:
                        # no value, a lone token
                        value = None
                pairs.append((name, value))
            elif text.lstrip().startswith(","):
                # concatenated headers, as per RFC 2616 section 4.2
                text = text.lstrip()[1:]
                if pairs: result.append(pairs)
                pairs = []
            else:
                # skip junk
                non_junk, nr_junk_chars = re.subn("^[=\s;]*", "", text)
                assert nr_junk_chars > 0, (
                    "split_header_words bug: '%s', '%s', %s" %
                    (orig_text, text, pairs))
                text = non_junk
        if pairs: result.append(pairs)
    return result
 join_escape_re = re.compile(r"([\"\\])")
 def join_header_words(lists):
    """Do the inverse of the conversion done by split_header_words.
    Takes a list of lists of (key, value) pairs and produces a single header
    value.  Attribute values are quoted if needed.
    >>> join_header_words([[("text/plain", None), ("charset", "iso-8859/1")]])
    'text/plain; charset="iso-8859/1"'
    >>> join_header_words([[("text/plain", None)], [("charset", "iso-8859/1")]])
    'text/plain, charset="iso-8859/1"'
    """
    headers = []
    for pairs in lists:
        attr = []
        for k, v in pairs:
            if v is not None:
                if not re.search(r"^\w+$", v):
                    v = join_escape_re.sub(r"\\\1", v)  # escape " and \
                    v = '"%s"' % v
                if k is None:  # Netscape cookies may have no name
                    k = v
                else:
                    k = "%s=%s" % (k, v)
            attr.append(k)
        if attr: headers.append("; ".join(attr))
    return ", ".join(headers)
 def parse_ns_headers(ns_headers):
    """Ad-hoc parser for Netscape protocol cookie-attributes.
    The old Netscape cookie format for Set-Cookie can for instance contain
    an unquoted "," in the expires field, so we have to use this ad-hoc
    parser instead of split_header_words.
    XXX This may not make the best possible effort to parse all the crap
    that Netscape Cookie headers contain.  Ronald Tschalar's HTTPClient
    parser is probably better, so could do worse than following that if
    this ever gives any trouble.
    Currently, this is also used for parsing RFC 2109 cookies.
    """
    known_attrs = ("expires", "domain", "path", "secure",
                   # RFC 2109 attrs (may turn up in Netscape cookies, too)
                   "port", "max-age")
    result = []
    for ns_header in ns_headers:
        pairs = []
        version_set = False
        params = re.split(r";\s*", ns_header)
        for ii in range(len(params)):
            param = params[ii]
            param = param.rstrip()
            if param == "": continue
            if "=" not in param:
                k, v = param, None
            else:
                k, v = re.split(r"\s*=\s*", param, 1)
                k = k.lstrip()
            if ii != 0:
                lc = k.lower()
                if lc in known_attrs:
                    k = lc
                if k == "version":
                    # This is an RFC 2109 cookie.
                    version_set = True
                if k == "expires":
                    # convert expires date to seconds since epoch
                    if v.startswith('"'): v = v[1:]
                    if v.endswith('"'): v = v[:-1]
                    v = http2time(v)  # None if invalid
            pairs.append((k, v))
        if pairs:
            if not version_set:
                pairs.append(("version", "0"))
            result.append(pairs)
    return result
 def _test():
   import doctest, _headersutil
   return doctest.testmod(_headersutil)
 if __name__ == "__main__":
   _test()
--- a/src/calibre/utils/mechanize/_html.py
+++ b/src/calibre/utils/mechanize/_html.py
@ -0,0 +1,607 @@
 """HTML handling.
 Copyright 2003-2006 John J. Lee <jjl@pobox.com>
 This code is free software; you can redistribute it and/or modify it under
 the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt
 included with the distribution).
 """
 import re, copy, htmlentitydefs
 import sgmllib, HTMLParser, ClientForm
 import _request
 from _headersutil import split_header_words, is_html as _is_html
 import _rfc3986
 DEFAULT_ENCODING = "latin-1"
 # the base classe is purely for backwards compatibility
 class ParseError(ClientForm.ParseError): pass
 class CachingGeneratorFunction(object):
    """Caching wrapper around a no-arguments iterable."""
    def __init__(self, iterable):
        self._cache = []
        # wrap iterable to make it non-restartable (otherwise, repeated
        # __call__ would give incorrect results)
        self._iterator = iter(iterable)
    def __call__(self):
        cache = self._cache
        for item in cache:
            yield item
        for item in self._iterator:
            cache.append(item)
            yield item
 class EncodingFinder:
    def __init__(self, default_encoding):
        self._default_encoding = default_encoding
    def encoding(self, response):
        # HTTPEquivProcessor may be in use, so both HTTP and HTTP-EQUIV
        # headers may be in the response.  HTTP-EQUIV headers come last,
        # so try in order from first to last.
        for ct in response.info().getheaders("content-type"):
            for k, v in split_header_words([ct])[0]:
                if k == "charset":
                    return v
        return self._default_encoding
 class ResponseTypeFinder:
    def __init__(self, allow_xhtml):
        self._allow_xhtml = allow_xhtml
    def is_html(self, response, encoding):
        ct_hdrs = response.info().getheaders("content-type")
        url = response.geturl()
        # XXX encoding
        return _is_html(ct_hdrs, url, self._allow_xhtml)
 # idea for this argument-processing trick is from Peter Otten
 class Args:
    def __init__(self, args_map):
        self.dictionary = dict(args_map)
    def __getattr__(self, key):
        try:
            return self.dictionary[key]
        except KeyError:
            return getattr(self.__class__, key)
 def form_parser_args(
    select_default=False,
    form_parser_class=None,
    request_class=None,
    backwards_compat=False,
    ):
    return Args(locals())
 class Link:
    def __init__(self, base_url, url, text, tag, attrs):
        assert None not in [url, tag, attrs]
        self.base_url = base_url
        self.absolute_url = _rfc3986.urljoin(base_url, url)
        self.url, self.text, self.tag, self.attrs = url, text, tag, attrs
    def __cmp__(self, other):
        try:
            for name in "url", "text", "tag", "attrs":
                if getattr(self, name) != getattr(other, name):
                    return -1
        except AttributeError:
            return -1
        return 0
    def __repr__(self):
        return "Link(base_url=%r, url=%r, text=%r, tag=%r, attrs=%r)" % (
            self.base_url, self.url, self.text, self.tag, self.attrs)
 class LinksFactory:
    def __init__(self,
                 link_parser_class=None,
                 link_class=Link,
                 urltags=None,
                 ):
        import _pullparser
        if link_parser_class is None:
            link_parser_class = _pullparser.TolerantPullParser
        self.link_parser_class = link_parser_class
        self.link_class = link_class
        if urltags is None:
            urltags = {
                "a": "href",
                "area": "href",
                "frame": "src",
                "iframe": "src",
                }
        self.urltags = urltags
        self._response = None
        self._encoding = None
    def set_response(self, response, base_url, encoding):
        self._response = response
        self._encoding = encoding
        self._base_url = base_url
    def links(self):
        """Return an iterator that provides links of the document."""
        response = self._response
        encoding = self._encoding
        base_url = self._base_url
        p = self.link_parser_class(response, encoding=encoding)
        try:
            for token in p.tags(*(self.urltags.keys()+["base"])):
                if token.type == "endtag":
                    continue
                if token.data == "base":
                    base_href = dict(token.attrs).get("href")
                    if base_href is not None:
                        base_url = base_href
                    continue
                attrs = dict(token.attrs)
                tag = token.data
                name = attrs.get("name")
                text = None
                # XXX use attr_encoding for ref'd doc if that doc does not
                #  provide one by other means
                #attr_encoding = attrs.get("charset")
                url = attrs.get(self.urltags[tag])  # XXX is "" a valid URL?
                if not url:
                    # Probably an <A NAME="blah"> link or <AREA NOHREF...>.
                    # For our purposes a link is something with a URL, so
                    # ignore this.
                    continue
                url = _rfc3986.clean_url(url, encoding)
                if tag == "a":
                    if token.type != "startendtag":
                        # hmm, this'd break if end tag is missing
                        text = p.get_compressed_text(("endtag", tag))
                    # but this doesn't work for eg.
                    # <a href="blah"><b>Andy</b></a>
                    #text = p.get_compressed_text()
                yield Link(base_url, url, text, tag, token.attrs)
        except sgmllib.SGMLParseError, exc:
            raise ParseError(exc)
 class FormsFactory:
    """Makes a sequence of objects satisfying ClientForm.HTMLForm interface.
    After calling .forms(), the .global_form attribute is a form object
    containing all controls not a descendant of any FORM element.
    For constructor argument docs, see ClientForm.ParseResponse
    argument docs.
    """
    def __init__(self,
                 select_default=False,
                 form_parser_class=None,
                 request_class=None,
                 backwards_compat=False,
                 ):
        import ClientForm
        self.select_default = select_default
        if form_parser_class is None:
            form_parser_class = ClientForm.FormParser
        self.form_parser_class = form_parser_class
        if request_class is None:
            request_class = _request.Request
        self.request_class = request_class
        self.backwards_compat = backwards_compat
        self._response = None
        self.encoding = None
        self.global_form = None
    def set_response(self, response, encoding):
        self._response = response
        self.encoding = encoding
        self.global_form = None
    def forms(self):
        import ClientForm
        encoding = self.encoding
        try:
            forms = ClientForm.ParseResponseEx(
                self._response,
                select_default=self.select_default,
                form_parser_class=self.form_parser_class,
                request_class=self.request_class,
                encoding=encoding,
                _urljoin=_rfc3986.urljoin,
                _urlparse=_rfc3986.urlsplit,
                _urlunparse=_rfc3986.urlunsplit,
                )
        except ClientForm.ParseError, exc:
            raise ParseError(exc)
        self.global_form = forms[0]
        return forms[1:]
 class TitleFactory:
    def __init__(self):
        self._response = self._encoding = None
    def set_response(self, response, encoding):
        self._response = response
        self._encoding = encoding
    def title(self):
        import _pullparser
        p = _pullparser.TolerantPullParser(
            self._response, encoding=self._encoding)
        try:
            try:
                p.get_tag("title")
            except _pullparser.NoMoreTokensError:
                return None
            else:
                return p.get_text()
        except sgmllib.SGMLParseError, exc:
            raise ParseError(exc)
 def unescape(data, entities, encoding):
    if data is None or "&" not in data:
        return data
    def replace_entities(match):
        ent = match.group()
        if ent[1] == "#":
            return unescape_charref(ent[2:-1], encoding)
        repl = entities.get(ent[1:-1])
        if repl is not None:
            repl = unichr(repl)
            if type(repl) != type(""):
                try:
                    repl = repl.encode(encoding)
                except UnicodeError:
                    repl = ent
        else:
            repl = ent
        return repl
    return re.sub(r"&#?[A-Za-z0-9]+?;", replace_entities, data)
 def unescape_charref(data, encoding):
    name, base = data, 10
    if name.startswith("x"):
        name, base= name[1:], 16
    uc = unichr(int(name, base))
    if encoding is None:
        return uc
    else:
        try:
            repl = uc.encode(encoding)
        except UnicodeError:
            repl = "&#%s;" % data
        return repl
 # bizarre import gymnastics for bundled BeautifulSoup
 import _beautifulsoup
 import ClientForm
 RobustFormParser, NestingRobustFormParser = ClientForm._create_bs_classes(
    _beautifulsoup.BeautifulSoup, _beautifulsoup.ICantBelieveItsBeautifulSoup
    )
 # monkeypatch sgmllib to fix http://www.python.org/sf/803422 :-(
 import sgmllib
 sgmllib.charref = re.compile("&#(x?[0-9a-fA-F]+)[^0-9a-fA-F]")
 class MechanizeBs(_beautifulsoup.BeautifulSoup):
    _entitydefs = htmlentitydefs.name2codepoint
    # don't want the magic Microsoft-char workaround
    PARSER_MASSAGE = [(re.compile('(<[^<>]*)/>'),
                       lambda(x):x.group(1) + ' />'),
                      (re.compile('<!\s+([^<>]*)>'),
                       lambda(x):'<!' + x.group(1) + '>')
                      ]
    def __init__(self, encoding, text=None, avoidParserProblems=True,
                 initialTextIsEverything=True):
        self._encoding = encoding
        _beautifulsoup.BeautifulSoup.__init__(
            self, text, avoidParserProblems, initialTextIsEverything)
    def handle_charref(self, ref):
        t = unescape("&#%s;"%ref, self._entitydefs, self._encoding)
        self.handle_data(t)
    def handle_entityref(self, ref):
        t = unescape("&%s;"%ref, self._entitydefs, self._encoding)
        self.handle_data(t)
    def unescape_attrs(self, attrs):
        escaped_attrs = []
        for key, val in attrs:
            val = unescape(val, self._entitydefs, self._encoding)
            escaped_attrs.append((key, val))
        return escaped_attrs
 class RobustLinksFactory:
    compress_re = re.compile(r"\s+")
    def __init__(self,
                 link_parser_class=None,
                 link_class=Link,
                 urltags=None,
                 ):
        import _beautifulsoup
        if link_parser_class is None:
            link_parser_class = MechanizeBs
        self.link_parser_class = link_parser_class
        self.link_class = link_class
        if urltags is None:
            urltags = {
                "a": "href",
                "area": "href",
                "frame": "src",
                "iframe": "src",
                }
        self.urltags = urltags
        self._bs = None
        self._encoding = None
        self._base_url = None
    def set_soup(self, soup, base_url, encoding):
        self._bs = soup
        self._base_url = base_url
        self._encoding = encoding
    def links(self):
        import _beautifulsoup
        bs = self._bs
        base_url = self._base_url
        encoding = self._encoding
        gen = bs.recursiveChildGenerator()
        for ch in bs.recursiveChildGenerator():
            if (isinstance(ch, _beautifulsoup.Tag) and
                ch.name in self.urltags.keys()+["base"]):
                link = ch
                attrs = bs.unescape_attrs(link.attrs)
                attrs_dict = dict(attrs)
                if link.name == "base":
                    base_href = attrs_dict.get("href")
                    if base_href is not None:
                        base_url = base_href
                    continue
                url_attr = self.urltags[link.name]
                url = attrs_dict.get(url_attr)
                if not url:
                    continue
                url = _rfc3986.clean_url(url, encoding)
                text = link.firstText(lambda t: True)
                if text is _beautifulsoup.Null:
                    # follow _pullparser's weird behaviour rigidly
                    if link.name == "a":
                        text = ""
                    else:
                        text = None
                else:
                    text = self.compress_re.sub(" ", text.strip())
                yield Link(base_url, url, text, link.name, attrs)
 class RobustFormsFactory(FormsFactory):
    def __init__(self, *args, **kwds):
        import ClientForm
        args = form_parser_args(*args, **kwds)
        if args.form_parser_class is None:
            args.form_parser_class = RobustFormParser
        FormsFactory.__init__(self, **args.dictionary)
    def set_response(self, response, encoding):
        self._response = response
        self.encoding = encoding
 class RobustTitleFactory:
    def __init__(self):
        self._bs = self._encoding = None
    def set_soup(self, soup, encoding):
        self._bs = soup
        self._encoding = encoding
    def title(self):
        import _beautifulsoup
        title = self._bs.first("title")
        if title == _beautifulsoup.Null:
            return None
        else:
            return title.firstText(lambda t: True)
 class Factory:
    """Factory for forms, links, etc.
    This interface may expand in future.
    Public methods:
    set_request_class(request_class)
    set_response(response)
    forms()
    links()
    Public attributes:
    Note that accessing these attributes may raise ParseError.
    encoding: string specifying the encoding of response if it contains a text
     document (this value is left unspecified for documents that do not have
     an encoding, e.g. an image file)
    is_html: true if response contains an HTML document (XHTML may be
     regarded as HTML too)
    title: page title, or None if no title or not HTML
    global_form: form object containing all controls that are not descendants
     of any FORM element, or None if the forms_factory does not support
     supplying a global form
    """
    LAZY_ATTRS = ["encoding", "is_html", "title", "global_form"]
    def __init__(self, forms_factory, links_factory, title_factory,
                 encoding_finder=EncodingFinder(DEFAULT_ENCODING),
                 response_type_finder=ResponseTypeFinder(allow_xhtml=False),
                 ):
        """
        Pass keyword arguments only.
        default_encoding: character encoding to use if encoding cannot be
         determined (or guessed) from the response.  You should turn on
         HTTP-EQUIV handling if you want the best chance of getting this right
         without resorting to this default.  The default value of this
         parameter (currently latin-1) may change in future.
        """
        self._forms_factory = forms_factory
        self._links_factory = links_factory
        self._title_factory = title_factory
        self._encoding_finder = encoding_finder
        self._response_type_finder = response_type_finder
        self.set_response(None)
    def set_request_class(self, request_class):
        """Set urllib2.Request class.
        ClientForm.HTMLForm instances returned by .forms() will return
        instances of this class when .click()ed.
        """
        self._forms_factory.request_class = request_class
    def set_response(self, response):
        """Set response.
        The response must either be None or implement the same interface as
        objects returned by urllib2.urlopen().
        """
        self._response = response
        self._forms_genf = self._links_genf = None
        self._get_title = None
        for name in self.LAZY_ATTRS:
            try:
                delattr(self, name)
            except AttributeError:
                pass
    def __getattr__(self, name):
        if name not in self.LAZY_ATTRS:
            return getattr(self.__class__, name)
        if name == "encoding":
            self.encoding = self._encoding_finder.encoding(
                copy.copy(self._response))
            return self.encoding
        elif name == "is_html":
            self.is_html = self._response_type_finder.is_html(
                copy.copy(self._response), self.encoding)
            return self.is_html
        elif name == "title":
            if self.is_html:
                self.title = self._title_factory.title()
            else:
                self.title = None
            return self.title
        elif name == "global_form":
            self.forms()
            return self.global_form
    def forms(self):
        """Return iterable over ClientForm.HTMLForm-like objects.
        Raises mechanize.ParseError on failure.
        """
        # this implementation sets .global_form as a side-effect, for benefit
        # of __getattr__ impl
        if self._forms_genf is None:
            try:
                self._forms_genf = CachingGeneratorFunction(
                    self._forms_factory.forms())
            except:  # XXXX define exception!
                self.set_response(self._response)
                raise
            self.global_form = getattr(
                self._forms_factory, "global_form", None)
        return self._forms_genf()
    def links(self):
        """Return iterable over mechanize.Link-like objects.
        Raises mechanize.ParseError on failure.
        """
        if self._links_genf is None:
            try:
                self._links_genf = CachingGeneratorFunction(
                    self._links_factory.links())
            except:  # XXXX define exception!
                self.set_response(self._response)
                raise
        return self._links_genf()
 class DefaultFactory(Factory):
    """Based on sgmllib."""
    def __init__(self, i_want_broken_xhtml_support=False):
        Factory.__init__(
            self,
            forms_factory=FormsFactory(),
            links_factory=LinksFactory(),
            title_factory=TitleFactory(),
            response_type_finder=ResponseTypeFinder(
                allow_xhtml=i_want_broken_xhtml_support),
            )
    def set_response(self, response):
        Factory.set_response(self, response)
        if response is not None:
            self._forms_factory.set_response(
                copy.copy(response), self.encoding)
            self._links_factory.set_response(
                copy.copy(response), response.geturl(), self.encoding)
            self._title_factory.set_response(
                copy.copy(response), self.encoding)
 class RobustFactory(Factory):
    """Based on BeautifulSoup, hopefully a bit more robust to bad HTML than is
    DefaultFactory.
    """
    def __init__(self, i_want_broken_xhtml_support=False,
                 soup_class=None):
        Factory.__init__(
            self,
            forms_factory=RobustFormsFactory(),
            links_factory=RobustLinksFactory(),
            title_factory=RobustTitleFactory(),
            response_type_finder=ResponseTypeFinder(
                allow_xhtml=i_want_broken_xhtml_support),
            )
        if soup_class is None:
            soup_class = MechanizeBs
        self._soup_class = soup_class
    def set_response(self, response):
        import _beautifulsoup
        Factory.set_response(self, response)
        if response is not None:
            data = response.read()
            soup = self._soup_class(self.encoding, data)
            self._forms_factory.set_response(
                copy.copy(response), self.encoding)
            self._links_factory.set_soup(
                soup, response.geturl(), self.encoding)
            self._title_factory.set_soup(soup, self.encoding)
--- a/src/calibre/utils/mechanize/_http.py
+++ b/src/calibre/utils/mechanize/_http.py
@ -0,0 +1,729 @@
 """HTTP related handlers.
 Note that some other HTTP handlers live in more specific modules: _auth.py,
 _gzip.py, etc.
 Copyright 2002-2006 John J Lee <jjl@pobox.com>
 This code is free software; you can redistribute it and/or modify it
 under the terms of the BSD or ZPL 2.1 licenses (see the file
 COPYING.txt included with the distribution).
 """
 import copy, time, tempfile, htmlentitydefs, re, logging, socket, \
       urllib2, urllib, httplib, sgmllib
 from urllib2 import URLError, HTTPError, BaseHandler
 from cStringIO import StringIO
 from _request import Request
 from _util import isstringlike
 from _response import closeable_response, response_seek_wrapper
 from _html import unescape, unescape_charref
 from _headersutil import is_html
 from _clientcookie import CookieJar, request_host
 import _rfc3986
 debug = logging.getLogger("mechanize").debug
 # monkeypatch urllib2.HTTPError to show URL
 ## def urllib2_str(self):
 ##     return 'HTTP Error %s: %s (%s)' % (
 ##         self.code, self.msg, self.geturl())
 ## urllib2.HTTPError.__str__ = urllib2_str
 CHUNK = 1024  # size of chunks fed to HTML HEAD parser, in bytes
 DEFAULT_ENCODING = 'latin-1'
 # This adds "refresh" to the list of redirectables and provides a redirection
 # algorithm that doesn't go into a loop in the presence of cookies
 # (Python 2.4 has this new algorithm, 2.3 doesn't).
 class HTTPRedirectHandler(BaseHandler):
    # maximum number of redirections to any single URL
    # this is needed because of the state that cookies introduce
    max_repeats = 4
    # maximum total number of redirections (regardless of URL) before
    # assuming we're in a loop
    max_redirections = 10
    # Implementation notes:
    # To avoid the server sending us into an infinite loop, the request
    # object needs to track what URLs we have already seen.  Do this by
    # adding a handler-specific attribute to the Request object.  The value
    # of the dict is used to count the number of times the same URL has
    # been visited.  This is needed because visiting the same URL twice
    # does not necessarily imply a loop, thanks to state introduced by
    # cookies.
    # Always unhandled redirection codes:
    # 300 Multiple Choices: should not handle this here.
    # 304 Not Modified: no need to handle here: only of interest to caches
    #     that do conditional GETs
    # 305 Use Proxy: probably not worth dealing with here
    # 306 Unused: what was this for in the previous versions of protocol??
    def redirect_request(self, newurl, req, fp, code, msg, headers):
        """Return a Request or None in response to a redirect.
        This is called by the http_error_30x methods when a redirection
        response is received.  If a redirection should take place, return a
        new Request to allow http_error_30x to perform the redirect;
        otherwise, return None to indicate that an HTTPError should be
        raised.
        """
        if code in (301, 302, 303, "refresh") or \
               (code == 307 and not req.has_data()):
            # Strictly (according to RFC 2616), 301 or 302 in response to
            # a POST MUST NOT cause a redirection without confirmation
            # from the user (of urllib2, in this case).  In practice,
            # essentially all clients do redirect in this case, so we do
            # the same.
            # XXX really refresh redirections should be visiting; tricky to
            #  fix, so this will wait until post-stable release
            new = Request(newurl,
                          headers=req.headers,
                          origin_req_host=req.get_origin_req_host(),
                          unverifiable=True,
                          visit=False,
                          )
            new._origin_req = getattr(req, "_origin_req", req)
            return new
        else:
            raise HTTPError(req.get_full_url(), code, msg, headers, fp)
    def http_error_302(self, req, fp, code, msg, headers):
        # Some servers (incorrectly) return multiple Location headers
        # (so probably same goes for URI).  Use first header.
        if headers.has_key('location'):
            newurl = headers.getheaders('location')[0]
        elif headers.has_key('uri'):
            newurl = headers.getheaders('uri')[0]
        else:
            return
        newurl = _rfc3986.clean_url(newurl, "latin-1")
        newurl = _rfc3986.urljoin(req.get_full_url(), newurl)
        # XXX Probably want to forget about the state of the current
        # request, although that might interact poorly with other
        # handlers that also use handler-specific request attributes
        new = self.redirect_request(newurl, req, fp, code, msg, headers)
        if new is None:
            return
        # loop detection
        # .redirect_dict has a key url if url was previously visited.
        if hasattr(req, 'redirect_dict'):
            visited = new.redirect_dict = req.redirect_dict
            if (visited.get(newurl, 0) >= self.max_repeats or
                len(visited) >= self.max_redirections):
                raise HTTPError(req.get_full_url(), code,
                                self.inf_msg + msg, headers, fp)
        else:
            visited = new.redirect_dict = req.redirect_dict = {}
        visited[newurl] = visited.get(newurl, 0) + 1
        # Don't close the fp until we are sure that we won't use it
        # with HTTPError.  
        fp.read()
        fp.close()
        return self.parent.open(new)
    http_error_301 = http_error_303 = http_error_307 = http_error_302
    http_error_refresh = http_error_302
    inf_msg = "The HTTP server returned a redirect error that would " \
              "lead to an infinite loop.\n" \
              "The last 30x error message was:\n"
 # XXX would self.reset() work, instead of raising this exception?
 class EndOfHeadError(Exception): pass
 class AbstractHeadParser:
    # only these elements are allowed in or before HEAD of document
    head_elems = ("html", "head",
                  "title", "base",
                  "script", "style", "meta", "link", "object")
    _entitydefs = htmlentitydefs.name2codepoint
    _encoding = DEFAULT_ENCODING
    def __init__(self):
        self.http_equiv = []
    def start_meta(self, attrs):
        http_equiv = content = None
        for key, value in attrs:
            if key == "http-equiv":
                http_equiv = self.unescape_attr_if_required(value)
            elif key == "content":
                content = self.unescape_attr_if_required(value)
        if http_equiv is not None and content is not None:
            self.http_equiv.append((http_equiv, content))
    def end_head(self):
        raise EndOfHeadError()
    def handle_entityref(self, name):
        #debug("%s", name)
        self.handle_data(unescape(
            '&%s;' % name, self._entitydefs, self._encoding))
    def handle_charref(self, name):
        #debug("%s", name)
        self.handle_data(unescape_charref(name, self._encoding))
    def unescape_attr(self, name):
        #debug("%s", name)
        return unescape(name, self._entitydefs, self._encoding)
    def unescape_attrs(self, attrs):
        #debug("%s", attrs)
        escaped_attrs = {}
        for key, val in attrs.items():
            escaped_attrs[key] = self.unescape_attr(val)
        return escaped_attrs
    def unknown_entityref(self, ref):
        self.handle_data("&%s;" % ref)
    def unknown_charref(self, ref):
        self.handle_data("&#%s;" % ref)
 try:
    import HTMLParser
 except ImportError:
    pass
 else:
    class XHTMLCompatibleHeadParser(AbstractHeadParser,
                                    HTMLParser.HTMLParser):
        def __init__(self):
            HTMLParser.HTMLParser.__init__(self)
            AbstractHeadParser.__init__(self)
        def handle_starttag(self, tag, attrs):
            if tag not in self.head_elems:
                raise EndOfHeadError()
            try:
                method = getattr(self, 'start_' + tag)
            except AttributeError:
                try:
                    method = getattr(self, 'do_' + tag)
                except AttributeError:
                    pass # unknown tag
                else:
                    method(attrs)
            else:
                method(attrs)
        def handle_endtag(self, tag):
            if tag not in self.head_elems:
                raise EndOfHeadError()
            try:
                method = getattr(self, 'end_' + tag)
            except AttributeError:
                pass # unknown tag
            else:
                method()
        def unescape(self, name):
            # Use the entitydefs passed into constructor, not
            # HTMLParser.HTMLParser's entitydefs.
            return self.unescape_attr(name)
        def unescape_attr_if_required(self, name):
            return name  # HTMLParser.HTMLParser already did it
 class HeadParser(AbstractHeadParser, sgmllib.SGMLParser):
    def _not_called(self):
        assert False
    def __init__(self):
        sgmllib.SGMLParser.__init__(self)
        AbstractHeadParser.__init__(self)
    def handle_starttag(self, tag, method, attrs):
        if tag not in self.head_elems:
            raise EndOfHeadError()
        if tag == "meta":
            method(attrs)
    def unknown_starttag(self, tag, attrs):
        self.handle_starttag(tag, self._not_called, attrs)
    def handle_endtag(self, tag, method):
        if tag in self.head_elems:
            method()
        else:
            raise EndOfHeadError()
    def unescape_attr_if_required(self, name):
        return self.unescape_attr(name)
 def parse_head(fileobj, parser):
    """Return a list of key, value pairs."""
    while 1:
        data = fileobj.read(CHUNK)
        try:
            parser.feed(data)
        except EndOfHeadError:
            break
        if len(data) != CHUNK:
            # this should only happen if there is no HTML body, or if
            # CHUNK is big
            break
    return parser.http_equiv
 class HTTPEquivProcessor(BaseHandler):
    """Append META HTTP-EQUIV headers to regular HTTP headers."""
    handler_order = 300  # before handlers that look at HTTP headers
    def __init__(self, head_parser_class=HeadParser,
                 i_want_broken_xhtml_support=False,
                 ):
        self.head_parser_class = head_parser_class
        self._allow_xhtml = i_want_broken_xhtml_support
    def http_response(self, request, response):
        if not hasattr(response, "seek"):
            response = response_seek_wrapper(response)
        http_message = response.info()
        url = response.geturl()
        ct_hdrs = http_message.getheaders("content-type")
        if is_html(ct_hdrs, url, self._allow_xhtml):
            try:
                try:
                    html_headers = parse_head(response, self.head_parser_class())
                finally:
                    response.seek(0)
            except (HTMLParser.HTMLParseError,
                    sgmllib.SGMLParseError):
                pass
            else:
                for hdr, val in html_headers:
                    # add a header
                    http_message.dict[hdr.lower()] = val
                    text = hdr + ": " + val
                    for line in text.split("\n"):
                        http_message.headers.append(line + "\n")
        return response
    https_response = http_response
 class HTTPCookieProcessor(BaseHandler):
    """Handle HTTP cookies.
    Public attributes:
    cookiejar: CookieJar instance
    """
    def __init__(self, cookiejar=None):
        if cookiejar is None:
            cookiejar = CookieJar()
        self.cookiejar = cookiejar
    def http_request(self, request):
        self.cookiejar.add_cookie_header(request)
        return request
    def http_response(self, request, response):
        self.cookiejar.extract_cookies(response, request)
        return response
    https_request = http_request
    https_response = http_response
 try:
    import robotparser
 except ImportError:
    pass
 else:
    class MechanizeRobotFileParser(robotparser.RobotFileParser):
        def __init__(self, url='', opener=None):
            import _opener
            robotparser.RobotFileParser.__init__(self, url)
            self._opener = opener
        def set_opener(self, opener=None):
            if opener is None:
                opener = _opener.OpenerDirector()
            self._opener = opener
        def read(self):
            """Reads the robots.txt URL and feeds it to the parser."""
            if self._opener is None:
                self.set_opener()
            req = Request(self.url, unverifiable=True, visit=False)
            try:
                f = self._opener.open(req)
            except HTTPError, f:
                pass
            except (IOError, socket.error, OSError), exc:
                robotparser._debug("ignoring error opening %r: %s" %
                                   (self.url, exc))
                return
            lines = []
            line = f.readline()
            while line:
                lines.append(line.strip())
                line = f.readline()
            status = f.code
            if status == 401 or status == 403:
                self.disallow_all = True
                robotparser._debug("disallow all")
            elif status >= 400:
                self.allow_all = True
                robotparser._debug("allow all")
            elif status == 200 and lines:
                robotparser._debug("parse lines")
                self.parse(lines)
    class RobotExclusionError(urllib2.HTTPError):
        def __init__(self, request, *args):
            apply(urllib2.HTTPError.__init__, (self,)+args)
            self.request = request
    class HTTPRobotRulesProcessor(BaseHandler):
        # before redirections, after everything else
        handler_order = 800
        try:
            from httplib import HTTPMessage
        except:
            from mimetools import Message
            http_response_class = Message
        else:
            http_response_class = HTTPMessage
        def __init__(self, rfp_class=MechanizeRobotFileParser):
            self.rfp_class = rfp_class
            self.rfp = None
            self._host = None
        def http_request(self, request):
            scheme = request.get_type()
            if scheme not in ["http", "https"]:
                # robots exclusion only applies to HTTP
                return request
            if request.get_selector() == "/robots.txt":
                # /robots.txt is always OK to fetch
                return request
            host = request.get_host()
            # robots.txt requests don't need to be allowed by robots.txt :-)
            origin_req = getattr(request, "_origin_req", None)
            if (origin_req is not None and
                origin_req.get_selector() == "/robots.txt" and
                origin_req.get_host() == host
                ):
                return request
            if host != self._host:
                self.rfp = self.rfp_class()
                try:
                    self.rfp.set_opener(self.parent)
                except AttributeError:
                    debug("%r instance does not support set_opener" %
                          self.rfp.__class__)
                self.rfp.set_url(scheme+"://"+host+"/robots.txt")
                self.rfp.read()
                self._host = host
            ua = request.get_header("User-agent", "")
            if self.rfp.can_fetch(ua, request.get_full_url()):
                return request
            else:
                # XXX This should really have raised URLError.  Too late now...
                msg = "request disallowed by robots.txt"
                raise RobotExclusionError(
                    request,
                    request.get_full_url(),
                    403, msg,
                    self.http_response_class(StringIO()), StringIO(msg))
        https_request = http_request
 class HTTPRefererProcessor(BaseHandler):
    """Add Referer header to requests.
    This only makes sense if you use each RefererProcessor for a single
    chain of requests only (so, for example, if you use a single
    HTTPRefererProcessor to fetch a series of URLs extracted from a single
    page, this will break).
    There's a proper implementation of this in mechanize.Browser.
    """
    def __init__(self):
        self.referer = None
    def http_request(self, request):
        if ((self.referer is not None) and
            not request.has_header("Referer")):
            request.add_unredirected_header("Referer", self.referer)
        return request
    def http_response(self, request, response):
        self.referer = response.geturl()
        return response
    https_request = http_request
    https_response = http_response
 def clean_refresh_url(url):
    # e.g. Firefox 1.5 does (something like) this
    if ((url.startswith('"') and url.endswith('"')) or
        (url.startswith("'") and url.endswith("'"))):
        url = url[1:-1]
    return _rfc3986.clean_url(url, "latin-1")  # XXX encoding
 def parse_refresh_header(refresh):
    """
    >>> parse_refresh_header("1; url=http://example.com/")
    (1.0, 'http://example.com/')
    >>> parse_refresh_header("1; url='http://example.com/'")
    (1.0, 'http://example.com/')
    >>> parse_refresh_header("1")
    (1.0, None)
    >>> parse_refresh_header("blah")
    Traceback (most recent call last):
    ValueError: invalid literal for float(): blah
    """
    ii = refresh.find(";")
    if ii != -1:
        pause, newurl_spec = float(refresh[:ii]), refresh[ii+1:]
        jj = newurl_spec.find("=")
        key = None
        if jj != -1:
            key, newurl = newurl_spec[:jj], newurl_spec[jj+1:]
            newurl = clean_refresh_url(newurl)
        if key is None or key.strip().lower() != "url":
            raise ValueError()
    else:
        pause, newurl = float(refresh), None
    return pause, newurl
 class HTTPRefreshProcessor(BaseHandler):
    """Perform HTTP Refresh redirections.
    Note that if a non-200 HTTP code has occurred (for example, a 30x
    redirect), this processor will do nothing.
    By default, only zero-time Refresh headers are redirected.  Use the
    max_time attribute / constructor argument to allow Refresh with longer
    pauses.  Use the honor_time attribute / constructor argument to control
    whether the requested pause is honoured (with a time.sleep()) or
    skipped in favour of immediate redirection.
    Public attributes:
    max_time: see above
    honor_time: see above
    """
    handler_order = 1000
    def __init__(self, max_time=0, honor_time=True):
        self.max_time = max_time
        self.honor_time = honor_time
    def http_response(self, request, response):
        code, msg, hdrs = response.code, response.msg, response.info()
        if code == 200 and hdrs.has_key("refresh"):
            refresh = hdrs.getheaders("refresh")[0]
            try:
                pause, newurl = parse_refresh_header(refresh)
            except ValueError:
                debug("bad Refresh header: %r" % refresh)
                return response
            if newurl is None:
                newurl = response.geturl()
            if (self.max_time is None) or (pause <= self.max_time):
                if pause > 1E-3 and self.honor_time:
                    time.sleep(pause)
                hdrs["location"] = newurl
                # hardcoded http is NOT a bug
                response = self.parent.error(
                    "http", request, response,
                    "refresh", msg, hdrs)
        return response
    https_response = http_response
 class HTTPErrorProcessor(BaseHandler):
    """Process HTTP error responses.
    The purpose of this handler is to to allow other response processors a
    look-in by removing the call to parent.error() from
    AbstractHTTPHandler.
    For non-200 error codes, this just passes the job on to the
    Handler.<proto>_error_<code> methods, via the OpenerDirector.error
    method.  Eventually, urllib2.HTTPDefaultErrorHandler will raise an
    HTTPError if no other handler handles the error.
    """
    handler_order = 1000  # after all other processors
    def http_response(self, request, response):
        code, msg, hdrs = response.code, response.msg, response.info()
        if code != 200:
            # hardcoded http is NOT a bug
            response = self.parent.error(
                "http", request, response, code, msg, hdrs)
        return response
    https_response = http_response
 class HTTPDefaultErrorHandler(BaseHandler):
    def http_error_default(self, req, fp, code, msg, hdrs):
        # why these error methods took the code, msg, headers args in the first
        # place rather than a response object, I don't know, but to avoid
        # multiple wrapping, we're discarding them
        if isinstance(fp, urllib2.HTTPError):
            response = fp
        else:
            response = urllib2.HTTPError(
                req.get_full_url(), code, msg, hdrs, fp)
        assert code == response.code
        assert msg == response.msg
        assert hdrs == response.hdrs
        raise response
 class AbstractHTTPHandler(BaseHandler):
    def __init__(self, debuglevel=0):
        self._debuglevel = debuglevel
    def set_http_debuglevel(self, level):
        self._debuglevel = level
    def do_request_(self, request):
        host = request.get_host()
        if not host:
            raise URLError('no host given')
        if request.has_data():  # POST
            data = request.get_data()
            if not request.has_header('Content-type'):
                request.add_unredirected_header(
                    'Content-type',
                    'application/x-www-form-urlencoded')
        scheme, sel = urllib.splittype(request.get_selector())
        sel_host, sel_path = urllib.splithost(sel)
        if not request.has_header('Host'):
            request.add_unredirected_header('Host', sel_host or host)
        for name, value in self.parent.addheaders:
            name = name.capitalize()
            if not request.has_header(name):
                request.add_unredirected_header(name, value)
        return request
    def do_open(self, http_class, req):
        """Return an addinfourl object for the request, using http_class.
        http_class must implement the HTTPConnection API from httplib.
        The addinfourl return value is a file-like object.  It also
        has methods and attributes including:
            - info(): return a mimetools.Message object for the headers
            - geturl(): return the original request URL
            - code: HTTP status code
        """
        host = req.get_host()
        if not host:
            raise URLError('no host given')
        h = http_class(host) # will parse host:port
        h.set_debuglevel(self._debuglevel)
        headers = dict(req.headers)
        headers.update(req.unredirected_hdrs)
        # We want to make an HTTP/1.1 request, but the addinfourl
        # class isn't prepared to deal with a persistent connection.
        # It will try to read all remaining data from the socket,
        # which will block while the server waits for the next request.
        # So make sure the connection gets closed after the (only)
        # request.
        headers["Connection"] = "close"
        headers = dict(
            [(name.title(), val) for name, val in headers.items()])
        try:
            h.request(req.get_method(), req.get_selector(), req.data, headers)
            r = h.getresponse()
        except socket.error, err: # XXX what error?
            raise URLError(err)
        # Pick apart the HTTPResponse object to get the addinfourl
        # object initialized properly.
        # Wrap the HTTPResponse object in socket's file object adapter
        # for Windows.  That adapter calls recv(), so delegate recv()
        # to read().  This weird wrapping allows the returned object to
        # have readline() and readlines() methods.
        # XXX It might be better to extract the read buffering code
        # out of socket._fileobject() and into a base class.
        r.recv = r.read
        fp = socket._fileobject(r)
        resp = closeable_response(fp, r.msg, req.get_full_url(),
                                  r.status, r.reason)
        return resp
 class HTTPHandler(AbstractHTTPHandler):
    def http_open(self, req):
        return self.do_open(httplib.HTTPConnection, req)
    http_request = AbstractHTTPHandler.do_request_
 if hasattr(httplib, 'HTTPS'):
    class HTTPSConnectionFactory:
        def __init__(self, key_file, cert_file):
            self._key_file = key_file
            self._cert_file = cert_file
        def __call__(self, hostport):
            return httplib.HTTPSConnection(
                hostport,
                key_file=self._key_file, cert_file=self._cert_file)
    class HTTPSHandler(AbstractHTTPHandler):
        def __init__(self, client_cert_manager=None):
            AbstractHTTPHandler.__init__(self)
            self.client_cert_manager = client_cert_manager
        def https_open(self, req):
            if self.client_cert_manager is not None:
                key_file, cert_file = self.client_cert_manager.find_key_cert(
                    req.get_full_url())
                conn_factory = HTTPSConnectionFactory(key_file, cert_file)
            else:
                conn_factory = httplib.HTTPSConnection
            return self.do_open(conn_factory, req)
        https_request = AbstractHTTPHandler.do_request_
--- a/src/calibre/utils/mechanize/_lwpcookiejar.py
+++ b/src/calibre/utils/mechanize/_lwpcookiejar.py
@ -0,0 +1,185 @@
 """Load / save to libwww-perl (LWP) format files.
 Actually, the format is slightly extended from that used by LWP's
 (libwww-perl's) HTTP::Cookies, to avoid losing some RFC 2965 information
 not recorded by LWP.
 It uses the version string "2.0", though really there isn't an LWP Cookies
 2.0 format.  This indicates that there is extra information in here
 (domain_dot and port_spec) while still being compatible with libwww-perl,
 I hope.
 Copyright 2002-2006 John J Lee <jjl@pobox.com>
 Copyright 1997-1999 Gisle Aas (original libwww-perl code)
 This code is free software; you can redistribute it and/or modify it
 under the terms of the BSD or ZPL 2.1 licenses (see the file
 COPYING.txt included with the distribution).
 """
 import time, re, logging
 from _clientcookie import reraise_unmasked_exceptions, FileCookieJar, Cookie, \
     MISSING_FILENAME_TEXT, LoadError
 from _headersutil import join_header_words, split_header_words
 from _util import iso2time, time2isoz
 debug = logging.getLogger("mechanize").debug
 def lwp_cookie_str(cookie):
    """Return string representation of Cookie in an the LWP cookie file format.
    Actually, the format is extended a bit -- see module docstring.
    """
    h = [(cookie.name, cookie.value),
         ("path", cookie.path),
         ("domain", cookie.domain)]
    if cookie.port is not None: h.append(("port", cookie.port))
    if cookie.path_specified: h.append(("path_spec", None))
    if cookie.port_specified: h.append(("port_spec", None))
    if cookie.domain_initial_dot: h.append(("domain_dot", None))
    if cookie.secure: h.append(("secure", None))
    if cookie.expires: h.append(("expires",
                               time2isoz(float(cookie.expires))))
    if cookie.discard: h.append(("discard", None))
    if cookie.comment: h.append(("comment", cookie.comment))
    if cookie.comment_url: h.append(("commenturl", cookie.comment_url))
    if cookie.rfc2109: h.append(("rfc2109", None))
    keys = cookie.nonstandard_attr_keys()
    keys.sort()
    for k in keys:
        h.append((k, str(cookie.get_nonstandard_attr(k))))
    h.append(("version", str(cookie.version)))
    return join_header_words([h])
 class LWPCookieJar(FileCookieJar):
    """
    The LWPCookieJar saves a sequence of"Set-Cookie3" lines.
    "Set-Cookie3" is the format used by the libwww-perl libary, not known
    to be compatible with any browser, but which is easy to read and
    doesn't lose information about RFC 2965 cookies.
    Additional methods
    as_lwp_str(ignore_discard=True, ignore_expired=True)
    """
    magic_re = r"^\#LWP-Cookies-(\d+\.\d+)"
    def as_lwp_str(self, ignore_discard=True, ignore_expires=True):
        """Return cookies as a string of "\n"-separated "Set-Cookie3" headers.
        ignore_discard and ignore_expires: see docstring for FileCookieJar.save
        """
        now = time.time()
        r = []
        for cookie in self:
            if not ignore_discard and cookie.discard:
                debug("   Not saving %s: marked for discard", cookie.name)
                continue
            if not ignore_expires and cookie.is_expired(now):
                debug("   Not saving %s: expired", cookie.name)
                continue
            r.append("Set-Cookie3: %s" % lwp_cookie_str(cookie))
        return "\n".join(r+[""])
    def save(self, filename=None, ignore_discard=False, ignore_expires=False):
        if filename is None:
            if self.filename is not None: filename = self.filename
            else: raise ValueError(MISSING_FILENAME_TEXT)
        f = open(filename, "w")
        try:
            debug("Saving LWP cookies file")
            # There really isn't an LWP Cookies 2.0 format, but this indicates
            # that there is extra information in here (domain_dot and
            # port_spec) while still being compatible with libwww-perl, I hope.
            f.write("#LWP-Cookies-2.0\n")
            f.write(self.as_lwp_str(ignore_discard, ignore_expires))
        finally:
            f.close()
    def _really_load(self, f, filename, ignore_discard, ignore_expires):
        magic = f.readline()
        if not re.search(self.magic_re, magic):
            msg = "%s does not seem to contain cookies" % filename
            raise LoadError(msg)
        now = time.time()
        header = "Set-Cookie3:"
        boolean_attrs = ("port_spec", "path_spec", "domain_dot",
                         "secure", "discard", "rfc2109")
        value_attrs = ("version",
                       "port", "path", "domain",
                       "expires",
                       "comment", "commenturl")
        try:
            while 1:
                line = f.readline()
                if line == "": break
                if not line.startswith(header):
                    continue
                line = line[len(header):].strip()
                for data in split_header_words([line]):
                    name, value = data[0]
                    standard = {}
                    rest = {}
                    for k in boolean_attrs:
                        standard[k] = False
                    for k, v in data[1:]:
                        if k is not None:
                            lc = k.lower()
                        else:
                            lc = None
                        # don't lose case distinction for unknown fields
                        if (lc in value_attrs) or (lc in boolean_attrs):
                            k = lc
                        if k in boolean_attrs:
                            if v is None: v = True
                            standard[k] = v
                        elif k in value_attrs:
                            standard[k] = v
                        else:
                            rest[k] = v
                    h = standard.get
                    expires = h("expires")
                    discard = h("discard")
                    if expires is not None:
                        expires = iso2time(expires)
                    if expires is None:
                        discard = True
                    domain = h("domain")
                    domain_specified = domain.startswith(".")
                    c = Cookie(h("version"), name, value,
                               h("port"), h("port_spec"),
                               domain, domain_specified, h("domain_dot"),
                               h("path"), h("path_spec"),
                               h("secure"),
                               expires,
                               discard,
                               h("comment"),
                               h("commenturl"),
                               rest,
                               h("rfc2109"),
                               ) 
                    if not ignore_discard and c.discard:
                        continue
                    if not ignore_expires and c.is_expired(now):
                        continue
                    self.set_cookie(c)
        except:
            reraise_unmasked_exceptions((IOError,))
            raise LoadError("invalid Set-Cookie3 format file %s" % filename)
--- a/src/calibre/utils/mechanize/_mechanize.py
+++ b/src/calibre/utils/mechanize/_mechanize.py
@ -0,0 +1,656 @@
 """Stateful programmatic WWW navigation, after Perl's WWW::Mechanize.
 Copyright 2003-2006 John J. Lee <jjl@pobox.com>
 Copyright 2003 Andy Lester (original Perl code)
 This code is free software; you can redistribute it and/or modify it
 under the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt
 included with the distribution).
 """
 import urllib2, sys, copy, re
 from _useragent import UserAgentBase
 from _html import DefaultFactory
 import _response
 import _request
 import _rfc3986
 __version__ = (0, 1, 7, "b", None)  # 0.1.7b
 class BrowserStateError(Exception): pass
 class LinkNotFoundError(Exception): pass
 class FormNotFoundError(Exception): pass
 class History:
    """
    Though this will become public, the implied interface is not yet stable.
    """
    def __init__(self):
        self._history = []  # LIFO
    def add(self, request, response):
        self._history.append((request, response))
    def back(self, n, _response):
        response = _response  # XXX move Browser._response into this class?
        while n > 0 or response is None:
            try:
                request, response = self._history.pop()
            except IndexError:
                raise BrowserStateError("already at start of history")
            n -= 1
        return request, response
    def clear(self):
        del self._history[:]
    def close(self):
        for request, response in self._history:
            if response is not None:
                response.close()
        del self._history[:]
 class HTTPRefererProcessor(urllib2.BaseHandler):
    def http_request(self, request):
        # See RFC 2616 14.36.  The only times we know the source of the
        # request URI has a URI associated with it are redirect, and
        # Browser.click() / Browser.submit() / Browser.follow_link().
        # Otherwise, it's the user's job to add any Referer header before
        # .open()ing.
        if hasattr(request, "redirect_dict"):
            request = self.parent._add_referer_header(
                request, origin_request=False)
        return request
    https_request = http_request
 class Browser(UserAgentBase):
    """Browser-like class with support for history, forms and links.
    BrowserStateError is raised whenever the browser is in the wrong state to
    complete the requested operation - eg., when .back() is called when the
    browser history is empty, or when .follow_link() is called when the current
    response does not contain HTML data.
    Public attributes:
    request: current request (mechanize.Request or urllib2.Request)
    form: currently selected form (see .select_form())
    """
    handler_classes = copy.copy(UserAgentBase.handler_classes)
    handler_classes["_referer"] = HTTPRefererProcessor
    default_features = copy.copy(UserAgentBase.default_features)
    default_features.append("_referer")
    def __init__(self,
                 factory=None,
                 history=None,
                 request_class=None,
                 ):
        """
        Only named arguments should be passed to this constructor.
        factory: object implementing the mechanize.Factory interface.
        history: object implementing the mechanize.History interface.  Note
         this interface is still experimental and may change in future.
        request_class: Request class to use.  Defaults to mechanize.Request
         by default for Pythons older than 2.4, urllib2.Request otherwise.
        The Factory and History objects passed in are 'owned' by the Browser,
        so they should not be shared across Browsers.  In particular,
        factory.set_response() should not be called except by the owning
        Browser itself.
        Note that the supplied factory's request_class is overridden by this
        constructor, to ensure only one Request class is used.
        """
        self._handle_referer = True
        if history is None:
            history = History()
        self._history = history
        if request_class is None:
            if not hasattr(urllib2.Request, "add_unredirected_header"):
                request_class = _request.Request
            else:
                request_class = urllib2.Request  # Python >= 2.4
        if factory is None:
            factory = DefaultFactory()
        factory.set_request_class(request_class)
        self._factory = factory
        self.request_class = request_class
        self.request = None
        self._set_response(None, False)
        # do this last to avoid __getattr__ problems
        UserAgentBase.__init__(self)
    def close(self):
        UserAgentBase.close(self)
        if self._response is not None:
            self._response.close()    
        if self._history is not None:
            self._history.close()
            self._history = None
        # make use after .close easy to spot
        self.form = None
        self.request = self._response = None
        self.request = self.response = self.set_response = None
        self.geturl =  self.reload = self.back = None
        self.clear_history = self.set_cookie = self.links = self.forms = None
        self.viewing_html = self.encoding = self.title = None
        self.select_form = self.click = self.submit = self.click_link = None
        self.follow_link = self.find_link = None
    def set_handle_referer(self, handle):
        """Set whether to add Referer header to each request.
        This base class does not implement this feature (so don't turn this on
        if you're using this base class directly), but the subclass
        mechanize.Browser does.
        """
        self._set_handler("_referer", handle)
        self._handle_referer = bool(handle)
    def _add_referer_header(self, request, origin_request=True):
        if self.request is None:
            return request
        scheme = request.get_type()
        original_scheme = self.request.get_type()
        if scheme not in ["http", "https"]:
            return request
        if not origin_request and not self.request.has_header("Referer"):
            return request
        if (self._handle_referer and
            original_scheme in ["http", "https"] and
            not (original_scheme == "https" and scheme != "https")):
            # strip URL fragment (RFC 2616 14.36)
            parts = _rfc3986.urlsplit(self.request.get_full_url())
            parts = parts[:-1]+(None,)
            referer = _rfc3986.urlunsplit(parts)
            request.add_unredirected_header("Referer", referer)
        return request
    def open_novisit(self, url, data=None):
        """Open a URL without visiting it.
        The browser state (including .request, .response(), history, forms and
        links) are all left unchanged by calling this function.
        The interface is the same as for .open().
        This is useful for things like fetching images.
        See also .retrieve().
        """
        return self._mech_open(url, data, visit=False)
    def open(self, url, data=None):
        return self._mech_open(url, data)
    def _mech_open(self, url, data=None, update_history=True, visit=None):
        try:
            url.get_full_url
        except AttributeError:
            # string URL -- convert to absolute URL if required
            scheme, authority = _rfc3986.urlsplit(url)[:2]
            if scheme is None:
                # relative URL
                if self._response is None:
                    raise BrowserStateError(
                        "can't fetch relative reference: "
                        "not viewing any document")
                url = _rfc3986.urljoin(self._response.geturl(), url)
        request = self._request(url, data, visit)
        visit = request.visit
        if visit is None:
            visit = True
        if visit:
            self._visit_request(request, update_history)
        success = True
        try:
            response = UserAgentBase.open(self, request, data)
        except urllib2.HTTPError, error:
            success = False
            if error.fp is None:  # not a response
                raise
            response = error
 ##         except (IOError, socket.error, OSError), error:
 ##             # Yes, urllib2 really does raise all these :-((
 ##             # See test_urllib2.py for examples of socket.gaierror and OSError,
 ##             # plus note that FTPHandler raises IOError.
 ##             # XXX I don't seem to have an example of exactly socket.error being
 ##             #  raised, only socket.gaierror...
 ##             # I don't want to start fixing these here, though, since this is a
 ##             # subclass of OpenerDirector, and it would break old code.  Even in
 ##             # Python core, a fix would need some backwards-compat. hack to be
 ##             # acceptable.
 ##             raise
        if visit:
            self._set_response(response, False)
            response = copy.copy(self._response)
        elif response is not None:
            response = _response.upgrade_response(response)
        if not success:
            raise response
        return response
    def __str__(self):
        text = []
        text.append("<%s " % self.__class__.__name__)
        if self._response:
            text.append("visiting %s" % self._response.geturl())
        else:
            text.append("(not visiting a URL)")
        if self.form:
            text.append("\n selected form:\n %s\n" % str(self.form))
        text.append(">")
        return "".join(text)
    def response(self):
        """Return a copy of the current response.
        The returned object has the same interface as the object returned by
        .open() (or urllib2.urlopen()).
        """
        return copy.copy(self._response)
    def set_response(self, response):
        """Replace current response with (a copy of) response.
        response may be None.
        This is intended mostly for HTML-preprocessing.
        """
        self._set_response(response, True)
    def _set_response(self, response, close_current):
        # sanity check, necessary but far from sufficient
        if not (response is None or
                (hasattr(response, "info") and hasattr(response, "geturl") and
                 hasattr(response, "read")
                 )
                ):
            raise ValueError("not a response object")
        self.form = None
        if response is not None:
            response = _response.upgrade_response(response)
        if close_current and self._response is not None:
            self._response.close()
        self._response = response
        self._factory.set_response(response)
    def visit_response(self, response, request=None):
        """Visit the response, as if it had been .open()ed.
        Unlike .set_response(), this updates history rather than replacing the
        current response.
        """
        if request is None:
            request = _request.Request(response.geturl())
        self._visit_request(request, True)
        self._set_response(response, False)
    def _visit_request(self, request, update_history):
        if self._response is not None:
            self._response.close()
        if self.request is not None and update_history:
            self._history.add(self.request, self._response)
        self._response = None
        # we want self.request to be assigned even if UserAgentBase.open
        # fails
        self.request = request
    def geturl(self):
        """Get URL of current document."""
        if self._response is None:
            raise BrowserStateError("not viewing any document")
        return self._response.geturl()
    def reload(self):
        """Reload current document, and return response object."""
        if self.request is None:
            raise BrowserStateError("no URL has yet been .open()ed")
        if self._response is not None:
            self._response.close()
        return self._mech_open(self.request, update_history=False)
    def back(self, n=1):
        """Go back n steps in history, and return response object.
        n: go back this number of steps (default 1 step)
        """
        if self._response is not None:
            self._response.close()
        self.request, response = self._history.back(n, self._response)
        self.set_response(response)
        if not response.read_complete:
            return self.reload()
        return copy.copy(response)
    def clear_history(self):
        self._history.clear()
    def set_cookie(self, cookie_string):
        """Request to set a cookie.
        Note that it is NOT necessary to call this method under ordinary
        circumstances: cookie handling is normally entirely automatic.  The
        intended use case is rather to simulate the setting of a cookie by
        client script in a web page (e.g. JavaScript).  In that case, use of
        this method is necessary because mechanize currently does not support
        JavaScript, VBScript, etc.
        The cookie is added in the same way as if it had arrived with the
        current response, as a result of the current request.  This means that,
        for example, it is not appropriate to set the cookie based on the
        current request, no cookie will be set.
        The cookie will be returned automatically with subsequent responses
        made by the Browser instance whenever that's appropriate.
        cookie_string should be a valid value of the Set-Cookie header.
        For example:
        browser.set_cookie(
            "sid=abcdef; expires=Wednesday, 09-Nov-06 23:12:40 GMT")
        Currently, this method does not allow for adding RFC 2986 cookies.
        This limitation will be lifted if anybody requests it.
        """
        if self._response is None:
            raise BrowserStateError("not viewing any document")
        if self.request.get_type() not in ["http", "https"]:
            raise BrowserStateError("can't set cookie for non-HTTP/HTTPS "
                                    "transactions")
        cookiejar = self._ua_handlers["_cookies"].cookiejar
        response = self.response()  # copy
        headers = response.info()
        headers["Set-cookie"] = cookie_string
        cookiejar.extract_cookies(response, self.request)
    def links(self, **kwds):
        """Return iterable over links (mechanize.Link objects)."""
        if not self.viewing_html():
            raise BrowserStateError("not viewing HTML")
        links = self._factory.links()
        if kwds:
            return self._filter_links(links, **kwds)
        else:
            return links
    def forms(self):
        """Return iterable over forms.
        The returned form objects implement the ClientForm.HTMLForm interface.
        """
        if not self.viewing_html():
            raise BrowserStateError("not viewing HTML")
        return self._factory.forms()
    def global_form(self):
        """Return the global form object, or None if the factory implementation
        did not supply one.
        The "global" form object contains all controls that are not descendants of
        any FORM element.
        The returned form object implements the ClientForm.HTMLForm interface.
        This is a separate method since the global form is not regarded as part
        of the sequence of forms in the document -- mostly for
        backwards-compatibility.
        """
        if not self.viewing_html():
            raise BrowserStateError("not viewing HTML")
        return self._factory.global_form
    def viewing_html(self):
        """Return whether the current response contains HTML data."""
        if self._response is None:
            raise BrowserStateError("not viewing any document")
        return self._factory.is_html
    def encoding(self):
        """"""
        if self._response is None:
            raise BrowserStateError("not viewing any document")
        return self._factory.encoding
    def title(self):
        """Return title, or None if there is no title element in the document.
        Tags are stripped or textified as described in docs for
        PullParser.get_text() method of pullparser module.
        """
        if not self.viewing_html():
            raise BrowserStateError("not viewing HTML")
        return self._factory.title
    def select_form(self, name=None, predicate=None, nr=None):
        """Select an HTML form for input.
        This is a bit like giving a form the "input focus" in a browser.
        If a form is selected, the Browser object supports the HTMLForm
        interface, so you can call methods like .set_value(), .set(), and
        .click().
        Another way to select a form is to assign to the .form attribute.  The
        form assigned should be one of the objects returned by the .forms()
        method.
        At least one of the name, predicate and nr arguments must be supplied.
        If no matching form is found, mechanize.FormNotFoundError is raised.
        If name is specified, then the form must have the indicated name.
        If predicate is specified, then the form must match that function.  The
        predicate function is passed the HTMLForm as its single argument, and
        should return a boolean value indicating whether the form matched.
        nr, if supplied, is the sequence number of the form (where 0 is the
        first).  Note that control 0 is the first form matching all the other
        arguments (if supplied); it is not necessarily the first control in the
        form.
        """
        if not self.viewing_html():
            raise BrowserStateError("not viewing HTML")
        if (name is None) and (predicate is None) and (nr is None):
            raise ValueError(
                "at least one argument must be supplied to specify form")
        orig_nr = nr
        for form in self.forms():
            if name is not None and name != form.name:
                continue
            if predicate is not None and not predicate(form):
                continue
            if nr:
                nr -= 1
                continue
            self.form = form
            break  # success
        else:
            # failure
            description = []
            if name is not None: description.append("name '%s'" % name)
            if predicate is not None:
                description.append("predicate %s" % predicate)
            if orig_nr is not None: description.append("nr %d" % orig_nr)
            description = ", ".join(description)
            raise FormNotFoundError("no form matching "+description)
    def click(self, *args, **kwds):
        """See ClientForm.HTMLForm.click for documentation."""
        if not self.viewing_html():
            raise BrowserStateError("not viewing HTML")
        request = self.form.click(*args, **kwds)
        return self._add_referer_header(request)
    def submit(self, *args, **kwds):
        """Submit current form.
        Arguments are as for ClientForm.HTMLForm.click().
        Return value is same as for Browser.open().
        """
        return self.open(self.click(*args, **kwds))
    def click_link(self, link=None, **kwds):
        """Find a link and return a Request object for it.
        Arguments are as for .find_link(), except that a link may be supplied
        as the first argument.
        """
        if not self.viewing_html():
            raise BrowserStateError("not viewing HTML")
        if not link:
            link = self.find_link(**kwds)
        else:
            if kwds:
                raise ValueError(
                    "either pass a Link, or keyword arguments, not both")
        request = self.request_class(link.absolute_url)
        return self._add_referer_header(request)
    def follow_link(self, link=None, **kwds):
        """Find a link and .open() it.
        Arguments are as for .click_link().
        Return value is same as for Browser.open().
        """
        return self.open(self.click_link(link, **kwds))
    def find_link(self, **kwds):
        """Find a link in current page.
        Links are returned as mechanize.Link objects.
        # Return third link that .search()-matches the regexp "python"
        # (by ".search()-matches", I mean that the regular expression method
        # .search() is used, rather than .match()).
        find_link(text_regex=re.compile("python"), nr=2)
        # Return first http link in the current page that points to somewhere
        # on python.org whose link text (after tags have been removed) is
        # exactly "monty python".
        find_link(text="monty python",
                  url_regex=re.compile("http.*python.org"))
        # Return first link with exactly three HTML attributes.
        find_link(predicate=lambda link: len(link.attrs) == 3)
        Links include anchors (<a>), image maps (<area>), and frames (<frame>,
        <iframe>).
        All arguments must be passed by keyword, not position.  Zero or more
        arguments may be supplied.  In order to find a link, all arguments
        supplied must match.
        If a matching link is not found, mechanize.LinkNotFoundError is raised.
        text: link text between link tags: eg. <a href="blah">this bit</a> (as
         returned by pullparser.get_compressed_text(), ie. without tags but
         with opening tags "textified" as per the pullparser docs) must compare
         equal to this argument, if supplied
        text_regex: link text between tag (as defined above) must match the
         regular expression object or regular expression string passed as this
         argument, if supplied
        name, name_regex: as for text and text_regex, but matched against the
         name HTML attribute of the link tag
        url, url_regex: as for text and text_regex, but matched against the
         URL of the link tag (note this matches against Link.url, which is a
         relative or absolute URL according to how it was written in the HTML)
        tag: element name of opening tag, eg. "a"
        predicate: a function taking a Link object as its single argument,
         returning a boolean result, indicating whether the links
        nr: matches the nth link that matches all other criteria (default 0)
        """
        try:
            return self._filter_links(self._factory.links(), **kwds).next()
        except StopIteration:
            raise LinkNotFoundError()
    def __getattr__(self, name):
        # pass through ClientForm / DOMForm methods and attributes
        form = self.__dict__.get("form")
        if form is None:
            raise AttributeError(
                "%s instance has no attribute %s (perhaps you forgot to "
                ".select_form()?)" % (self.__class__, name))
        return getattr(form, name)
    def _filter_links(self, links,
                    text=None, text_regex=None,
                    name=None, name_regex=None,
                    url=None, url_regex=None,
                    tag=None,
                    predicate=None,
                    nr=0
                    ):
        if not self.viewing_html():
            raise BrowserStateError("not viewing HTML")
        found_links = []
        orig_nr = nr
        for link in links:
            if url is not None and url != link.url:
                continue
            if url_regex is not None and not re.search(url_regex, link.url):
                continue
            if (text is not None and
                (link.text is None or text != link.text)):
                continue
            if (text_regex is not None and
                (link.text is None or not re.search(text_regex, link.text))):
                continue
            if name is not None and name != dict(link.attrs).get("name"):
                continue
            if name_regex is not None:
                link_name = dict(link.attrs).get("name")
                if link_name is None or not re.search(name_regex, link_name):
                    continue
            if tag is not None and tag != link.tag:
                continue
            if predicate is not None and not predicate(link):
                continue
            if nr:
                nr -= 1
                continue
            yield link
            nr = orig_nr
--- a/src/calibre/utils/mechanize/_mozillacookiejar.py
+++ b/src/calibre/utils/mechanize/_mozillacookiejar.py
@ -0,0 +1,159 @@
 """Mozilla / Netscape cookie loading / saving.
 Copyright 2002-2006 John J Lee <jjl@pobox.com>
 Copyright 1997-1999 Gisle Aas (original libwww-perl code)
 This code is free software; you can redistribute it and/or modify it
 under the terms of the BSD or ZPL 2.1 licenses (see the file
 COPYING.txt included with the distribution).
 """
 import re, time, logging
 from _clientcookie import reraise_unmasked_exceptions, FileCookieJar, Cookie, \
     MISSING_FILENAME_TEXT, LoadError
 debug = logging.getLogger("ClientCookie").debug
 class MozillaCookieJar(FileCookieJar):
    """
    WARNING: you may want to backup your browser's cookies file if you use
    this class to save cookies.  I *think* it works, but there have been
    bugs in the past!
    This class differs from CookieJar only in the format it uses to save and
    load cookies to and from a file.  This class uses the Mozilla/Netscape
    `cookies.txt' format.  lynx uses this file format, too.
    Don't expect cookies saved while the browser is running to be noticed by
    the browser (in fact, Mozilla on unix will overwrite your saved cookies if
    you change them on disk while it's running; on Windows, you probably can't
    save at all while the browser is running).
    Note that the Mozilla/Netscape format will downgrade RFC2965 cookies to
    Netscape cookies on saving.
    In particular, the cookie version and port number information is lost,
    together with information about whether or not Path, Port and Discard were
    specified by the Set-Cookie2 (or Set-Cookie) header, and whether or not the
    domain as set in the HTTP header started with a dot (yes, I'm aware some
    domains in Netscape files start with a dot and some don't -- trust me, you
    really don't want to know any more about this).
    Note that though Mozilla and Netscape use the same format, they use
    slightly different headers.  The class saves cookies using the Netscape
    header by default (Mozilla can cope with that).
    """
    magic_re = "#( Netscape)? HTTP Cookie File"
    header = """\
    # Netscape HTTP Cookie File
    # http://www.netscape.com/newsref/std/cookie_spec.html
    # This is a generated file!  Do not edit.
 """
    def _really_load(self, f, filename, ignore_discard, ignore_expires):
        now = time.time()
        magic = f.readline()
        if not re.search(self.magic_re, magic):
            f.close()
            raise LoadError(
                "%s does not look like a Netscape format cookies file" %
                filename)
        try:
            while 1:
                line = f.readline()
                if line == "": break
                # last field may be absent, so keep any trailing tab
                if line.endswith("\n"): line = line[:-1]
                # skip comments and blank lines XXX what is $ for?
                if (line.strip().startswith("#") or
                    line.strip().startswith("$") or
                    line.strip() == ""):
                    continue
                domain, domain_specified, path, secure, expires, name, value = \
                        line.split("\t")
                secure = (secure == "TRUE")
                domain_specified = (domain_specified == "TRUE")
                if name == "":
                    name = value
                    value = None
                initial_dot = domain.startswith(".")
                assert domain_specified == initial_dot
                discard = False
                if expires == "":
                    expires = None
                    discard = True
                # assume path_specified is false
                c = Cookie(0, name, value,
                           None, False,
                           domain, domain_specified, initial_dot,
                           path, False,
                           secure,
                           expires,
                           discard,
                           None,
                           None,
                           {})
                if not ignore_discard and c.discard:
                    continue
                if not ignore_expires and c.is_expired(now):
                    continue
                self.set_cookie(c)
        except:
            reraise_unmasked_exceptions((IOError,))
            raise LoadError("invalid Netscape format file %s: %s" %
                          (filename, line))
    def save(self, filename=None, ignore_discard=False, ignore_expires=False):
        if filename is None:
            if self.filename is not None: filename = self.filename
            else: raise ValueError(MISSING_FILENAME_TEXT)
        f = open(filename, "w")
        try:
            debug("Saving Netscape cookies.txt file")
            f.write(self.header)
            now = time.time()
            for cookie in self:
                if not ignore_discard and cookie.discard:
                    debug("   Not saving %s: marked for discard", cookie.name)
                    continue
                if not ignore_expires and cookie.is_expired(now):
                    debug("   Not saving %s: expired", cookie.name)
                    continue
                if cookie.secure: secure = "TRUE"
                else: secure = "FALSE"
                if cookie.domain.startswith("."): initial_dot = "TRUE"
                else: initial_dot = "FALSE"
                if cookie.expires is not None:
                    expires = str(cookie.expires)
                else:
                    expires = ""
                if cookie.value is None:
                    # cookies.txt regards 'Set-Cookie: foo' as a cookie
                    # with no name, whereas cookielib regards it as a
                    # cookie with no value.
                    name = ""
                    value = cookie.name
                else:
                    name = cookie.name
                    value = cookie.value
                f.write(
                    "\t".join([cookie.domain, initial_dot, cookie.path,
                               secure, expires, name, value])+
                    "\n")
        finally:
            f.close()
--- a/src/calibre/utils/mechanize/_msiecookiejar.py
+++ b/src/calibre/utils/mechanize/_msiecookiejar.py
@ -0,0 +1,387 @@
 """Microsoft Internet Explorer cookie loading on Windows.
 Copyright 2002-2003 Johnny Lee <typo_pl@hotmail.com> (MSIE Perl code)
 Copyright 2002-2006 John J Lee <jjl@pobox.com> (The Python port)
 This code is free software; you can redistribute it and/or modify it
 under the terms of the BSD or ZPL 2.1 licenses (see the file
 COPYING.txt included with the distribution).
 """
 # XXX names and comments are not great here
 import os, re, time, struct, logging
 if os.name == "nt":
    import _winreg
 from _clientcookie import FileCookieJar, CookieJar, Cookie, \
     MISSING_FILENAME_TEXT, LoadError
 debug = logging.getLogger("mechanize").debug
 def regload(path, leaf):
    key = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER, path, 0,
                          _winreg.KEY_ALL_ACCESS)
    try:
        value = _winreg.QueryValueEx(key, leaf)[0]
    except WindowsError:
        value = None
    return value
 WIN32_EPOCH = 0x019db1ded53e8000L  # 1970 Jan 01 00:00:00 in Win32 FILETIME
 def epoch_time_offset_from_win32_filetime(filetime):
    """Convert from win32 filetime to seconds-since-epoch value.
    MSIE stores create and expire times as Win32 FILETIME, which is 64
    bits of 100 nanosecond intervals since Jan 01 1601.
    mechanize expects time in 32-bit value expressed in seconds since the
    epoch (Jan 01 1970).
    """
    if filetime < WIN32_EPOCH:
        raise ValueError("filetime (%d) is before epoch (%d)" %
                         (filetime, WIN32_EPOCH))
    return divmod((filetime - WIN32_EPOCH), 10000000L)[0]
 def binary_to_char(c): return "%02X" % ord(c)
 def binary_to_str(d): return "".join(map(binary_to_char, list(d)))
 class MSIEBase:
    magic_re = re.compile(r"Client UrlCache MMF Ver \d\.\d.*")
    padding = "\x0d\xf0\xad\x0b"
    msie_domain_re = re.compile(r"^([^/]+)(/.*)$")
    cookie_re = re.compile("Cookie\:.+\@([\x21-\xFF]+).*?"
                           "(.+\@[\x21-\xFF]+\.txt)")
    # path under HKEY_CURRENT_USER from which to get location of index.dat
    reg_path = r"software\microsoft\windows" \
               r"\currentversion\explorer\shell folders"
    reg_key = "Cookies"
    def __init__(self):
        self._delayload_domains = {}
    def _delayload_domain(self, domain):
        # if necessary, lazily load cookies for this domain
        delayload_info = self._delayload_domains.get(domain)
        if delayload_info is not None:
            cookie_file, ignore_discard, ignore_expires = delayload_info
            try:
                self.load_cookie_data(cookie_file,
                                      ignore_discard, ignore_expires)
            except (LoadError, IOError):
                debug("error reading cookie file, skipping: %s", cookie_file)
            else:
                del self._delayload_domains[domain]
    def _load_cookies_from_file(self, filename):
        debug("Loading MSIE cookies file: %s", filename)
        cookies = []
        cookies_fh = open(filename)
        try:
            while 1:
                key = cookies_fh.readline()
                if key == "": break
                rl = cookies_fh.readline
                def getlong(rl=rl): return long(rl().rstrip())
                def getstr(rl=rl): return rl().rstrip()
                key = key.rstrip()
                value = getstr()
                domain_path = getstr()
                flags = getlong()  # 0x2000 bit is for secure I think
                lo_expire = getlong()
                hi_expire = getlong()
                lo_create = getlong()
                hi_create = getlong()
                sep = getstr()
                if "" in (key, value, domain_path, flags, hi_expire, lo_expire,
                          hi_create, lo_create, sep) or (sep != "*"):
                    break
                m = self.msie_domain_re.search(domain_path)
                if m:
                    domain = m.group(1)
                    path = m.group(2)
                    cookies.append({"KEY": key, "VALUE": value, "DOMAIN": domain,
                                    "PATH": path, "FLAGS": flags, "HIXP": hi_expire,
                                    "LOXP": lo_expire, "HICREATE": hi_create,
                                    "LOCREATE": lo_create})
        finally:
            cookies_fh.close()
        return cookies
    def load_cookie_data(self, filename,
                         ignore_discard=False, ignore_expires=False):
        """Load cookies from file containing actual cookie data.
        Old cookies are kept unless overwritten by newly loaded ones.
        You should not call this method if the delayload attribute is set.
        I think each of these files contain all cookies for one user, domain,
        and path.
        filename: file containing cookies -- usually found in a file like
         C:\WINNT\Profiles\joe\Cookies\joe@blah[1].txt
        """
        now = int(time.time())
        cookie_data = self._load_cookies_from_file(filename)
        for cookie in cookie_data:
            flags = cookie["FLAGS"]
            secure = ((flags & 0x2000) != 0)
            filetime = (cookie["HIXP"] << 32) + cookie["LOXP"]
            expires = epoch_time_offset_from_win32_filetime(filetime)
            if expires < now:
                discard = True
            else:
                discard = False
            domain = cookie["DOMAIN"]
            initial_dot = domain.startswith(".")
            if initial_dot:
                domain_specified = True
            else:
                # MSIE 5 does not record whether the domain cookie-attribute
                # was specified.
                # Assuming it wasn't is conservative, because with strict
                # domain matching this will match less frequently; with regular
                # Netscape tail-matching, this will match at exactly the same
                # times that domain_specified = True would.  It also means we
                # don't have to prepend a dot to achieve consistency with our
                # own & Mozilla's domain-munging scheme.
                domain_specified = False
            # assume path_specified is false
            # XXX is there other stuff in here? -- eg. comment, commentURL?
            c = Cookie(0,
                       cookie["KEY"], cookie["VALUE"],
                       None, False,
                       domain, domain_specified, initial_dot,
                       cookie["PATH"], False,
                       secure,
                       expires,
                       discard,
                       None,
                       None,
                       {"flags": flags})
            if not ignore_discard and c.discard:
                continue
            if not ignore_expires and c.is_expired(now):
                continue
            CookieJar.set_cookie(self, c)
    def load_from_registry(self, ignore_discard=False, ignore_expires=False,
                           username=None):
        """
        username: only required on win9x
        """
        cookies_dir = regload(self.reg_path, self.reg_key)
        filename = os.path.normpath(os.path.join(cookies_dir, "INDEX.DAT"))
        self.load(filename, ignore_discard, ignore_expires, username)
    def _really_load(self, index, filename, ignore_discard, ignore_expires,
                     username):
        now = int(time.time())
        if username is None:
            username = os.environ['USERNAME'].lower()
        cookie_dir = os.path.dirname(filename)
        data = index.read(256)
        if len(data) != 256:
            raise LoadError("%s file is too short" % filename)
        # Cookies' index.dat file starts with 32 bytes of signature
        # followed by an offset to the first record, stored as a little-
        # endian DWORD.
        sig, size, data = data[:32], data[32:36], data[36:]
        size = struct.unpack("<L", size)[0]
        # check that sig is valid
        if not self.magic_re.match(sig) or size != 0x4000:
            raise LoadError("%s ['%s' %s] does not seem to contain cookies" %
                          (str(filename), sig, size))
        # skip to start of first record
        index.seek(size, 0)
        sector = 128  # size of sector in bytes
        while 1:
            data = ""
            # Cookies are usually in two contiguous sectors, so read in two
            # sectors and adjust if not a Cookie.
            to_read = 2 * sector
            d = index.read(to_read)
            if len(d) != to_read:
                break
            data = data + d
            # Each record starts with a 4-byte signature and a count
            # (little-endian DWORD) of sectors for the record.
            sig, size, data = data[:4], data[4:8], data[8:]
            size = struct.unpack("<L", size)[0]
            to_read = (size - 2) * sector
 ##             from urllib import quote
 ##             print "data", quote(data)
 ##             print "sig", quote(sig)
 ##             print "size in sectors", size
 ##             print "size in bytes", size*sector
 ##             print "size in units of 16 bytes", (size*sector) / 16
 ##             print "size to read in bytes", to_read
 ##             print
            if sig != "URL ":
                assert (sig in ("HASH", "LEAK",
                                self.padding, "\x00\x00\x00\x00"),
                        "unrecognized MSIE index.dat record: %s" %
                        binary_to_str(sig))
                if sig == "\x00\x00\x00\x00":
                    # assume we've got all the cookies, and stop
                    break
                if sig == self.padding:
                    continue
                # skip the rest of this record
                assert to_read >= 0
                if size != 2:
                    assert to_read != 0
                    index.seek(to_read, 1)
                continue
            # read in rest of record if necessary
            if size > 2:
                more_data = index.read(to_read)
                if len(more_data) != to_read: break
                data = data + more_data
            cookie_re = ("Cookie\:%s\@([\x21-\xFF]+).*?" % username +
                         "(%s\@[\x21-\xFF]+\.txt)" % username)
            m = re.search(cookie_re, data, re.I)
            if m:
                cookie_file = os.path.join(cookie_dir, m.group(2))
                if not self.delayload:
                    try:
                        self.load_cookie_data(cookie_file,
                                              ignore_discard, ignore_expires)
                    except (LoadError, IOError):
                        debug("error reading cookie file, skipping: %s",
                              cookie_file)
                else:
                    domain = m.group(1)
                    i = domain.find("/")
                    if i != -1:
                        domain = domain[:i]
                    self._delayload_domains[domain] = (
                        cookie_file, ignore_discard, ignore_expires)
 class MSIECookieJar(MSIEBase, FileCookieJar):
    """FileCookieJar that reads from the Windows MSIE cookies database.
    MSIECookieJar can read the cookie files of Microsoft Internet Explorer
    (MSIE) for Windows version 5 on Windows NT and version 6 on Windows XP and
    Windows 98.  Other configurations may also work, but are untested.  Saving
    cookies in MSIE format is NOT supported.  If you save cookies, they'll be
    in the usual Set-Cookie3 format, which you can read back in using an
    instance of the plain old CookieJar class.  Don't save using the same
    filename that you loaded cookies from, because you may succeed in
    clobbering your MSIE cookies index file!
    You should be able to have LWP share Internet Explorer's cookies like
    this (note you need to supply a username to load_from_registry if you're on
    Windows 9x or Windows ME):
    cj = MSIECookieJar(delayload=1)
    # find cookies index file in registry and load cookies from it
    cj.load_from_registry()
    opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cj))
    response = opener.open("http://example.com/")
    Iterating over a delayloaded MSIECookieJar instance will not cause any
    cookies to be read from disk.  To force reading of all cookies from disk,
    call read_all_cookies.  Note that the following methods iterate over self:
    clear_temporary_cookies, clear_expired_cookies, __len__, __repr__, __str__
    and as_string.
    Additional methods:
    load_from_registry(ignore_discard=False, ignore_expires=False,
                       username=None)
    load_cookie_data(filename, ignore_discard=False, ignore_expires=False)
    read_all_cookies()
    """
    def __init__(self, filename=None, delayload=False, policy=None):
        MSIEBase.__init__(self)
        FileCookieJar.__init__(self, filename, delayload, policy)
    def set_cookie(self, cookie):
        if self.delayload:
            self._delayload_domain(cookie.domain)
        CookieJar.set_cookie(self, cookie)
    def _cookies_for_request(self, request):
        """Return a list of cookies to be returned to server."""
        domains = self._cookies.copy()
        domains.update(self._delayload_domains)
        domains = domains.keys()
        cookies = []
        for domain in domains:
            cookies.extend(self._cookies_for_domain(domain, request))
        return cookies
    def _cookies_for_domain(self, domain, request):
        if not self._policy.domain_return_ok(domain, request):
            return []
        debug("Checking %s for cookies to return", domain)
        if self.delayload:
            self._delayload_domain(domain)
        return CookieJar._cookies_for_domain(self, domain, request)
    def read_all_cookies(self):
        """Eagerly read in all cookies."""
        if self.delayload:
            for domain in self._delayload_domains.keys():
                self._delayload_domain(domain)
    def load(self, filename, ignore_discard=False, ignore_expires=False,
             username=None):
        """Load cookies from an MSIE 'index.dat' cookies index file.
        filename: full path to cookie index file
        username: only required on win9x
        """
        if filename is None:
            if self.filename is not None: filename = self.filename
            else: raise ValueError(MISSING_FILENAME_TEXT)
        index = open(filename, "rb")
        try:
            self._really_load(index, filename, ignore_discard, ignore_expires,
                              username)
        finally:
            index.close()
--- a/src/calibre/utils/mechanize/_opener.py
+++ b/src/calibre/utils/mechanize/_opener.py
@ -0,0 +1,421 @@
 """Integration with Python standard library module urllib2: OpenerDirector
 class.
 Copyright 2004-2006 John J Lee <jjl@pobox.com>
 This code is free software; you can redistribute it and/or modify it
 under the terms of the BSD or ZPL 2.1 licenses (see the file
 COPYING.txt included with the distribution).
 """
 import os, urllib2, bisect, urllib, httplib, types, tempfile
 try:
    import threading as _threading
 except ImportError:
    import dummy_threading as _threading
 try:
    set
 except NameError:
    import sets
    set = sets.Set
 import _http
 import _upgrade
 import _rfc3986
 import _response
 from _util import isstringlike
 from _request import Request
 class ContentTooShortError(urllib2.URLError):
    def __init__(self, reason, result):
        urllib2.URLError.__init__(self, reason)
        self.result = result
 class OpenerDirector(urllib2.OpenerDirector):
    def __init__(self):
        urllib2.OpenerDirector.__init__(self)
        # really none of these are (sanely) public -- the lack of initial
        # underscore on some is just due to following urllib2
        self.process_response = {}
        self.process_request = {}
        self._any_request = {}
        self._any_response = {}
        self._handler_index_valid = True
        self._tempfiles = []
    def add_handler(self, handler):
        if handler in self.handlers:
            return
        # XXX why does self.handlers need to be sorted?
        bisect.insort(self.handlers, handler)
        handler.add_parent(self)
        self._handler_index_valid = False
    def _maybe_reindex_handlers(self):
        if self._handler_index_valid:
            return
        handle_error = {}
        handle_open = {}
        process_request = {}
        process_response = {}
        any_request = set()
        any_response = set()
        unwanted = []
        for handler in self.handlers:
            added = False
            for meth in dir(handler):
                if meth in ["redirect_request", "do_open", "proxy_open"]:
                    # oops, coincidental match
                    continue
                if meth == "any_request":
                    any_request.add(handler)
                    added = True
                    continue
                elif meth == "any_response":
                    any_response.add(handler)
                    added = True
                    continue
                ii = meth.find("_")
                scheme = meth[:ii]
                condition = meth[ii+1:]
                if condition.startswith("error"):
                    jj = meth[ii+1:].find("_") + ii + 1
                    kind = meth[jj+1:]
                    try:
                        kind = int(kind)
                    except ValueError:
                        pass
                    lookup = handle_error.setdefault(scheme, {})
                elif condition == "open":
                    kind = scheme
                    lookup = handle_open
                elif condition == "request":
                    kind = scheme
                    lookup = process_request
                elif condition == "response":
                    kind = scheme
                    lookup = process_response
                else:
                    continue
                lookup.setdefault(kind, set()).add(handler)
                added = True
            if not added:
                unwanted.append(handler)
        for handler in unwanted:
            self.handlers.remove(handler)
        # sort indexed methods
        # XXX could be cleaned up
        for lookup in [process_request, process_response]:
            for scheme, handlers in lookup.iteritems():
                lookup[scheme] = handlers
        for scheme, lookup in handle_error.iteritems():
            for code, handlers in lookup.iteritems():
                handlers = list(handlers)
                handlers.sort()
                lookup[code] = handlers
        for scheme, handlers in handle_open.iteritems():
            handlers = list(handlers)
            handlers.sort()
            handle_open[scheme] = handlers
        # cache the indexes
        self.handle_error = handle_error
        self.handle_open = handle_open
        self.process_request = process_request
        self.process_response = process_response
        self._any_request = any_request
        self._any_response = any_response
    def _request(self, url_or_req, data, visit):
        if isstringlike(url_or_req):
            req = Request(url_or_req, data, visit=visit)
        else:
            # already a urllib2.Request or mechanize.Request instance
            req = url_or_req
            if data is not None:
                req.add_data(data)
            # XXX yuck, give request a .visit attribute if it doesn't have one
            try:
                req.visit
            except AttributeError:
                req.visit = None
            if visit is not None:
                req.visit = visit
        return req
    def open(self, fullurl, data=None):
        req = self._request(fullurl, data, None)
        req_scheme = req.get_type()
        self._maybe_reindex_handlers()
        # pre-process request
        # XXX should we allow a Processor to change the URL scheme
        #   of the request?
        request_processors = set(self.process_request.get(req_scheme, []))
        request_processors.update(self._any_request)
        request_processors = list(request_processors)
        request_processors.sort()
        for processor in request_processors:
            for meth_name in ["any_request", req_scheme+"_request"]:
                meth = getattr(processor, meth_name, None)
                if meth:
                    req = meth(req)
        # In Python >= 2.4, .open() supports processors already, so we must
        # call ._open() instead.
        urlopen = getattr(urllib2.OpenerDirector, "_open",
                          urllib2.OpenerDirector.open)
        response = urlopen(self, req, data)
        # post-process response
        response_processors = set(self.process_response.get(req_scheme, []))
        response_processors.update(self._any_response)
        response_processors = list(response_processors)
        response_processors.sort()
        for processor in response_processors:
            for meth_name in ["any_response", req_scheme+"_response"]:
                meth = getattr(processor, meth_name, None)
                if meth:
                    response = meth(req, response)
        return response
    def error(self, proto, *args):
        if proto in ['http', 'https']:
            # XXX http[s] protocols are special-cased
            dict = self.handle_error['http'] # https is not different than http
            proto = args[2]  # YUCK!
            meth_name = 'http_error_%s' % proto
            http_err = 1
            orig_args = args
        else:
            dict = self.handle_error
            meth_name = proto + '_error'
            http_err = 0
        args = (dict, proto, meth_name) + args
        result = apply(self._call_chain, args)
        if result:
            return result
        if http_err:
            args = (dict, 'default', 'http_error_default') + orig_args
            return apply(self._call_chain, args)
    BLOCK_SIZE = 1024*8
    def retrieve(self, fullurl, filename=None, reporthook=None, data=None):
        """Returns (filename, headers).
        For remote objects, the default filename will refer to a temporary
        file.  Temporary files are removed when the OpenerDirector.close()
        method is called.
        For file: URLs, at present the returned filename is None.  This may
        change in future.
        If the actual number of bytes read is less than indicated by the
        Content-Length header, raises ContentTooShortError (a URLError
        subclass).  The exception's .result attribute contains the (filename,
        headers) that would have been returned.
        """
        req = self._request(fullurl, data, False)
        scheme = req.get_type()
        fp = self.open(req)
        headers = fp.info()
        if filename is None and scheme == 'file':
            # XXX req.get_selector() seems broken here, return None,
            #   pending sanity :-/
            return None, headers
            #return urllib.url2pathname(req.get_selector()), headers
        if filename:
            tfp = open(filename, 'wb')
        else:
            path = _rfc3986.urlsplit(fullurl)[2]
            suffix = os.path.splitext(path)[1]
            fd, filename = tempfile.mkstemp(suffix)
            self._tempfiles.append(filename)
            tfp = os.fdopen(fd, 'wb')
        result = filename, headers
        bs = self.BLOCK_SIZE
        size = -1
        read = 0
        blocknum = 0
        if reporthook:
            if "content-length" in headers:
                size = int(headers["Content-Length"])
            reporthook(blocknum, bs, size)
        while 1:
            block = fp.read(bs)
            if block == "":
                break
            read += len(block)
            tfp.write(block)
            blocknum += 1
            if reporthook:
                reporthook(blocknum, bs, size)
        fp.close()
        tfp.close()
        del fp
        del tfp
        # raise exception if actual size does not match content-length header
        if size >= 0 and read < size:
            raise ContentTooShortError(
                "retrieval incomplete: "
                "got only %i out of %i bytes" % (read, size),
                result
                )
        return result
    def close(self):
        urllib2.OpenerDirector.close(self)
        # make it very obvious this object is no longer supposed to be used
        self.open = self.error = self.retrieve = self.add_handler = None
        if self._tempfiles:
            for filename in self._tempfiles:
                try:
                    os.unlink(filename)
                except OSError:
                    pass
            del self._tempfiles[:]
 def wrapped_open(urlopen, process_response_object, fullurl, data=None):
    success = True
    try:
        response = urlopen(fullurl, data)
    except urllib2.HTTPError, error:
        success = False
        if error.fp is None:  # not a response
            raise
        response = error
    if response is not None:
        response = process_response_object(response)
    if not success:
        raise response
    return response
 class ResponseProcessingOpener(OpenerDirector):
    def open(self, fullurl, data=None):
        def bound_open(fullurl, data=None):
            return OpenerDirector.open(self, fullurl, data)
        return wrapped_open(
            bound_open, self.process_response_object, fullurl, data)
    def process_response_object(self, response):
        return response
 class SeekableResponseOpener(ResponseProcessingOpener):
    def process_response_object(self, response):
        return _response.seek_wrapped_response(response)
 class OpenerFactory:
    """This class's interface is quite likely to change."""
    default_classes = [
        # handlers
        urllib2.ProxyHandler,
        urllib2.UnknownHandler,
        _http.HTTPHandler,  # derived from new AbstractHTTPHandler
        _http.HTTPDefaultErrorHandler,
        _http.HTTPRedirectHandler,  # bugfixed
        urllib2.FTPHandler,
        urllib2.FileHandler,
        # processors
        _upgrade.HTTPRequestUpgradeProcessor,
        _http.HTTPCookieProcessor,
        _http.HTTPErrorProcessor,
        ]
    if hasattr(httplib, 'HTTPS'):
        default_classes.append(_http.HTTPSHandler)
    handlers = []
    replacement_handlers = []
    def __init__(self, klass=OpenerDirector):
        self.klass = klass
    def build_opener(self, *handlers):
        """Create an opener object from a list of handlers and processors.
        The opener will use several default handlers and processors, including
        support for HTTP and FTP.
        If any of the handlers passed as arguments are subclasses of the
        default handlers, the default handlers will not be used.
        """
        opener = self.klass()
        default_classes = list(self.default_classes)
        skip = []
        for klass in default_classes:
            for check in handlers:
                if type(check) == types.ClassType:
                    if issubclass(check, klass):
                        skip.append(klass)
                elif type(check) == types.InstanceType:
                    if isinstance(check, klass):
                        skip.append(klass)
        for klass in skip:
            default_classes.remove(klass)
        for klass in default_classes:
            opener.add_handler(klass())
        for h in handlers:
            if type(h) == types.ClassType:
                h = h()
            opener.add_handler(h)
        return opener
 build_opener = OpenerFactory().build_opener
 _opener = None
 urlopen_lock = _threading.Lock()
 def urlopen(url, data=None):
    global _opener
    if _opener is None:
        urlopen_lock.acquire()
        try:
            if _opener is None:
                _opener = build_opener()
        finally:
            urlopen_lock.release()
    return _opener.open(url, data)
 def urlretrieve(url, filename=None, reporthook=None, data=None):
    global _opener
    if _opener is None:
        urlopen_lock.acquire()
        try:
            if _opener is None:
                _opener = build_opener()
        finally:
            urlopen_lock.release()
    return _opener.retrieve(url, filename, reporthook, data)
 def install_opener(opener):
    global _opener
    _opener = opener
--- a/src/calibre/utils/mechanize/_pullparser.py
+++ b/src/calibre/utils/mechanize/_pullparser.py
@ -0,0 +1,334 @@
 """A simple "pull API" for HTML parsing, after Perl's HTML::TokeParser.
 Examples
 This program extracts all links from a document.  It will print one
 line for each link, containing the URL and the textual description
 between the <A>...</A> tags:
 import pullparser, sys
 f = file(sys.argv[1])
 p = pullparser.PullParser(f)
 for token in p.tags("a"):
    if token.type == "endtag": continue
    url = dict(token.attrs).get("href", "-")
    text = p.get_compressed_text(endat=("endtag", "a"))
    print "%s\t%s" % (url, text)
 This program extracts the <TITLE> from the document:
 import pullparser, sys
 f = file(sys.argv[1])
 p = pullparser.PullParser(f)
 if p.get_tag("title"):
    title = p.get_compressed_text()
    print "Title: %s" % title
 Copyright 2003-2006 John J. Lee <jjl@pobox.com>
 Copyright 1998-2001 Gisle Aas (original libwww-perl code)
 This code is free software; you can redistribute it and/or modify it
 under the terms of the BSD or ZPL 2.1 licenses.
 """
 import re, htmlentitydefs
 import sgmllib, HTMLParser
 from _html import unescape, unescape_charref
 class NoMoreTokensError(Exception): pass
 class Token:
    """Represents an HTML tag, declaration, processing instruction etc.
    Behaves as both a tuple-like object (ie. iterable) and has attributes
    .type, .data and .attrs.
    >>> t = Token("starttag", "a", [("href", "http://www.python.org/")])
    >>> t == ("starttag", "a", [("href", "http://www.python.org/")])
    True
    >>> (t.type, t.data) == ("starttag", "a")
    True
    >>> t.attrs == [("href", "http://www.python.org/")]
    True
    Public attributes
    type: one of "starttag", "endtag", "startendtag", "charref", "entityref",
     "data", "comment", "decl", "pi", after the corresponding methods of
     HTMLParser.HTMLParser
    data: For a tag, the tag name; otherwise, the relevant data carried by the
     tag, as a string
    attrs: list of (name, value) pairs representing HTML attributes
     (or None if token does not represent an opening tag)
    """
    def __init__(self, type, data, attrs=None):
        self.type = type
        self.data = data
        self.attrs = attrs
    def __iter__(self):
        return iter((self.type, self.data, self.attrs))
    def __eq__(self, other):
        type, data, attrs = other
        if (self.type == type and
            self.data == data and
            self.attrs == attrs):
            return True
        else:
            return False
    def __ne__(self, other): return not self.__eq__(other)
    def __repr__(self):
        args = ", ".join(map(repr, [self.type, self.data, self.attrs]))
        return self.__class__.__name__+"(%s)" % args
 def iter_until_exception(fn, exception, *args, **kwds):
    while 1:
        try:
            yield fn(*args, **kwds)
        except exception:
            raise StopIteration
 class _AbstractParser:
    chunk = 1024
    compress_re = re.compile(r"\s+")
    def __init__(self, fh, textify={"img": "alt", "applet": "alt"},
                 encoding="ascii", entitydefs=None):
        """
        fh: file-like object (only a .read() method is required) from which to
         read HTML to be parsed
        textify: mapping used by .get_text() and .get_compressed_text() methods
         to represent opening tags as text
        encoding: encoding used to encode numeric character references by
         .get_text() and .get_compressed_text() ("ascii" by default)
        entitydefs: mapping like {"amp": "&", ...} containing HTML entity
         definitions (a sensible default is used).  This is used to unescape
         entities in .get_text() (and .get_compressed_text()) and attribute
         values.  If the encoding can not represent the character, the entity
         reference is left unescaped.  Note that entity references (both
         numeric - e.g. &#123; or &#xabc; - and non-numeric - e.g. &amp;) are
         unescaped in attribute values and the return value of .get_text(), but
         not in data outside of tags.  Instead, entity references outside of
         tags are represented as tokens.  This is a bit odd, it's true :-/
        If the element name of an opening tag matches a key in the textify
        mapping then that tag is converted to text.  The corresponding value is
        used to specify which tag attribute to obtain the text from.  textify
        maps from element names to either:
          - an HTML attribute name, in which case the HTML attribute value is
            used as its text value along with the element name in square
            brackets (eg."alt text goes here[IMG]", or, if the alt attribute
            were missing, just "[IMG]")
          - a callable object (eg. a function) which takes a Token and returns
            the string to be used as its text value
        If textify has no key for an element name, nothing is substituted for
        the opening tag.
        Public attributes:
        encoding and textify: see above
        """
        self._fh = fh
        self._tokenstack = []  # FIFO
        self.textify = textify
        self.encoding = encoding
        if entitydefs is None:
            entitydefs = htmlentitydefs.name2codepoint
        self._entitydefs = entitydefs
    def __iter__(self): return self
    def tags(self, *names):
        return iter_until_exception(self.get_tag, NoMoreTokensError, *names)
    def tokens(self, *tokentypes):
        return iter_until_exception(self.get_token, NoMoreTokensError, *tokentypes)
    def next(self):
        try:
            return self.get_token()
        except NoMoreTokensError:
            raise StopIteration()
    def get_token(self, *tokentypes):
        """Pop the next Token object from the stack of parsed tokens.
        If arguments are given, they are taken to be token types in which the
        caller is interested: tokens representing other elements will be
        skipped.  Element names must be given in lower case.
        Raises NoMoreTokensError.
        """
        while 1:
            while self._tokenstack:
                token = self._tokenstack.pop(0)
                if tokentypes:
                    if token.type in tokentypes:
                        return token
                else:
                    return token
            data = self._fh.read(self.chunk)
            if not data:
                raise NoMoreTokensError()
            self.feed(data)
    def unget_token(self, token):
        """Push a Token back onto the stack."""
        self._tokenstack.insert(0, token)
    def get_tag(self, *names):
        """Return the next Token that represents an opening or closing tag.
        If arguments are given, they are taken to be element names in which the
        caller is interested: tags representing other elements will be skipped.
        Element names must be given in lower case.
        Raises NoMoreTokensError.
        """
        while 1:
            tok = self.get_token()
            if tok.type not in ["starttag", "endtag", "startendtag"]:
                continue
            if names:
                if tok.data in names:
                    return tok
            else:
                return tok
    def get_text(self, endat=None):
        """Get some text.
        endat: stop reading text at this tag (the tag is included in the
         returned text); endtag is a tuple (type, name) where type is
         "starttag", "endtag" or "startendtag", and name is the element name of
         the tag (element names must be given in lower case)
        If endat is not given, .get_text() will stop at the next opening or
        closing tag, or when there are no more tokens (no exception is raised).
        Note that .get_text() includes the text representation (if any) of the
        opening tag, but pushes the opening tag back onto the stack.  As a
        result, if you want to call .get_text() again, you need to call
        .get_tag() first (unless you want an empty string returned when you
        next call .get_text()).
        Entity references are translated using the value of the entitydefs
        constructor argument (a mapping from names to characters like that
        provided by the standard module htmlentitydefs).  Named entity
        references that are not in this mapping are left unchanged.
        The textify attribute is used to translate opening tags into text: see
        the class docstring.
        """
        text = []
        tok = None
        while 1:
            try:
                tok = self.get_token()
            except NoMoreTokensError:
                # unget last token (not the one we just failed to get)
                if tok: self.unget_token(tok)
                break
            if tok.type == "data":
                text.append(tok.data)
            elif tok.type == "entityref":
                t = unescape("&%s;"%tok.data, self._entitydefs, self.encoding)
                text.append(t)
            elif tok.type == "charref":
                t = unescape_charref(tok.data, self.encoding)
                text.append(t)
            elif tok.type in ["starttag", "endtag", "startendtag"]:
                tag_name = tok.data
                if tok.type in ["starttag", "startendtag"]:
                    alt = self.textify.get(tag_name)
                    if alt is not None:
                        if callable(alt):
                            text.append(alt(tok))
                        elif tok.attrs is not None:
                            for k, v in tok.attrs:
                                if k == alt:
                                    text.append(v)
                            text.append("[%s]" % tag_name.upper())
                if endat is None or endat == (tok.type, tag_name):
                    self.unget_token(tok)
                    break
        return "".join(text)
    def get_compressed_text(self, *args, **kwds):
        """
        As .get_text(), but collapses each group of contiguous whitespace to a
        single space character, and removes all initial and trailing
        whitespace.
        """
        text = self.get_text(*args, **kwds)
        text = text.strip()
        return self.compress_re.sub(" ", text)
    def handle_startendtag(self, tag, attrs):
        self._tokenstack.append(Token("startendtag", tag, attrs))
    def handle_starttag(self, tag, attrs):
        self._tokenstack.append(Token("starttag", tag, attrs))
    def handle_endtag(self, tag):
        self._tokenstack.append(Token("endtag", tag))
    def handle_charref(self, name):
        self._tokenstack.append(Token("charref", name))
    def handle_entityref(self, name):
        self._tokenstack.append(Token("entityref", name))
    def handle_data(self, data):
        self._tokenstack.append(Token("data", data))
    def handle_comment(self, data):
        self._tokenstack.append(Token("comment", data))
    def handle_decl(self, decl):
        self._tokenstack.append(Token("decl", decl))
    def unknown_decl(self, data):
        # XXX should this call self.error instead?
        #self.error("unknown declaration: " + `data`)
        self._tokenstack.append(Token("decl", data))
    def handle_pi(self, data):
        self._tokenstack.append(Token("pi", data))
    def unescape_attr(self, name):
        return unescape(name, self._entitydefs, self.encoding)
    def unescape_attrs(self, attrs):
        escaped_attrs = []
        for key, val in attrs:
            escaped_attrs.append((key, self.unescape_attr(val)))
        return escaped_attrs
 class PullParser(_AbstractParser, HTMLParser.HTMLParser):
    def __init__(self, *args, **kwds):
        HTMLParser.HTMLParser.__init__(self)
        _AbstractParser.__init__(self, *args, **kwds)
    def unescape(self, name):
        # Use the entitydefs passed into constructor, not
        # HTMLParser.HTMLParser's entitydefs.
        return self.unescape_attr(name)
 class TolerantPullParser(_AbstractParser, sgmllib.SGMLParser):
    def __init__(self, *args, **kwds):
        sgmllib.SGMLParser.__init__(self)
        _AbstractParser.__init__(self, *args, **kwds)
    def unknown_starttag(self, tag, attrs):
        attrs = self.unescape_attrs(attrs)
        self._tokenstack.append(Token("starttag", tag, attrs))
    def unknown_endtag(self, tag):
        self._tokenstack.append(Token("endtag", tag))
 def _test():
   import doctest, _pullparser
   return doctest.testmod(_pullparser)
 if __name__ == "__main__":
   _test()
--- a/src/calibre/utils/mechanize/_request.py
+++ b/src/calibre/utils/mechanize/_request.py
@ -0,0 +1,86 @@
 """Integration with Python standard library module urllib2: Request class.
 Copyright 2004-2006 John J Lee <jjl@pobox.com>
 This code is free software; you can redistribute it and/or modify it
 under the terms of the BSD or ZPL 2.1 licenses (see the file
 COPYING.txt included with the distribution).
 """
 import urllib2, urllib, logging
 from _clientcookie import request_host
 import _rfc3986
 warn = logging.getLogger("mechanize").warning
 # don't complain about missing logging handler
 logging.getLogger("mechanize").setLevel(logging.ERROR)
 class Request(urllib2.Request):
    def __init__(self, url, data=None, headers={},
                 origin_req_host=None, unverifiable=False, visit=None):
        # In mechanize 0.2, the interpretation of a unicode url argument will
        # change: A unicode url argument will be interpreted as an IRI, and a
        # bytestring as a URI. For now, we accept unicode or bytestring.  We
        # don't insist that the value is always a URI (specifically, must only
        # contain characters which are legal), because that might break working
        # code (who knows what bytes some servers want to see, especially with
        # browser plugins for internationalised URIs).
        if not _rfc3986.is_clean_uri(url):
            warn("url argument is not a URI "
                 "(contains illegal characters) %r" % url)
        urllib2.Request.__init__(self, url, data, headers)
        self.selector = None
        self.unredirected_hdrs = {}
        self.visit = visit
        # All the terminology below comes from RFC 2965.
        self.unverifiable = unverifiable
        # Set request-host of origin transaction.
        # The origin request-host is needed in order to decide whether
        # unverifiable sub-requests (automatic redirects, images embedded
        # in HTML, etc.) are to third-party hosts.  If they are, the
        # resulting transactions might need to be conducted with cookies
        # turned off.
        if origin_req_host is None:
            origin_req_host = request_host(self)
        self.origin_req_host = origin_req_host
    def get_selector(self):
        return urllib.splittag(self.__r_host)[0]
    def get_origin_req_host(self):
        return self.origin_req_host
    def is_unverifiable(self):
        return self.unverifiable
    def add_unredirected_header(self, key, val):
        """Add a header that will not be added to a redirected request."""
        self.unredirected_hdrs[key.capitalize()] = val
    def has_header(self, header_name):
        """True iff request has named header (regular or unredirected)."""
        return (header_name in self.headers or
                header_name in self.unredirected_hdrs)
    def get_header(self, header_name, default=None):
        return self.headers.get(
            header_name,
            self.unredirected_hdrs.get(header_name, default))
    def header_items(self):
        hdrs = self.unredirected_hdrs.copy()
        hdrs.update(self.headers)
        return hdrs.items()
    def __str__(self):
        return "<Request for %s>" % self.get_full_url()
    def get_method(self):
        if self.has_data():
            return "POST"
        else:
            return "GET"
--- a/src/calibre/utils/mechanize/_response.py
+++ b/src/calibre/utils/mechanize/_response.py
@ -0,0 +1,515 @@
 """Response classes.
 The seek_wrapper code is not used if you're using UserAgent with
 .set_seekable_responses(False), or if you're using the urllib2-level interface
 without SeekableProcessor or HTTPEquivProcessor.  Class closeable_response is
 instantiated by some handlers (AbstractHTTPHandler), but the closeable_response
 interface is only depended upon by Browser-level code.  Function
 upgrade_response is only used if you're using Browser or
 ResponseUpgradeProcessor.
 Copyright 2006 John J. Lee <jjl@pobox.com>
 This code is free software; you can redistribute it and/or modify it
 under the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt
 included with the distribution).
 """
 import copy, mimetools
 from cStringIO import StringIO
 import urllib2
 # XXX Andrew Dalke kindly sent me a similar class in response to my request on
 # comp.lang.python, which I then proceeded to lose.  I wrote this class
 # instead, but I think he's released his code publicly since, could pinch the
 # tests from it, at least...
 # For testing seek_wrapper invariant (note that
 # test_urllib2.HandlerTest.test_seekable is expected to fail when this
 # invariant checking is turned on).  The invariant checking is done by module
 # ipdc, which is available here:
 # http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/436834
 ## from ipdbc import ContractBase
 ## class seek_wrapper(ContractBase):
 class seek_wrapper:
    """Adds a seek method to a file object.
    This is only designed for seeking on readonly file-like objects.
    Wrapped file-like object must have a read method.  The readline method is
    only supported if that method is present on the wrapped object.  The
    readlines method is always supported.  xreadlines and iteration are
    supported only for Python 2.2 and above.
    Public attributes:
    wrapped: the wrapped file object
    is_closed: true iff .close() has been called
    WARNING: All other attributes of the wrapped object (ie. those that are not
    one of wrapped, read, readline, readlines, xreadlines, __iter__ and next)
    are passed through unaltered, which may or may not make sense for your
    particular file object.
    """
    # General strategy is to check that cache is full enough, then delegate to
    # the cache (self.__cache, which is a cStringIO.StringIO instance).  A seek
    # position (self.__pos) is maintained independently of the cache, in order
    # that a single cache may be shared between multiple seek_wrapper objects.
    # Copying using module copy shares the cache in this way.
    def __init__(self, wrapped):
        self.wrapped = wrapped
        self.__read_complete_state = [False]
        self.__is_closed_state = [False]
        self.__have_readline = hasattr(self.wrapped, "readline")
        self.__cache = StringIO()
        self.__pos = 0  # seek position
    def invariant(self):
        # The end of the cache is always at the same place as the end of the
        # wrapped file.
        return self.wrapped.tell() == len(self.__cache.getvalue())
    def close(self):
        self.wrapped.close()
        self.is_closed = True
    def __getattr__(self, name):
        if name == "is_closed":
            return self.__is_closed_state[0]
        elif name == "read_complete":
            return self.__read_complete_state[0]
        wrapped = self.__dict__.get("wrapped")
        if wrapped:
            return getattr(wrapped, name)
        return getattr(self.__class__, name)
    def __setattr__(self, name, value):
        if name == "is_closed":
            self.__is_closed_state[0] = bool(value)
        elif name == "read_complete":
            if not self.is_closed:
                self.__read_complete_state[0] = bool(value)
        else:
            self.__dict__[name] = value
    def seek(self, offset, whence=0):
        assert whence in [0,1,2]
        # how much data, if any, do we need to read?
        if whence == 2:  # 2: relative to end of *wrapped* file
            if offset < 0: raise ValueError("negative seek offset")
            # since we don't know yet where the end of that file is, we must
            # read everything
            to_read = None
        else:
            if whence == 0:  # 0: absolute
                if offset < 0: raise ValueError("negative seek offset")
                dest = offset
            else:  # 1: relative to current position
                pos = self.__pos
                if pos < offset:
                    raise ValueError("seek to before start of file")
                dest = pos + offset
            end = len(self.__cache.getvalue())
            to_read = dest - end
            if to_read < 0:
                to_read = 0
        if to_read != 0:
            self.__cache.seek(0, 2)
            if to_read is None:
                assert whence == 2
                self.__cache.write(self.wrapped.read())
                self.read_complete = True
                self.__pos = self.__cache.tell() - offset
            else:
                data = self.wrapped.read(to_read)
                if not data:
                    self.read_complete = True
                else:
                    self.__cache.write(data)
                # Don't raise an exception even if we've seek()ed past the end
                # of .wrapped, since fseek() doesn't complain in that case.
                # Also like fseek(), pretend we have seek()ed past the end,
                # i.e. not:
                #self.__pos = self.__cache.tell()
                # but rather:
                self.__pos = dest
        else:
            self.__pos = dest
    def tell(self):
        return self.__pos
    def __copy__(self):
        cpy = self.__class__(self.wrapped)
        cpy.__cache = self.__cache
        cpy.__read_complete_state = self.__read_complete_state
        cpy.__is_closed_state = self.__is_closed_state
        return cpy
    def get_data(self):
        pos = self.__pos
        try:
            self.seek(0)
            return self.read(-1)
        finally:
            self.__pos = pos
    def read(self, size=-1):
        pos = self.__pos
        end = len(self.__cache.getvalue())
        available = end - pos
        # enough data already cached?
        if size <= available and size != -1:
            self.__cache.seek(pos)
            self.__pos = pos+size
            return self.__cache.read(size)
        # no, so read sufficient data from wrapped file and cache it
        self.__cache.seek(0, 2)
        if size == -1:
            self.__cache.write(self.wrapped.read())
            self.read_complete = True
        else:
            to_read = size - available
            assert to_read > 0
            data = self.wrapped.read(to_read)
            if not data:
                self.read_complete = True
            else:
                self.__cache.write(data)
        self.__cache.seek(pos)
        data = self.__cache.read(size)
        self.__pos = self.__cache.tell()
        assert self.__pos == pos + len(data)
        return data
    def readline(self, size=-1):
        if not self.__have_readline:
            raise NotImplementedError("no readline method on wrapped object")
        # line we're about to read might not be complete in the cache, so
        # read another line first
        pos = self.__pos
        self.__cache.seek(0, 2)
        data = self.wrapped.readline()
        if not data:
            self.read_complete = True
        else:
            self.__cache.write(data)
        self.__cache.seek(pos)
        data = self.__cache.readline()
        if size != -1:
            r = data[:size]
            self.__pos = pos+size
        else:
            r = data
            self.__pos = pos+len(data)
        return r
    def readlines(self, sizehint=-1):
        pos = self.__pos
        self.__cache.seek(0, 2)
        self.__cache.write(self.wrapped.read())
        self.read_complete = True
        self.__cache.seek(pos)
        data = self.__cache.readlines(sizehint)
        self.__pos = self.__cache.tell()
        return data
    def __iter__(self): return self
    def next(self):
        line = self.readline()
        if line == "": raise StopIteration
        return line
    xreadlines = __iter__
    def __repr__(self):
        return ("<%s at %s whose wrapped object = %r>" %
                (self.__class__.__name__, hex(abs(id(self))), self.wrapped))
 class response_seek_wrapper(seek_wrapper):
    """
    Supports copying response objects and setting response body data.
    """
    def __init__(self, wrapped):
        seek_wrapper.__init__(self, wrapped)
        self._headers = self.wrapped.info()
    def __copy__(self):
        cpy = seek_wrapper.__copy__(self)
        # copy headers from delegate
        cpy._headers = copy.copy(self.info())
        return cpy
    # Note that .info() and .geturl() (the only two urllib2 response methods
    # that are not implemented by seek_wrapper) must be here explicitly rather
    # than by seek_wrapper's __getattr__ delegation) so that the nasty
    # dynamically-created HTTPError classes in get_seek_wrapper_class() get the
    # wrapped object's implementation, and not HTTPError's.
    def info(self):
        return self._headers
    def geturl(self):
        return self.wrapped.geturl()
    def set_data(self, data):
        self.seek(0)
        self.read()
        self.close()
        cache = self._seek_wrapper__cache = StringIO()
        cache.write(data)
        self.seek(0)
 class eoffile:
    # file-like object that always claims to be at end-of-file...
    def read(self, size=-1): return ""
    def readline(self, size=-1): return ""
    def __iter__(self): return self
    def next(self): return ""
    def close(self): pass
 class eofresponse(eoffile):
    def __init__(self, url, headers, code, msg):
        self._url = url
        self._headers = headers
        self.code = code
        self.msg = msg
    def geturl(self): return self._url
    def info(self): return self._headers
 class closeable_response:
    """Avoids unnecessarily clobbering urllib.addinfourl methods on .close().
    Only supports responses returned by mechanize.HTTPHandler.
    After .close(), the following methods are supported:
    .read()
    .readline()
    .info()
    .geturl()
    .__iter__()
    .next()
    .close()
    and the following attributes are supported:
    .code
    .msg
    Also supports pickling (but the stdlib currently does something to prevent
    it: http://python.org/sf/1144636).
    """
    # presence of this attr indicates is useable after .close()
    closeable_response = None
    def __init__(self, fp, headers, url, code, msg):
        self._set_fp(fp)
        self._headers = headers
        self._url = url
        self.code = code
        self.msg = msg
    def _set_fp(self, fp):
        self.fp = fp
        self.read = self.fp.read
        self.readline = self.fp.readline
        if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines
        if hasattr(self.fp, "fileno"):
            self.fileno = self.fp.fileno
        else:
            self.fileno = lambda: None
        self.__iter__ = self.fp.__iter__
        self.next = self.fp.next
    def __repr__(self):
        return '<%s at %s whose fp = %r>' % (
            self.__class__.__name__, hex(abs(id(self))), self.fp)
    def info(self):
        return self._headers
    def geturl(self):
        return self._url
    def close(self):
        self.fp._close = True
        wrapped = self.fp
        wrapped.close()
        new_wrapped = eofresponse(
            self._url, self._headers, self.code, self.msg)
        self._set_fp(new_wrapped)
    def __getstate__(self):
        # There are three obvious options here:
        # 1. truncate
        # 2. read to end
        # 3. close socket, pickle state including read position, then open
        #    again on unpickle and use Range header
        # XXXX um, 4. refuse to pickle unless .close()d.  This is better,
        #  actually ("errors should never pass silently").  Pickling doesn't
        #  work anyway ATM, because of http://python.org/sf/1144636 so fix
        #  this later
        # 2 breaks pickle protocol, because one expects the original object
        # to be left unscathed by pickling.  3 is too complicated and
        # surprising (and too much work ;-) to happen in a sane __getstate__.
        # So we do 1.
        state = self.__dict__.copy()
        new_wrapped = eofresponse(
            self._url, self._headers, self.code, self.msg)
        state["wrapped"] = new_wrapped
        return state
 def test_response(data='test data', headers=[],
                  url="http://example.com/", code=200, msg="OK"):
    return make_response(data, headers, url, code, msg)
 def test_html_response(data='test data', headers=[],
                       url="http://example.com/", code=200, msg="OK"):
    headers += [("Content-type", "text/html")]
    return make_response(data, headers, url, code, msg)
 def make_response(data, headers, url, code, msg):
    """Convenient factory for objects implementing response interface.
    data: string containing response body data
    headers: sequence of (name, value) pairs
    url: URL of response
    code: integer response code (e.g. 200)
    msg: string response code message (e.g. "OK")
    """
    mime_headers = make_headers(headers)
    r = closeable_response(StringIO(data), mime_headers, url, code, msg)
    return response_seek_wrapper(r)
 def make_headers(headers):
    """
    headers: sequence of (name, value) pairs
    """
    hdr_text = []
    for name_value in headers:
        hdr_text.append("%s: %s" % name_value)
    return mimetools.Message(StringIO("\n".join(hdr_text)))
 # Rest of this module is especially horrible, but needed, at least until fork
 # urllib2.  Even then, may want to preseve urllib2 compatibility.
 def get_seek_wrapper_class(response):
    # in order to wrap response objects that are also exceptions, we must
    # dynamically subclass the exception :-(((
    if (isinstance(response, urllib2.HTTPError) and
        not hasattr(response, "seek")):
        if response.__class__.__module__ == "__builtin__":
            exc_class_name = response.__class__.__name__
        else:
            exc_class_name = "%s.%s" % (
                response.__class__.__module__, response.__class__.__name__)
        class httperror_seek_wrapper(response_seek_wrapper, response.__class__):
            # this only derives from HTTPError in order to be a subclass --
            # the HTTPError behaviour comes from delegation
            _exc_class_name = exc_class_name
            def __init__(self, wrapped):
                response_seek_wrapper.__init__(self, wrapped)
                # be compatible with undocumented HTTPError attributes :-(
                self.hdrs = wrapped.info()
                self.filename = wrapped.geturl()
            def __repr__(self):
                return (
                    "<%s (%s instance) at %s "
                    "whose wrapped object = %r>" % (
                    self.__class__.__name__, self._exc_class_name,
                    hex(abs(id(self))), self.wrapped)
                    )
        wrapper_class = httperror_seek_wrapper
    else:
        wrapper_class = response_seek_wrapper
    return wrapper_class
 def seek_wrapped_response(response):
    """Return a copy of response that supports seekable response interface.
    Accepts responses from both mechanize and urllib2 handlers.
    Copes with both oridinary response instances and HTTPError instances (which
    can't be simply wrapped due to the requirement of preserving the exception
    base class).
    """
    if not hasattr(response, "seek"):
        wrapper_class = get_seek_wrapper_class(response)
        response = wrapper_class(response)
    assert hasattr(response, "get_data")
    return response
 def upgrade_response(response):
    """Return a copy of response that supports Browser response interface.
    Browser response interface is that of "seekable responses"
    (response_seek_wrapper), plus the requirement that responses must be
    useable after .close() (closeable_response).
    Accepts responses from both mechanize and urllib2 handlers.
    Copes with both ordinary response instances and HTTPError instances (which
    can't be simply wrapped due to the requirement of preserving the exception
    base class).
    """
    wrapper_class = get_seek_wrapper_class(response)
    if hasattr(response, "closeable_response"):
        if not hasattr(response, "seek"):
            response = wrapper_class(response)
        assert hasattr(response, "get_data")
        return copy.copy(response)
    # a urllib2 handler constructed the response, i.e. the response is an
    # urllib.addinfourl or a urllib2.HTTPError, instead of a
    # _Util.closeable_response as returned by e.g. mechanize.HTTPHandler
    try:
        code = response.code
    except AttributeError:
        code = None
    try:
        msg = response.msg
    except AttributeError:
        msg = None
    # may have already-.read() data from .seek() cache
    data = None
    get_data = getattr(response, "get_data", None)
    if get_data:
        data = get_data()
    response = closeable_response(
        response.fp, response.info(), response.geturl(), code, msg)
    response = wrapper_class(response)
    if data:
        response.set_data(data)
    return response
--- a/src/calibre/utils/mechanize/_rfc3986.py
+++ b/src/calibre/utils/mechanize/_rfc3986.py
@ -0,0 +1,240 @@
 """RFC 3986 URI parsing and relative reference resolution / absolutization.
 (aka splitting and joining)
 Copyright 2006 John J. Lee <jjl@pobox.com>
 This code is free software; you can redistribute it and/or modify it under
 the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt
 included with the distribution).
 """
 # XXX Wow, this is ugly.  Overly-direct translation of the RFC ATM.
 import sys, re, posixpath, urllib
 ## def chr_range(a, b):
 ##     return "".join(map(chr, range(ord(a), ord(b)+1)))
 ## UNRESERVED_URI_CHARS = ("ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 ##                         "abcdefghijklmnopqrstuvwxyz"
 ##                         "0123456789"
 ##                         "-_.~")
 ## RESERVED_URI_CHARS = "!*'();:@&=+$,/?#[]"
 ## URI_CHARS = RESERVED_URI_CHARS+UNRESERVED_URI_CHARS+'%'
 # this re matches any character that's not in URI_CHARS
 BAD_URI_CHARS_RE = re.compile("[^A-Za-z0-9\-_.~!*'();:@&=+$,/?%#[\]]")
 def clean_url(url, encoding):
    # percent-encode illegal URI characters
    # Trying to come up with test cases for this gave me a headache, revisit
    # when do switch to unicode.
    # Somebody else's comments (lost the attribution):
 ##     - IE will return you the url in the encoding you send it
 ##     - Mozilla/Firefox will send you latin-1 if there's no non latin-1
 ##     characters in your link. It will send you utf-8 however if there are...
    if type(url) == type(""):
        url = url.decode(encoding, "replace")
    url = url.strip()
    # for second param to urllib.quote(), we want URI_CHARS, minus the
    # 'always_safe' characters that urllib.quote() never percent-encodes
    return urllib.quote(url.encode(encoding), "!*'();:@&=+$,/?%#[]~")
 def is_clean_uri(uri):
    """
    >>> is_clean_uri("ABC!")
    True
    >>> is_clean_uri(u"ABC!")
    True
    >>> is_clean_uri("ABC|")
    False
    >>> is_clean_uri(u"ABC|")
    False
    >>> is_clean_uri("http://example.com/0")
    True
    >>> is_clean_uri(u"http://example.com/0")
    True
    """
    # note module re treats bytestrings as through they were decoded as latin-1
    # so this function accepts both unicode and bytestrings
    return not bool(BAD_URI_CHARS_RE.search(uri))
 SPLIT_MATCH = re.compile(
    r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?").match
 def urlsplit(absolute_uri):
    """Return scheme, authority, path, query, fragment."""
    match = SPLIT_MATCH(absolute_uri)
    if match:
        g = match.groups()
        return g[1], g[3], g[4], g[6], g[8]
 def urlunsplit(parts):
    scheme, authority, path, query, fragment = parts
    r = []
    append = r.append
    if scheme is not None:
        append(scheme)
        append(":")
    if authority is not None:
        append("//")
        append(authority)
    append(path)
    if query is not None:
        append("?")
        append(query)
    if fragment is not None:
        append("#")
        append(fragment)
    return "".join(r)
 def urljoin(base_uri, uri_reference):
    return urlunsplit(urljoin_parts(urlsplit(base_uri),
                                    urlsplit(uri_reference)))
 # oops, this doesn't do the same thing as the literal translation
 # from the RFC below
 ## def urljoin_parts(base_parts, reference_parts):
 ##     scheme, authority, path, query, fragment = base_parts
 ##     rscheme, rauthority, rpath, rquery, rfragment = reference_parts
 ##     # compute target URI path
 ##     if rpath == "":
 ##         tpath = path
 ##     else:
 ##         tpath = rpath
 ##         if not tpath.startswith("/"):
 ##             tpath = merge(authority, path, tpath)
 ##         tpath = posixpath.normpath(tpath)
 ##     if rscheme is not None:
 ##         return (rscheme, rauthority, tpath, rquery, rfragment)
 ##     elif rauthority is not None:
 ##         return (scheme, rauthority, tpath, rquery, rfragment)
 ##     elif rpath == "":
 ##         if rquery is not None:
 ##             tquery = rquery
 ##         else:
 ##             tquery = query
 ##         return (scheme, authority, tpath, tquery, rfragment)
 ##     else:
 ##         return (scheme, authority, tpath, rquery, rfragment)
 def urljoin_parts(base_parts, reference_parts):
    scheme, authority, path, query, fragment = base_parts
    rscheme, rauthority, rpath, rquery, rfragment = reference_parts
    if rscheme == scheme:
        rscheme = None
    if rscheme is not None:
        tscheme, tauthority, tpath, tquery = (
            rscheme, rauthority, remove_dot_segments(rpath), rquery)
    else:
        if rauthority is not None:
            tauthority, tpath, tquery = (
                rauthority, remove_dot_segments(rpath), rquery)
        else:
            if rpath == "":
                tpath = path
                if rquery is not None:
                    tquery = rquery
                else:
                    tquery = query
            else:
                if rpath.startswith("/"):
                    tpath = remove_dot_segments(rpath)
                else:
                    tpath = merge(authority, path, rpath)
                    tpath = remove_dot_segments(tpath)
                tquery = rquery
            tauthority = authority
        tscheme = scheme
    tfragment = rfragment
    return (tscheme, tauthority, tpath, tquery, tfragment)
 # um, something *vaguely* like this is what I want, but I have to generate
 # lots of test cases first, if only to understand what it is that
 # remove_dot_segments really does...
 ## def remove_dot_segments(path):
 ##     if path == '':
 ##         return ''
 ##     comps = path.split('/')
 ##     new_comps = []
 ##     for comp in comps:
 ##         if comp in ['.', '']:
 ##             if not new_comps or new_comps[-1]:
 ##                 new_comps.append('')
 ##             continue
 ##         if comp != '..':
 ##             new_comps.append(comp)
 ##         elif new_comps:
 ##             new_comps.pop()
 ##     return '/'.join(new_comps)
 def remove_dot_segments(path):
    r = []
    while path:
        # A
        if path.startswith("../"):
            path = path[3:]
            continue
        if path.startswith("./"):
            path = path[2:]
            continue
        # B
        if path.startswith("/./"):
            path = path[2:]
            continue
        if path == "/.":
            path = "/"
            continue
        # C
        if path.startswith("/../"):
            path = path[3:]
            if r:
                r.pop()
            continue
        if path == "/..":
            path = "/"
            if r:
                r.pop()
            continue
        # D
        if path == ".":
            path = path[1:]
            continue
        if path == "..":
            path = path[2:]
            continue
        # E
        start = 0
        if path.startswith("/"):
            start = 1
        ii = path.find("/", start)
        if ii < 0:
            ii = None
        r.append(path[:ii])
        if ii is None:
            break
        path = path[ii:]
    return "".join(r)
 def merge(base_authority, base_path, ref_path):
    # XXXX Oddly, the sample Perl implementation of this by Roy Fielding
    # doesn't even take base_authority as a parameter, despite the wording in
    # the RFC suggesting otherwise.  Perhaps I'm missing some obvious identity.
    #if base_authority is not None and base_path == "":
    if base_path == "":
        return "/" + ref_path
    ii = base_path.rfind("/")
    if ii >= 0:
        return base_path[:ii+1] + ref_path
    return ref_path
 if __name__ == "__main__":
    import doctest
    doctest.testmod()
--- a/src/calibre/utils/mechanize/_seek.py
+++ b/src/calibre/utils/mechanize/_seek.py
@ -0,0 +1,16 @@
 from urllib2 import BaseHandler
 from _util import deprecation
 from _response import response_seek_wrapper
 class SeekableProcessor(BaseHandler):
    """Deprecated: Make responses seekable."""
    def __init__(self):
        deprecation(
            "See http://wwwsearch.sourceforge.net/mechanize/doc.html#seekable")
    def any_response(self, request, response):
        if not hasattr(response, "seek"):
            return response_seek_wrapper(response)
        return response
--- a/src/calibre/utils/mechanize/_upgrade.py
+++ b/src/calibre/utils/mechanize/_upgrade.py
@ -0,0 +1,40 @@
 from urllib2 import BaseHandler
 from _request import Request
 from _response import upgrade_response
 from _util import deprecation
 class HTTPRequestUpgradeProcessor(BaseHandler):
    # upgrade urllib2.Request to this module's Request
    # yuck!
    handler_order = 0  # before anything else
    def http_request(self, request):
        if not hasattr(request, "add_unredirected_header"):
            newrequest = Request(request._Request__original, request.data,
                                 request.headers)
            try: newrequest.origin_req_host = request.origin_req_host
            except AttributeError: pass
            try: newrequest.unverifiable = request.unverifiable
            except AttributeError: pass
            try: newrequest.visit = request.visit
            except AttributeError: pass
            request = newrequest
        return request
    https_request = http_request
 class ResponseUpgradeProcessor(BaseHandler):
    # upgrade responses to be .close()able without becoming unusable
    handler_order = 0  # before anything else
    def __init__(self):
        deprecation(
            "See http://wwwsearch.sourceforge.net/mechanize/doc.html#seekable")
    def any_response(self, request, response):
        if not hasattr(response, 'closeable_response'):
            response = upgrade_response(response)
        return response
--- a/src/calibre/utils/mechanize/_urllib2.py
+++ b/src/calibre/utils/mechanize/_urllib2.py
@ -0,0 +1,62 @@
 # urllib2 work-alike interface
 # ...from urllib2...
 from urllib2 import \
     URLError, \
     HTTPError, \
     GopherError
 # ...and from mechanize
 from _opener import OpenerDirector, \
     SeekableResponseOpener, \
     build_opener, install_opener, urlopen
 from _auth import \
     HTTPPasswordMgr, \
     HTTPPasswordMgrWithDefaultRealm, \
     AbstractBasicAuthHandler, \
     AbstractDigestAuthHandler, \
     HTTPProxyPasswordMgr, \
     ProxyHandler, \
     ProxyBasicAuthHandler, \
     ProxyDigestAuthHandler, \
     HTTPBasicAuthHandler, \
     HTTPDigestAuthHandler, \
     HTTPSClientCertMgr
 from _request import \
     Request
 from _http import \
     RobotExclusionError
 # handlers...
 # ...from urllib2...
 from urllib2 import \
     BaseHandler, \
     UnknownHandler, \
     FTPHandler, \
     CacheFTPHandler, \
     FileHandler, \
     GopherHandler
 # ...and from mechanize
 from _http import \
     HTTPHandler, \
     HTTPDefaultErrorHandler, \
     HTTPRedirectHandler, \
     HTTPEquivProcessor, \
     HTTPCookieProcessor, \
     HTTPRefererProcessor, \
     HTTPRefreshProcessor, \
     HTTPErrorProcessor, \
     HTTPRobotRulesProcessor
 from _upgrade import \
     HTTPRequestUpgradeProcessor, \
     ResponseUpgradeProcessor
 from _debug import \
     HTTPResponseDebugProcessor, \
     HTTPRedirectDebugProcessor
 from _seek import \
     SeekableProcessor
 # crap ATM
 ## from _gzip import \
 ##      HTTPGzipProcessor
 import httplib
 if hasattr(httplib, 'HTTPS'):
    from _http import HTTPSHandler
 del httplib
--- a/src/calibre/utils/mechanize/_useragent.py
+++ b/src/calibre/utils/mechanize/_useragent.py
@ -0,0 +1,348 @@
 """Convenient HTTP UserAgent class.
 This is a subclass of urllib2.OpenerDirector.
 Copyright 2003-2006 John J. Lee <jjl@pobox.com>
 This code is free software; you can redistribute it and/or modify it under
 the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt
 included with the distribution).
 """
 import sys, warnings, urllib2
 import _opener
 import _urllib2
 import _auth
 import _gzip
 import _response
 class UserAgentBase(_opener.OpenerDirector):
    """Convenient user-agent class.
    Do not use .add_handler() to add a handler for something already dealt with
    by this code.
    The only reason at present for the distinction between UserAgent and
    UserAgentBase is so that classes that depend on .seek()able responses
    (e.g. mechanize.Browser) can inherit from UserAgentBase.  The subclass
    UserAgent exposes a .set_seekable_responses() method that allows switching
    off the adding of a .seek() method to responses.
    Public attributes:
    addheaders: list of (name, value) pairs specifying headers to send with
     every request, unless they are overridden in the Request instance.
     >>> ua = UserAgentBase()
     >>> ua.addheaders = [
     ...  ("User-agent", "Mozilla/5.0 (compatible)"),
     ...  ("From", "responsible.person@example.com")]
    """
    handler_classes = {
        # scheme handlers
        "http": _urllib2.HTTPHandler,
        # CacheFTPHandler is buggy, at least in 2.3, so we don't use it
        "ftp": _urllib2.FTPHandler,
        "file": _urllib2.FileHandler,
        "gopher": _urllib2.GopherHandler,
        # other handlers
        "_unknown": _urllib2.UnknownHandler,
        # HTTP{S,}Handler depend on HTTPErrorProcessor too
        "_http_error": _urllib2.HTTPErrorProcessor,
        "_http_request_upgrade": _urllib2.HTTPRequestUpgradeProcessor,
        "_http_default_error": _urllib2.HTTPDefaultErrorHandler,
        # feature handlers
        "_basicauth": _urllib2.HTTPBasicAuthHandler,
        "_digestauth": _urllib2.HTTPDigestAuthHandler,
        "_redirect": _urllib2.HTTPRedirectHandler,
        "_cookies": _urllib2.HTTPCookieProcessor,
        "_refresh": _urllib2.HTTPRefreshProcessor,
        "_equiv": _urllib2.HTTPEquivProcessor,
        "_proxy": _urllib2.ProxyHandler,
        "_proxy_basicauth": _urllib2.ProxyBasicAuthHandler,
        "_proxy_digestauth": _urllib2.ProxyDigestAuthHandler,
        "_robots": _urllib2.HTTPRobotRulesProcessor,
        "_gzip": _gzip.HTTPGzipProcessor,  # experimental!
        # debug handlers
        "_debug_redirect": _urllib2.HTTPRedirectDebugProcessor,
        "_debug_response_body": _urllib2.HTTPResponseDebugProcessor,
        }
    default_schemes = ["http", "ftp", "file", "gopher"]
    default_others = ["_unknown", "_http_error", "_http_request_upgrade",
                      "_http_default_error",
                      ]
    default_features = ["_redirect", "_cookies",
                        "_refresh", "_equiv",
                        "_basicauth", "_digestauth",
                        "_proxy", "_proxy_basicauth", "_proxy_digestauth",
                        "_robots",
                        ]
    if hasattr(_urllib2, 'HTTPSHandler'):
        handler_classes["https"] = _urllib2.HTTPSHandler
        default_schemes.append("https")
    def __init__(self):
        _opener.OpenerDirector.__init__(self)
        ua_handlers = self._ua_handlers = {}
        for scheme in (self.default_schemes+
                       self.default_others+
                       self.default_features):
            klass = self.handler_classes[scheme]
            ua_handlers[scheme] = klass()
        for handler in ua_handlers.itervalues():
            self.add_handler(handler)
        # Yuck.
        # Ensure correct default constructor args were passed to
        # HTTPRefreshProcessor and HTTPEquivProcessor.
        if "_refresh" in ua_handlers:
            self.set_handle_refresh(True)
        if "_equiv" in ua_handlers:
            self.set_handle_equiv(True)
        # Ensure default password managers are installed.
        pm = ppm = None
        if "_basicauth" in ua_handlers or "_digestauth" in ua_handlers:
            pm = _urllib2.HTTPPasswordMgrWithDefaultRealm()
        if ("_proxy_basicauth" in ua_handlers or
            "_proxy_digestauth" in ua_handlers):
            ppm = _auth.HTTPProxyPasswordMgr()
        self.set_password_manager(pm)
        self.set_proxy_password_manager(ppm)
        # set default certificate manager
        if "https" in ua_handlers:
            cm = _urllib2.HTTPSClientCertMgr()
            self.set_client_cert_manager(cm)
    def close(self):
        _opener.OpenerDirector.close(self)
        self._ua_handlers = None
    # XXX
 ##     def set_timeout(self, timeout):
 ##         self._timeout = timeout
 ##     def set_http_connection_cache(self, conn_cache):
 ##         self._http_conn_cache = conn_cache
 ##     def set_ftp_connection_cache(self, conn_cache):
 ##         # XXX ATM, FTP has cache as part of handler; should it be separate?
 ##         self._ftp_conn_cache = conn_cache
    def set_handled_schemes(self, schemes):
        """Set sequence of URL scheme (protocol) strings.
        For example: ua.set_handled_schemes(["http", "ftp"])
        If this fails (with ValueError) because you've passed an unknown
        scheme, the set of handled schemes will not be changed.
        """
        want = {}
        for scheme in schemes:
            if scheme.startswith("_"):
                raise ValueError("not a scheme '%s'" % scheme)
            if scheme not in self.handler_classes:
                raise ValueError("unknown scheme '%s'")
            want[scheme] = None
        # get rid of scheme handlers we don't want
        for scheme, oldhandler in self._ua_handlers.items():
            if scheme.startswith("_"): continue  # not a scheme handler
            if scheme not in want:
                self._replace_handler(scheme, None)
            else:
                del want[scheme]  # already got it
        # add the scheme handlers that are missing
        for scheme in want.keys():
            self._set_handler(scheme, True)
    def set_cookiejar(self, cookiejar):
        """Set a mechanize.CookieJar, or None."""
        self._set_handler("_cookies", obj=cookiejar)
    # XXX could use Greg Stein's httpx for some of this instead?
    # or httplib2??
    def set_proxies(self, proxies):
        """Set a dictionary mapping URL scheme to proxy specification, or None.
        e.g. {"http": "joe:password@myproxy.example.com:3128",
              "ftp": "proxy.example.com"}
        """
        self._set_handler("_proxy", obj=proxies)
    def add_password(self, url, user, password, realm=None):
        self._password_manager.add_password(realm, url, user, password)
    def add_proxy_password(self, user, password, hostport=None, realm=None):
        self._proxy_password_manager.add_password(
            realm, hostport, user, password)
    def add_client_certificate(self, url, key_file, cert_file):
        """Add an SSL client certificate, for HTTPS client auth.
        key_file and cert_file must be filenames of the key and certificate
        files, in PEM format.  You can use e.g. OpenSSL to convert a p12 (PKCS
        12) file to PEM format:
        openssl pkcs12 -clcerts -nokeys -in cert.p12 -out cert.pem
        openssl pkcs12 -nocerts -in cert.p12 -out key.pem
        Note that client certificate password input is very inflexible ATM.  At
        the moment this seems to be console only, which is presumably the
        default behaviour of libopenssl.  In future mechanize may support
        third-party libraries that (I assume) allow more options here.
        """
        self._client_cert_manager.add_key_cert(url, key_file, cert_file)
    # the following are rarely useful -- use add_password / add_proxy_password
    # instead
    def set_password_manager(self, password_manager):
        """Set a mechanize.HTTPPasswordMgrWithDefaultRealm, or None."""
        self._password_manager = password_manager
        self._set_handler("_basicauth", obj=password_manager)
        self._set_handler("_digestauth", obj=password_manager)
    def set_proxy_password_manager(self, password_manager):
        """Set a mechanize.HTTPProxyPasswordMgr, or None."""
        self._proxy_password_manager = password_manager
        self._set_handler("_proxy_basicauth", obj=password_manager)
        self._set_handler("_proxy_digestauth", obj=password_manager)
    def set_client_cert_manager(self, cert_manager):
        """Set a mechanize.HTTPClientCertMgr, or None."""
        self._client_cert_manager = cert_manager
        handler = self._ua_handlers["https"]
        handler.client_cert_manager = cert_manager
    # these methods all take a boolean parameter
    def set_handle_robots(self, handle):
        """Set whether to observe rules from robots.txt."""
        self._set_handler("_robots", handle)
    def set_handle_redirect(self, handle):
        """Set whether to handle HTTP 30x redirections."""
        self._set_handler("_redirect", handle)
    def set_handle_refresh(self, handle, max_time=None, honor_time=True):
        """Set whether to handle HTTP Refresh headers."""
        self._set_handler("_refresh", handle, constructor_kwds=
                          {"max_time": max_time, "honor_time": honor_time})
    def set_handle_equiv(self, handle, head_parser_class=None):
        """Set whether to treat HTML http-equiv headers like HTTP headers.
        Response objects may be .seek()able if this is set (currently returned
        responses are, raised HTTPError exception responses are not).
        """
        if head_parser_class is not None:
            constructor_kwds = {"head_parser_class": head_parser_class}
        else:
            constructor_kwds={}
        self._set_handler("_equiv", handle, constructor_kwds=constructor_kwds)
    def set_handle_gzip(self, handle):
        """Handle gzip transfer encoding.
        """
        if handle:
            warnings.warn(
                "gzip transfer encoding is experimental!", stacklevel=2)
        self._set_handler("_gzip", handle)
    def set_debug_redirects(self, handle):
        """Log information about HTTP redirects (including refreshes).
        Logging is performed using module logging.  The logger name is
        "mechanize.http_redirects".  To actually print some debug output,
        eg:
        import sys, logging
        logger = logging.getLogger("mechanize.http_redirects")
        logger.addHandler(logging.StreamHandler(sys.stdout))
        logger.setLevel(logging.INFO)
        Other logger names relevant to this module:
        "mechanize.http_responses"
        "mechanize.cookies" (or "cookielib" if running Python 2.4)
        To turn on everything:
        import sys, logging
        logger = logging.getLogger("mechanize")
        logger.addHandler(logging.StreamHandler(sys.stdout))
        logger.setLevel(logging.INFO)
        """
        self._set_handler("_debug_redirect", handle)
    def set_debug_responses(self, handle):
        """Log HTTP response bodies.
        See docstring for .set_debug_redirects() for details of logging.
        Response objects may be .seek()able if this is set (currently returned
        responses are, raised HTTPError exception responses are not).
        """
        self._set_handler("_debug_response_body", handle)
    def set_debug_http(self, handle):
        """Print HTTP headers to sys.stdout."""
        level = int(bool(handle))
        for scheme in "http", "https":
            h = self._ua_handlers.get(scheme)
            if h is not None:
                h.set_http_debuglevel(level)
    def _set_handler(self, name, handle=None, obj=None,
                     constructor_args=(), constructor_kwds={}):
        if handle is None:
            handle = obj is not None
        if handle:
            handler_class = self.handler_classes[name]
            if obj is not None:
                newhandler = handler_class(obj)
            else:
                newhandler = handler_class(*constructor_args, **constructor_kwds)
        else:
            newhandler = None
        self._replace_handler(name, newhandler)
    def _replace_handler(self, name, newhandler=None):
        # first, if handler was previously added, remove it
        if name is not None:
            handler = self._ua_handlers.get(name)
            if handler:
                try:
                    self.handlers.remove(handler)
                except ValueError:
                    pass
        # then add the replacement, if any
        if newhandler is not None:
            self.add_handler(newhandler)
            self._ua_handlers[name] = newhandler
 class UserAgent(UserAgentBase):
    def __init__(self):
        UserAgentBase.__init__(self)
        self._seekable = False
    def set_seekable_responses(self, handle):
        """Make response objects .seek()able."""
        self._seekable = bool(handle)
    def open(self, fullurl, data=None):
        if self._seekable:
            def bound_open(fullurl, data=None):
                return UserAgentBase.open(self, fullurl, data)
            response = _opener.wrapped_open(
                bound_open, _response.seek_wrapped_response, fullurl, data)
        else:
            response = UserAgentBase.open(self, fullurl, data)
        return response
--- a/src/calibre/utils/mechanize/_util.py
+++ b/src/calibre/utils/mechanize/_util.py
@ -0,0 +1,279 @@
 """Utility functions and date/time routines.
 Copyright 2002-2006 John J Lee <jjl@pobox.com>
 This code is free software; you can redistribute it and/or modify it
 under the terms of the BSD or ZPL 2.1 licenses (see the file
 COPYING.txt included with the distribution).
 """
 import re, string, time, warnings
 def deprecation(message):
    warnings.warn(message, DeprecationWarning, stacklevel=3)
 def hide_deprecations():
    warnings.filterwarnings('ignore', category=DeprecationWarning)
 def reset_deprecations():
    warnings.filterwarnings('default', category=DeprecationWarning)
 def isstringlike(x):
    try: x+""
    except: return False
    else: return True
 ## def caller():
 ##     try:
 ##         raise SyntaxError
 ##     except:
 ##         import sys
 ##     return sys.exc_traceback.tb_frame.f_back.f_back.f_code.co_name
 from calendar import timegm
 # Date/time conversion routines for formats used by the HTTP protocol.
 EPOCH = 1970
 def my_timegm(tt):
    year, month, mday, hour, min, sec = tt[:6]
    if ((year >= EPOCH) and (1 <= month <= 12) and (1 <= mday <= 31) and
        (0 <= hour <= 24) and (0 <= min <= 59) and (0 <= sec <= 61)):
        return timegm(tt)
    else:
        return None
 days = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
 months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun",
          "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
 months_lower = []
 for month in months: months_lower.append(month.lower())
 def time2isoz(t=None):
    """Return a string representing time in seconds since epoch, t.
    If the function is called without an argument, it will use the current
    time.
    The format of the returned string is like "YYYY-MM-DD hh:mm:ssZ",
    representing Universal Time (UTC, aka GMT).  An example of this format is:
    1994-11-24 08:49:37Z
    """
    if t is None: t = time.time()
    year, mon, mday, hour, min, sec = time.gmtime(t)[:6]
    return "%04d-%02d-%02d %02d:%02d:%02dZ" % (
        year, mon, mday, hour, min, sec)
 def time2netscape(t=None):
    """Return a string representing time in seconds since epoch, t.
    If the function is called without an argument, it will use the current
    time.
    The format of the returned string is like this:
    Wed, DD-Mon-YYYY HH:MM:SS GMT
    """
    if t is None: t = time.time()
    year, mon, mday, hour, min, sec, wday = time.gmtime(t)[:7]
    return "%s %02d-%s-%04d %02d:%02d:%02d GMT" % (
        days[wday], mday, months[mon-1], year, hour, min, sec)
 UTC_ZONES = {"GMT": None, "UTC": None, "UT": None, "Z": None}
 timezone_re = re.compile(r"^([-+])?(\d\d?):?(\d\d)?$")
 def offset_from_tz_string(tz):
    offset = None
    if UTC_ZONES.has_key(tz):
        offset = 0
    else:
        m = timezone_re.search(tz)
        if m:
            offset = 3600 * int(m.group(2))
            if m.group(3):
                offset = offset + 60 * int(m.group(3))
            if m.group(1) == '-':
                offset = -offset
    return offset
 def _str2time(day, mon, yr, hr, min, sec, tz):
    # translate month name to number
    # month numbers start with 1 (January)
    try:
        mon = months_lower.index(mon.lower())+1
    except ValueError:
        # maybe it's already a number
        try:
            imon = int(mon)
        except ValueError:
            return None
        if 1 <= imon <= 12:
            mon = imon
        else:
            return None
    # make sure clock elements are defined
    if hr is None: hr = 0
    if min is None: min = 0
    if sec is None: sec = 0
    yr = int(yr)
    day = int(day)
    hr = int(hr)
    min = int(min)
    sec = int(sec)
    if yr < 1000:
        # find "obvious" year
        cur_yr = time.localtime(time.time())[0]
        m = cur_yr % 100
        tmp = yr
        yr = yr + cur_yr - m
        m = m - tmp
        if abs(m) > 50:
            if m > 0: yr = yr + 100
            else: yr = yr - 100
    # convert UTC time tuple to seconds since epoch (not timezone-adjusted)
    t = my_timegm((yr, mon, day, hr, min, sec, tz))
    if t is not None:
        # adjust time using timezone string, to get absolute time since epoch
        if tz is None:
            tz = "UTC"
        tz = tz.upper()
        offset = offset_from_tz_string(tz)
        if offset is None:
            return None
        t = t - offset
    return t
 strict_re = re.compile(r"^[SMTWF][a-z][a-z], (\d\d) ([JFMASOND][a-z][a-z]) (\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$")
 wkday_re = re.compile(
    r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I)
 loose_http_re = re.compile(
    r"""^
    (\d\d?)            # day
       (?:\s+|[-\/])
    (\w+)              # month
        (?:\s+|[-\/])
    (\d+)              # year
    (?:
          (?:\s+|:)    # separator before clock
       (\d\d?):(\d\d)  # hour:min
       (?::(\d\d))?    # optional seconds
    )?                 # optional clock
       \s*
    ([-+]?\d{2,4}|(?![APap][Mm]\b)[A-Za-z]+)? # timezone
       \s*
    (?:\(\w+\))?       # ASCII representation of timezone in parens.
       \s*$""", re.X)
 def http2time(text):
    """Returns time in seconds since epoch of time represented by a string.
    Return value is an integer.
    None is returned if the format of str is unrecognized, the time is outside
    the representable range, or the timezone string is not recognized.  If the
    string contains no timezone, UTC is assumed.
    The timezone in the string may be numerical (like "-0800" or "+0100") or a
    string timezone (like "UTC", "GMT", "BST" or "EST").  Currently, only the
    timezone strings equivalent to UTC (zero offset) are known to the function.
    The function loosely parses the following formats:
    Wed, 09 Feb 1994 22:23:32 GMT       -- HTTP format
    Tuesday, 08-Feb-94 14:15:29 GMT     -- old rfc850 HTTP format
    Tuesday, 08-Feb-1994 14:15:29 GMT   -- broken rfc850 HTTP format
    09 Feb 1994 22:23:32 GMT            -- HTTP format (no weekday)
    08-Feb-94 14:15:29 GMT              -- rfc850 format (no weekday)
    08-Feb-1994 14:15:29 GMT            -- broken rfc850 format (no weekday)
    The parser ignores leading and trailing whitespace.  The time may be
    absent.
    If the year is given with only 2 digits, the function will select the
    century that makes the year closest to the current date.
    """
    # fast exit for strictly conforming string
    m = strict_re.search(text)
    if m:
        g = m.groups()
        mon = months_lower.index(g[1].lower()) + 1
        tt = (int(g[2]), mon, int(g[0]),
              int(g[3]), int(g[4]), float(g[5]))
        return my_timegm(tt)
    # No, we need some messy parsing...
    # clean up
    text = text.lstrip()
    text = wkday_re.sub("", text, 1)  # Useless weekday
    # tz is time zone specifier string
    day, mon, yr, hr, min, sec, tz = [None]*7
    # loose regexp parse
    m = loose_http_re.search(text)
    if m is not None:
        day, mon, yr, hr, min, sec, tz = m.groups()
    else:
        return None  # bad format
    return _str2time(day, mon, yr, hr, min, sec, tz)
 iso_re = re.compile(
    """^
    (\d{4})              # year
       [-\/]?
    (\d\d?)              # numerical month
       [-\/]?
    (\d\d?)              # day
   (?:
         (?:\s+|[-:Tt])  # separator before clock
      (\d\d?):?(\d\d)    # hour:min
      (?::?(\d\d(?:\.\d*)?))?  # optional seconds (and fractional)
   )?                    # optional clock
      \s*
   ([-+]?\d\d?:?(:?\d\d)?
    |Z|z)?               # timezone  (Z is "zero meridian", i.e. GMT)
      \s*$""", re.X)
 def iso2time(text):
    """
    As for http2time, but parses the ISO 8601 formats:
    1994-02-03 14:15:29 -0100    -- ISO 8601 format
    1994-02-03 14:15:29          -- zone is optional
    1994-02-03                   -- only date
    1994-02-03T14:15:29          -- Use T as separator
    19940203T141529Z             -- ISO 8601 compact format
    19940203                     -- only date
    """
    # clean up
    text = text.lstrip()
    # tz is time zone specifier string
    day, mon, yr, hr, min, sec, tz = [None]*7
    # loose regexp parse
    m = iso_re.search(text)
    if m is not None:
        # XXX there's an extra bit of the timezone I'm ignoring here: is
        #   this the right thing to do?
        yr, mon, day, hr, min, sec, tz, _ = m.groups()
    else:
        return None  # bad format
    return _str2time(day, mon, yr, hr, min, sec, tz)