From 6fee09b9d2d649083063b78f50044f7f7188b098 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 11 Sep 2008 17:07:21 -0700 Subject: [PATCH] IGN:Use patched mechanize implementation that correctly closes connections --- src/calibre/__init__.py | 4 +- src/calibre/utils/mechanize/__init__.py | 125 ++ src/calibre/utils/mechanize/_auth.py | 500 +++++ src/calibre/utils/mechanize/_beautifulsoup.py | 1080 +++++++++++ src/calibre/utils/mechanize/_clientcookie.py | 1651 +++++++++++++++++ src/calibre/utils/mechanize/_debug.py | 28 + src/calibre/utils/mechanize/_gzip.py | 103 + src/calibre/utils/mechanize/_headersutil.py | 226 +++ src/calibre/utils/mechanize/_html.py | 607 ++++++ src/calibre/utils/mechanize/_http.py | 729 ++++++++ src/calibre/utils/mechanize/_lwpcookiejar.py | 185 ++ src/calibre/utils/mechanize/_mechanize.py | 656 +++++++ .../utils/mechanize/_mozillacookiejar.py | 159 ++ src/calibre/utils/mechanize/_msiecookiejar.py | 387 ++++ src/calibre/utils/mechanize/_opener.py | 421 +++++ src/calibre/utils/mechanize/_pullparser.py | 334 ++++ src/calibre/utils/mechanize/_request.py | 86 + src/calibre/utils/mechanize/_response.py | 515 +++++ src/calibre/utils/mechanize/_rfc3986.py | 240 +++ src/calibre/utils/mechanize/_seek.py | 16 + src/calibre/utils/mechanize/_upgrade.py | 40 + src/calibre/utils/mechanize/_urllib2.py | 62 + src/calibre/utils/mechanize/_useragent.py | 348 ++++ src/calibre/utils/mechanize/_util.py | 279 +++ 24 files changed, 8779 insertions(+), 2 deletions(-) create mode 100644 src/calibre/utils/mechanize/__init__.py create mode 100644 src/calibre/utils/mechanize/_auth.py create mode 100644 src/calibre/utils/mechanize/_beautifulsoup.py create mode 100644 src/calibre/utils/mechanize/_clientcookie.py create mode 100644 src/calibre/utils/mechanize/_debug.py create mode 100644 src/calibre/utils/mechanize/_gzip.py create mode 100644 src/calibre/utils/mechanize/_headersutil.py create mode 100644 src/calibre/utils/mechanize/_html.py create mode 100644 src/calibre/utils/mechanize/_http.py create mode 100644 src/calibre/utils/mechanize/_lwpcookiejar.py create mode 100644 src/calibre/utils/mechanize/_mechanize.py create mode 100644 src/calibre/utils/mechanize/_mozillacookiejar.py create mode 100644 src/calibre/utils/mechanize/_msiecookiejar.py create mode 100644 src/calibre/utils/mechanize/_opener.py create mode 100644 src/calibre/utils/mechanize/_pullparser.py create mode 100644 src/calibre/utils/mechanize/_request.py create mode 100644 src/calibre/utils/mechanize/_response.py create mode 100644 src/calibre/utils/mechanize/_rfc3986.py create mode 100644 src/calibre/utils/mechanize/_seek.py create mode 100644 src/calibre/utils/mechanize/_upgrade.py create mode 100644 src/calibre/utils/mechanize/_urllib2.py create mode 100644 src/calibre/utils/mechanize/_useragent.py create mode 100644 src/calibre/utils/mechanize/_util.py diff --git a/src/calibre/__init__.py b/src/calibre/__init__.py index 536ac5002a..c69518c0c5 100644 --- a/src/calibre/__init__.py +++ b/src/calibre/__init__.py @@ -2,7 +2,7 @@ __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import sys, os, re, logging, time, subprocess, mechanize, atexit +import sys, os, re, logging, time, subprocess, atexit from htmlentitydefs import name2codepoint from math import floor from logging import Formatter @@ -14,7 +14,7 @@ from calibre.constants import iswindows, isosx, islinux, isfrozen, \ terminal_controller, preferred_encoding, \ __appname__, __version__, __author__, \ win32event, win32api, winerror, fcntl - +from calibre.utils import mechanize def unicode_path(path, abs=False): if not isinstance(path, unicode): diff --git a/src/calibre/utils/mechanize/__init__.py b/src/calibre/utils/mechanize/__init__.py new file mode 100644 index 0000000000..8bea889f30 --- /dev/null +++ b/src/calibre/utils/mechanize/__init__.py @@ -0,0 +1,125 @@ +__all__ = [ + 'AbstractBasicAuthHandler', + 'AbstractDigestAuthHandler', + 'BaseHandler', + 'Browser', + 'BrowserStateError', + 'CacheFTPHandler', + 'ContentTooShortError', + 'Cookie', + 'CookieJar', + 'CookiePolicy', + 'DefaultCookiePolicy', + 'DefaultFactory', + 'FTPHandler', + 'Factory', + 'FileCookieJar', + 'FileHandler', + 'FormNotFoundError', + 'FormsFactory', + 'GopherError', + 'GopherHandler', + 'HTTPBasicAuthHandler', + 'HTTPCookieProcessor', + 'HTTPDefaultErrorHandler', + 'HTTPDigestAuthHandler', + 'HTTPEquivProcessor', + 'HTTPError', + 'HTTPErrorProcessor', + 'HTTPHandler', + 'HTTPPasswordMgr', + 'HTTPPasswordMgrWithDefaultRealm', + 'HTTPProxyPasswordMgr', + 'HTTPRedirectDebugProcessor', + 'HTTPRedirectHandler', + 'HTTPRefererProcessor', + 'HTTPRefreshProcessor', + 'HTTPRequestUpgradeProcessor', + 'HTTPResponseDebugProcessor', + 'HTTPRobotRulesProcessor', + 'HTTPSClientCertMgr', + 'HTTPSHandler', + 'HeadParser', + 'History', + 'LWPCookieJar', + 'Link', + 'LinkNotFoundError', + 'LinksFactory', + 'LoadError', + 'MSIECookieJar', + 'MozillaCookieJar', + 'OpenerDirector', + 'OpenerFactory', + 'ParseError', + 'ProxyBasicAuthHandler', + 'ProxyDigestAuthHandler', + 'ProxyHandler', + 'Request', + 'ResponseUpgradeProcessor', + 'RobotExclusionError', + 'RobustFactory', + 'RobustFormsFactory', + 'RobustLinksFactory', + 'RobustTitleFactory', + 'SeekableProcessor', + 'SeekableResponseOpener', + 'TitleFactory', + 'URLError', + 'USE_BARE_EXCEPT', + 'UnknownHandler', + 'UserAgent', + 'UserAgentBase', + 'XHTMLCompatibleHeadParser', + '__version__', + 'build_opener', + 'install_opener', + 'lwp_cookie_str', + 'make_response', + 'request_host', + 'response_seek_wrapper', # XXX deprecate in public interface? + 'seek_wrapped_response' # XXX should probably use this internally in place of response_seek_wrapper() + 'str2time', + 'urlopen', + 'urlretrieve'] + +from _mechanize import __version__ + +# high-level stateful browser-style interface +from _mechanize import \ + Browser, History, \ + BrowserStateError, LinkNotFoundError, FormNotFoundError + +# configurable URL-opener interface +from _useragent import UserAgentBase, UserAgent +from _html import \ + ParseError, \ + Link, \ + Factory, DefaultFactory, RobustFactory, \ + FormsFactory, LinksFactory, TitleFactory, \ + RobustFormsFactory, RobustLinksFactory, RobustTitleFactory + +# urllib2 work-alike interface (part from mechanize, part from urllib2) +# This is a superset of the urllib2 interface. +from _urllib2 import * + +# misc +from _opener import ContentTooShortError, OpenerFactory, urlretrieve +from _util import http2time as str2time +from _response import \ + response_seek_wrapper, seek_wrapped_response, make_response +from _http import HeadParser +try: + from _http import XHTMLCompatibleHeadParser +except ImportError: + pass + +# cookies +from _clientcookie import Cookie, CookiePolicy, DefaultCookiePolicy, \ + CookieJar, FileCookieJar, LoadError, request_host +from _lwpcookiejar import LWPCookieJar, lwp_cookie_str +from _mozillacookiejar import MozillaCookieJar +from _msiecookiejar import MSIECookieJar + +# If you hate the idea of turning bugs into warnings, do: +# import mechanize; mechanize.USE_BARE_EXCEPT = False +USE_BARE_EXCEPT = True diff --git a/src/calibre/utils/mechanize/_auth.py b/src/calibre/utils/mechanize/_auth.py new file mode 100644 index 0000000000..9bb5873019 --- /dev/null +++ b/src/calibre/utils/mechanize/_auth.py @@ -0,0 +1,500 @@ +"""HTTP Authentication and Proxy support. + +All but HTTPProxyPasswordMgr come from Python 2.5. + + +Copyright 2006 John J. Lee + +This code is free software; you can redistribute it and/or modify it under +the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt +included with the distribution). + +""" + +import re, base64, urlparse, posixpath, md5, sha, sys, copy + +from urllib2 import BaseHandler +from urllib import getproxies, unquote, splittype, splituser, splitpasswd, \ + splitport + + +def _parse_proxy(proxy): + """Return (scheme, user, password, host/port) given a URL or an authority. + + If a URL is supplied, it must have an authority (host:port) component. + According to RFC 3986, having an authority component means the URL must + have two slashes after the scheme: + + >>> _parse_proxy('file:/ftp.example.com/') + Traceback (most recent call last): + ValueError: proxy URL with no authority: 'file:/ftp.example.com/' + + The first three items of the returned tuple may be None. + + Examples of authority parsing: + + >>> _parse_proxy('proxy.example.com') + (None, None, None, 'proxy.example.com') + >>> _parse_proxy('proxy.example.com:3128') + (None, None, None, 'proxy.example.com:3128') + + The authority component may optionally include userinfo (assumed to be + username:password): + + >>> _parse_proxy('joe:password@proxy.example.com') + (None, 'joe', 'password', 'proxy.example.com') + >>> _parse_proxy('joe:password@proxy.example.com:3128') + (None, 'joe', 'password', 'proxy.example.com:3128') + + Same examples, but with URLs instead: + + >>> _parse_proxy('http://proxy.example.com/') + ('http', None, None, 'proxy.example.com') + >>> _parse_proxy('http://proxy.example.com:3128/') + ('http', None, None, 'proxy.example.com:3128') + >>> _parse_proxy('http://joe:password@proxy.example.com/') + ('http', 'joe', 'password', 'proxy.example.com') + >>> _parse_proxy('http://joe:password@proxy.example.com:3128') + ('http', 'joe', 'password', 'proxy.example.com:3128') + + Everything after the authority is ignored: + + >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128') + ('ftp', 'joe', 'password', 'proxy.example.com') + + Test for no trailing '/' case: + + >>> _parse_proxy('http://joe:password@proxy.example.com') + ('http', 'joe', 'password', 'proxy.example.com') + + """ + scheme, r_scheme = splittype(proxy) + if not r_scheme.startswith("/"): + # authority + scheme = None + authority = proxy + else: + # URL + if not r_scheme.startswith("//"): + raise ValueError("proxy URL with no authority: %r" % proxy) + # We have an authority, so for RFC 3986-compliant URLs (by ss 3. + # and 3.3.), path is empty or starts with '/' + end = r_scheme.find("/", 2) + if end == -1: + end = None + authority = r_scheme[2:end] + userinfo, hostport = splituser(authority) + if userinfo is not None: + user, password = splitpasswd(userinfo) + else: + user = password = None + return scheme, user, password, hostport + +class ProxyHandler(BaseHandler): + # Proxies must be in front + handler_order = 100 + + def __init__(self, proxies=None): + if proxies is None: + proxies = getproxies() + assert hasattr(proxies, 'has_key'), "proxies must be a mapping" + self.proxies = proxies + for type, url in proxies.items(): + setattr(self, '%s_open' % type, + lambda r, proxy=url, type=type, meth=self.proxy_open: \ + meth(r, proxy, type)) + + def proxy_open(self, req, proxy, type): + orig_type = req.get_type() + proxy_type, user, password, hostport = _parse_proxy(proxy) + if proxy_type is None: + proxy_type = orig_type + if user and password: + user_pass = '%s:%s' % (unquote(user), unquote(password)) + creds = base64.encodestring(user_pass).strip() + req.add_header('Proxy-authorization', 'Basic ' + creds) + hostport = unquote(hostport) + req.set_proxy(hostport, proxy_type) + if orig_type == proxy_type: + # let other handlers take care of it + return None + else: + # need to start over, because the other handlers don't + # grok the proxy's URL type + # e.g. if we have a constructor arg proxies like so: + # {'http': 'ftp://proxy.example.com'}, we may end up turning + # a request for http://acme.example.com/a into one for + # ftp://proxy.example.com/a + return self.parent.open(req) + +class HTTPPasswordMgr: + + def __init__(self): + self.passwd = {} + + def add_password(self, realm, uri, user, passwd): + # uri could be a single URI or a sequence + if isinstance(uri, basestring): + uri = [uri] + if not realm in self.passwd: + self.passwd[realm] = {} + for default_port in True, False: + reduced_uri = tuple( + [self.reduce_uri(u, default_port) for u in uri]) + self.passwd[realm][reduced_uri] = (user, passwd) + + def find_user_password(self, realm, authuri): + domains = self.passwd.get(realm, {}) + for default_port in True, False: + reduced_authuri = self.reduce_uri(authuri, default_port) + for uris, authinfo in domains.iteritems(): + for uri in uris: + if self.is_suburi(uri, reduced_authuri): + return authinfo + return None, None + + def reduce_uri(self, uri, default_port=True): + """Accept authority or URI and extract only the authority and path.""" + # note HTTP URLs do not have a userinfo component + parts = urlparse.urlsplit(uri) + if parts[1]: + # URI + scheme = parts[0] + authority = parts[1] + path = parts[2] or '/' + else: + # host or host:port + scheme = None + authority = uri + path = '/' + host, port = splitport(authority) + if default_port and port is None and scheme is not None: + dport = {"http": 80, + "https": 443, + }.get(scheme) + if dport is not None: + authority = "%s:%d" % (host, dport) + return authority, path + + def is_suburi(self, base, test): + """Check if test is below base in a URI tree + + Both args must be URIs in reduced form. + """ + if base == test: + return True + if base[0] != test[0]: + return False + common = posixpath.commonprefix((base[1], test[1])) + if len(common) == len(base[1]): + return True + return False + + +class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr): + + def find_user_password(self, realm, authuri): + user, password = HTTPPasswordMgr.find_user_password(self, realm, + authuri) + if user is not None: + return user, password + return HTTPPasswordMgr.find_user_password(self, None, authuri) + + +class AbstractBasicAuthHandler: + + rx = re.compile('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', re.I) + + # XXX there can actually be multiple auth-schemes in a + # www-authenticate header. should probably be a lot more careful + # in parsing them to extract multiple alternatives + + def __init__(self, password_mgr=None): + if password_mgr is None: + password_mgr = HTTPPasswordMgr() + self.passwd = password_mgr + self.add_password = self.passwd.add_password + + def http_error_auth_reqed(self, authreq, host, req, headers): + # host may be an authority (without userinfo) or a URL with an + # authority + # XXX could be multiple headers + authreq = headers.get(authreq, None) + if authreq: + mo = AbstractBasicAuthHandler.rx.search(authreq) + if mo: + scheme, realm = mo.groups() + if scheme.lower() == 'basic': + return self.retry_http_basic_auth(host, req, realm) + + def retry_http_basic_auth(self, host, req, realm): + user, pw = self.passwd.find_user_password(realm, host) + if pw is not None: + raw = "%s:%s" % (user, pw) + auth = 'Basic %s' % base64.encodestring(raw).strip() + if req.headers.get(self.auth_header, None) == auth: + return None + newreq = copy.copy(req) + newreq.add_header(self.auth_header, auth) + newreq.visit = False + return self.parent.open(newreq) + else: + return None + + +class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler): + + auth_header = 'Authorization' + + def http_error_401(self, req, fp, code, msg, headers): + url = req.get_full_url() + return self.http_error_auth_reqed('www-authenticate', + url, req, headers) + + +class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler): + + auth_header = 'Proxy-authorization' + + def http_error_407(self, req, fp, code, msg, headers): + # http_error_auth_reqed requires that there is no userinfo component in + # authority. Assume there isn't one, since urllib2 does not (and + # should not, RFC 3986 s. 3.2.1) support requests for URLs containing + # userinfo. + authority = req.get_host() + return self.http_error_auth_reqed('proxy-authenticate', + authority, req, headers) + + +def randombytes(n): + """Return n random bytes.""" + # Use /dev/urandom if it is available. Fall back to random module + # if not. It might be worthwhile to extend this function to use + # other platform-specific mechanisms for getting random bytes. + if os.path.exists("/dev/urandom"): + f = open("/dev/urandom") + s = f.read(n) + f.close() + return s + else: + L = [chr(random.randrange(0, 256)) for i in range(n)] + return "".join(L) + +class AbstractDigestAuthHandler: + # Digest authentication is specified in RFC 2617. + + # XXX The client does not inspect the Authentication-Info header + # in a successful response. + + # XXX It should be possible to test this implementation against + # a mock server that just generates a static set of challenges. + + # XXX qop="auth-int" supports is shaky + + def __init__(self, passwd=None): + if passwd is None: + passwd = HTTPPasswordMgr() + self.passwd = passwd + self.add_password = self.passwd.add_password + self.retried = 0 + self.nonce_count = 0 + + def reset_retry_count(self): + self.retried = 0 + + def http_error_auth_reqed(self, auth_header, host, req, headers): + authreq = headers.get(auth_header, None) + if self.retried > 5: + # Don't fail endlessly - if we failed once, we'll probably + # fail a second time. Hm. Unless the Password Manager is + # prompting for the information. Crap. This isn't great + # but it's better than the current 'repeat until recursion + # depth exceeded' approach + raise HTTPError(req.get_full_url(), 401, "digest auth failed", + headers, None) + else: + self.retried += 1 + if authreq: + scheme = authreq.split()[0] + if scheme.lower() == 'digest': + return self.retry_http_digest_auth(req, authreq) + + def retry_http_digest_auth(self, req, auth): + token, challenge = auth.split(' ', 1) + chal = parse_keqv_list(parse_http_list(challenge)) + auth = self.get_authorization(req, chal) + if auth: + auth_val = 'Digest %s' % auth + if req.headers.get(self.auth_header, None) == auth_val: + return None + newreq = copy.copy(req) + newreq.add_unredirected_header(self.auth_header, auth_val) + newreq.visit = False + return self.parent.open(newreq) + + def get_cnonce(self, nonce): + # The cnonce-value is an opaque + # quoted string value provided by the client and used by both client + # and server to avoid chosen plaintext attacks, to provide mutual + # authentication, and to provide some message integrity protection. + # This isn't a fabulous effort, but it's probably Good Enough. + dig = sha.new("%s:%s:%s:%s" % (self.nonce_count, nonce, time.ctime(), + randombytes(8))).hexdigest() + return dig[:16] + + def get_authorization(self, req, chal): + try: + realm = chal['realm'] + nonce = chal['nonce'] + qop = chal.get('qop') + algorithm = chal.get('algorithm', 'MD5') + # mod_digest doesn't send an opaque, even though it isn't + # supposed to be optional + opaque = chal.get('opaque', None) + except KeyError: + return None + + H, KD = self.get_algorithm_impls(algorithm) + if H is None: + return None + + user, pw = self.passwd.find_user_password(realm, req.get_full_url()) + if user is None: + return None + + # XXX not implemented yet + if req.has_data(): + entdig = self.get_entity_digest(req.get_data(), chal) + else: + entdig = None + + A1 = "%s:%s:%s" % (user, realm, pw) + A2 = "%s:%s" % (req.get_method(), + # XXX selector: what about proxies and full urls + req.get_selector()) + if qop == 'auth': + self.nonce_count += 1 + ncvalue = '%08x' % self.nonce_count + cnonce = self.get_cnonce(nonce) + noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2)) + respdig = KD(H(A1), noncebit) + elif qop is None: + respdig = KD(H(A1), "%s:%s" % (nonce, H(A2))) + else: + # XXX handle auth-int. + pass + + # XXX should the partial digests be encoded too? + + base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \ + 'response="%s"' % (user, realm, nonce, req.get_selector(), + respdig) + if opaque: + base += ', opaque="%s"' % opaque + if entdig: + base += ', digest="%s"' % entdig + base += ', algorithm="%s"' % algorithm + if qop: + base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce) + return base + + def get_algorithm_impls(self, algorithm): + # lambdas assume digest modules are imported at the top level + if algorithm == 'MD5': + H = lambda x: md5.new(x).hexdigest() + elif algorithm == 'SHA': + H = lambda x: sha.new(x).hexdigest() + # XXX MD5-sess + KD = lambda s, d: H("%s:%s" % (s, d)) + return H, KD + + def get_entity_digest(self, data, chal): + # XXX not implemented yet + return None + + +class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler): + """An authentication protocol defined by RFC 2069 + + Digest authentication improves on basic authentication because it + does not transmit passwords in the clear. + """ + + auth_header = 'Authorization' + handler_order = 490 + + def http_error_401(self, req, fp, code, msg, headers): + host = urlparse.urlparse(req.get_full_url())[1] + retry = self.http_error_auth_reqed('www-authenticate', + host, req, headers) + self.reset_retry_count() + return retry + + +class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler): + + auth_header = 'Proxy-Authorization' + handler_order = 490 + + def http_error_407(self, req, fp, code, msg, headers): + host = req.get_host() + retry = self.http_error_auth_reqed('proxy-authenticate', + host, req, headers) + self.reset_retry_count() + return retry + + +# XXX ugly implementation, should probably not bother deriving +class HTTPProxyPasswordMgr(HTTPPasswordMgr): + # has default realm and host/port + def add_password(self, realm, uri, user, passwd): + # uri could be a single URI or a sequence + if uri is None or isinstance(uri, basestring): + uris = [uri] + else: + uris = uri + passwd_by_domain = self.passwd.setdefault(realm, {}) + for uri in uris: + for default_port in True, False: + reduced_uri = self.reduce_uri(uri, default_port) + passwd_by_domain[reduced_uri] = (user, passwd) + + def find_user_password(self, realm, authuri): + attempts = [(realm, authuri), (None, authuri)] + # bleh, want default realm to take precedence over default + # URI/authority, hence this outer loop + for default_uri in False, True: + for realm, authuri in attempts: + authinfo_by_domain = self.passwd.get(realm, {}) + for default_port in True, False: + reduced_authuri = self.reduce_uri(authuri, default_port) + for uri, authinfo in authinfo_by_domain.iteritems(): + if uri is None and not default_uri: + continue + if self.is_suburi(uri, reduced_authuri): + return authinfo + user, password = None, None + + if user is not None: + break + return user, password + + def reduce_uri(self, uri, default_port=True): + if uri is None: + return None + return HTTPPasswordMgr.reduce_uri(self, uri, default_port) + + def is_suburi(self, base, test): + if base is None: + # default to the proxy's host/port + hostport, path = test + base = (hostport, "/") + return HTTPPasswordMgr.is_suburi(self, base, test) + + +class HTTPSClientCertMgr(HTTPPasswordMgr): + # implementation inheritance: this is not a proper subclass + def add_key_cert(self, uri, key_file, cert_file): + self.add_password(None, uri, key_file, cert_file) + def find_key_cert(self, authuri): + return HTTPPasswordMgr.find_user_password(self, None, authuri) diff --git a/src/calibre/utils/mechanize/_beautifulsoup.py b/src/calibre/utils/mechanize/_beautifulsoup.py new file mode 100644 index 0000000000..2541dcc63a --- /dev/null +++ b/src/calibre/utils/mechanize/_beautifulsoup.py @@ -0,0 +1,1080 @@ +"""Beautiful Soup +Elixir and Tonic +"The Screen-Scraper's Friend" +v2.1.1 +http://www.crummy.com/software/BeautifulSoup/ + +Beautiful Soup parses arbitrarily invalid XML- or HTML-like substance +into a tree representation. It provides methods and Pythonic idioms +that make it easy to search and modify the tree. + +A well-formed XML/HTML document will yield a well-formed data +structure. An ill-formed XML/HTML document will yield a +correspondingly ill-formed data structure. If your document is only +locally well-formed, you can use this library to find and process the +well-formed part of it. The BeautifulSoup class has heuristics for +obtaining a sensible parse tree in the face of common HTML errors. + +Beautiful Soup has no external dependencies. It works with Python 2.2 +and up. + +Beautiful Soup defines classes for four different parsing strategies: + + * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific + language that kind of looks like XML. + + * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid + or invalid. + + * ICantBelieveItsBeautifulSoup, for parsing valid but bizarre HTML + that trips up BeautifulSoup. + + * BeautifulSOAP, for making it easier to parse XML documents that use + lots of subelements containing a single string, where you'd prefer + they put that string into an attribute (such as SOAP messages). + +You can subclass BeautifulStoneSoup or BeautifulSoup to create a +parsing strategy specific to an XML schema or a particular bizarre +HTML document. Typically your subclass would just override +SELF_CLOSING_TAGS and/or NESTABLE_TAGS. +""" +from __future__ import generators + +__author__ = "Leonard Richardson (leonardr@segfault.org)" +__version__ = "2.1.1" +__date__ = "$Date: 2004/10/18 00:14:20 $" +__copyright__ = "Copyright (c) 2004-2005 Leonard Richardson" +__license__ = "PSF" + +from sgmllib import SGMLParser, SGMLParseError +import types +import re +import sgmllib + +#This code makes Beautiful Soup able to parse XML with namespaces +sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') + +class NullType(object): + + """Similar to NoneType with a corresponding singleton instance + 'Null' that, unlike None, accepts any message and returns itself. + + Examples: + >>> Null("send", "a", "message")("and one more", + ... "and what you get still") is Null + True + """ + + def __new__(cls): return Null + def __call__(self, *args, **kwargs): return Null +## def __getstate__(self, *args): return Null + def __getattr__(self, attr): return Null + def __getitem__(self, item): return Null + def __setattr__(self, attr, value): pass + def __setitem__(self, item, value): pass + def __len__(self): return 0 + # FIXME: is this a python bug? otherwise ``for x in Null: pass`` + # never terminates... + def __iter__(self): return iter([]) + def __contains__(self, item): return False + def __repr__(self): return "Null" +Null = object.__new__(NullType) + +class PageElement: + """Contains the navigational information for some part of the page + (either a tag or a piece of text)""" + + def setup(self, parent=Null, previous=Null): + """Sets up the initial relations between this element and + other elements.""" + self.parent = parent + self.previous = previous + self.next = Null + self.previousSibling = Null + self.nextSibling = Null + if self.parent and self.parent.contents: + self.previousSibling = self.parent.contents[-1] + self.previousSibling.nextSibling = self + + def findNext(self, name=None, attrs={}, text=None): + """Returns the first item that matches the given criteria and + appears after this Tag in the document.""" + return self._first(self.fetchNext, name, attrs, text) + firstNext = findNext + + def fetchNext(self, name=None, attrs={}, text=None, limit=None): + """Returns all items that match the given criteria and appear + before after Tag in the document.""" + return self._fetch(name, attrs, text, limit, self.nextGenerator) + + def findNextSibling(self, name=None, attrs={}, text=None): + """Returns the closest sibling to this Tag that matches the + given criteria and appears after this Tag in the document.""" + return self._first(self.fetchNextSiblings, name, attrs, text) + firstNextSibling = findNextSibling + + def fetchNextSiblings(self, name=None, attrs={}, text=None, limit=None): + """Returns the siblings of this Tag that match the given + criteria and appear after this Tag in the document.""" + return self._fetch(name, attrs, text, limit, self.nextSiblingGenerator) + + def findPrevious(self, name=None, attrs={}, text=None): + """Returns the first item that matches the given criteria and + appears before this Tag in the document.""" + return self._first(self.fetchPrevious, name, attrs, text) + + def fetchPrevious(self, name=None, attrs={}, text=None, limit=None): + """Returns all items that match the given criteria and appear + before this Tag in the document.""" + return self._fetch(name, attrs, text, limit, self.previousGenerator) + firstPrevious = findPrevious + + def findPreviousSibling(self, name=None, attrs={}, text=None): + """Returns the closest sibling to this Tag that matches the + given criteria and appears before this Tag in the document.""" + return self._first(self.fetchPreviousSiblings, name, attrs, text) + firstPreviousSibling = findPreviousSibling + + def fetchPreviousSiblings(self, name=None, attrs={}, text=None, + limit=None): + """Returns the siblings of this Tag that match the given + criteria and appear before this Tag in the document.""" + return self._fetch(name, attrs, text, limit, + self.previousSiblingGenerator) + + def findParent(self, name=None, attrs={}): + """Returns the closest parent of this Tag that matches the given + criteria.""" + r = Null + l = self.fetchParents(name, attrs, 1) + if l: + r = l[0] + return r + firstParent = findParent + + def fetchParents(self, name=None, attrs={}, limit=None): + """Returns the parents of this Tag that match the given + criteria.""" + return self._fetch(name, attrs, None, limit, self.parentGenerator) + + #These methods do the real heavy lifting. + + def _first(self, method, name, attrs, text): + r = Null + l = method(name, attrs, text, 1) + if l: + r = l[0] + return r + + def _fetch(self, name, attrs, text, limit, generator): + "Iterates over a generator looking for things that match." + if not hasattr(attrs, 'items'): + attrs = {'class' : attrs} + + results = [] + g = generator() + while True: + try: + i = g.next() + except StopIteration: + break + found = None + if isinstance(i, Tag): + if not text: + if not name or self._matches(i, name): + match = True + for attr, matchAgainst in attrs.items(): + check = i.get(attr) + if not self._matches(check, matchAgainst): + match = False + break + if match: + found = i + elif text: + if self._matches(i, text): + found = i + if found: + results.append(found) + if limit and len(results) >= limit: + break + return results + + #Generators that can be used to navigate starting from both + #NavigableTexts and Tags. + def nextGenerator(self): + i = self + while i: + i = i.next + yield i + + def nextSiblingGenerator(self): + i = self + while i: + i = i.nextSibling + yield i + + def previousGenerator(self): + i = self + while i: + i = i.previous + yield i + + def previousSiblingGenerator(self): + i = self + while i: + i = i.previousSibling + yield i + + def parentGenerator(self): + i = self + while i: + i = i.parent + yield i + + def _matches(self, chunk, howToMatch): + #print 'looking for %s in %s' % (howToMatch, chunk) + # + # If given a list of items, return true if the list contains a + # text element that matches. + if isList(chunk) and not isinstance(chunk, Tag): + for tag in chunk: + if isinstance(tag, NavigableText) and self._matches(tag, howToMatch): + return True + return False + if callable(howToMatch): + return howToMatch(chunk) + if isinstance(chunk, Tag): + #Custom match methods take the tag as an argument, but all other + #ways of matching match the tag name as a string + chunk = chunk.name + #Now we know that chunk is a string + if not isinstance(chunk, basestring): + chunk = str(chunk) + if hasattr(howToMatch, 'match'): + # It's a regexp object. + return howToMatch.search(chunk) + if isList(howToMatch): + return chunk in howToMatch + if hasattr(howToMatch, 'items'): + return howToMatch.has_key(chunk) + #It's just a string + return str(howToMatch) == chunk + +class NavigableText(PageElement): + + def __getattr__(self, attr): + "For backwards compatibility, text.string gives you text" + if attr == 'string': + return self + else: + raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr) + +class NavigableString(str, NavigableText): + pass + +class NavigableUnicodeString(unicode, NavigableText): + pass + +class Tag(PageElement): + + """Represents a found HTML tag with its attributes and contents.""" + + def __init__(self, name, attrs=None, parent=Null, previous=Null): + "Basic constructor." + self.name = name + if attrs == None: + attrs = [] + self.attrs = attrs + self.contents = [] + self.setup(parent, previous) + self.hidden = False + + def get(self, key, default=None): + """Returns the value of the 'key' attribute for the tag, or + the value given for 'default' if it doesn't have that + attribute.""" + return self._getAttrMap().get(key, default) + + def __getitem__(self, key): + """tag[key] returns the value of the 'key' attribute for the tag, + and throws an exception if it's not there.""" + return self._getAttrMap()[key] + + def __iter__(self): + "Iterating over a tag iterates over its contents." + return iter(self.contents) + + def __len__(self): + "The length of a tag is the length of its list of contents." + return len(self.contents) + + def __contains__(self, x): + return x in self.contents + + def __nonzero__(self): + "A tag is non-None even if it has no contents." + return True + + def __setitem__(self, key, value): + """Setting tag[key] sets the value of the 'key' attribute for the + tag.""" + self._getAttrMap() + self.attrMap[key] = value + found = False + for i in range(0, len(self.attrs)): + if self.attrs[i][0] == key: + self.attrs[i] = (key, value) + found = True + if not found: + self.attrs.append((key, value)) + self._getAttrMap()[key] = value + + def __delitem__(self, key): + "Deleting tag[key] deletes all 'key' attributes for the tag." + for item in self.attrs: + if item[0] == key: + self.attrs.remove(item) + #We don't break because bad HTML can define the same + #attribute multiple times. + self._getAttrMap() + if self.attrMap.has_key(key): + del self.attrMap[key] + + def __call__(self, *args, **kwargs): + """Calling a tag like a function is the same as calling its + fetch() method. Eg. tag('a') returns a list of all the A tags + found within this tag.""" + return apply(self.fetch, args, kwargs) + + def __getattr__(self, tag): + if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3: + return self.first(tag[:-3]) + elif tag.find('__') != 0: + return self.first(tag) + + def __eq__(self, other): + """Returns true iff this tag has the same name, the same attributes, + and the same contents (recursively) as the given tag. + + NOTE: right now this will return false if two tags have the + same attributes in a different order. Should this be fixed?""" + if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other): + return False + for i in range(0, len(self.contents)): + if self.contents[i] != other.contents[i]: + return False + return True + + def __ne__(self, other): + """Returns true iff this tag is not identical to the other tag, + as defined in __eq__.""" + return not self == other + + def __repr__(self): + """Renders this tag as a string.""" + return str(self) + + def __unicode__(self): + return self.__str__(1) + + def __str__(self, needUnicode=None, showStructureIndent=None): + """Returns a string or Unicode representation of this tag and + its contents. + + NOTE: since Python's HTML parser consumes whitespace, this + method is not certain to reproduce the whitespace present in + the original string.""" + + attrs = [] + if self.attrs: + for key, val in self.attrs: + attrs.append('%s="%s"' % (key, val)) + close = '' + closeTag = '' + if self.isSelfClosing(): + close = ' /' + else: + closeTag = '' % self.name + indentIncrement = None + if showStructureIndent != None: + indentIncrement = showStructureIndent + if not self.hidden: + indentIncrement += 1 + contents = self.renderContents(indentIncrement, needUnicode=needUnicode) + if showStructureIndent: + space = '\n%s' % (' ' * showStructureIndent) + if self.hidden: + s = contents + else: + s = [] + attributeString = '' + if attrs: + attributeString = ' ' + ' '.join(attrs) + if showStructureIndent: + s.append(space) + s.append('<%s%s%s>' % (self.name, attributeString, close)) + s.append(contents) + if closeTag and showStructureIndent != None: + s.append(space) + s.append(closeTag) + s = ''.join(s) + isUnicode = type(s) == types.UnicodeType + if needUnicode and not isUnicode: + s = unicode(s) + elif isUnicode and needUnicode==False: + s = str(s) + return s + + def prettify(self, needUnicode=None): + return self.__str__(needUnicode, showStructureIndent=True) + + def renderContents(self, showStructureIndent=None, needUnicode=None): + """Renders the contents of this tag as a (possibly Unicode) + string.""" + s=[] + for c in self: + text = None + if isinstance(c, NavigableUnicodeString) or type(c) == types.UnicodeType: + text = unicode(c) + elif isinstance(c, Tag): + s.append(c.__str__(needUnicode, showStructureIndent)) + elif needUnicode: + text = unicode(c) + else: + text = str(c) + if text: + if showStructureIndent != None: + if text[-1] == '\n': + text = text[:-1] + s.append(text) + return ''.join(s) + + #Soup methods + + def firstText(self, text, recursive=True): + """Convenience method to retrieve the first piece of text matching the + given criteria. 'text' can be a string, a regular expression object, + a callable that takes a string and returns whether or not the + string 'matches', etc.""" + return self.first(recursive=recursive, text=text) + + def fetchText(self, text, recursive=True, limit=None): + """Convenience method to retrieve all pieces of text matching the + given criteria. 'text' can be a string, a regular expression object, + a callable that takes a string and returns whether or not the + string 'matches', etc.""" + return self.fetch(recursive=recursive, text=text, limit=limit) + + def first(self, name=None, attrs={}, recursive=True, text=None): + """Return only the first child of this + Tag matching the given criteria.""" + r = Null + l = self.fetch(name, attrs, recursive, text, 1) + if l: + r = l[0] + return r + findChild = first + + def fetch(self, name=None, attrs={}, recursive=True, text=None, + limit=None): + """Extracts a list of Tag objects that match the given + criteria. You can specify the name of the Tag and any + attributes you want the Tag to have. + + The value of a key-value pair in the 'attrs' map can be a + string, a list of strings, a regular expression object, or a + callable that takes a string and returns whether or not the + string matches for some custom definition of 'matches'. The + same is true of the tag name.""" + generator = self.recursiveChildGenerator + if not recursive: + generator = self.childGenerator + return self._fetch(name, attrs, text, limit, generator) + fetchChildren = fetch + + #Utility methods + + def isSelfClosing(self): + """Returns true iff this is a self-closing tag as defined in the HTML + standard. + + TODO: This is specific to BeautifulSoup and its subclasses, but it's + used by __str__""" + return self.name in BeautifulSoup.SELF_CLOSING_TAGS + + def append(self, tag): + """Appends the given tag to the contents of this tag.""" + self.contents.append(tag) + + #Private methods + + def _getAttrMap(self): + """Initializes a map representation of this tag's attributes, + if not already initialized.""" + if not getattr(self, 'attrMap'): + self.attrMap = {} + for (key, value) in self.attrs: + self.attrMap[key] = value + return self.attrMap + + #Generator methods + def childGenerator(self): + for i in range(0, len(self.contents)): + yield self.contents[i] + raise StopIteration + + def recursiveChildGenerator(self): + stack = [(self, 0)] + while stack: + tag, start = stack.pop() + if isinstance(tag, Tag): + for i in range(start, len(tag.contents)): + a = tag.contents[i] + yield a + if isinstance(a, Tag) and tag.contents: + if i < len(tag.contents) - 1: + stack.append((tag, i+1)) + stack.append((a, 0)) + break + raise StopIteration + + +def isList(l): + """Convenience method that works with all 2.x versions of Python + to determine whether or not something is listlike.""" + return hasattr(l, '__iter__') \ + or (type(l) in (types.ListType, types.TupleType)) + +def buildTagMap(default, *args): + """Turns a list of maps, lists, or scalars into a single map. + Used to build the SELF_CLOSING_TAGS and NESTABLE_TAGS maps out + of lists and partial maps.""" + built = {} + for portion in args: + if hasattr(portion, 'items'): + #It's a map. Merge it. + for k,v in portion.items(): + built[k] = v + elif isList(portion): + #It's a list. Map each item to the default. + for k in portion: + built[k] = default + else: + #It's a scalar. Map it to the default. + built[portion] = default + return built + +class BeautifulStoneSoup(Tag, SGMLParser): + + """This class contains the basic parser and fetch code. It defines + a parser that knows nothing about tag behavior except for the + following: + + You can't close a tag without closing all the tags it encloses. + That is, "" actually means + "". + + [Another possible explanation is "", but since + this class defines no SELF_CLOSING_TAGS, it will never use that + explanation.] + + This class is useful for parsing XML or made-up markup languages, + or when BeautifulSoup makes an assumption counter to what you were + expecting.""" + + SELF_CLOSING_TAGS = {} + NESTABLE_TAGS = {} + RESET_NESTING_TAGS = {} + QUOTE_TAGS = {} + + #As a public service we will by default silently replace MS smart quotes + #and similar characters with their HTML or ASCII equivalents. + MS_CHARS = { '\x80' : '€', + '\x81' : ' ', + '\x82' : '‚', + '\x83' : 'ƒ', + '\x84' : '„', + '\x85' : '…', + '\x86' : '†', + '\x87' : '‡', + '\x88' : '⁁', + '\x89' : '%', + '\x8A' : 'Š', + '\x8B' : '<', + '\x8C' : 'Œ', + '\x8D' : '?', + '\x8E' : 'Z', + '\x8F' : '?', + '\x90' : '?', + '\x91' : '‘', + '\x92' : '’', + '\x93' : '“', + '\x94' : '”', + '\x95' : '•', + '\x96' : '–', + '\x97' : '—', + '\x98' : '˜', + '\x99' : '™', + '\x9a' : 'š', + '\x9b' : '>', + '\x9c' : 'œ', + '\x9d' : '?', + '\x9e' : 'z', + '\x9f' : 'Ÿ',} + + PARSER_MASSAGE = [(re.compile('(<[^<>]*)/>'), + lambda(x):x.group(1) + ' />'), + (re.compile(']*)>'), + lambda(x):''), + (re.compile("([\x80-\x9f])"), + lambda(x): BeautifulStoneSoup.MS_CHARS.get(x.group(1))) + ] + + ROOT_TAG_NAME = '[document]' + + def __init__(self, text=None, avoidParserProblems=True, + initialTextIsEverything=True): + """Initialize this as the 'root tag' and feed in any text to + the parser. + + NOTE about avoidParserProblems: sgmllib will process most bad + HTML, and BeautifulSoup has tricks for dealing with some HTML + that kills sgmllib, but Beautiful Soup can nonetheless choke + or lose data if your data uses self-closing tags or + declarations incorrectly. By default, Beautiful Soup sanitizes + its input to avoid the vast majority of these problems. The + problems are relatively rare, even in bad HTML, so feel free + to pass in False to avoidParserProblems if they don't apply to + you, and you'll get better performance. The only reason I have + this turned on by default is so I don't get so many tech + support questions. + + The two most common instances of invalid HTML that will choke + sgmllib are fixed by the default parser massage techniques: + +
(No space between name of closing tag and tag close) + (Extraneous whitespace in declaration) + + You can pass in a custom list of (RE object, replace method) + tuples to get Beautiful Soup to scrub your input the way you + want.""" + Tag.__init__(self, self.ROOT_TAG_NAME) + if avoidParserProblems \ + and not isList(avoidParserProblems): + avoidParserProblems = self.PARSER_MASSAGE + self.avoidParserProblems = avoidParserProblems + SGMLParser.__init__(self) + self.quoteStack = [] + self.hidden = 1 + self.reset() + if hasattr(text, 'read'): + #It's a file-type object. + text = text.read() + if text: + self.feed(text) + if initialTextIsEverything: + self.done() + + def __getattr__(self, methodName): + """This method routes method call requests to either the SGMLParser + superclass or the Tag superclass, depending on the method name.""" + if methodName.find('start_') == 0 or methodName.find('end_') == 0 \ + or methodName.find('do_') == 0: + return SGMLParser.__getattr__(self, methodName) + elif methodName.find('__') != 0: + return Tag.__getattr__(self, methodName) + else: + raise AttributeError + + def feed(self, text): + if self.avoidParserProblems: + for fix, m in self.avoidParserProblems: + text = fix.sub(m, text) + SGMLParser.feed(self, text) + + def done(self): + """Called when you're done parsing, so that the unclosed tags can be + correctly processed.""" + self.endData() #NEW + while self.currentTag.name != self.ROOT_TAG_NAME: + self.popTag() + + def reset(self): + SGMLParser.reset(self) + self.currentData = [] + self.currentTag = None + self.tagStack = [] + self.pushTag(self) + + def popTag(self): + tag = self.tagStack.pop() + # Tags with just one string-owning child get the child as a + # 'string' property, so that soup.tag.string is shorthand for + # soup.tag.contents[0] + if len(self.currentTag.contents) == 1 and \ + isinstance(self.currentTag.contents[0], NavigableText): + self.currentTag.string = self.currentTag.contents[0] + + #print "Pop", tag.name + if self.tagStack: + self.currentTag = self.tagStack[-1] + return self.currentTag + + def pushTag(self, tag): + #print "Push", tag.name + if self.currentTag: + self.currentTag.append(tag) + self.tagStack.append(tag) + self.currentTag = self.tagStack[-1] + + def endData(self): + currentData = ''.join(self.currentData) + if currentData: + if not currentData.strip(): + if '\n' in currentData: + currentData = '\n' + else: + currentData = ' ' + c = NavigableString + if type(currentData) == types.UnicodeType: + c = NavigableUnicodeString + o = c(currentData) + o.setup(self.currentTag, self.previous) + if self.previous: + self.previous.next = o + self.previous = o + self.currentTag.contents.append(o) + self.currentData = [] + + def _popToTag(self, name, inclusivePop=True): + """Pops the tag stack up to and including the most recent + instance of the given tag. If inclusivePop is false, pops the tag + stack up to but *not* including the most recent instqance of + the given tag.""" + if name == self.ROOT_TAG_NAME: + return + + numPops = 0 + mostRecentTag = None + for i in range(len(self.tagStack)-1, 0, -1): + if name == self.tagStack[i].name: + numPops = len(self.tagStack)-i + break + if not inclusivePop: + numPops = numPops - 1 + + for i in range(0, numPops): + mostRecentTag = self.popTag() + return mostRecentTag + + def _smartPop(self, name): + + """We need to pop up to the previous tag of this type, unless + one of this tag's nesting reset triggers comes between this + tag and the previous tag of this type, OR unless this tag is a + generic nesting trigger and another generic nesting trigger + comes between this tag and the previous tag of this type. + + Examples: +

FooBar

should pop to 'p', not 'b'. +

FooBar

should pop to 'table', not 'p'. +

Foo

Bar

should pop to 'tr', not 'p'. +

FooBar

should pop to 'p', not 'b'. + +

    • *
    • * should pop to 'ul', not the first 'li'. +
  • ** should pop to 'table', not the first 'tr' + tag should + implicitly close the previous tag within the same
    ** should pop to 'tr', not the first 'td' + """ + + nestingResetTriggers = self.NESTABLE_TAGS.get(name) + isNestable = nestingResetTriggers != None + isResetNesting = self.RESET_NESTING_TAGS.has_key(name) + popTo = None + inclusive = True + for i in range(len(self.tagStack)-1, 0, -1): + p = self.tagStack[i] + if (not p or p.name == name) and not isNestable: + #Non-nestable tags get popped to the top or to their + #last occurance. + popTo = name + break + if (nestingResetTriggers != None + and p.name in nestingResetTriggers) \ + or (nestingResetTriggers == None and isResetNesting + and self.RESET_NESTING_TAGS.has_key(p.name)): + + #If we encounter one of the nesting reset triggers + #peculiar to this tag, or we encounter another tag + #that causes nesting to reset, pop up to but not + #including that tag. + + popTo = p.name + inclusive = False + break + p = p.parent + if popTo: + self._popToTag(popTo, inclusive) + + def unknown_starttag(self, name, attrs, selfClosing=0): + #print "Start tag %s" % name + if self.quoteStack: + #This is not a real tag. + #print "<%s> is not real!" % name + attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs)) + self.handle_data('<%s%s>' % (name, attrs)) + return + self.endData() + if not name in self.SELF_CLOSING_TAGS and not selfClosing: + self._smartPop(name) + tag = Tag(name, attrs, self.currentTag, self.previous) + if self.previous: + self.previous.next = tag + self.previous = tag + self.pushTag(tag) + if selfClosing or name in self.SELF_CLOSING_TAGS: + self.popTag() + if name in self.QUOTE_TAGS: + #print "Beginning quote (%s)" % name + self.quoteStack.append(name) + self.literal = 1 + + def unknown_endtag(self, name): + if self.quoteStack and self.quoteStack[-1] != name: + #This is not a real end tag. + #print " is not real!" % name + self.handle_data('' % name) + return + self.endData() + self._popToTag(name) + if self.quoteStack and self.quoteStack[-1] == name: + self.quoteStack.pop() + self.literal = (len(self.quoteStack) > 0) + + def handle_data(self, data): + self.currentData.append(data) + + def handle_pi(self, text): + "Propagate processing instructions right through." + self.handle_data("" % text) + + def handle_comment(self, text): + "Propagate comments right through." + self.handle_data("" % text) + + def handle_charref(self, ref): + "Propagate char refs right through." + self.handle_data('&#%s;' % ref) + + def handle_entityref(self, ref): + "Propagate entity refs right through." + self.handle_data('&%s;' % ref) + + def handle_decl(self, data): + "Propagate DOCTYPEs and the like right through." + self.handle_data('' % data) + + def parse_declaration(self, i): + """Treat a bogus SGML declaration as raw data. Treat a CDATA + declaration as regular data.""" + j = None + if self.rawdata[i:i+9] == '', i) + if k == -1: + k = len(self.rawdata) + self.handle_data(self.rawdata[i+9:k]) + j = k+3 + else: + try: + j = SGMLParser.parse_declaration(self, i) + except SGMLParseError: + toHandle = self.rawdata[i:] + self.handle_data(toHandle) + j = i + len(toHandle) + return j + +class BeautifulSoup(BeautifulStoneSoup): + + """This parser knows the following facts about HTML: + + * Some tags have no closing tag and should be interpreted as being + closed as soon as they are encountered. + + * The text inside some tags (ie. 'script') may contain tags which + are not really part of the document and which should be parsed + as text, not tags. If you want to parse the text as tags, you can + always fetch it and parse it explicitly. + + * Tag nesting rules: + + Most tags can't be nested at all. For instance, the occurance of + a

    tag should implicitly close the previous

    tag. + +

    Para1

    Para2 + should be transformed into: +

    Para1

    Para2 + + Some tags can be nested arbitrarily. For instance, the occurance + of a

    tag should _not_ implicitly close the previous +
    tag. + + Alice said:
    Bob said:
    Blah + should NOT be transformed into: + Alice said:
    Bob said:
    Blah + + Some tags can be nested, but the nesting is reset by the + interposition of other tags. For instance, a
    , + but not close a tag in another table. + +
    BlahBlah + should be transformed into: +
    BlahBlah + but, + Blah
    Blah + should NOT be transformed into + Blah
    Blah + + Differing assumptions about tag nesting rules are a major source + of problems with the BeautifulSoup class. If BeautifulSoup is not + treating as nestable a tag your page author treats as nestable, + try ICantBelieveItsBeautifulSoup before writing your own + subclass.""" + + SELF_CLOSING_TAGS = buildTagMap(None, ['br' , 'hr', 'input', 'img', 'meta', + 'spacer', 'link', 'frame', 'base']) + + QUOTE_TAGS = {'script': None} + + #According to the HTML standard, each of these inline tags can + #contain another tag of the same type. Furthermore, it's common + #to actually use these tags this way. + NESTABLE_INLINE_TAGS = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup', + 'center'] + + #According to the HTML standard, these block tags can contain + #another tag of the same type. Furthermore, it's common + #to actually use these tags this way. + NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del'] + + #Lists can contain other lists, but there are restrictions. + NESTABLE_LIST_TAGS = { 'ol' : [], + 'ul' : [], + 'li' : ['ul', 'ol'], + 'dl' : [], + 'dd' : ['dl'], + 'dt' : ['dl'] } + + #Tables can contain other tables, but there are restrictions. + NESTABLE_TABLE_TAGS = {'table' : [], + 'tr' : ['table', 'tbody', 'tfoot', 'thead'], + 'td' : ['tr'], + 'th' : ['tr'], + } + + NON_NESTABLE_BLOCK_TAGS = ['address', 'form', 'p', 'pre'] + + #If one of these tags is encountered, all tags up to the next tag of + #this type are popped. + RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript', + NON_NESTABLE_BLOCK_TAGS, + NESTABLE_LIST_TAGS, + NESTABLE_TABLE_TAGS) + + NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS, + NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS) + +class ICantBelieveItsBeautifulSoup(BeautifulSoup): + + """The BeautifulSoup class is oriented towards skipping over + common HTML errors like unclosed tags. However, sometimes it makes + errors of its own. For instance, consider this fragment: + + FooBar + + This is perfectly valid (if bizarre) HTML. However, the + BeautifulSoup class will implicitly close the first b tag when it + encounters the second 'b'. It will think the author wrote + "FooBar", and didn't close the first 'b' tag, because + there's no real-world reason to bold something that's already + bold. When it encounters '' it will close two more 'b' + tags, for a grand total of three tags closed instead of two. This + can throw off the rest of your document structure. The same is + true of a number of other tags, listed below. + + It's much more common for someone to forget to close (eg.) a 'b' + tag than to actually use nested 'b' tags, and the BeautifulSoup + class handles the common case. This class handles the + not-co-common case: where you can't believe someone wrote what + they did, but it's valid HTML and BeautifulSoup screwed up by + assuming it wouldn't be. + + If this doesn't do what you need, try subclassing this class or + BeautifulSoup, and providing your own list of NESTABLE_TAGS.""" + + I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \ + ['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong', + 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b', + 'big'] + + I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript'] + + NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS, + I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS, + I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS) + +class BeautifulSOAP(BeautifulStoneSoup): + """This class will push a tag with only a single string child into + the tag's parent as an attribute. The attribute's name is the tag + name, and the value is the string child. An example should give + the flavor of the change: + + baz + => + baz + + You can then access fooTag['bar'] instead of fooTag.barTag.string. + + This is, of course, useful for scraping structures that tend to + use subelements instead of attributes, such as SOAP messages. Note + that it modifies its input, so don't print the modified version + out. + + I'm not sure how many people really want to use this class; let me + know if you do. Mainly I like the name.""" + + def popTag(self): + if len(self.tagStack) > 1: + tag = self.tagStack[-1] + parent = self.tagStack[-2] + parent._getAttrMap() + if (isinstance(tag, Tag) and len(tag.contents) == 1 and + isinstance(tag.contents[0], NavigableText) and + not parent.attrMap.has_key(tag.name)): + parent[tag.name] = tag.contents[0] + BeautifulStoneSoup.popTag(self) + +#Enterprise class names! It has come to our attention that some people +#think the names of the Beautiful Soup parser classes are too silly +#and "unprofessional" for use in enterprise screen-scraping. We feel +#your pain! For such-minded folk, the Beautiful Soup Consortium And +#All-Night Kosher Bakery recommends renaming this file to +#"RobustParser.py" (or, in cases of extreme enterprisitude, +#"RobustParserBeanInterface.class") and using the following +#enterprise-friendly class aliases: +class RobustXMLParser(BeautifulStoneSoup): + pass +class RobustHTMLParser(BeautifulSoup): + pass +class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup): + pass +class SimplifyingSOAPParser(BeautifulSOAP): + pass + +### + + +#By default, act as an HTML pretty-printer. +if __name__ == '__main__': + import sys + soup = BeautifulStoneSoup(sys.stdin.read()) + print soup.prettify() diff --git a/src/calibre/utils/mechanize/_clientcookie.py b/src/calibre/utils/mechanize/_clientcookie.py new file mode 100644 index 0000000000..e8f0f67d4a --- /dev/null +++ b/src/calibre/utils/mechanize/_clientcookie.py @@ -0,0 +1,1651 @@ +"""HTTP cookie handling for web clients. + +This module originally developed from my port of Gisle Aas' Perl module +HTTP::Cookies, from the libwww-perl library. + +Docstrings, comments and debug strings in this code refer to the +attributes of the HTTP cookie system as cookie-attributes, to distinguish +them clearly from Python attributes. + + CookieJar____ + / \ \ + FileCookieJar \ \ + / | \ \ \ + MozillaCookieJar | LWPCookieJar \ \ + | | \ + | ---MSIEBase | \ + | / | | \ + | / MSIEDBCookieJar BSDDBCookieJar + |/ + MSIECookieJar + +Comments to John J Lee . + + +Copyright 2002-2006 John J Lee +Copyright 1997-1999 Gisle Aas (original libwww-perl code) +Copyright 2002-2003 Johnny Lee (original MSIE Perl code) + +This code is free software; you can redistribute it and/or modify it +under the terms of the BSD or ZPL 2.1 licenses (see the file +COPYING.txt included with the distribution). + +""" + +import sys, re, copy, time, struct, urllib, types, logging +try: + import threading + _threading = threading; del threading +except ImportError: + import dummy_threading + _threading = dummy_threading; del dummy_threading +import httplib # only for the default HTTP port + +MISSING_FILENAME_TEXT = ("a filename was not supplied (nor was the CookieJar " + "instance initialised with one)") +DEFAULT_HTTP_PORT = str(httplib.HTTP_PORT) + +from _headersutil import split_header_words, parse_ns_headers +from _util import isstringlike +import _rfc3986 + +debug = logging.getLogger("mechanize.cookies").debug + + +def reraise_unmasked_exceptions(unmasked=()): + # There are a few catch-all except: statements in this module, for + # catching input that's bad in unexpected ways. + # This function re-raises some exceptions we don't want to trap. + import mechanize, warnings + if not mechanize.USE_BARE_EXCEPT: + raise + unmasked = unmasked + (KeyboardInterrupt, SystemExit, MemoryError) + etype = sys.exc_info()[0] + if issubclass(etype, unmasked): + raise + # swallowed an exception + import traceback, StringIO + f = StringIO.StringIO() + traceback.print_exc(None, f) + msg = f.getvalue() + warnings.warn("mechanize bug!\n%s" % msg, stacklevel=2) + + +IPV4_RE = re.compile(r"\.\d+$") +def is_HDN(text): + """Return True if text is a host domain name.""" + # XXX + # This may well be wrong. Which RFC is HDN defined in, if any (for + # the purposes of RFC 2965)? + # For the current implementation, what about IPv6? Remember to look + # at other uses of IPV4_RE also, if change this. + return not (IPV4_RE.search(text) or + text == "" or + text[0] == "." or text[-1] == ".") + +def domain_match(A, B): + """Return True if domain A domain-matches domain B, according to RFC 2965. + + A and B may be host domain names or IP addresses. + + RFC 2965, section 1: + + Host names can be specified either as an IP address or a HDN string. + Sometimes we compare one host name with another. (Such comparisons SHALL + be case-insensitive.) Host A's name domain-matches host B's if + + * their host name strings string-compare equal; or + + * A is a HDN string and has the form NB, where N is a non-empty + name string, B has the form .B', and B' is a HDN string. (So, + x.y.com domain-matches .Y.com but not Y.com.) + + Note that domain-match is not a commutative operation: a.b.c.com + domain-matches .c.com, but not the reverse. + + """ + # Note that, if A or B are IP addresses, the only relevant part of the + # definition of the domain-match algorithm is the direct string-compare. + A = A.lower() + B = B.lower() + if A == B: + return True + if not is_HDN(A): + return False + i = A.rfind(B) + has_form_nb = not (i == -1 or i == 0) + return ( + has_form_nb and + B.startswith(".") and + is_HDN(B[1:]) + ) + +def liberal_is_HDN(text): + """Return True if text is a sort-of-like a host domain name. + + For accepting/blocking domains. + + """ + return not IPV4_RE.search(text) + +def user_domain_match(A, B): + """For blocking/accepting domains. + + A and B may be host domain names or IP addresses. + + """ + A = A.lower() + B = B.lower() + if not (liberal_is_HDN(A) and liberal_is_HDN(B)): + if A == B: + # equal IP addresses + return True + return False + initial_dot = B.startswith(".") + if initial_dot and A.endswith(B): + return True + if not initial_dot and A == B: + return True + return False + +cut_port_re = re.compile(r":\d+$") +def request_host(request): + """Return request-host, as defined by RFC 2965. + + Variation from RFC: returned value is lowercased, for convenient + comparison. + + """ + url = request.get_full_url() + host = _rfc3986.urlsplit(url)[1] + if host is None: + host = request.get_header("Host", "") + + # remove port, if present + host = cut_port_re.sub("", host, 1) + return host.lower() + +def eff_request_host(request): + """Return a tuple (request-host, effective request-host name). + + As defined by RFC 2965, except both are lowercased. + + """ + erhn = req_host = request_host(request) + if req_host.find(".") == -1 and not IPV4_RE.search(req_host): + erhn = req_host + ".local" + return req_host, erhn + +def request_path(request): + """request-URI, as defined by RFC 2965.""" + url = request.get_full_url() + path, query, frag = _rfc3986.urlsplit(url)[2:] + path = escape_path(path) + req_path = _rfc3986.urlunsplit((None, None, path, query, frag)) + if not req_path.startswith("/"): + req_path = "/"+req_path + return req_path + +def request_port(request): + host = request.get_host() + i = host.find(':') + if i >= 0: + port = host[i+1:] + try: + int(port) + except ValueError: + debug("nonnumeric port: '%s'", port) + return None + else: + port = DEFAULT_HTTP_PORT + return port + +# Characters in addition to A-Z, a-z, 0-9, '_', '.', and '-' that don't +# need to be escaped to form a valid HTTP URL (RFCs 2396 and 1738). +HTTP_PATH_SAFE = "%/;:@&=+$,!~*'()" +ESCAPED_CHAR_RE = re.compile(r"%([0-9a-fA-F][0-9a-fA-F])") +def uppercase_escaped_char(match): + return "%%%s" % match.group(1).upper() +def escape_path(path): + """Escape any invalid characters in HTTP URL, and uppercase all escapes.""" + # There's no knowing what character encoding was used to create URLs + # containing %-escapes, but since we have to pick one to escape invalid + # path characters, we pick UTF-8, as recommended in the HTML 4.0 + # specification: + # http://www.w3.org/TR/REC-html40/appendix/notes.html#h-B.2.1 + # And here, kind of: draft-fielding-uri-rfc2396bis-03 + # (And in draft IRI specification: draft-duerst-iri-05) + # (And here, for new URI schemes: RFC 2718) + if isinstance(path, types.UnicodeType): + path = path.encode("utf-8") + path = urllib.quote(path, HTTP_PATH_SAFE) + path = ESCAPED_CHAR_RE.sub(uppercase_escaped_char, path) + return path + +def reach(h): + """Return reach of host h, as defined by RFC 2965, section 1. + + The reach R of a host name H is defined as follows: + + * If + + - H is the host domain name of a host; and, + + - H has the form A.B; and + + - A has no embedded (that is, interior) dots; and + + - B has at least one embedded dot, or B is the string "local". + then the reach of H is .B. + + * Otherwise, the reach of H is H. + + >>> reach("www.acme.com") + '.acme.com' + >>> reach("acme.com") + 'acme.com' + >>> reach("acme.local") + '.local' + + """ + i = h.find(".") + if i >= 0: + #a = h[:i] # this line is only here to show what a is + b = h[i+1:] + i = b.find(".") + if is_HDN(h) and (i >= 0 or b == "local"): + return "."+b + return h + +def is_third_party(request): + """ + + RFC 2965, section 3.3.6: + + An unverifiable transaction is to a third-party host if its request- + host U does not domain-match the reach R of the request-host O in the + origin transaction. + + """ + req_host = request_host(request) + # the origin request's request-host was stuffed into request by + # _urllib2_support.AbstractHTTPHandler + return not domain_match(req_host, reach(request.origin_req_host)) + + +class Cookie: + """HTTP Cookie. + + This class represents both Netscape and RFC 2965 cookies. + + This is deliberately a very simple class. It just holds attributes. It's + possible to construct Cookie instances that don't comply with the cookie + standards. CookieJar.make_cookies is the factory function for Cookie + objects -- it deals with cookie parsing, supplying defaults, and + normalising to the representation used in this class. CookiePolicy is + responsible for checking them to see whether they should be accepted from + and returned to the server. + + version: integer; + name: string; + value: string (may be None); + port: string; None indicates no attribute was supplied (eg. "Port", rather + than eg. "Port=80"); otherwise, a port string (eg. "80") or a port list + string (eg. "80,8080") + port_specified: boolean; true if a value was supplied with the Port + cookie-attribute + domain: string; + domain_specified: boolean; true if Domain was explicitly set + domain_initial_dot: boolean; true if Domain as set in HTTP header by server + started with a dot (yes, this really is necessary!) + path: string; + path_specified: boolean; true if Path was explicitly set + secure: boolean; true if should only be returned over secure connection + expires: integer; seconds since epoch (RFC 2965 cookies should calculate + this value from the Max-Age attribute) + discard: boolean, true if this is a session cookie; (if no expires value, + this should be true) + comment: string; + comment_url: string; + rfc2109: boolean; true if cookie arrived in a Set-Cookie: (not + Set-Cookie2:) header, but had a version cookie-attribute of 1 + rest: mapping of other cookie-attributes + + Note that the port may be present in the headers, but unspecified ("Port" + rather than"Port=80", for example); if this is the case, port is None. + + """ + + def __init__(self, version, name, value, + port, port_specified, + domain, domain_specified, domain_initial_dot, + path, path_specified, + secure, + expires, + discard, + comment, + comment_url, + rest, + rfc2109=False, + ): + + if version is not None: version = int(version) + if expires is not None: expires = int(expires) + if port is None and port_specified is True: + raise ValueError("if port is None, port_specified must be false") + + self.version = version + self.name = name + self.value = value + self.port = port + self.port_specified = port_specified + # normalise case, as per RFC 2965 section 3.3.3 + self.domain = domain.lower() + self.domain_specified = domain_specified + # Sigh. We need to know whether the domain given in the + # cookie-attribute had an initial dot, in order to follow RFC 2965 + # (as clarified in draft errata). Needed for the returned $Domain + # value. + self.domain_initial_dot = domain_initial_dot + self.path = path + self.path_specified = path_specified + self.secure = secure + self.expires = expires + self.discard = discard + self.comment = comment + self.comment_url = comment_url + self.rfc2109 = rfc2109 + + self._rest = copy.copy(rest) + + def has_nonstandard_attr(self, name): + return self._rest.has_key(name) + def get_nonstandard_attr(self, name, default=None): + return self._rest.get(name, default) + def set_nonstandard_attr(self, name, value): + self._rest[name] = value + def nonstandard_attr_keys(self): + return self._rest.keys() + + def is_expired(self, now=None): + if now is None: now = time.time() + return (self.expires is not None) and (self.expires <= now) + + def __str__(self): + if self.port is None: p = "" + else: p = ":"+self.port + limit = self.domain + p + self.path + if self.value is not None: + namevalue = "%s=%s" % (self.name, self.value) + else: + namevalue = self.name + return "" % (namevalue, limit) + + def __repr__(self): + args = [] + for name in ["version", "name", "value", + "port", "port_specified", + "domain", "domain_specified", "domain_initial_dot", + "path", "path_specified", + "secure", "expires", "discard", "comment", "comment_url", + ]: + attr = getattr(self, name) + args.append("%s=%s" % (name, repr(attr))) + args.append("rest=%s" % repr(self._rest)) + args.append("rfc2109=%s" % repr(self.rfc2109)) + return "Cookie(%s)" % ", ".join(args) + + +class CookiePolicy: + """Defines which cookies get accepted from and returned to server. + + May also modify cookies. + + The subclass DefaultCookiePolicy defines the standard rules for Netscape + and RFC 2965 cookies -- override that if you want a customised policy. + + As well as implementing set_ok and return_ok, implementations of this + interface must also supply the following attributes, indicating which + protocols should be used, and how. These can be read and set at any time, + though whether that makes complete sense from the protocol point of view is + doubtful. + + Public attributes: + + netscape: implement netscape protocol + rfc2965: implement RFC 2965 protocol + rfc2109_as_netscape: + WARNING: This argument will change or go away if is not accepted into + the Python standard library in this form! + If true, treat RFC 2109 cookies as though they were Netscape cookies. The + default is for this attribute to be None, which means treat 2109 cookies + as RFC 2965 cookies unless RFC 2965 handling is switched off (which it is, + by default), and as Netscape cookies otherwise. + hide_cookie2: don't add Cookie2 header to requests (the presence of + this header indicates to the server that we understand RFC 2965 + cookies) + + """ + def set_ok(self, cookie, request): + """Return true if (and only if) cookie should be accepted from server. + + Currently, pre-expired cookies never get this far -- the CookieJar + class deletes such cookies itself. + + cookie: mechanize.Cookie object + request: object implementing the interface defined by + CookieJar.extract_cookies.__doc__ + + """ + raise NotImplementedError() + + def return_ok(self, cookie, request): + """Return true if (and only if) cookie should be returned to server. + + cookie: mechanize.Cookie object + request: object implementing the interface defined by + CookieJar.add_cookie_header.__doc__ + + """ + raise NotImplementedError() + + def domain_return_ok(self, domain, request): + """Return false if cookies should not be returned, given cookie domain. + + This is here as an optimization, to remove the need for checking every + cookie with a particular domain (which may involve reading many files). + The default implementations of domain_return_ok and path_return_ok + (return True) leave all the work to return_ok. + + If domain_return_ok returns true for the cookie domain, path_return_ok + is called for the cookie path. Otherwise, path_return_ok and return_ok + are never called for that cookie domain. If path_return_ok returns + true, return_ok is called with the Cookie object itself for a full + check. Otherwise, return_ok is never called for that cookie path. + + Note that domain_return_ok is called for every *cookie* domain, not + just for the *request* domain. For example, the function might be + called with both ".acme.com" and "www.acme.com" if the request domain is + "www.acme.com". The same goes for path_return_ok. + + For argument documentation, see the docstring for return_ok. + + """ + return True + + def path_return_ok(self, path, request): + """Return false if cookies should not be returned, given cookie path. + + See the docstring for domain_return_ok. + + """ + return True + + +class DefaultCookiePolicy(CookiePolicy): + """Implements the standard rules for accepting and returning cookies. + + Both RFC 2965 and Netscape cookies are covered. RFC 2965 handling is + switched off by default. + + The easiest way to provide your own policy is to override this class and + call its methods in your overriden implementations before adding your own + additional checks. + + import mechanize + class MyCookiePolicy(mechanize.DefaultCookiePolicy): + def set_ok(self, cookie, request): + if not mechanize.DefaultCookiePolicy.set_ok( + self, cookie, request): + return False + if i_dont_want_to_store_this_cookie(): + return False + return True + + In addition to the features required to implement the CookiePolicy + interface, this class allows you to block and allow domains from setting + and receiving cookies. There are also some strictness switches that allow + you to tighten up the rather loose Netscape protocol rules a little bit (at + the cost of blocking some benign cookies). + + A domain blacklist and whitelist is provided (both off by default). Only + domains not in the blacklist and present in the whitelist (if the whitelist + is active) participate in cookie setting and returning. Use the + blocked_domains constructor argument, and blocked_domains and + set_blocked_domains methods (and the corresponding argument and methods for + allowed_domains). If you set a whitelist, you can turn it off again by + setting it to None. + + Domains in block or allow lists that do not start with a dot must + string-compare equal. For example, "acme.com" matches a blacklist entry of + "acme.com", but "www.acme.com" does not. Domains that do start with a dot + are matched by more specific domains too. For example, both "www.acme.com" + and "www.munitions.acme.com" match ".acme.com" (but "acme.com" itself does + not). IP addresses are an exception, and must match exactly. For example, + if blocked_domains contains "192.168.1.2" and ".168.1.2" 192.168.1.2 is + blocked, but 193.168.1.2 is not. + + Additional Public Attributes: + + General strictness switches + + strict_domain: don't allow sites to set two-component domains with + country-code top-level domains like .co.uk, .gov.uk, .co.nz. etc. + This is far from perfect and isn't guaranteed to work! + + RFC 2965 protocol strictness switches + + strict_rfc2965_unverifiable: follow RFC 2965 rules on unverifiable + transactions (usually, an unverifiable transaction is one resulting from + a redirect or an image hosted on another site); if this is false, cookies + are NEVER blocked on the basis of verifiability + + Netscape protocol strictness switches + + strict_ns_unverifiable: apply RFC 2965 rules on unverifiable transactions + even to Netscape cookies + strict_ns_domain: flags indicating how strict to be with domain-matching + rules for Netscape cookies: + DomainStrictNoDots: when setting cookies, host prefix must not contain a + dot (eg. www.foo.bar.com can't set a cookie for .bar.com, because + www.foo contains a dot) + DomainStrictNonDomain: cookies that did not explicitly specify a Domain + cookie-attribute can only be returned to a domain that string-compares + equal to the domain that set the cookie (eg. rockets.acme.com won't + be returned cookies from acme.com that had no Domain cookie-attribute) + DomainRFC2965Match: when setting cookies, require a full RFC 2965 + domain-match + DomainLiberal and DomainStrict are the most useful combinations of the + above flags, for convenience + strict_ns_set_initial_dollar: ignore cookies in Set-Cookie: headers that + have names starting with '$' + strict_ns_set_path: don't allow setting cookies whose path doesn't + path-match request URI + + """ + + DomainStrictNoDots = 1 + DomainStrictNonDomain = 2 + DomainRFC2965Match = 4 + + DomainLiberal = 0 + DomainStrict = DomainStrictNoDots|DomainStrictNonDomain + + def __init__(self, + blocked_domains=None, allowed_domains=None, + netscape=True, rfc2965=False, + # WARNING: this argument will change or go away if is not + # accepted into the Python standard library in this form! + # default, ie. treat 2109 as netscape iff not rfc2965 + rfc2109_as_netscape=None, + hide_cookie2=False, + strict_domain=False, + strict_rfc2965_unverifiable=True, + strict_ns_unverifiable=False, + strict_ns_domain=DomainLiberal, + strict_ns_set_initial_dollar=False, + strict_ns_set_path=False, + ): + """ + Constructor arguments should be used as keyword arguments only. + + blocked_domains: sequence of domain names that we never accept cookies + from, nor return cookies to + allowed_domains: if not None, this is a sequence of the only domains + for which we accept and return cookies + + For other arguments, see CookiePolicy.__doc__ and + DefaultCookiePolicy.__doc__.. + + """ + self.netscape = netscape + self.rfc2965 = rfc2965 + self.rfc2109_as_netscape = rfc2109_as_netscape + self.hide_cookie2 = hide_cookie2 + self.strict_domain = strict_domain + self.strict_rfc2965_unverifiable = strict_rfc2965_unverifiable + self.strict_ns_unverifiable = strict_ns_unverifiable + self.strict_ns_domain = strict_ns_domain + self.strict_ns_set_initial_dollar = strict_ns_set_initial_dollar + self.strict_ns_set_path = strict_ns_set_path + + if blocked_domains is not None: + self._blocked_domains = tuple(blocked_domains) + else: + self._blocked_domains = () + + if allowed_domains is not None: + allowed_domains = tuple(allowed_domains) + self._allowed_domains = allowed_domains + + def blocked_domains(self): + """Return the sequence of blocked domains (as a tuple).""" + return self._blocked_domains + def set_blocked_domains(self, blocked_domains): + """Set the sequence of blocked domains.""" + self._blocked_domains = tuple(blocked_domains) + + def is_blocked(self, domain): + for blocked_domain in self._blocked_domains: + if user_domain_match(domain, blocked_domain): + return True + return False + + def allowed_domains(self): + """Return None, or the sequence of allowed domains (as a tuple).""" + return self._allowed_domains + def set_allowed_domains(self, allowed_domains): + """Set the sequence of allowed domains, or None.""" + if allowed_domains is not None: + allowed_domains = tuple(allowed_domains) + self._allowed_domains = allowed_domains + + def is_not_allowed(self, domain): + if self._allowed_domains is None: + return False + for allowed_domain in self._allowed_domains: + if user_domain_match(domain, allowed_domain): + return False + return True + + def set_ok(self, cookie, request): + """ + If you override set_ok, be sure to call this method. If it returns + false, so should your subclass (assuming your subclass wants to be more + strict about which cookies to accept). + + """ + debug(" - checking cookie %s", cookie) + + assert cookie.name is not None + + for n in "version", "verifiability", "name", "path", "domain", "port": + fn_name = "set_ok_"+n + fn = getattr(self, fn_name) + if not fn(cookie, request): + return False + + return True + + def set_ok_version(self, cookie, request): + if cookie.version is None: + # Version is always set to 0 by parse_ns_headers if it's a Netscape + # cookie, so this must be an invalid RFC 2965 cookie. + debug(" Set-Cookie2 without version attribute (%s)", cookie) + return False + if cookie.version > 0 and not self.rfc2965: + debug(" RFC 2965 cookies are switched off") + return False + elif cookie.version == 0 and not self.netscape: + debug(" Netscape cookies are switched off") + return False + return True + + def set_ok_verifiability(self, cookie, request): + if request.unverifiable and is_third_party(request): + if cookie.version > 0 and self.strict_rfc2965_unverifiable: + debug(" third-party RFC 2965 cookie during " + "unverifiable transaction") + return False + elif cookie.version == 0 and self.strict_ns_unverifiable: + debug(" third-party Netscape cookie during " + "unverifiable transaction") + return False + return True + + def set_ok_name(self, cookie, request): + # Try and stop servers setting V0 cookies designed to hack other + # servers that know both V0 and V1 protocols. + if (cookie.version == 0 and self.strict_ns_set_initial_dollar and + cookie.name.startswith("$")): + debug(" illegal name (starts with '$'): '%s'", cookie.name) + return False + return True + + def set_ok_path(self, cookie, request): + if cookie.path_specified: + req_path = request_path(request) + if ((cookie.version > 0 or + (cookie.version == 0 and self.strict_ns_set_path)) and + not req_path.startswith(cookie.path)): + debug(" path attribute %s is not a prefix of request " + "path %s", cookie.path, req_path) + return False + return True + + def set_ok_countrycode_domain(self, cookie, request): + """Return False if explicit cookie domain is not acceptable. + + Called by set_ok_domain, for convenience of overriding by + subclasses. + + """ + if cookie.domain_specified and self.strict_domain: + domain = cookie.domain + # since domain was specified, we know that: + assert domain.startswith(".") + if domain.count(".") == 2: + # domain like .foo.bar + i = domain.rfind(".") + tld = domain[i+1:] + sld = domain[1:i] + if (sld.lower() in [ + "co", "ac", + "com", "edu", "org", "net", "gov", "mil", "int", + "aero", "biz", "cat", "coop", "info", "jobs", "mobi", + "museum", "name", "pro", "travel", + ] and + len(tld) == 2): + # domain like .co.uk + return False + return True + + def set_ok_domain(self, cookie, request): + if self.is_blocked(cookie.domain): + debug(" domain %s is in user block-list", cookie.domain) + return False + if self.is_not_allowed(cookie.domain): + debug(" domain %s is not in user allow-list", cookie.domain) + return False + if not self.set_ok_countrycode_domain(cookie, request): + debug(" country-code second level domain %s", cookie.domain) + return False + if cookie.domain_specified: + req_host, erhn = eff_request_host(request) + domain = cookie.domain + if domain.startswith("."): + undotted_domain = domain[1:] + else: + undotted_domain = domain + embedded_dots = (undotted_domain.find(".") >= 0) + if not embedded_dots and domain != ".local": + debug(" non-local domain %s contains no embedded dot", + domain) + return False + if cookie.version == 0: + if (not erhn.endswith(domain) and + (not erhn.startswith(".") and + not ("."+erhn).endswith(domain))): + debug(" effective request-host %s (even with added " + "initial dot) does not end end with %s", + erhn, domain) + return False + if (cookie.version > 0 or + (self.strict_ns_domain & self.DomainRFC2965Match)): + if not domain_match(erhn, domain): + debug(" effective request-host %s does not domain-match " + "%s", erhn, domain) + return False + if (cookie.version > 0 or + (self.strict_ns_domain & self.DomainStrictNoDots)): + host_prefix = req_host[:-len(domain)] + if (host_prefix.find(".") >= 0 and + not IPV4_RE.search(req_host)): + debug(" host prefix %s for domain %s contains a dot", + host_prefix, domain) + return False + return True + + def set_ok_port(self, cookie, request): + if cookie.port_specified: + req_port = request_port(request) + if req_port is None: + req_port = "80" + else: + req_port = str(req_port) + for p in cookie.port.split(","): + try: + int(p) + except ValueError: + debug(" bad port %s (not numeric)", p) + return False + if p == req_port: + break + else: + debug(" request port (%s) not found in %s", + req_port, cookie.port) + return False + return True + + def return_ok(self, cookie, request): + """ + If you override return_ok, be sure to call this method. If it returns + false, so should your subclass (assuming your subclass wants to be more + strict about which cookies to return). + + """ + # Path has already been checked by path_return_ok, and domain blocking + # done by domain_return_ok. + debug(" - checking cookie %s", cookie) + + for n in "version", "verifiability", "secure", "expires", "port", "domain": + fn_name = "return_ok_"+n + fn = getattr(self, fn_name) + if not fn(cookie, request): + return False + return True + + def return_ok_version(self, cookie, request): + if cookie.version > 0 and not self.rfc2965: + debug(" RFC 2965 cookies are switched off") + return False + elif cookie.version == 0 and not self.netscape: + debug(" Netscape cookies are switched off") + return False + return True + + def return_ok_verifiability(self, cookie, request): + if request.unverifiable and is_third_party(request): + if cookie.version > 0 and self.strict_rfc2965_unverifiable: + debug(" third-party RFC 2965 cookie during unverifiable " + "transaction") + return False + elif cookie.version == 0 and self.strict_ns_unverifiable: + debug(" third-party Netscape cookie during unverifiable " + "transaction") + return False + return True + + def return_ok_secure(self, cookie, request): + if cookie.secure and request.get_type() != "https": + debug(" secure cookie with non-secure request") + return False + return True + + def return_ok_expires(self, cookie, request): + if cookie.is_expired(self._now): + debug(" cookie expired") + return False + return True + + def return_ok_port(self, cookie, request): + if cookie.port: + req_port = request_port(request) + if req_port is None: + req_port = "80" + for p in cookie.port.split(","): + if p == req_port: + break + else: + debug(" request port %s does not match cookie port %s", + req_port, cookie.port) + return False + return True + + def return_ok_domain(self, cookie, request): + req_host, erhn = eff_request_host(request) + domain = cookie.domain + + # strict check of non-domain cookies: Mozilla does this, MSIE5 doesn't + if (cookie.version == 0 and + (self.strict_ns_domain & self.DomainStrictNonDomain) and + not cookie.domain_specified and domain != erhn): + debug(" cookie with unspecified domain does not string-compare " + "equal to request domain") + return False + + if cookie.version > 0 and not domain_match(erhn, domain): + debug(" effective request-host name %s does not domain-match " + "RFC 2965 cookie domain %s", erhn, domain) + return False + if cookie.version == 0 and not ("."+erhn).endswith(domain): + debug(" request-host %s does not match Netscape cookie domain " + "%s", req_host, domain) + return False + return True + + def domain_return_ok(self, domain, request): + # Liberal check of domain. This is here as an optimization to avoid + # having to load lots of MSIE cookie files unless necessary. + + # Munge req_host and erhn to always start with a dot, so as to err on + # the side of letting cookies through. + dotted_req_host, dotted_erhn = eff_request_host(request) + if not dotted_req_host.startswith("."): + dotted_req_host = "."+dotted_req_host + if not dotted_erhn.startswith("."): + dotted_erhn = "."+dotted_erhn + if not (dotted_req_host.endswith(domain) or + dotted_erhn.endswith(domain)): + #debug(" request domain %s does not match cookie domain %s", + # req_host, domain) + return False + + if self.is_blocked(domain): + debug(" domain %s is in user block-list", domain) + return False + if self.is_not_allowed(domain): + debug(" domain %s is not in user allow-list", domain) + return False + + return True + + def path_return_ok(self, path, request): + debug("- checking cookie path=%s", path) + req_path = request_path(request) + if not req_path.startswith(path): + debug(" %s does not path-match %s", req_path, path) + return False + return True + + +def vals_sorted_by_key(adict): + keys = adict.keys() + keys.sort() + return map(adict.get, keys) + +class MappingIterator: + """Iterates over nested mapping, depth-first, in sorted order by key.""" + def __init__(self, mapping): + self._s = [(vals_sorted_by_key(mapping), 0, None)] # LIFO stack + + def __iter__(self): return self + + def next(self): + # this is hairy because of lack of generators + while 1: + try: + vals, i, prev_item = self._s.pop() + except IndexError: + raise StopIteration() + if i < len(vals): + item = vals[i] + i = i + 1 + self._s.append((vals, i, prev_item)) + try: + item.items + except AttributeError: + # non-mapping + break + else: + # mapping + self._s.append((vals_sorted_by_key(item), 0, item)) + continue + return item + + +# Used as second parameter to dict.get method, to distinguish absent +# dict key from one with a None value. +class Absent: pass + +class CookieJar: + """Collection of HTTP cookies. + + You may not need to know about this class: try mechanize.urlopen(). + + The major methods are extract_cookies and add_cookie_header; these are all + you are likely to need. + + CookieJar supports the iterator protocol: + + for cookie in cookiejar: + # do something with cookie + + Methods: + + add_cookie_header(request) + extract_cookies(response, request) + make_cookies(response, request) + set_cookie_if_ok(cookie, request) + set_cookie(cookie) + clear_session_cookies() + clear_expired_cookies() + clear(domain=None, path=None, name=None) + + Public attributes + + policy: CookiePolicy object + + """ + + non_word_re = re.compile(r"\W") + quote_re = re.compile(r"([\"\\])") + strict_domain_re = re.compile(r"\.?[^.]*") + domain_re = re.compile(r"[^.]*") + dots_re = re.compile(r"^\.+") + + def __init__(self, policy=None): + """ + See CookieJar.__doc__ for argument documentation. + + """ + if policy is None: + policy = DefaultCookiePolicy() + self._policy = policy + + self._cookies = {} + + # for __getitem__ iteration in pre-2.2 Pythons + self._prev_getitem_index = 0 + + def set_policy(self, policy): + self._policy = policy + + def _cookies_for_domain(self, domain, request): + cookies = [] + if not self._policy.domain_return_ok(domain, request): + return [] + debug("Checking %s for cookies to return", domain) + cookies_by_path = self._cookies[domain] + for path in cookies_by_path.keys(): + if not self._policy.path_return_ok(path, request): + continue + cookies_by_name = cookies_by_path[path] + for cookie in cookies_by_name.values(): + if not self._policy.return_ok(cookie, request): + debug(" not returning cookie") + continue + debug(" it's a match") + cookies.append(cookie) + return cookies + + def _cookies_for_request(self, request): + """Return a list of cookies to be returned to server.""" + cookies = [] + for domain in self._cookies.keys(): + cookies.extend(self._cookies_for_domain(domain, request)) + return cookies + + def _cookie_attrs(self, cookies): + """Return a list of cookie-attributes to be returned to server. + + like ['foo="bar"; $Path="/"', ...] + + The $Version attribute is also added when appropriate (currently only + once per request). + + """ + # add cookies in order of most specific (ie. longest) path first + def decreasing_size(a, b): return cmp(len(b.path), len(a.path)) + cookies.sort(decreasing_size) + + version_set = False + + attrs = [] + for cookie in cookies: + # set version of Cookie header + # XXX + # What should it be if multiple matching Set-Cookie headers have + # different versions themselves? + # Answer: there is no answer; was supposed to be settled by + # RFC 2965 errata, but that may never appear... + version = cookie.version + if not version_set: + version_set = True + if version > 0: + attrs.append("$Version=%s" % version) + + # quote cookie value if necessary + # (not for Netscape protocol, which already has any quotes + # intact, due to the poorly-specified Netscape Cookie: syntax) + if ((cookie.value is not None) and + self.non_word_re.search(cookie.value) and version > 0): + value = self.quote_re.sub(r"\\\1", cookie.value) + else: + value = cookie.value + + # add cookie-attributes to be returned in Cookie header + if cookie.value is None: + attrs.append(cookie.name) + else: + attrs.append("%s=%s" % (cookie.name, value)) + if version > 0: + if cookie.path_specified: + attrs.append('$Path="%s"' % cookie.path) + if cookie.domain.startswith("."): + domain = cookie.domain + if (not cookie.domain_initial_dot and + domain.startswith(".")): + domain = domain[1:] + attrs.append('$Domain="%s"' % domain) + if cookie.port is not None: + p = "$Port" + if cookie.port_specified: + p = p + ('="%s"' % cookie.port) + attrs.append(p) + + return attrs + + def add_cookie_header(self, request): + """Add correct Cookie: header to request (urllib2.Request object). + + The Cookie2 header is also added unless policy.hide_cookie2 is true. + + The request object (usually a urllib2.Request instance) must support + the methods get_full_url, get_host, get_type, has_header, get_header, + header_items and add_unredirected_header, as documented by urllib2, and + the port attribute (the port number). Actually, + RequestUpgradeProcessor will automatically upgrade your Request object + to one with has_header, get_header, header_items and + add_unredirected_header, if it lacks those methods, for compatibility + with pre-2.4 versions of urllib2. + + """ + debug("add_cookie_header") + self._policy._now = self._now = int(time.time()) + + req_host, erhn = eff_request_host(request) + strict_non_domain = ( + self._policy.strict_ns_domain & self._policy.DomainStrictNonDomain) + + cookies = self._cookies_for_request(request) + + attrs = self._cookie_attrs(cookies) + if attrs: + if not request.has_header("Cookie"): + request.add_unredirected_header("Cookie", "; ".join(attrs)) + + # if necessary, advertise that we know RFC 2965 + if self._policy.rfc2965 and not self._policy.hide_cookie2: + for cookie in cookies: + if cookie.version != 1 and not request.has_header("Cookie2"): + request.add_unredirected_header("Cookie2", '$Version="1"') + break + + self.clear_expired_cookies() + + def _normalized_cookie_tuples(self, attrs_set): + """Return list of tuples containing normalised cookie information. + + attrs_set is the list of lists of key,value pairs extracted from + the Set-Cookie or Set-Cookie2 headers. + + Tuples are name, value, standard, rest, where name and value are the + cookie name and value, standard is a dictionary containing the standard + cookie-attributes (discard, secure, version, expires or max-age, + domain, path and port) and rest is a dictionary containing the rest of + the cookie-attributes. + + """ + cookie_tuples = [] + + boolean_attrs = "discard", "secure" + value_attrs = ("version", + "expires", "max-age", + "domain", "path", "port", + "comment", "commenturl") + + for cookie_attrs in attrs_set: + name, value = cookie_attrs[0] + + # Build dictionary of standard cookie-attributes (standard) and + # dictionary of other cookie-attributes (rest). + + # Note: expiry time is normalised to seconds since epoch. V0 + # cookies should have the Expires cookie-attribute, and V1 cookies + # should have Max-Age, but since V1 includes RFC 2109 cookies (and + # since V0 cookies may be a mish-mash of Netscape and RFC 2109), we + # accept either (but prefer Max-Age). + max_age_set = False + + bad_cookie = False + + standard = {} + rest = {} + for k, v in cookie_attrs[1:]: + lc = k.lower() + # don't lose case distinction for unknown fields + if lc in value_attrs or lc in boolean_attrs: + k = lc + if k in boolean_attrs and v is None: + # boolean cookie-attribute is present, but has no value + # (like "discard", rather than "port=80") + v = True + if standard.has_key(k): + # only first value is significant + continue + if k == "domain": + if v is None: + debug(" missing value for domain attribute") + bad_cookie = True + break + # RFC 2965 section 3.3.3 + v = v.lower() + if k == "expires": + if max_age_set: + # Prefer max-age to expires (like Mozilla) + continue + if v is None: + debug(" missing or invalid value for expires " + "attribute: treating as session cookie") + continue + if k == "max-age": + max_age_set = True + try: + v = int(v) + except ValueError: + debug(" missing or invalid (non-numeric) value for " + "max-age attribute") + bad_cookie = True + break + # convert RFC 2965 Max-Age to seconds since epoch + # XXX Strictly you're supposed to follow RFC 2616 + # age-calculation rules. Remember that zero Max-Age is a + # is a request to discard (old and new) cookie, though. + k = "expires" + v = self._now + v + if (k in value_attrs) or (k in boolean_attrs): + if (v is None and + k not in ["port", "comment", "commenturl"]): + debug(" missing value for %s attribute" % k) + bad_cookie = True + break + standard[k] = v + else: + rest[k] = v + + if bad_cookie: + continue + + cookie_tuples.append((name, value, standard, rest)) + + return cookie_tuples + + def _cookie_from_cookie_tuple(self, tup, request): + # standard is dict of standard cookie-attributes, rest is dict of the + # rest of them + name, value, standard, rest = tup + + domain = standard.get("domain", Absent) + path = standard.get("path", Absent) + port = standard.get("port", Absent) + expires = standard.get("expires", Absent) + + # set the easy defaults + version = standard.get("version", None) + if version is not None: version = int(version) + secure = standard.get("secure", False) + # (discard is also set if expires is Absent) + discard = standard.get("discard", False) + comment = standard.get("comment", None) + comment_url = standard.get("commenturl", None) + + # set default path + if path is not Absent and path != "": + path_specified = True + path = escape_path(path) + else: + path_specified = False + path = request_path(request) + i = path.rfind("/") + if i != -1: + if version == 0: + # Netscape spec parts company from reality here + path = path[:i] + else: + path = path[:i+1] + if len(path) == 0: path = "/" + + # set default domain + domain_specified = domain is not Absent + # but first we have to remember whether it starts with a dot + domain_initial_dot = False + if domain_specified: + domain_initial_dot = bool(domain.startswith(".")) + if domain is Absent: + req_host, erhn = eff_request_host(request) + domain = erhn + elif not domain.startswith("."): + domain = "."+domain + + # set default port + port_specified = False + if port is not Absent: + if port is None: + # Port attr present, but has no value: default to request port. + # Cookie should then only be sent back on that port. + port = request_port(request) + else: + port_specified = True + port = re.sub(r"\s+", "", port) + else: + # No port attr present. Cookie can be sent back on any port. + port = None + + # set default expires and discard + if expires is Absent: + expires = None + discard = True + elif expires <= self._now: + # Expiry date in past is request to delete cookie. This can't be + # in DefaultCookiePolicy, because can't delete cookies there. + try: + self.clear(domain, path, name) + except KeyError: + pass + debug("Expiring cookie, domain='%s', path='%s', name='%s'", + domain, path, name) + return None + + return Cookie(version, + name, value, + port, port_specified, + domain, domain_specified, domain_initial_dot, + path, path_specified, + secure, + expires, + discard, + comment, + comment_url, + rest) + + def _cookies_from_attrs_set(self, attrs_set, request): + cookie_tuples = self._normalized_cookie_tuples(attrs_set) + + cookies = [] + for tup in cookie_tuples: + cookie = self._cookie_from_cookie_tuple(tup, request) + if cookie: cookies.append(cookie) + return cookies + + def _process_rfc2109_cookies(self, cookies): + if self._policy.rfc2109_as_netscape is None: + rfc2109_as_netscape = not self._policy.rfc2965 + else: + rfc2109_as_netscape = self._policy.rfc2109_as_netscape + for cookie in cookies: + if cookie.version == 1: + cookie.rfc2109 = True + if rfc2109_as_netscape: + # treat 2109 cookies as Netscape cookies rather than + # as RFC2965 cookies + cookie.version = 0 + + def make_cookies(self, response, request): + """Return sequence of Cookie objects extracted from response object. + + See extract_cookies.__doc__ for the interfaces required of the + response and request arguments. + + """ + # get cookie-attributes for RFC 2965 and Netscape protocols + headers = response.info() + rfc2965_hdrs = headers.getheaders("Set-Cookie2") + ns_hdrs = headers.getheaders("Set-Cookie") + + rfc2965 = self._policy.rfc2965 + netscape = self._policy.netscape + + if ((not rfc2965_hdrs and not ns_hdrs) or + (not ns_hdrs and not rfc2965) or + (not rfc2965_hdrs and not netscape) or + (not netscape and not rfc2965)): + return [] # no relevant cookie headers: quick exit + + try: + cookies = self._cookies_from_attrs_set( + split_header_words(rfc2965_hdrs), request) + except: + reraise_unmasked_exceptions() + cookies = [] + + if ns_hdrs and netscape: + try: + # RFC 2109 and Netscape cookies + ns_cookies = self._cookies_from_attrs_set( + parse_ns_headers(ns_hdrs), request) + except: + reraise_unmasked_exceptions() + ns_cookies = [] + self._process_rfc2109_cookies(ns_cookies) + + # Look for Netscape cookies (from Set-Cookie headers) that match + # corresponding RFC 2965 cookies (from Set-Cookie2 headers). + # For each match, keep the RFC 2965 cookie and ignore the Netscape + # cookie (RFC 2965 section 9.1). Actually, RFC 2109 cookies are + # bundled in with the Netscape cookies for this purpose, which is + # reasonable behaviour. + if rfc2965: + lookup = {} + for cookie in cookies: + lookup[(cookie.domain, cookie.path, cookie.name)] = None + + def no_matching_rfc2965(ns_cookie, lookup=lookup): + key = ns_cookie.domain, ns_cookie.path, ns_cookie.name + return not lookup.has_key(key) + ns_cookies = filter(no_matching_rfc2965, ns_cookies) + + if ns_cookies: + cookies.extend(ns_cookies) + + return cookies + + def set_cookie_if_ok(self, cookie, request): + """Set a cookie if policy says it's OK to do so. + + cookie: mechanize.Cookie instance + request: see extract_cookies.__doc__ for the required interface + + """ + self._policy._now = self._now = int(time.time()) + + if self._policy.set_ok(cookie, request): + self.set_cookie(cookie) + + def set_cookie(self, cookie): + """Set a cookie, without checking whether or not it should be set. + + cookie: mechanize.Cookie instance + """ + c = self._cookies + if not c.has_key(cookie.domain): c[cookie.domain] = {} + c2 = c[cookie.domain] + if not c2.has_key(cookie.path): c2[cookie.path] = {} + c3 = c2[cookie.path] + c3[cookie.name] = cookie + + def extract_cookies(self, response, request): + """Extract cookies from response, where allowable given the request. + + Look for allowable Set-Cookie: and Set-Cookie2: headers in the response + object passed as argument. Any of these headers that are found are + used to update the state of the object (subject to the policy.set_ok + method's approval). + + The response object (usually be the result of a call to + mechanize.urlopen, or similar) should support an info method, which + returns a mimetools.Message object (in fact, the 'mimetools.Message + object' may be any object that provides a getallmatchingheaders + method). + + The request object (usually a urllib2.Request instance) must support + the methods get_full_url and get_host, as documented by urllib2, and + the port attribute (the port number). The request is used to set + default values for cookie-attributes as well as for checking that the + cookie is OK to be set. + + """ + debug("extract_cookies: %s", response.info()) + self._policy._now = self._now = int(time.time()) + + for cookie in self.make_cookies(response, request): + if self._policy.set_ok(cookie, request): + debug(" setting cookie: %s", cookie) + self.set_cookie(cookie) + + def clear(self, domain=None, path=None, name=None): + """Clear some cookies. + + Invoking this method without arguments will clear all cookies. If + given a single argument, only cookies belonging to that domain will be + removed. If given two arguments, cookies belonging to the specified + path within that domain are removed. If given three arguments, then + the cookie with the specified name, path and domain is removed. + + Raises KeyError if no matching cookie exists. + + """ + if name is not None: + if (domain is None) or (path is None): + raise ValueError( + "domain and path must be given to remove a cookie by name") + del self._cookies[domain][path][name] + elif path is not None: + if domain is None: + raise ValueError( + "domain must be given to remove cookies by path") + del self._cookies[domain][path] + elif domain is not None: + del self._cookies[domain] + else: + self._cookies = {} + + def clear_session_cookies(self): + """Discard all session cookies. + + Discards all cookies held by object which had either no Max-Age or + Expires cookie-attribute or an explicit Discard cookie-attribute, or + which otherwise have ended up with a true discard attribute. For + interactive browsers, the end of a session usually corresponds to + closing the browser window. + + Note that the save method won't save session cookies anyway, unless you + ask otherwise by passing a true ignore_discard argument. + + """ + for cookie in self: + if cookie.discard: + self.clear(cookie.domain, cookie.path, cookie.name) + + def clear_expired_cookies(self): + """Discard all expired cookies. + + You probably don't need to call this method: expired cookies are never + sent back to the server (provided you're using DefaultCookiePolicy), + this method is called by CookieJar itself every so often, and the save + method won't save expired cookies anyway (unless you ask otherwise by + passing a true ignore_expires argument). + + """ + now = time.time() + for cookie in self: + if cookie.is_expired(now): + self.clear(cookie.domain, cookie.path, cookie.name) + + def __getitem__(self, i): + if i == 0: + self._getitem_iterator = self.__iter__() + elif self._prev_getitem_index != i-1: raise IndexError( + "CookieJar.__getitem__ only supports sequential iteration") + self._prev_getitem_index = i + try: + return self._getitem_iterator.next() + except StopIteration: + raise IndexError() + + def __iter__(self): + return MappingIterator(self._cookies) + + def __len__(self): + """Return number of contained cookies.""" + i = 0 + for cookie in self: i = i + 1 + return i + + def __repr__(self): + r = [] + for cookie in self: r.append(repr(cookie)) + return "<%s[%s]>" % (self.__class__, ", ".join(r)) + + def __str__(self): + r = [] + for cookie in self: r.append(str(cookie)) + return "<%s[%s]>" % (self.__class__, ", ".join(r)) + + +class LoadError(Exception): pass + +class FileCookieJar(CookieJar): + """CookieJar that can be loaded from and saved to a file. + + Additional methods + + save(filename=None, ignore_discard=False, ignore_expires=False) + load(filename=None, ignore_discard=False, ignore_expires=False) + revert(filename=None, ignore_discard=False, ignore_expires=False) + + Additional public attributes + + filename: filename for loading and saving cookies + + Additional public readable attributes + + delayload: request that cookies are lazily loaded from disk; this is only + a hint since this only affects performance, not behaviour (unless the + cookies on disk are changing); a CookieJar object may ignore it (in fact, + only MSIECookieJar lazily loads cookies at the moment) + + """ + + def __init__(self, filename=None, delayload=False, policy=None): + """ + See FileCookieJar.__doc__ for argument documentation. + + Cookies are NOT loaded from the named file until either the load or + revert method is called. + + """ + CookieJar.__init__(self, policy) + if filename is not None and not isstringlike(filename): + raise ValueError("filename must be string-like") + self.filename = filename + self.delayload = bool(delayload) + + def save(self, filename=None, ignore_discard=False, ignore_expires=False): + """Save cookies to a file. + + filename: name of file in which to save cookies + ignore_discard: save even cookies set to be discarded + ignore_expires: save even cookies that have expired + + The file is overwritten if it already exists, thus wiping all its + cookies. Saved cookies can be restored later using the load or revert + methods. If filename is not specified, self.filename is used; if + self.filename is None, ValueError is raised. + + """ + raise NotImplementedError() + + def load(self, filename=None, ignore_discard=False, ignore_expires=False): + """Load cookies from a file. + + Old cookies are kept unless overwritten by newly loaded ones. + + Arguments are as for .save(). + + If filename is not specified, self.filename is used; if self.filename + is None, ValueError is raised. The named file must be in the format + understood by the class, or LoadError will be raised. This format will + be identical to that written by the save method, unless the load format + is not sufficiently well understood (as is the case for MSIECookieJar). + + """ + if filename is None: + if self.filename is not None: filename = self.filename + else: raise ValueError(MISSING_FILENAME_TEXT) + + f = open(filename) + try: + self._really_load(f, filename, ignore_discard, ignore_expires) + finally: + f.close() + + def revert(self, filename=None, + ignore_discard=False, ignore_expires=False): + """Clear all cookies and reload cookies from a saved file. + + Raises LoadError (or IOError) if reversion is not successful; the + object's state will not be altered if this happens. + + """ + if filename is None: + if self.filename is not None: filename = self.filename + else: raise ValueError(MISSING_FILENAME_TEXT) + + old_state = copy.deepcopy(self._cookies) + self._cookies = {} + try: + self.load(filename, ignore_discard, ignore_expires) + except (LoadError, IOError): + self._cookies = old_state + raise diff --git a/src/calibre/utils/mechanize/_debug.py b/src/calibre/utils/mechanize/_debug.py new file mode 100644 index 0000000000..596b11477e --- /dev/null +++ b/src/calibre/utils/mechanize/_debug.py @@ -0,0 +1,28 @@ +import logging + +from urllib2 import BaseHandler +from _response import response_seek_wrapper + + +class HTTPResponseDebugProcessor(BaseHandler): + handler_order = 900 # before redirections, after everything else + + def http_response(self, request, response): + if not hasattr(response, "seek"): + response = response_seek_wrapper(response) + info = logging.getLogger("mechanize.http_responses").info + try: + info(response.read()) + finally: + response.seek(0) + info("*****************************************************") + return response + + https_response = http_response + +class HTTPRedirectDebugProcessor(BaseHandler): + def http_request(self, request): + if hasattr(request, "redirect_dict"): + info = logging.getLogger("mechanize.http_redirects").info + info("redirecting to %s", request.get_full_url()) + return request diff --git a/src/calibre/utils/mechanize/_gzip.py b/src/calibre/utils/mechanize/_gzip.py new file mode 100644 index 0000000000..46a98a3858 --- /dev/null +++ b/src/calibre/utils/mechanize/_gzip.py @@ -0,0 +1,103 @@ +import urllib2 +from cStringIO import StringIO +import _response + +# GzipConsumer was taken from Fredrik Lundh's effbot.org-0.1-20041009 library +class GzipConsumer: + + def __init__(self, consumer): + self.__consumer = consumer + self.__decoder = None + self.__data = "" + + def __getattr__(self, key): + return getattr(self.__consumer, key) + + def feed(self, data): + if self.__decoder is None: + # check if we have a full gzip header + data = self.__data + data + try: + i = 10 + flag = ord(data[3]) + if flag & 4: # extra + x = ord(data[i]) + 256*ord(data[i+1]) + i = i + 2 + x + if flag & 8: # filename + while ord(data[i]): + i = i + 1 + i = i + 1 + if flag & 16: # comment + while ord(data[i]): + i = i + 1 + i = i + 1 + if flag & 2: # crc + i = i + 2 + if len(data) < i: + raise IndexError("not enough data") + if data[:3] != "\x1f\x8b\x08": + raise IOError("invalid gzip data") + data = data[i:] + except IndexError: + self.__data = data + return # need more data + import zlib + self.__data = "" + self.__decoder = zlib.decompressobj(-zlib.MAX_WBITS) + data = self.__decoder.decompress(data) + if data: + self.__consumer.feed(data) + + def close(self): + if self.__decoder: + data = self.__decoder.flush() + if data: + self.__consumer.feed(data) + self.__consumer.close() + + +# -------------------------------------------------------------------- + +# the rest of this module is John Lee's stupid code, not +# Fredrik's nice code :-) + +class stupid_gzip_consumer: + def __init__(self): self.data = [] + def feed(self, data): self.data.append(data) + +class stupid_gzip_wrapper(_response.closeable_response): + def __init__(self, response): + self._response = response + + c = stupid_gzip_consumer() + gzc = GzipConsumer(c) + gzc.feed(response.read()) + self.__data = StringIO("".join(c.data)) + + def read(self, size=-1): + return self.__data.read(size) + def readline(self, size=-1): + return self.__data.readline(size) + def readlines(self, sizehint=-1): + return self.__data.readlines(size) + + def __getattr__(self, name): + # delegate unknown methods/attributes + return getattr(self._response, name) + +class HTTPGzipProcessor(urllib2.BaseHandler): + handler_order = 200 # response processing before HTTPEquivProcessor + + def http_request(self, request): + request.add_header("Accept-Encoding", "gzip") + return request + + def http_response(self, request, response): + # post-process response + enc_hdrs = response.info().getheaders("Content-encoding") + for enc_hdr in enc_hdrs: + if ("gzip" in enc_hdr) or ("compress" in enc_hdr): + return stupid_gzip_wrapper(response) + return response + + https_response = http_response diff --git a/src/calibre/utils/mechanize/_headersutil.py b/src/calibre/utils/mechanize/_headersutil.py new file mode 100644 index 0000000000..d8fe47a0e7 --- /dev/null +++ b/src/calibre/utils/mechanize/_headersutil.py @@ -0,0 +1,226 @@ +"""Utility functions for HTTP header value parsing and construction. + +Copyright 1997-1998, Gisle Aas +Copyright 2002-2006, John J. Lee + +This code is free software; you can redistribute it and/or modify it +under the terms of the BSD or ZPL 2.1 licenses (see the file +COPYING.txt included with the distribution). + +""" + +import os, re +from types import StringType +from types import UnicodeType +STRING_TYPES = StringType, UnicodeType + +from _util import http2time +import _rfc3986 + +def is_html(ct_headers, url, allow_xhtml=False): + """ + ct_headers: Sequence of Content-Type headers + url: Response URL + + """ + if not ct_headers: + # guess + ext = os.path.splitext(_rfc3986.urlsplit(url)[2])[1] + html_exts = [".htm", ".html"] + if allow_xhtml: + html_exts += [".xhtml"] + return ext in html_exts + # use first header + ct = split_header_words(ct_headers)[0][0][0] + html_types = ["text/html"] + if allow_xhtml: + html_types += [ + "text/xhtml", "text/xml", + "application/xml", "application/xhtml+xml", + ] + return ct in html_types + +def unmatched(match): + """Return unmatched part of re.Match object.""" + start, end = match.span(0) + return match.string[:start]+match.string[end:] + +token_re = re.compile(r"^\s*([^=\s;,]+)") +quoted_value_re = re.compile(r"^\s*=\s*\"([^\"\\]*(?:\\.[^\"\\]*)*)\"") +value_re = re.compile(r"^\s*=\s*([^\s;,]*)") +escape_re = re.compile(r"\\(.)") +def split_header_words(header_values): + r"""Parse header values into a list of lists containing key,value pairs. + + The function knows how to deal with ",", ";" and "=" as well as quoted + values after "=". A list of space separated tokens are parsed as if they + were separated by ";". + + If the header_values passed as argument contains multiple values, then they + are treated as if they were a single value separated by comma ",". + + This means that this function is useful for parsing header fields that + follow this syntax (BNF as from the HTTP/1.1 specification, but we relax + the requirement for tokens). + + headers = #header + header = (token | parameter) *( [";"] (token | parameter)) + + token = 1* + separators = "(" | ")" | "<" | ">" | "@" + | "," | ";" | ":" | "\" | <"> + | "/" | "[" | "]" | "?" | "=" + | "{" | "}" | SP | HT + + quoted-string = ( <"> *(qdtext | quoted-pair ) <"> ) + qdtext = > + quoted-pair = "\" CHAR + + parameter = attribute "=" value + attribute = token + value = token | quoted-string + + Each header is represented by a list of key/value pairs. The value for a + simple token (not part of a parameter) is None. Syntactically incorrect + headers will not necessarily be parsed as you would want. + + This is easier to describe with some examples: + + >>> split_header_words(['foo="bar"; port="80,81"; discard, bar=baz']) + [[('foo', 'bar'), ('port', '80,81'), ('discard', None)], [('bar', 'baz')]] + >>> split_header_words(['text/html; charset="iso-8859-1"']) + [[('text/html', None), ('charset', 'iso-8859-1')]] + >>> split_header_words([r'Basic realm="\"foo\bar\""']) + [[('Basic', None), ('realm', '"foobar"')]] + + """ + assert type(header_values) not in STRING_TYPES + result = [] + for text in header_values: + orig_text = text + pairs = [] + while text: + m = token_re.search(text) + if m: + text = unmatched(m) + name = m.group(1) + m = quoted_value_re.search(text) + if m: # quoted value + text = unmatched(m) + value = m.group(1) + value = escape_re.sub(r"\1", value) + else: + m = value_re.search(text) + if m: # unquoted value + text = unmatched(m) + value = m.group(1) + value = value.rstrip() + else: + # no value, a lone token + value = None + pairs.append((name, value)) + elif text.lstrip().startswith(","): + # concatenated headers, as per RFC 2616 section 4.2 + text = text.lstrip()[1:] + if pairs: result.append(pairs) + pairs = [] + else: + # skip junk + non_junk, nr_junk_chars = re.subn("^[=\s;]*", "", text) + assert nr_junk_chars > 0, ( + "split_header_words bug: '%s', '%s', %s" % + (orig_text, text, pairs)) + text = non_junk + if pairs: result.append(pairs) + return result + +join_escape_re = re.compile(r"([\"\\])") +def join_header_words(lists): + """Do the inverse of the conversion done by split_header_words. + + Takes a list of lists of (key, value) pairs and produces a single header + value. Attribute values are quoted if needed. + + >>> join_header_words([[("text/plain", None), ("charset", "iso-8859/1")]]) + 'text/plain; charset="iso-8859/1"' + >>> join_header_words([[("text/plain", None)], [("charset", "iso-8859/1")]]) + 'text/plain, charset="iso-8859/1"' + + """ + headers = [] + for pairs in lists: + attr = [] + for k, v in pairs: + if v is not None: + if not re.search(r"^\w+$", v): + v = join_escape_re.sub(r"\\\1", v) # escape " and \ + v = '"%s"' % v + if k is None: # Netscape cookies may have no name + k = v + else: + k = "%s=%s" % (k, v) + attr.append(k) + if attr: headers.append("; ".join(attr)) + return ", ".join(headers) + +def parse_ns_headers(ns_headers): + """Ad-hoc parser for Netscape protocol cookie-attributes. + + The old Netscape cookie format for Set-Cookie can for instance contain + an unquoted "," in the expires field, so we have to use this ad-hoc + parser instead of split_header_words. + + XXX This may not make the best possible effort to parse all the crap + that Netscape Cookie headers contain. Ronald Tschalar's HTTPClient + parser is probably better, so could do worse than following that if + this ever gives any trouble. + + Currently, this is also used for parsing RFC 2109 cookies. + + """ + known_attrs = ("expires", "domain", "path", "secure", + # RFC 2109 attrs (may turn up in Netscape cookies, too) + "port", "max-age") + + result = [] + for ns_header in ns_headers: + pairs = [] + version_set = False + params = re.split(r";\s*", ns_header) + for ii in range(len(params)): + param = params[ii] + param = param.rstrip() + if param == "": continue + if "=" not in param: + k, v = param, None + else: + k, v = re.split(r"\s*=\s*", param, 1) + k = k.lstrip() + if ii != 0: + lc = k.lower() + if lc in known_attrs: + k = lc + if k == "version": + # This is an RFC 2109 cookie. + version_set = True + if k == "expires": + # convert expires date to seconds since epoch + if v.startswith('"'): v = v[1:] + if v.endswith('"'): v = v[:-1] + v = http2time(v) # None if invalid + pairs.append((k, v)) + + if pairs: + if not version_set: + pairs.append(("version", "0")) + result.append(pairs) + + return result + + +def _test(): + import doctest, _headersutil + return doctest.testmod(_headersutil) + +if __name__ == "__main__": + _test() diff --git a/src/calibre/utils/mechanize/_html.py b/src/calibre/utils/mechanize/_html.py new file mode 100644 index 0000000000..2d562c98bf --- /dev/null +++ b/src/calibre/utils/mechanize/_html.py @@ -0,0 +1,607 @@ +"""HTML handling. + +Copyright 2003-2006 John J. Lee + +This code is free software; you can redistribute it and/or modify it under +the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt +included with the distribution). + +""" + +import re, copy, htmlentitydefs +import sgmllib, HTMLParser, ClientForm + +import _request +from _headersutil import split_header_words, is_html as _is_html +import _rfc3986 + +DEFAULT_ENCODING = "latin-1" + + +# the base classe is purely for backwards compatibility +class ParseError(ClientForm.ParseError): pass + + +class CachingGeneratorFunction(object): + """Caching wrapper around a no-arguments iterable.""" + + def __init__(self, iterable): + self._cache = [] + # wrap iterable to make it non-restartable (otherwise, repeated + # __call__ would give incorrect results) + self._iterator = iter(iterable) + + def __call__(self): + cache = self._cache + for item in cache: + yield item + for item in self._iterator: + cache.append(item) + yield item + + +class EncodingFinder: + def __init__(self, default_encoding): + self._default_encoding = default_encoding + def encoding(self, response): + # HTTPEquivProcessor may be in use, so both HTTP and HTTP-EQUIV + # headers may be in the response. HTTP-EQUIV headers come last, + # so try in order from first to last. + for ct in response.info().getheaders("content-type"): + for k, v in split_header_words([ct])[0]: + if k == "charset": + return v + return self._default_encoding + +class ResponseTypeFinder: + def __init__(self, allow_xhtml): + self._allow_xhtml = allow_xhtml + def is_html(self, response, encoding): + ct_hdrs = response.info().getheaders("content-type") + url = response.geturl() + # XXX encoding + return _is_html(ct_hdrs, url, self._allow_xhtml) + + +# idea for this argument-processing trick is from Peter Otten +class Args: + def __init__(self, args_map): + self.dictionary = dict(args_map) + def __getattr__(self, key): + try: + return self.dictionary[key] + except KeyError: + return getattr(self.__class__, key) + +def form_parser_args( + select_default=False, + form_parser_class=None, + request_class=None, + backwards_compat=False, + ): + return Args(locals()) + + +class Link: + def __init__(self, base_url, url, text, tag, attrs): + assert None not in [url, tag, attrs] + self.base_url = base_url + self.absolute_url = _rfc3986.urljoin(base_url, url) + self.url, self.text, self.tag, self.attrs = url, text, tag, attrs + def __cmp__(self, other): + try: + for name in "url", "text", "tag", "attrs": + if getattr(self, name) != getattr(other, name): + return -1 + except AttributeError: + return -1 + return 0 + def __repr__(self): + return "Link(base_url=%r, url=%r, text=%r, tag=%r, attrs=%r)" % ( + self.base_url, self.url, self.text, self.tag, self.attrs) + + +class LinksFactory: + + def __init__(self, + link_parser_class=None, + link_class=Link, + urltags=None, + ): + import _pullparser + if link_parser_class is None: + link_parser_class = _pullparser.TolerantPullParser + self.link_parser_class = link_parser_class + self.link_class = link_class + if urltags is None: + urltags = { + "a": "href", + "area": "href", + "frame": "src", + "iframe": "src", + } + self.urltags = urltags + self._response = None + self._encoding = None + + def set_response(self, response, base_url, encoding): + self._response = response + self._encoding = encoding + self._base_url = base_url + + def links(self): + """Return an iterator that provides links of the document.""" + response = self._response + encoding = self._encoding + base_url = self._base_url + p = self.link_parser_class(response, encoding=encoding) + + try: + for token in p.tags(*(self.urltags.keys()+["base"])): + if token.type == "endtag": + continue + if token.data == "base": + base_href = dict(token.attrs).get("href") + if base_href is not None: + base_url = base_href + continue + attrs = dict(token.attrs) + tag = token.data + name = attrs.get("name") + text = None + # XXX use attr_encoding for ref'd doc if that doc does not + # provide one by other means + #attr_encoding = attrs.get("charset") + url = attrs.get(self.urltags[tag]) # XXX is "" a valid URL? + if not url: + # Probably an link or . + # For our purposes a link is something with a URL, so + # ignore this. + continue + + url = _rfc3986.clean_url(url, encoding) + if tag == "a": + if token.type != "startendtag": + # hmm, this'd break if end tag is missing + text = p.get_compressed_text(("endtag", tag)) + # but this doesn't work for eg. + # Andy + #text = p.get_compressed_text() + + yield Link(base_url, url, text, tag, token.attrs) + except sgmllib.SGMLParseError, exc: + raise ParseError(exc) + +class FormsFactory: + + """Makes a sequence of objects satisfying ClientForm.HTMLForm interface. + + After calling .forms(), the .global_form attribute is a form object + containing all controls not a descendant of any FORM element. + + For constructor argument docs, see ClientForm.ParseResponse + argument docs. + + """ + + def __init__(self, + select_default=False, + form_parser_class=None, + request_class=None, + backwards_compat=False, + ): + import ClientForm + self.select_default = select_default + if form_parser_class is None: + form_parser_class = ClientForm.FormParser + self.form_parser_class = form_parser_class + if request_class is None: + request_class = _request.Request + self.request_class = request_class + self.backwards_compat = backwards_compat + self._response = None + self.encoding = None + self.global_form = None + + def set_response(self, response, encoding): + self._response = response + self.encoding = encoding + self.global_form = None + + def forms(self): + import ClientForm + encoding = self.encoding + try: + forms = ClientForm.ParseResponseEx( + self._response, + select_default=self.select_default, + form_parser_class=self.form_parser_class, + request_class=self.request_class, + encoding=encoding, + _urljoin=_rfc3986.urljoin, + _urlparse=_rfc3986.urlsplit, + _urlunparse=_rfc3986.urlunsplit, + ) + except ClientForm.ParseError, exc: + raise ParseError(exc) + self.global_form = forms[0] + return forms[1:] + +class TitleFactory: + def __init__(self): + self._response = self._encoding = None + + def set_response(self, response, encoding): + self._response = response + self._encoding = encoding + + def title(self): + import _pullparser + p = _pullparser.TolerantPullParser( + self._response, encoding=self._encoding) + try: + try: + p.get_tag("title") + except _pullparser.NoMoreTokensError: + return None + else: + return p.get_text() + except sgmllib.SGMLParseError, exc: + raise ParseError(exc) + + +def unescape(data, entities, encoding): + if data is None or "&" not in data: + return data + + def replace_entities(match): + ent = match.group() + if ent[1] == "#": + return unescape_charref(ent[2:-1], encoding) + + repl = entities.get(ent[1:-1]) + if repl is not None: + repl = unichr(repl) + if type(repl) != type(""): + try: + repl = repl.encode(encoding) + except UnicodeError: + repl = ent + else: + repl = ent + return repl + + return re.sub(r"&#?[A-Za-z0-9]+?;", replace_entities, data) + +def unescape_charref(data, encoding): + name, base = data, 10 + if name.startswith("x"): + name, base= name[1:], 16 + uc = unichr(int(name, base)) + if encoding is None: + return uc + else: + try: + repl = uc.encode(encoding) + except UnicodeError: + repl = "&#%s;" % data + return repl + + +# bizarre import gymnastics for bundled BeautifulSoup +import _beautifulsoup +import ClientForm +RobustFormParser, NestingRobustFormParser = ClientForm._create_bs_classes( + _beautifulsoup.BeautifulSoup, _beautifulsoup.ICantBelieveItsBeautifulSoup + ) +# monkeypatch sgmllib to fix http://www.python.org/sf/803422 :-( +import sgmllib +sgmllib.charref = re.compile("&#(x?[0-9a-fA-F]+)[^0-9a-fA-F]") + +class MechanizeBs(_beautifulsoup.BeautifulSoup): + _entitydefs = htmlentitydefs.name2codepoint + # don't want the magic Microsoft-char workaround + PARSER_MASSAGE = [(re.compile('(<[^<>]*)/>'), + lambda(x):x.group(1) + ' />'), + (re.compile(']*)>'), + lambda(x):'') + ] + + def __init__(self, encoding, text=None, avoidParserProblems=True, + initialTextIsEverything=True): + self._encoding = encoding + _beautifulsoup.BeautifulSoup.__init__( + self, text, avoidParserProblems, initialTextIsEverything) + + def handle_charref(self, ref): + t = unescape("&#%s;"%ref, self._entitydefs, self._encoding) + self.handle_data(t) + def handle_entityref(self, ref): + t = unescape("&%s;"%ref, self._entitydefs, self._encoding) + self.handle_data(t) + def unescape_attrs(self, attrs): + escaped_attrs = [] + for key, val in attrs: + val = unescape(val, self._entitydefs, self._encoding) + escaped_attrs.append((key, val)) + return escaped_attrs + +class RobustLinksFactory: + + compress_re = re.compile(r"\s+") + + def __init__(self, + link_parser_class=None, + link_class=Link, + urltags=None, + ): + import _beautifulsoup + if link_parser_class is None: + link_parser_class = MechanizeBs + self.link_parser_class = link_parser_class + self.link_class = link_class + if urltags is None: + urltags = { + "a": "href", + "area": "href", + "frame": "src", + "iframe": "src", + } + self.urltags = urltags + self._bs = None + self._encoding = None + self._base_url = None + + def set_soup(self, soup, base_url, encoding): + self._bs = soup + self._base_url = base_url + self._encoding = encoding + + def links(self): + import _beautifulsoup + bs = self._bs + base_url = self._base_url + encoding = self._encoding + gen = bs.recursiveChildGenerator() + for ch in bs.recursiveChildGenerator(): + if (isinstance(ch, _beautifulsoup.Tag) and + ch.name in self.urltags.keys()+["base"]): + link = ch + attrs = bs.unescape_attrs(link.attrs) + attrs_dict = dict(attrs) + if link.name == "base": + base_href = attrs_dict.get("href") + if base_href is not None: + base_url = base_href + continue + url_attr = self.urltags[link.name] + url = attrs_dict.get(url_attr) + if not url: + continue + url = _rfc3986.clean_url(url, encoding) + text = link.firstText(lambda t: True) + if text is _beautifulsoup.Null: + # follow _pullparser's weird behaviour rigidly + if link.name == "a": + text = "" + else: + text = None + else: + text = self.compress_re.sub(" ", text.strip()) + yield Link(base_url, url, text, link.name, attrs) + + +class RobustFormsFactory(FormsFactory): + def __init__(self, *args, **kwds): + import ClientForm + args = form_parser_args(*args, **kwds) + if args.form_parser_class is None: + args.form_parser_class = RobustFormParser + FormsFactory.__init__(self, **args.dictionary) + + def set_response(self, response, encoding): + self._response = response + self.encoding = encoding + + +class RobustTitleFactory: + def __init__(self): + self._bs = self._encoding = None + + def set_soup(self, soup, encoding): + self._bs = soup + self._encoding = encoding + + def title(self): + import _beautifulsoup + title = self._bs.first("title") + if title == _beautifulsoup.Null: + return None + else: + return title.firstText(lambda t: True) + + +class Factory: + """Factory for forms, links, etc. + + This interface may expand in future. + + Public methods: + + set_request_class(request_class) + set_response(response) + forms() + links() + + Public attributes: + + Note that accessing these attributes may raise ParseError. + + encoding: string specifying the encoding of response if it contains a text + document (this value is left unspecified for documents that do not have + an encoding, e.g. an image file) + is_html: true if response contains an HTML document (XHTML may be + regarded as HTML too) + title: page title, or None if no title or not HTML + global_form: form object containing all controls that are not descendants + of any FORM element, or None if the forms_factory does not support + supplying a global form + + """ + + LAZY_ATTRS = ["encoding", "is_html", "title", "global_form"] + + def __init__(self, forms_factory, links_factory, title_factory, + encoding_finder=EncodingFinder(DEFAULT_ENCODING), + response_type_finder=ResponseTypeFinder(allow_xhtml=False), + ): + """ + + Pass keyword arguments only. + + default_encoding: character encoding to use if encoding cannot be + determined (or guessed) from the response. You should turn on + HTTP-EQUIV handling if you want the best chance of getting this right + without resorting to this default. The default value of this + parameter (currently latin-1) may change in future. + + """ + self._forms_factory = forms_factory + self._links_factory = links_factory + self._title_factory = title_factory + self._encoding_finder = encoding_finder + self._response_type_finder = response_type_finder + + self.set_response(None) + + def set_request_class(self, request_class): + """Set urllib2.Request class. + + ClientForm.HTMLForm instances returned by .forms() will return + instances of this class when .click()ed. + + """ + self._forms_factory.request_class = request_class + + def set_response(self, response): + """Set response. + + The response must either be None or implement the same interface as + objects returned by urllib2.urlopen(). + + """ + self._response = response + self._forms_genf = self._links_genf = None + self._get_title = None + for name in self.LAZY_ATTRS: + try: + delattr(self, name) + except AttributeError: + pass + + def __getattr__(self, name): + if name not in self.LAZY_ATTRS: + return getattr(self.__class__, name) + + if name == "encoding": + self.encoding = self._encoding_finder.encoding( + copy.copy(self._response)) + return self.encoding + elif name == "is_html": + self.is_html = self._response_type_finder.is_html( + copy.copy(self._response), self.encoding) + return self.is_html + elif name == "title": + if self.is_html: + self.title = self._title_factory.title() + else: + self.title = None + return self.title + elif name == "global_form": + self.forms() + return self.global_form + + def forms(self): + """Return iterable over ClientForm.HTMLForm-like objects. + + Raises mechanize.ParseError on failure. + """ + # this implementation sets .global_form as a side-effect, for benefit + # of __getattr__ impl + if self._forms_genf is None: + try: + self._forms_genf = CachingGeneratorFunction( + self._forms_factory.forms()) + except: # XXXX define exception! + self.set_response(self._response) + raise + self.global_form = getattr( + self._forms_factory, "global_form", None) + return self._forms_genf() + + def links(self): + """Return iterable over mechanize.Link-like objects. + + Raises mechanize.ParseError on failure. + """ + if self._links_genf is None: + try: + self._links_genf = CachingGeneratorFunction( + self._links_factory.links()) + except: # XXXX define exception! + self.set_response(self._response) + raise + return self._links_genf() + +class DefaultFactory(Factory): + """Based on sgmllib.""" + def __init__(self, i_want_broken_xhtml_support=False): + Factory.__init__( + self, + forms_factory=FormsFactory(), + links_factory=LinksFactory(), + title_factory=TitleFactory(), + response_type_finder=ResponseTypeFinder( + allow_xhtml=i_want_broken_xhtml_support), + ) + + def set_response(self, response): + Factory.set_response(self, response) + if response is not None: + self._forms_factory.set_response( + copy.copy(response), self.encoding) + self._links_factory.set_response( + copy.copy(response), response.geturl(), self.encoding) + self._title_factory.set_response( + copy.copy(response), self.encoding) + +class RobustFactory(Factory): + """Based on BeautifulSoup, hopefully a bit more robust to bad HTML than is + DefaultFactory. + + """ + def __init__(self, i_want_broken_xhtml_support=False, + soup_class=None): + Factory.__init__( + self, + forms_factory=RobustFormsFactory(), + links_factory=RobustLinksFactory(), + title_factory=RobustTitleFactory(), + response_type_finder=ResponseTypeFinder( + allow_xhtml=i_want_broken_xhtml_support), + ) + if soup_class is None: + soup_class = MechanizeBs + self._soup_class = soup_class + + def set_response(self, response): + import _beautifulsoup + Factory.set_response(self, response) + if response is not None: + data = response.read() + soup = self._soup_class(self.encoding, data) + self._forms_factory.set_response( + copy.copy(response), self.encoding) + self._links_factory.set_soup( + soup, response.geturl(), self.encoding) + self._title_factory.set_soup(soup, self.encoding) diff --git a/src/calibre/utils/mechanize/_http.py b/src/calibre/utils/mechanize/_http.py new file mode 100644 index 0000000000..d73f3f44e5 --- /dev/null +++ b/src/calibre/utils/mechanize/_http.py @@ -0,0 +1,729 @@ +"""HTTP related handlers. + +Note that some other HTTP handlers live in more specific modules: _auth.py, +_gzip.py, etc. + + +Copyright 2002-2006 John J Lee + +This code is free software; you can redistribute it and/or modify it +under the terms of the BSD or ZPL 2.1 licenses (see the file +COPYING.txt included with the distribution). + +""" + +import copy, time, tempfile, htmlentitydefs, re, logging, socket, \ + urllib2, urllib, httplib, sgmllib +from urllib2 import URLError, HTTPError, BaseHandler +from cStringIO import StringIO + +from _request import Request +from _util import isstringlike +from _response import closeable_response, response_seek_wrapper +from _html import unescape, unescape_charref +from _headersutil import is_html +from _clientcookie import CookieJar, request_host +import _rfc3986 + +debug = logging.getLogger("mechanize").debug + +# monkeypatch urllib2.HTTPError to show URL +## def urllib2_str(self): +## return 'HTTP Error %s: %s (%s)' % ( +## self.code, self.msg, self.geturl()) +## urllib2.HTTPError.__str__ = urllib2_str + + +CHUNK = 1024 # size of chunks fed to HTML HEAD parser, in bytes +DEFAULT_ENCODING = 'latin-1' + + +# This adds "refresh" to the list of redirectables and provides a redirection +# algorithm that doesn't go into a loop in the presence of cookies +# (Python 2.4 has this new algorithm, 2.3 doesn't). +class HTTPRedirectHandler(BaseHandler): + # maximum number of redirections to any single URL + # this is needed because of the state that cookies introduce + max_repeats = 4 + # maximum total number of redirections (regardless of URL) before + # assuming we're in a loop + max_redirections = 10 + + # Implementation notes: + + # To avoid the server sending us into an infinite loop, the request + # object needs to track what URLs we have already seen. Do this by + # adding a handler-specific attribute to the Request object. The value + # of the dict is used to count the number of times the same URL has + # been visited. This is needed because visiting the same URL twice + # does not necessarily imply a loop, thanks to state introduced by + # cookies. + + # Always unhandled redirection codes: + # 300 Multiple Choices: should not handle this here. + # 304 Not Modified: no need to handle here: only of interest to caches + # that do conditional GETs + # 305 Use Proxy: probably not worth dealing with here + # 306 Unused: what was this for in the previous versions of protocol?? + + def redirect_request(self, newurl, req, fp, code, msg, headers): + """Return a Request or None in response to a redirect. + + This is called by the http_error_30x methods when a redirection + response is received. If a redirection should take place, return a + new Request to allow http_error_30x to perform the redirect; + otherwise, return None to indicate that an HTTPError should be + raised. + + """ + if code in (301, 302, 303, "refresh") or \ + (code == 307 and not req.has_data()): + # Strictly (according to RFC 2616), 301 or 302 in response to + # a POST MUST NOT cause a redirection without confirmation + # from the user (of urllib2, in this case). In practice, + # essentially all clients do redirect in this case, so we do + # the same. + # XXX really refresh redirections should be visiting; tricky to + # fix, so this will wait until post-stable release + new = Request(newurl, + headers=req.headers, + origin_req_host=req.get_origin_req_host(), + unverifiable=True, + visit=False, + ) + new._origin_req = getattr(req, "_origin_req", req) + return new + else: + raise HTTPError(req.get_full_url(), code, msg, headers, fp) + + def http_error_302(self, req, fp, code, msg, headers): + # Some servers (incorrectly) return multiple Location headers + # (so probably same goes for URI). Use first header. + if headers.has_key('location'): + newurl = headers.getheaders('location')[0] + elif headers.has_key('uri'): + newurl = headers.getheaders('uri')[0] + else: + return + newurl = _rfc3986.clean_url(newurl, "latin-1") + newurl = _rfc3986.urljoin(req.get_full_url(), newurl) + + # XXX Probably want to forget about the state of the current + # request, although that might interact poorly with other + # handlers that also use handler-specific request attributes + new = self.redirect_request(newurl, req, fp, code, msg, headers) + if new is None: + return + + # loop detection + # .redirect_dict has a key url if url was previously visited. + if hasattr(req, 'redirect_dict'): + visited = new.redirect_dict = req.redirect_dict + if (visited.get(newurl, 0) >= self.max_repeats or + len(visited) >= self.max_redirections): + raise HTTPError(req.get_full_url(), code, + self.inf_msg + msg, headers, fp) + else: + visited = new.redirect_dict = req.redirect_dict = {} + visited[newurl] = visited.get(newurl, 0) + 1 + + # Don't close the fp until we are sure that we won't use it + # with HTTPError. + fp.read() + fp.close() + + return self.parent.open(new) + + http_error_301 = http_error_303 = http_error_307 = http_error_302 + http_error_refresh = http_error_302 + + inf_msg = "The HTTP server returned a redirect error that would " \ + "lead to an infinite loop.\n" \ + "The last 30x error message was:\n" + + +# XXX would self.reset() work, instead of raising this exception? +class EndOfHeadError(Exception): pass +class AbstractHeadParser: + # only these elements are allowed in or before HEAD of document + head_elems = ("html", "head", + "title", "base", + "script", "style", "meta", "link", "object") + _entitydefs = htmlentitydefs.name2codepoint + _encoding = DEFAULT_ENCODING + + def __init__(self): + self.http_equiv = [] + + def start_meta(self, attrs): + http_equiv = content = None + for key, value in attrs: + if key == "http-equiv": + http_equiv = self.unescape_attr_if_required(value) + elif key == "content": + content = self.unescape_attr_if_required(value) + if http_equiv is not None and content is not None: + self.http_equiv.append((http_equiv, content)) + + def end_head(self): + raise EndOfHeadError() + + def handle_entityref(self, name): + #debug("%s", name) + self.handle_data(unescape( + '&%s;' % name, self._entitydefs, self._encoding)) + + def handle_charref(self, name): + #debug("%s", name) + self.handle_data(unescape_charref(name, self._encoding)) + + def unescape_attr(self, name): + #debug("%s", name) + return unescape(name, self._entitydefs, self._encoding) + + def unescape_attrs(self, attrs): + #debug("%s", attrs) + escaped_attrs = {} + for key, val in attrs.items(): + escaped_attrs[key] = self.unescape_attr(val) + return escaped_attrs + + def unknown_entityref(self, ref): + self.handle_data("&%s;" % ref) + + def unknown_charref(self, ref): + self.handle_data("&#%s;" % ref) + + +try: + import HTMLParser +except ImportError: + pass +else: + class XHTMLCompatibleHeadParser(AbstractHeadParser, + HTMLParser.HTMLParser): + def __init__(self): + HTMLParser.HTMLParser.__init__(self) + AbstractHeadParser.__init__(self) + + def handle_starttag(self, tag, attrs): + if tag not in self.head_elems: + raise EndOfHeadError() + try: + method = getattr(self, 'start_' + tag) + except AttributeError: + try: + method = getattr(self, 'do_' + tag) + except AttributeError: + pass # unknown tag + else: + method(attrs) + else: + method(attrs) + + def handle_endtag(self, tag): + if tag not in self.head_elems: + raise EndOfHeadError() + try: + method = getattr(self, 'end_' + tag) + except AttributeError: + pass # unknown tag + else: + method() + + def unescape(self, name): + # Use the entitydefs passed into constructor, not + # HTMLParser.HTMLParser's entitydefs. + return self.unescape_attr(name) + + def unescape_attr_if_required(self, name): + return name # HTMLParser.HTMLParser already did it + +class HeadParser(AbstractHeadParser, sgmllib.SGMLParser): + + def _not_called(self): + assert False + + def __init__(self): + sgmllib.SGMLParser.__init__(self) + AbstractHeadParser.__init__(self) + + def handle_starttag(self, tag, method, attrs): + if tag not in self.head_elems: + raise EndOfHeadError() + if tag == "meta": + method(attrs) + + def unknown_starttag(self, tag, attrs): + self.handle_starttag(tag, self._not_called, attrs) + + def handle_endtag(self, tag, method): + if tag in self.head_elems: + method() + else: + raise EndOfHeadError() + + def unescape_attr_if_required(self, name): + return self.unescape_attr(name) + +def parse_head(fileobj, parser): + """Return a list of key, value pairs.""" + while 1: + data = fileobj.read(CHUNK) + try: + parser.feed(data) + except EndOfHeadError: + break + if len(data) != CHUNK: + # this should only happen if there is no HTML body, or if + # CHUNK is big + break + return parser.http_equiv + +class HTTPEquivProcessor(BaseHandler): + """Append META HTTP-EQUIV headers to regular HTTP headers.""" + + handler_order = 300 # before handlers that look at HTTP headers + + def __init__(self, head_parser_class=HeadParser, + i_want_broken_xhtml_support=False, + ): + self.head_parser_class = head_parser_class + self._allow_xhtml = i_want_broken_xhtml_support + + def http_response(self, request, response): + if not hasattr(response, "seek"): + response = response_seek_wrapper(response) + http_message = response.info() + url = response.geturl() + ct_hdrs = http_message.getheaders("content-type") + if is_html(ct_hdrs, url, self._allow_xhtml): + try: + try: + html_headers = parse_head(response, self.head_parser_class()) + finally: + response.seek(0) + except (HTMLParser.HTMLParseError, + sgmllib.SGMLParseError): + pass + else: + for hdr, val in html_headers: + # add a header + http_message.dict[hdr.lower()] = val + text = hdr + ": " + val + for line in text.split("\n"): + http_message.headers.append(line + "\n") + return response + + https_response = http_response + +class HTTPCookieProcessor(BaseHandler): + """Handle HTTP cookies. + + Public attributes: + + cookiejar: CookieJar instance + + """ + def __init__(self, cookiejar=None): + if cookiejar is None: + cookiejar = CookieJar() + self.cookiejar = cookiejar + + def http_request(self, request): + self.cookiejar.add_cookie_header(request) + return request + + def http_response(self, request, response): + self.cookiejar.extract_cookies(response, request) + return response + + https_request = http_request + https_response = http_response + +try: + import robotparser +except ImportError: + pass +else: + class MechanizeRobotFileParser(robotparser.RobotFileParser): + + def __init__(self, url='', opener=None): + import _opener + robotparser.RobotFileParser.__init__(self, url) + self._opener = opener + + def set_opener(self, opener=None): + if opener is None: + opener = _opener.OpenerDirector() + self._opener = opener + + def read(self): + """Reads the robots.txt URL and feeds it to the parser.""" + if self._opener is None: + self.set_opener() + req = Request(self.url, unverifiable=True, visit=False) + try: + f = self._opener.open(req) + except HTTPError, f: + pass + except (IOError, socket.error, OSError), exc: + robotparser._debug("ignoring error opening %r: %s" % + (self.url, exc)) + return + lines = [] + line = f.readline() + while line: + lines.append(line.strip()) + line = f.readline() + status = f.code + if status == 401 or status == 403: + self.disallow_all = True + robotparser._debug("disallow all") + elif status >= 400: + self.allow_all = True + robotparser._debug("allow all") + elif status == 200 and lines: + robotparser._debug("parse lines") + self.parse(lines) + + class RobotExclusionError(urllib2.HTTPError): + def __init__(self, request, *args): + apply(urllib2.HTTPError.__init__, (self,)+args) + self.request = request + + class HTTPRobotRulesProcessor(BaseHandler): + # before redirections, after everything else + handler_order = 800 + + try: + from httplib import HTTPMessage + except: + from mimetools import Message + http_response_class = Message + else: + http_response_class = HTTPMessage + + def __init__(self, rfp_class=MechanizeRobotFileParser): + self.rfp_class = rfp_class + self.rfp = None + self._host = None + + def http_request(self, request): + scheme = request.get_type() + if scheme not in ["http", "https"]: + # robots exclusion only applies to HTTP + return request + + if request.get_selector() == "/robots.txt": + # /robots.txt is always OK to fetch + return request + + host = request.get_host() + + # robots.txt requests don't need to be allowed by robots.txt :-) + origin_req = getattr(request, "_origin_req", None) + if (origin_req is not None and + origin_req.get_selector() == "/robots.txt" and + origin_req.get_host() == host + ): + return request + + if host != self._host: + self.rfp = self.rfp_class() + try: + self.rfp.set_opener(self.parent) + except AttributeError: + debug("%r instance does not support set_opener" % + self.rfp.__class__) + self.rfp.set_url(scheme+"://"+host+"/robots.txt") + self.rfp.read() + self._host = host + + ua = request.get_header("User-agent", "") + if self.rfp.can_fetch(ua, request.get_full_url()): + return request + else: + # XXX This should really have raised URLError. Too late now... + msg = "request disallowed by robots.txt" + raise RobotExclusionError( + request, + request.get_full_url(), + 403, msg, + self.http_response_class(StringIO()), StringIO(msg)) + + https_request = http_request + +class HTTPRefererProcessor(BaseHandler): + """Add Referer header to requests. + + This only makes sense if you use each RefererProcessor for a single + chain of requests only (so, for example, if you use a single + HTTPRefererProcessor to fetch a series of URLs extracted from a single + page, this will break). + + There's a proper implementation of this in mechanize.Browser. + + """ + def __init__(self): + self.referer = None + + def http_request(self, request): + if ((self.referer is not None) and + not request.has_header("Referer")): + request.add_unredirected_header("Referer", self.referer) + return request + + def http_response(self, request, response): + self.referer = response.geturl() + return response + + https_request = http_request + https_response = http_response + + +def clean_refresh_url(url): + # e.g. Firefox 1.5 does (something like) this + if ((url.startswith('"') and url.endswith('"')) or + (url.startswith("'") and url.endswith("'"))): + url = url[1:-1] + return _rfc3986.clean_url(url, "latin-1") # XXX encoding + +def parse_refresh_header(refresh): + """ + >>> parse_refresh_header("1; url=http://example.com/") + (1.0, 'http://example.com/') + >>> parse_refresh_header("1; url='http://example.com/'") + (1.0, 'http://example.com/') + >>> parse_refresh_header("1") + (1.0, None) + >>> parse_refresh_header("blah") + Traceback (most recent call last): + ValueError: invalid literal for float(): blah + + """ + + ii = refresh.find(";") + if ii != -1: + pause, newurl_spec = float(refresh[:ii]), refresh[ii+1:] + jj = newurl_spec.find("=") + key = None + if jj != -1: + key, newurl = newurl_spec[:jj], newurl_spec[jj+1:] + newurl = clean_refresh_url(newurl) + if key is None or key.strip().lower() != "url": + raise ValueError() + else: + pause, newurl = float(refresh), None + return pause, newurl + +class HTTPRefreshProcessor(BaseHandler): + """Perform HTTP Refresh redirections. + + Note that if a non-200 HTTP code has occurred (for example, a 30x + redirect), this processor will do nothing. + + By default, only zero-time Refresh headers are redirected. Use the + max_time attribute / constructor argument to allow Refresh with longer + pauses. Use the honor_time attribute / constructor argument to control + whether the requested pause is honoured (with a time.sleep()) or + skipped in favour of immediate redirection. + + Public attributes: + + max_time: see above + honor_time: see above + + """ + handler_order = 1000 + + def __init__(self, max_time=0, honor_time=True): + self.max_time = max_time + self.honor_time = honor_time + + def http_response(self, request, response): + code, msg, hdrs = response.code, response.msg, response.info() + + if code == 200 and hdrs.has_key("refresh"): + refresh = hdrs.getheaders("refresh")[0] + try: + pause, newurl = parse_refresh_header(refresh) + except ValueError: + debug("bad Refresh header: %r" % refresh) + return response + if newurl is None: + newurl = response.geturl() + if (self.max_time is None) or (pause <= self.max_time): + if pause > 1E-3 and self.honor_time: + time.sleep(pause) + hdrs["location"] = newurl + # hardcoded http is NOT a bug + response = self.parent.error( + "http", request, response, + "refresh", msg, hdrs) + + return response + + https_response = http_response + +class HTTPErrorProcessor(BaseHandler): + """Process HTTP error responses. + + The purpose of this handler is to to allow other response processors a + look-in by removing the call to parent.error() from + AbstractHTTPHandler. + + For non-200 error codes, this just passes the job on to the + Handler._error_ methods, via the OpenerDirector.error + method. Eventually, urllib2.HTTPDefaultErrorHandler will raise an + HTTPError if no other handler handles the error. + + """ + handler_order = 1000 # after all other processors + + def http_response(self, request, response): + code, msg, hdrs = response.code, response.msg, response.info() + + if code != 200: + # hardcoded http is NOT a bug + response = self.parent.error( + "http", request, response, code, msg, hdrs) + + return response + + https_response = http_response + + +class HTTPDefaultErrorHandler(BaseHandler): + def http_error_default(self, req, fp, code, msg, hdrs): + # why these error methods took the code, msg, headers args in the first + # place rather than a response object, I don't know, but to avoid + # multiple wrapping, we're discarding them + + if isinstance(fp, urllib2.HTTPError): + response = fp + else: + response = urllib2.HTTPError( + req.get_full_url(), code, msg, hdrs, fp) + assert code == response.code + assert msg == response.msg + assert hdrs == response.hdrs + raise response + + +class AbstractHTTPHandler(BaseHandler): + + def __init__(self, debuglevel=0): + self._debuglevel = debuglevel + + def set_http_debuglevel(self, level): + self._debuglevel = level + + def do_request_(self, request): + host = request.get_host() + if not host: + raise URLError('no host given') + + if request.has_data(): # POST + data = request.get_data() + if not request.has_header('Content-type'): + request.add_unredirected_header( + 'Content-type', + 'application/x-www-form-urlencoded') + + scheme, sel = urllib.splittype(request.get_selector()) + sel_host, sel_path = urllib.splithost(sel) + if not request.has_header('Host'): + request.add_unredirected_header('Host', sel_host or host) + for name, value in self.parent.addheaders: + name = name.capitalize() + if not request.has_header(name): + request.add_unredirected_header(name, value) + + return request + + def do_open(self, http_class, req): + """Return an addinfourl object for the request, using http_class. + + http_class must implement the HTTPConnection API from httplib. + The addinfourl return value is a file-like object. It also + has methods and attributes including: + - info(): return a mimetools.Message object for the headers + - geturl(): return the original request URL + - code: HTTP status code + """ + host = req.get_host() + if not host: + raise URLError('no host given') + + h = http_class(host) # will parse host:port + h.set_debuglevel(self._debuglevel) + + headers = dict(req.headers) + headers.update(req.unredirected_hdrs) + # We want to make an HTTP/1.1 request, but the addinfourl + # class isn't prepared to deal with a persistent connection. + # It will try to read all remaining data from the socket, + # which will block while the server waits for the next request. + # So make sure the connection gets closed after the (only) + # request. + headers["Connection"] = "close" + headers = dict( + [(name.title(), val) for name, val in headers.items()]) + try: + h.request(req.get_method(), req.get_selector(), req.data, headers) + r = h.getresponse() + except socket.error, err: # XXX what error? + raise URLError(err) + + # Pick apart the HTTPResponse object to get the addinfourl + # object initialized properly. + + # Wrap the HTTPResponse object in socket's file object adapter + # for Windows. That adapter calls recv(), so delegate recv() + # to read(). This weird wrapping allows the returned object to + # have readline() and readlines() methods. + + # XXX It might be better to extract the read buffering code + # out of socket._fileobject() and into a base class. + + r.recv = r.read + fp = socket._fileobject(r) + + resp = closeable_response(fp, r.msg, req.get_full_url(), + r.status, r.reason) + return resp + + +class HTTPHandler(AbstractHTTPHandler): + def http_open(self, req): + return self.do_open(httplib.HTTPConnection, req) + + http_request = AbstractHTTPHandler.do_request_ + +if hasattr(httplib, 'HTTPS'): + + class HTTPSConnectionFactory: + def __init__(self, key_file, cert_file): + self._key_file = key_file + self._cert_file = cert_file + def __call__(self, hostport): + return httplib.HTTPSConnection( + hostport, + key_file=self._key_file, cert_file=self._cert_file) + + class HTTPSHandler(AbstractHTTPHandler): + def __init__(self, client_cert_manager=None): + AbstractHTTPHandler.__init__(self) + self.client_cert_manager = client_cert_manager + + def https_open(self, req): + if self.client_cert_manager is not None: + key_file, cert_file = self.client_cert_manager.find_key_cert( + req.get_full_url()) + conn_factory = HTTPSConnectionFactory(key_file, cert_file) + else: + conn_factory = httplib.HTTPSConnection + return self.do_open(conn_factory, req) + + https_request = AbstractHTTPHandler.do_request_ diff --git a/src/calibre/utils/mechanize/_lwpcookiejar.py b/src/calibre/utils/mechanize/_lwpcookiejar.py new file mode 100644 index 0000000000..f8d49cf2d4 --- /dev/null +++ b/src/calibre/utils/mechanize/_lwpcookiejar.py @@ -0,0 +1,185 @@ +"""Load / save to libwww-perl (LWP) format files. + +Actually, the format is slightly extended from that used by LWP's +(libwww-perl's) HTTP::Cookies, to avoid losing some RFC 2965 information +not recorded by LWP. + +It uses the version string "2.0", though really there isn't an LWP Cookies +2.0 format. This indicates that there is extra information in here +(domain_dot and port_spec) while still being compatible with libwww-perl, +I hope. + +Copyright 2002-2006 John J Lee +Copyright 1997-1999 Gisle Aas (original libwww-perl code) + +This code is free software; you can redistribute it and/or modify it +under the terms of the BSD or ZPL 2.1 licenses (see the file +COPYING.txt included with the distribution). + +""" + +import time, re, logging + +from _clientcookie import reraise_unmasked_exceptions, FileCookieJar, Cookie, \ + MISSING_FILENAME_TEXT, LoadError +from _headersutil import join_header_words, split_header_words +from _util import iso2time, time2isoz + +debug = logging.getLogger("mechanize").debug + + +def lwp_cookie_str(cookie): + """Return string representation of Cookie in an the LWP cookie file format. + + Actually, the format is extended a bit -- see module docstring. + + """ + h = [(cookie.name, cookie.value), + ("path", cookie.path), + ("domain", cookie.domain)] + if cookie.port is not None: h.append(("port", cookie.port)) + if cookie.path_specified: h.append(("path_spec", None)) + if cookie.port_specified: h.append(("port_spec", None)) + if cookie.domain_initial_dot: h.append(("domain_dot", None)) + if cookie.secure: h.append(("secure", None)) + if cookie.expires: h.append(("expires", + time2isoz(float(cookie.expires)))) + if cookie.discard: h.append(("discard", None)) + if cookie.comment: h.append(("comment", cookie.comment)) + if cookie.comment_url: h.append(("commenturl", cookie.comment_url)) + if cookie.rfc2109: h.append(("rfc2109", None)) + + keys = cookie.nonstandard_attr_keys() + keys.sort() + for k in keys: + h.append((k, str(cookie.get_nonstandard_attr(k)))) + + h.append(("version", str(cookie.version))) + + return join_header_words([h]) + +class LWPCookieJar(FileCookieJar): + """ + The LWPCookieJar saves a sequence of"Set-Cookie3" lines. + "Set-Cookie3" is the format used by the libwww-perl libary, not known + to be compatible with any browser, but which is easy to read and + doesn't lose information about RFC 2965 cookies. + + Additional methods + + as_lwp_str(ignore_discard=True, ignore_expired=True) + + """ + + magic_re = r"^\#LWP-Cookies-(\d+\.\d+)" + + def as_lwp_str(self, ignore_discard=True, ignore_expires=True): + """Return cookies as a string of "\n"-separated "Set-Cookie3" headers. + + ignore_discard and ignore_expires: see docstring for FileCookieJar.save + + """ + now = time.time() + r = [] + for cookie in self: + if not ignore_discard and cookie.discard: + debug(" Not saving %s: marked for discard", cookie.name) + continue + if not ignore_expires and cookie.is_expired(now): + debug(" Not saving %s: expired", cookie.name) + continue + r.append("Set-Cookie3: %s" % lwp_cookie_str(cookie)) + return "\n".join(r+[""]) + + def save(self, filename=None, ignore_discard=False, ignore_expires=False): + if filename is None: + if self.filename is not None: filename = self.filename + else: raise ValueError(MISSING_FILENAME_TEXT) + + f = open(filename, "w") + try: + debug("Saving LWP cookies file") + # There really isn't an LWP Cookies 2.0 format, but this indicates + # that there is extra information in here (domain_dot and + # port_spec) while still being compatible with libwww-perl, I hope. + f.write("#LWP-Cookies-2.0\n") + f.write(self.as_lwp_str(ignore_discard, ignore_expires)) + finally: + f.close() + + def _really_load(self, f, filename, ignore_discard, ignore_expires): + magic = f.readline() + if not re.search(self.magic_re, magic): + msg = "%s does not seem to contain cookies" % filename + raise LoadError(msg) + + now = time.time() + + header = "Set-Cookie3:" + boolean_attrs = ("port_spec", "path_spec", "domain_dot", + "secure", "discard", "rfc2109") + value_attrs = ("version", + "port", "path", "domain", + "expires", + "comment", "commenturl") + + try: + while 1: + line = f.readline() + if line == "": break + if not line.startswith(header): + continue + line = line[len(header):].strip() + + for data in split_header_words([line]): + name, value = data[0] + standard = {} + rest = {} + for k in boolean_attrs: + standard[k] = False + for k, v in data[1:]: + if k is not None: + lc = k.lower() + else: + lc = None + # don't lose case distinction for unknown fields + if (lc in value_attrs) or (lc in boolean_attrs): + k = lc + if k in boolean_attrs: + if v is None: v = True + standard[k] = v + elif k in value_attrs: + standard[k] = v + else: + rest[k] = v + + h = standard.get + expires = h("expires") + discard = h("discard") + if expires is not None: + expires = iso2time(expires) + if expires is None: + discard = True + domain = h("domain") + domain_specified = domain.startswith(".") + c = Cookie(h("version"), name, value, + h("port"), h("port_spec"), + domain, domain_specified, h("domain_dot"), + h("path"), h("path_spec"), + h("secure"), + expires, + discard, + h("comment"), + h("commenturl"), + rest, + h("rfc2109"), + ) + if not ignore_discard and c.discard: + continue + if not ignore_expires and c.is_expired(now): + continue + self.set_cookie(c) + except: + reraise_unmasked_exceptions((IOError,)) + raise LoadError("invalid Set-Cookie3 format file %s" % filename) + diff --git a/src/calibre/utils/mechanize/_mechanize.py b/src/calibre/utils/mechanize/_mechanize.py new file mode 100644 index 0000000000..a9b8d9e0b5 --- /dev/null +++ b/src/calibre/utils/mechanize/_mechanize.py @@ -0,0 +1,656 @@ +"""Stateful programmatic WWW navigation, after Perl's WWW::Mechanize. + +Copyright 2003-2006 John J. Lee +Copyright 2003 Andy Lester (original Perl code) + +This code is free software; you can redistribute it and/or modify it +under the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt +included with the distribution). + +""" + +import urllib2, sys, copy, re + +from _useragent import UserAgentBase +from _html import DefaultFactory +import _response +import _request +import _rfc3986 + +__version__ = (0, 1, 7, "b", None) # 0.1.7b + +class BrowserStateError(Exception): pass +class LinkNotFoundError(Exception): pass +class FormNotFoundError(Exception): pass + + +class History: + """ + + Though this will become public, the implied interface is not yet stable. + + """ + def __init__(self): + self._history = [] # LIFO + def add(self, request, response): + self._history.append((request, response)) + def back(self, n, _response): + response = _response # XXX move Browser._response into this class? + while n > 0 or response is None: + try: + request, response = self._history.pop() + except IndexError: + raise BrowserStateError("already at start of history") + n -= 1 + return request, response + def clear(self): + del self._history[:] + def close(self): + for request, response in self._history: + if response is not None: + response.close() + del self._history[:] + + +class HTTPRefererProcessor(urllib2.BaseHandler): + def http_request(self, request): + # See RFC 2616 14.36. The only times we know the source of the + # request URI has a URI associated with it are redirect, and + # Browser.click() / Browser.submit() / Browser.follow_link(). + # Otherwise, it's the user's job to add any Referer header before + # .open()ing. + if hasattr(request, "redirect_dict"): + request = self.parent._add_referer_header( + request, origin_request=False) + return request + + https_request = http_request + + +class Browser(UserAgentBase): + """Browser-like class with support for history, forms and links. + + BrowserStateError is raised whenever the browser is in the wrong state to + complete the requested operation - eg., when .back() is called when the + browser history is empty, or when .follow_link() is called when the current + response does not contain HTML data. + + Public attributes: + + request: current request (mechanize.Request or urllib2.Request) + form: currently selected form (see .select_form()) + + """ + + handler_classes = copy.copy(UserAgentBase.handler_classes) + handler_classes["_referer"] = HTTPRefererProcessor + default_features = copy.copy(UserAgentBase.default_features) + default_features.append("_referer") + + def __init__(self, + factory=None, + history=None, + request_class=None, + ): + """ + + Only named arguments should be passed to this constructor. + + factory: object implementing the mechanize.Factory interface. + history: object implementing the mechanize.History interface. Note + this interface is still experimental and may change in future. + request_class: Request class to use. Defaults to mechanize.Request + by default for Pythons older than 2.4, urllib2.Request otherwise. + + The Factory and History objects passed in are 'owned' by the Browser, + so they should not be shared across Browsers. In particular, + factory.set_response() should not be called except by the owning + Browser itself. + + Note that the supplied factory's request_class is overridden by this + constructor, to ensure only one Request class is used. + + """ + self._handle_referer = True + + if history is None: + history = History() + self._history = history + + if request_class is None: + if not hasattr(urllib2.Request, "add_unredirected_header"): + request_class = _request.Request + else: + request_class = urllib2.Request # Python >= 2.4 + + if factory is None: + factory = DefaultFactory() + factory.set_request_class(request_class) + self._factory = factory + self.request_class = request_class + + self.request = None + self._set_response(None, False) + + # do this last to avoid __getattr__ problems + UserAgentBase.__init__(self) + + def close(self): + UserAgentBase.close(self) + if self._response is not None: + self._response.close() + if self._history is not None: + self._history.close() + self._history = None + + # make use after .close easy to spot + self.form = None + self.request = self._response = None + self.request = self.response = self.set_response = None + self.geturl = self.reload = self.back = None + self.clear_history = self.set_cookie = self.links = self.forms = None + self.viewing_html = self.encoding = self.title = None + self.select_form = self.click = self.submit = self.click_link = None + self.follow_link = self.find_link = None + + def set_handle_referer(self, handle): + """Set whether to add Referer header to each request. + + This base class does not implement this feature (so don't turn this on + if you're using this base class directly), but the subclass + mechanize.Browser does. + + """ + self._set_handler("_referer", handle) + self._handle_referer = bool(handle) + + def _add_referer_header(self, request, origin_request=True): + if self.request is None: + return request + scheme = request.get_type() + original_scheme = self.request.get_type() + if scheme not in ["http", "https"]: + return request + if not origin_request and not self.request.has_header("Referer"): + return request + + if (self._handle_referer and + original_scheme in ["http", "https"] and + not (original_scheme == "https" and scheme != "https")): + # strip URL fragment (RFC 2616 14.36) + parts = _rfc3986.urlsplit(self.request.get_full_url()) + parts = parts[:-1]+(None,) + referer = _rfc3986.urlunsplit(parts) + request.add_unredirected_header("Referer", referer) + return request + + def open_novisit(self, url, data=None): + """Open a URL without visiting it. + + The browser state (including .request, .response(), history, forms and + links) are all left unchanged by calling this function. + + The interface is the same as for .open(). + + This is useful for things like fetching images. + + See also .retrieve(). + + """ + return self._mech_open(url, data, visit=False) + + def open(self, url, data=None): + return self._mech_open(url, data) + + def _mech_open(self, url, data=None, update_history=True, visit=None): + try: + url.get_full_url + except AttributeError: + # string URL -- convert to absolute URL if required + scheme, authority = _rfc3986.urlsplit(url)[:2] + if scheme is None: + # relative URL + if self._response is None: + raise BrowserStateError( + "can't fetch relative reference: " + "not viewing any document") + url = _rfc3986.urljoin(self._response.geturl(), url) + + request = self._request(url, data, visit) + visit = request.visit + if visit is None: + visit = True + + if visit: + self._visit_request(request, update_history) + + success = True + try: + response = UserAgentBase.open(self, request, data) + except urllib2.HTTPError, error: + success = False + if error.fp is None: # not a response + raise + response = error +## except (IOError, socket.error, OSError), error: +## # Yes, urllib2 really does raise all these :-(( +## # See test_urllib2.py for examples of socket.gaierror and OSError, +## # plus note that FTPHandler raises IOError. +## # XXX I don't seem to have an example of exactly socket.error being +## # raised, only socket.gaierror... +## # I don't want to start fixing these here, though, since this is a +## # subclass of OpenerDirector, and it would break old code. Even in +## # Python core, a fix would need some backwards-compat. hack to be +## # acceptable. +## raise + + if visit: + self._set_response(response, False) + response = copy.copy(self._response) + elif response is not None: + response = _response.upgrade_response(response) + + if not success: + raise response + return response + + def __str__(self): + text = [] + text.append("<%s " % self.__class__.__name__) + if self._response: + text.append("visiting %s" % self._response.geturl()) + else: + text.append("(not visiting a URL)") + if self.form: + text.append("\n selected form:\n %s\n" % str(self.form)) + text.append(">") + return "".join(text) + + def response(self): + """Return a copy of the current response. + + The returned object has the same interface as the object returned by + .open() (or urllib2.urlopen()). + + """ + return copy.copy(self._response) + + def set_response(self, response): + """Replace current response with (a copy of) response. + + response may be None. + + This is intended mostly for HTML-preprocessing. + """ + self._set_response(response, True) + + def _set_response(self, response, close_current): + # sanity check, necessary but far from sufficient + if not (response is None or + (hasattr(response, "info") and hasattr(response, "geturl") and + hasattr(response, "read") + ) + ): + raise ValueError("not a response object") + + self.form = None + if response is not None: + response = _response.upgrade_response(response) + if close_current and self._response is not None: + self._response.close() + self._response = response + self._factory.set_response(response) + + def visit_response(self, response, request=None): + """Visit the response, as if it had been .open()ed. + + Unlike .set_response(), this updates history rather than replacing the + current response. + """ + if request is None: + request = _request.Request(response.geturl()) + self._visit_request(request, True) + self._set_response(response, False) + + def _visit_request(self, request, update_history): + if self._response is not None: + self._response.close() + if self.request is not None and update_history: + self._history.add(self.request, self._response) + self._response = None + # we want self.request to be assigned even if UserAgentBase.open + # fails + self.request = request + + def geturl(self): + """Get URL of current document.""" + if self._response is None: + raise BrowserStateError("not viewing any document") + return self._response.geturl() + + def reload(self): + """Reload current document, and return response object.""" + if self.request is None: + raise BrowserStateError("no URL has yet been .open()ed") + if self._response is not None: + self._response.close() + return self._mech_open(self.request, update_history=False) + + def back(self, n=1): + """Go back n steps in history, and return response object. + + n: go back this number of steps (default 1 step) + + """ + if self._response is not None: + self._response.close() + self.request, response = self._history.back(n, self._response) + self.set_response(response) + if not response.read_complete: + return self.reload() + return copy.copy(response) + + def clear_history(self): + self._history.clear() + + def set_cookie(self, cookie_string): + """Request to set a cookie. + + Note that it is NOT necessary to call this method under ordinary + circumstances: cookie handling is normally entirely automatic. The + intended use case is rather to simulate the setting of a cookie by + client script in a web page (e.g. JavaScript). In that case, use of + this method is necessary because mechanize currently does not support + JavaScript, VBScript, etc. + + The cookie is added in the same way as if it had arrived with the + current response, as a result of the current request. This means that, + for example, it is not appropriate to set the cookie based on the + current request, no cookie will be set. + + The cookie will be returned automatically with subsequent responses + made by the Browser instance whenever that's appropriate. + + cookie_string should be a valid value of the Set-Cookie header. + + For example: + + browser.set_cookie( + "sid=abcdef; expires=Wednesday, 09-Nov-06 23:12:40 GMT") + + Currently, this method does not allow for adding RFC 2986 cookies. + This limitation will be lifted if anybody requests it. + + """ + if self._response is None: + raise BrowserStateError("not viewing any document") + if self.request.get_type() not in ["http", "https"]: + raise BrowserStateError("can't set cookie for non-HTTP/HTTPS " + "transactions") + cookiejar = self._ua_handlers["_cookies"].cookiejar + response = self.response() # copy + headers = response.info() + headers["Set-cookie"] = cookie_string + cookiejar.extract_cookies(response, self.request) + + def links(self, **kwds): + """Return iterable over links (mechanize.Link objects).""" + if not self.viewing_html(): + raise BrowserStateError("not viewing HTML") + links = self._factory.links() + if kwds: + return self._filter_links(links, **kwds) + else: + return links + + def forms(self): + """Return iterable over forms. + + The returned form objects implement the ClientForm.HTMLForm interface. + + """ + if not self.viewing_html(): + raise BrowserStateError("not viewing HTML") + return self._factory.forms() + + def global_form(self): + """Return the global form object, or None if the factory implementation + did not supply one. + + The "global" form object contains all controls that are not descendants of + any FORM element. + + The returned form object implements the ClientForm.HTMLForm interface. + + This is a separate method since the global form is not regarded as part + of the sequence of forms in the document -- mostly for + backwards-compatibility. + + """ + if not self.viewing_html(): + raise BrowserStateError("not viewing HTML") + return self._factory.global_form + + def viewing_html(self): + """Return whether the current response contains HTML data.""" + if self._response is None: + raise BrowserStateError("not viewing any document") + return self._factory.is_html + + def encoding(self): + """""" + if self._response is None: + raise BrowserStateError("not viewing any document") + return self._factory.encoding + + def title(self): + """Return title, or None if there is no title element in the document. + + Tags are stripped or textified as described in docs for + PullParser.get_text() method of pullparser module. + + """ + if not self.viewing_html(): + raise BrowserStateError("not viewing HTML") + return self._factory.title + + def select_form(self, name=None, predicate=None, nr=None): + """Select an HTML form for input. + + This is a bit like giving a form the "input focus" in a browser. + + If a form is selected, the Browser object supports the HTMLForm + interface, so you can call methods like .set_value(), .set(), and + .click(). + + Another way to select a form is to assign to the .form attribute. The + form assigned should be one of the objects returned by the .forms() + method. + + At least one of the name, predicate and nr arguments must be supplied. + If no matching form is found, mechanize.FormNotFoundError is raised. + + If name is specified, then the form must have the indicated name. + + If predicate is specified, then the form must match that function. The + predicate function is passed the HTMLForm as its single argument, and + should return a boolean value indicating whether the form matched. + + nr, if supplied, is the sequence number of the form (where 0 is the + first). Note that control 0 is the first form matching all the other + arguments (if supplied); it is not necessarily the first control in the + form. + + """ + if not self.viewing_html(): + raise BrowserStateError("not viewing HTML") + if (name is None) and (predicate is None) and (nr is None): + raise ValueError( + "at least one argument must be supplied to specify form") + + orig_nr = nr + for form in self.forms(): + if name is not None and name != form.name: + continue + if predicate is not None and not predicate(form): + continue + if nr: + nr -= 1 + continue + self.form = form + break # success + else: + # failure + description = [] + if name is not None: description.append("name '%s'" % name) + if predicate is not None: + description.append("predicate %s" % predicate) + if orig_nr is not None: description.append("nr %d" % orig_nr) + description = ", ".join(description) + raise FormNotFoundError("no form matching "+description) + + def click(self, *args, **kwds): + """See ClientForm.HTMLForm.click for documentation.""" + if not self.viewing_html(): + raise BrowserStateError("not viewing HTML") + request = self.form.click(*args, **kwds) + return self._add_referer_header(request) + + def submit(self, *args, **kwds): + """Submit current form. + + Arguments are as for ClientForm.HTMLForm.click(). + + Return value is same as for Browser.open(). + + """ + return self.open(self.click(*args, **kwds)) + + def click_link(self, link=None, **kwds): + """Find a link and return a Request object for it. + + Arguments are as for .find_link(), except that a link may be supplied + as the first argument. + + """ + if not self.viewing_html(): + raise BrowserStateError("not viewing HTML") + if not link: + link = self.find_link(**kwds) + else: + if kwds: + raise ValueError( + "either pass a Link, or keyword arguments, not both") + request = self.request_class(link.absolute_url) + return self._add_referer_header(request) + + def follow_link(self, link=None, **kwds): + """Find a link and .open() it. + + Arguments are as for .click_link(). + + Return value is same as for Browser.open(). + + """ + return self.open(self.click_link(link, **kwds)) + + def find_link(self, **kwds): + """Find a link in current page. + + Links are returned as mechanize.Link objects. + + # Return third link that .search()-matches the regexp "python" + # (by ".search()-matches", I mean that the regular expression method + # .search() is used, rather than .match()). + find_link(text_regex=re.compile("python"), nr=2) + + # Return first http link in the current page that points to somewhere + # on python.org whose link text (after tags have been removed) is + # exactly "monty python". + find_link(text="monty python", + url_regex=re.compile("http.*python.org")) + + # Return first link with exactly three HTML attributes. + find_link(predicate=lambda link: len(link.attrs) == 3) + + Links include anchors (), image maps (), and frames (, +