mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
IGN:Use patched mechanize implementation that correctly closes connections
This commit is contained in:
parent
e7c7cc64eb
commit
6fee09b9d2
@ -2,7 +2,7 @@
|
|||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2008, Kovid Goyal <kovid@kovidgoyal.net>'
|
__copyright__ = '2008, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
import sys, os, re, logging, time, subprocess, mechanize, atexit
|
import sys, os, re, logging, time, subprocess, atexit
|
||||||
from htmlentitydefs import name2codepoint
|
from htmlentitydefs import name2codepoint
|
||||||
from math import floor
|
from math import floor
|
||||||
from logging import Formatter
|
from logging import Formatter
|
||||||
@ -14,7 +14,7 @@ from calibre.constants import iswindows, isosx, islinux, isfrozen, \
|
|||||||
terminal_controller, preferred_encoding, \
|
terminal_controller, preferred_encoding, \
|
||||||
__appname__, __version__, __author__, \
|
__appname__, __version__, __author__, \
|
||||||
win32event, win32api, winerror, fcntl
|
win32event, win32api, winerror, fcntl
|
||||||
|
from calibre.utils import mechanize
|
||||||
|
|
||||||
def unicode_path(path, abs=False):
|
def unicode_path(path, abs=False):
|
||||||
if not isinstance(path, unicode):
|
if not isinstance(path, unicode):
|
||||||
|
125
src/calibre/utils/mechanize/__init__.py
Normal file
125
src/calibre/utils/mechanize/__init__.py
Normal file
@ -0,0 +1,125 @@
|
|||||||
|
__all__ = [
|
||||||
|
'AbstractBasicAuthHandler',
|
||||||
|
'AbstractDigestAuthHandler',
|
||||||
|
'BaseHandler',
|
||||||
|
'Browser',
|
||||||
|
'BrowserStateError',
|
||||||
|
'CacheFTPHandler',
|
||||||
|
'ContentTooShortError',
|
||||||
|
'Cookie',
|
||||||
|
'CookieJar',
|
||||||
|
'CookiePolicy',
|
||||||
|
'DefaultCookiePolicy',
|
||||||
|
'DefaultFactory',
|
||||||
|
'FTPHandler',
|
||||||
|
'Factory',
|
||||||
|
'FileCookieJar',
|
||||||
|
'FileHandler',
|
||||||
|
'FormNotFoundError',
|
||||||
|
'FormsFactory',
|
||||||
|
'GopherError',
|
||||||
|
'GopherHandler',
|
||||||
|
'HTTPBasicAuthHandler',
|
||||||
|
'HTTPCookieProcessor',
|
||||||
|
'HTTPDefaultErrorHandler',
|
||||||
|
'HTTPDigestAuthHandler',
|
||||||
|
'HTTPEquivProcessor',
|
||||||
|
'HTTPError',
|
||||||
|
'HTTPErrorProcessor',
|
||||||
|
'HTTPHandler',
|
||||||
|
'HTTPPasswordMgr',
|
||||||
|
'HTTPPasswordMgrWithDefaultRealm',
|
||||||
|
'HTTPProxyPasswordMgr',
|
||||||
|
'HTTPRedirectDebugProcessor',
|
||||||
|
'HTTPRedirectHandler',
|
||||||
|
'HTTPRefererProcessor',
|
||||||
|
'HTTPRefreshProcessor',
|
||||||
|
'HTTPRequestUpgradeProcessor',
|
||||||
|
'HTTPResponseDebugProcessor',
|
||||||
|
'HTTPRobotRulesProcessor',
|
||||||
|
'HTTPSClientCertMgr',
|
||||||
|
'HTTPSHandler',
|
||||||
|
'HeadParser',
|
||||||
|
'History',
|
||||||
|
'LWPCookieJar',
|
||||||
|
'Link',
|
||||||
|
'LinkNotFoundError',
|
||||||
|
'LinksFactory',
|
||||||
|
'LoadError',
|
||||||
|
'MSIECookieJar',
|
||||||
|
'MozillaCookieJar',
|
||||||
|
'OpenerDirector',
|
||||||
|
'OpenerFactory',
|
||||||
|
'ParseError',
|
||||||
|
'ProxyBasicAuthHandler',
|
||||||
|
'ProxyDigestAuthHandler',
|
||||||
|
'ProxyHandler',
|
||||||
|
'Request',
|
||||||
|
'ResponseUpgradeProcessor',
|
||||||
|
'RobotExclusionError',
|
||||||
|
'RobustFactory',
|
||||||
|
'RobustFormsFactory',
|
||||||
|
'RobustLinksFactory',
|
||||||
|
'RobustTitleFactory',
|
||||||
|
'SeekableProcessor',
|
||||||
|
'SeekableResponseOpener',
|
||||||
|
'TitleFactory',
|
||||||
|
'URLError',
|
||||||
|
'USE_BARE_EXCEPT',
|
||||||
|
'UnknownHandler',
|
||||||
|
'UserAgent',
|
||||||
|
'UserAgentBase',
|
||||||
|
'XHTMLCompatibleHeadParser',
|
||||||
|
'__version__',
|
||||||
|
'build_opener',
|
||||||
|
'install_opener',
|
||||||
|
'lwp_cookie_str',
|
||||||
|
'make_response',
|
||||||
|
'request_host',
|
||||||
|
'response_seek_wrapper', # XXX deprecate in public interface?
|
||||||
|
'seek_wrapped_response' # XXX should probably use this internally in place of response_seek_wrapper()
|
||||||
|
'str2time',
|
||||||
|
'urlopen',
|
||||||
|
'urlretrieve']
|
||||||
|
|
||||||
|
from _mechanize import __version__
|
||||||
|
|
||||||
|
# high-level stateful browser-style interface
|
||||||
|
from _mechanize import \
|
||||||
|
Browser, History, \
|
||||||
|
BrowserStateError, LinkNotFoundError, FormNotFoundError
|
||||||
|
|
||||||
|
# configurable URL-opener interface
|
||||||
|
from _useragent import UserAgentBase, UserAgent
|
||||||
|
from _html import \
|
||||||
|
ParseError, \
|
||||||
|
Link, \
|
||||||
|
Factory, DefaultFactory, RobustFactory, \
|
||||||
|
FormsFactory, LinksFactory, TitleFactory, \
|
||||||
|
RobustFormsFactory, RobustLinksFactory, RobustTitleFactory
|
||||||
|
|
||||||
|
# urllib2 work-alike interface (part from mechanize, part from urllib2)
|
||||||
|
# This is a superset of the urllib2 interface.
|
||||||
|
from _urllib2 import *
|
||||||
|
|
||||||
|
# misc
|
||||||
|
from _opener import ContentTooShortError, OpenerFactory, urlretrieve
|
||||||
|
from _util import http2time as str2time
|
||||||
|
from _response import \
|
||||||
|
response_seek_wrapper, seek_wrapped_response, make_response
|
||||||
|
from _http import HeadParser
|
||||||
|
try:
|
||||||
|
from _http import XHTMLCompatibleHeadParser
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# cookies
|
||||||
|
from _clientcookie import Cookie, CookiePolicy, DefaultCookiePolicy, \
|
||||||
|
CookieJar, FileCookieJar, LoadError, request_host
|
||||||
|
from _lwpcookiejar import LWPCookieJar, lwp_cookie_str
|
||||||
|
from _mozillacookiejar import MozillaCookieJar
|
||||||
|
from _msiecookiejar import MSIECookieJar
|
||||||
|
|
||||||
|
# If you hate the idea of turning bugs into warnings, do:
|
||||||
|
# import mechanize; mechanize.USE_BARE_EXCEPT = False
|
||||||
|
USE_BARE_EXCEPT = True
|
500
src/calibre/utils/mechanize/_auth.py
Normal file
500
src/calibre/utils/mechanize/_auth.py
Normal file
@ -0,0 +1,500 @@
|
|||||||
|
"""HTTP Authentication and Proxy support.
|
||||||
|
|
||||||
|
All but HTTPProxyPasswordMgr come from Python 2.5.
|
||||||
|
|
||||||
|
|
||||||
|
Copyright 2006 John J. Lee <jjl@pobox.com>
|
||||||
|
|
||||||
|
This code is free software; you can redistribute it and/or modify it under
|
||||||
|
the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt
|
||||||
|
included with the distribution).
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re, base64, urlparse, posixpath, md5, sha, sys, copy
|
||||||
|
|
||||||
|
from urllib2 import BaseHandler
|
||||||
|
from urllib import getproxies, unquote, splittype, splituser, splitpasswd, \
|
||||||
|
splitport
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_proxy(proxy):
|
||||||
|
"""Return (scheme, user, password, host/port) given a URL or an authority.
|
||||||
|
|
||||||
|
If a URL is supplied, it must have an authority (host:port) component.
|
||||||
|
According to RFC 3986, having an authority component means the URL must
|
||||||
|
have two slashes after the scheme:
|
||||||
|
|
||||||
|
>>> _parse_proxy('file:/ftp.example.com/')
|
||||||
|
Traceback (most recent call last):
|
||||||
|
ValueError: proxy URL with no authority: 'file:/ftp.example.com/'
|
||||||
|
|
||||||
|
The first three items of the returned tuple may be None.
|
||||||
|
|
||||||
|
Examples of authority parsing:
|
||||||
|
|
||||||
|
>>> _parse_proxy('proxy.example.com')
|
||||||
|
(None, None, None, 'proxy.example.com')
|
||||||
|
>>> _parse_proxy('proxy.example.com:3128')
|
||||||
|
(None, None, None, 'proxy.example.com:3128')
|
||||||
|
|
||||||
|
The authority component may optionally include userinfo (assumed to be
|
||||||
|
username:password):
|
||||||
|
|
||||||
|
>>> _parse_proxy('joe:password@proxy.example.com')
|
||||||
|
(None, 'joe', 'password', 'proxy.example.com')
|
||||||
|
>>> _parse_proxy('joe:password@proxy.example.com:3128')
|
||||||
|
(None, 'joe', 'password', 'proxy.example.com:3128')
|
||||||
|
|
||||||
|
Same examples, but with URLs instead:
|
||||||
|
|
||||||
|
>>> _parse_proxy('http://proxy.example.com/')
|
||||||
|
('http', None, None, 'proxy.example.com')
|
||||||
|
>>> _parse_proxy('http://proxy.example.com:3128/')
|
||||||
|
('http', None, None, 'proxy.example.com:3128')
|
||||||
|
>>> _parse_proxy('http://joe:password@proxy.example.com/')
|
||||||
|
('http', 'joe', 'password', 'proxy.example.com')
|
||||||
|
>>> _parse_proxy('http://joe:password@proxy.example.com:3128')
|
||||||
|
('http', 'joe', 'password', 'proxy.example.com:3128')
|
||||||
|
|
||||||
|
Everything after the authority is ignored:
|
||||||
|
|
||||||
|
>>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128')
|
||||||
|
('ftp', 'joe', 'password', 'proxy.example.com')
|
||||||
|
|
||||||
|
Test for no trailing '/' case:
|
||||||
|
|
||||||
|
>>> _parse_proxy('http://joe:password@proxy.example.com')
|
||||||
|
('http', 'joe', 'password', 'proxy.example.com')
|
||||||
|
|
||||||
|
"""
|
||||||
|
scheme, r_scheme = splittype(proxy)
|
||||||
|
if not r_scheme.startswith("/"):
|
||||||
|
# authority
|
||||||
|
scheme = None
|
||||||
|
authority = proxy
|
||||||
|
else:
|
||||||
|
# URL
|
||||||
|
if not r_scheme.startswith("//"):
|
||||||
|
raise ValueError("proxy URL with no authority: %r" % proxy)
|
||||||
|
# We have an authority, so for RFC 3986-compliant URLs (by ss 3.
|
||||||
|
# and 3.3.), path is empty or starts with '/'
|
||||||
|
end = r_scheme.find("/", 2)
|
||||||
|
if end == -1:
|
||||||
|
end = None
|
||||||
|
authority = r_scheme[2:end]
|
||||||
|
userinfo, hostport = splituser(authority)
|
||||||
|
if userinfo is not None:
|
||||||
|
user, password = splitpasswd(userinfo)
|
||||||
|
else:
|
||||||
|
user = password = None
|
||||||
|
return scheme, user, password, hostport
|
||||||
|
|
||||||
|
class ProxyHandler(BaseHandler):
|
||||||
|
# Proxies must be in front
|
||||||
|
handler_order = 100
|
||||||
|
|
||||||
|
def __init__(self, proxies=None):
|
||||||
|
if proxies is None:
|
||||||
|
proxies = getproxies()
|
||||||
|
assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
|
||||||
|
self.proxies = proxies
|
||||||
|
for type, url in proxies.items():
|
||||||
|
setattr(self, '%s_open' % type,
|
||||||
|
lambda r, proxy=url, type=type, meth=self.proxy_open: \
|
||||||
|
meth(r, proxy, type))
|
||||||
|
|
||||||
|
def proxy_open(self, req, proxy, type):
|
||||||
|
orig_type = req.get_type()
|
||||||
|
proxy_type, user, password, hostport = _parse_proxy(proxy)
|
||||||
|
if proxy_type is None:
|
||||||
|
proxy_type = orig_type
|
||||||
|
if user and password:
|
||||||
|
user_pass = '%s:%s' % (unquote(user), unquote(password))
|
||||||
|
creds = base64.encodestring(user_pass).strip()
|
||||||
|
req.add_header('Proxy-authorization', 'Basic ' + creds)
|
||||||
|
hostport = unquote(hostport)
|
||||||
|
req.set_proxy(hostport, proxy_type)
|
||||||
|
if orig_type == proxy_type:
|
||||||
|
# let other handlers take care of it
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
# need to start over, because the other handlers don't
|
||||||
|
# grok the proxy's URL type
|
||||||
|
# e.g. if we have a constructor arg proxies like so:
|
||||||
|
# {'http': 'ftp://proxy.example.com'}, we may end up turning
|
||||||
|
# a request for http://acme.example.com/a into one for
|
||||||
|
# ftp://proxy.example.com/a
|
||||||
|
return self.parent.open(req)
|
||||||
|
|
||||||
|
class HTTPPasswordMgr:
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.passwd = {}
|
||||||
|
|
||||||
|
def add_password(self, realm, uri, user, passwd):
|
||||||
|
# uri could be a single URI or a sequence
|
||||||
|
if isinstance(uri, basestring):
|
||||||
|
uri = [uri]
|
||||||
|
if not realm in self.passwd:
|
||||||
|
self.passwd[realm] = {}
|
||||||
|
for default_port in True, False:
|
||||||
|
reduced_uri = tuple(
|
||||||
|
[self.reduce_uri(u, default_port) for u in uri])
|
||||||
|
self.passwd[realm][reduced_uri] = (user, passwd)
|
||||||
|
|
||||||
|
def find_user_password(self, realm, authuri):
|
||||||
|
domains = self.passwd.get(realm, {})
|
||||||
|
for default_port in True, False:
|
||||||
|
reduced_authuri = self.reduce_uri(authuri, default_port)
|
||||||
|
for uris, authinfo in domains.iteritems():
|
||||||
|
for uri in uris:
|
||||||
|
if self.is_suburi(uri, reduced_authuri):
|
||||||
|
return authinfo
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
def reduce_uri(self, uri, default_port=True):
|
||||||
|
"""Accept authority or URI and extract only the authority and path."""
|
||||||
|
# note HTTP URLs do not have a userinfo component
|
||||||
|
parts = urlparse.urlsplit(uri)
|
||||||
|
if parts[1]:
|
||||||
|
# URI
|
||||||
|
scheme = parts[0]
|
||||||
|
authority = parts[1]
|
||||||
|
path = parts[2] or '/'
|
||||||
|
else:
|
||||||
|
# host or host:port
|
||||||
|
scheme = None
|
||||||
|
authority = uri
|
||||||
|
path = '/'
|
||||||
|
host, port = splitport(authority)
|
||||||
|
if default_port and port is None and scheme is not None:
|
||||||
|
dport = {"http": 80,
|
||||||
|
"https": 443,
|
||||||
|
}.get(scheme)
|
||||||
|
if dport is not None:
|
||||||
|
authority = "%s:%d" % (host, dport)
|
||||||
|
return authority, path
|
||||||
|
|
||||||
|
def is_suburi(self, base, test):
|
||||||
|
"""Check if test is below base in a URI tree
|
||||||
|
|
||||||
|
Both args must be URIs in reduced form.
|
||||||
|
"""
|
||||||
|
if base == test:
|
||||||
|
return True
|
||||||
|
if base[0] != test[0]:
|
||||||
|
return False
|
||||||
|
common = posixpath.commonprefix((base[1], test[1]))
|
||||||
|
if len(common) == len(base[1]):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
|
||||||
|
|
||||||
|
def find_user_password(self, realm, authuri):
|
||||||
|
user, password = HTTPPasswordMgr.find_user_password(self, realm,
|
||||||
|
authuri)
|
||||||
|
if user is not None:
|
||||||
|
return user, password
|
||||||
|
return HTTPPasswordMgr.find_user_password(self, None, authuri)
|
||||||
|
|
||||||
|
|
||||||
|
class AbstractBasicAuthHandler:
|
||||||
|
|
||||||
|
rx = re.compile('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', re.I)
|
||||||
|
|
||||||
|
# XXX there can actually be multiple auth-schemes in a
|
||||||
|
# www-authenticate header. should probably be a lot more careful
|
||||||
|
# in parsing them to extract multiple alternatives
|
||||||
|
|
||||||
|
def __init__(self, password_mgr=None):
|
||||||
|
if password_mgr is None:
|
||||||
|
password_mgr = HTTPPasswordMgr()
|
||||||
|
self.passwd = password_mgr
|
||||||
|
self.add_password = self.passwd.add_password
|
||||||
|
|
||||||
|
def http_error_auth_reqed(self, authreq, host, req, headers):
|
||||||
|
# host may be an authority (without userinfo) or a URL with an
|
||||||
|
# authority
|
||||||
|
# XXX could be multiple headers
|
||||||
|
authreq = headers.get(authreq, None)
|
||||||
|
if authreq:
|
||||||
|
mo = AbstractBasicAuthHandler.rx.search(authreq)
|
||||||
|
if mo:
|
||||||
|
scheme, realm = mo.groups()
|
||||||
|
if scheme.lower() == 'basic':
|
||||||
|
return self.retry_http_basic_auth(host, req, realm)
|
||||||
|
|
||||||
|
def retry_http_basic_auth(self, host, req, realm):
|
||||||
|
user, pw = self.passwd.find_user_password(realm, host)
|
||||||
|
if pw is not None:
|
||||||
|
raw = "%s:%s" % (user, pw)
|
||||||
|
auth = 'Basic %s' % base64.encodestring(raw).strip()
|
||||||
|
if req.headers.get(self.auth_header, None) == auth:
|
||||||
|
return None
|
||||||
|
newreq = copy.copy(req)
|
||||||
|
newreq.add_header(self.auth_header, auth)
|
||||||
|
newreq.visit = False
|
||||||
|
return self.parent.open(newreq)
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
|
||||||
|
|
||||||
|
auth_header = 'Authorization'
|
||||||
|
|
||||||
|
def http_error_401(self, req, fp, code, msg, headers):
|
||||||
|
url = req.get_full_url()
|
||||||
|
return self.http_error_auth_reqed('www-authenticate',
|
||||||
|
url, req, headers)
|
||||||
|
|
||||||
|
|
||||||
|
class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
|
||||||
|
|
||||||
|
auth_header = 'Proxy-authorization'
|
||||||
|
|
||||||
|
def http_error_407(self, req, fp, code, msg, headers):
|
||||||
|
# http_error_auth_reqed requires that there is no userinfo component in
|
||||||
|
# authority. Assume there isn't one, since urllib2 does not (and
|
||||||
|
# should not, RFC 3986 s. 3.2.1) support requests for URLs containing
|
||||||
|
# userinfo.
|
||||||
|
authority = req.get_host()
|
||||||
|
return self.http_error_auth_reqed('proxy-authenticate',
|
||||||
|
authority, req, headers)
|
||||||
|
|
||||||
|
|
||||||
|
def randombytes(n):
|
||||||
|
"""Return n random bytes."""
|
||||||
|
# Use /dev/urandom if it is available. Fall back to random module
|
||||||
|
# if not. It might be worthwhile to extend this function to use
|
||||||
|
# other platform-specific mechanisms for getting random bytes.
|
||||||
|
if os.path.exists("/dev/urandom"):
|
||||||
|
f = open("/dev/urandom")
|
||||||
|
s = f.read(n)
|
||||||
|
f.close()
|
||||||
|
return s
|
||||||
|
else:
|
||||||
|
L = [chr(random.randrange(0, 256)) for i in range(n)]
|
||||||
|
return "".join(L)
|
||||||
|
|
||||||
|
class AbstractDigestAuthHandler:
|
||||||
|
# Digest authentication is specified in RFC 2617.
|
||||||
|
|
||||||
|
# XXX The client does not inspect the Authentication-Info header
|
||||||
|
# in a successful response.
|
||||||
|
|
||||||
|
# XXX It should be possible to test this implementation against
|
||||||
|
# a mock server that just generates a static set of challenges.
|
||||||
|
|
||||||
|
# XXX qop="auth-int" supports is shaky
|
||||||
|
|
||||||
|
def __init__(self, passwd=None):
|
||||||
|
if passwd is None:
|
||||||
|
passwd = HTTPPasswordMgr()
|
||||||
|
self.passwd = passwd
|
||||||
|
self.add_password = self.passwd.add_password
|
||||||
|
self.retried = 0
|
||||||
|
self.nonce_count = 0
|
||||||
|
|
||||||
|
def reset_retry_count(self):
|
||||||
|
self.retried = 0
|
||||||
|
|
||||||
|
def http_error_auth_reqed(self, auth_header, host, req, headers):
|
||||||
|
authreq = headers.get(auth_header, None)
|
||||||
|
if self.retried > 5:
|
||||||
|
# Don't fail endlessly - if we failed once, we'll probably
|
||||||
|
# fail a second time. Hm. Unless the Password Manager is
|
||||||
|
# prompting for the information. Crap. This isn't great
|
||||||
|
# but it's better than the current 'repeat until recursion
|
||||||
|
# depth exceeded' approach <wink>
|
||||||
|
raise HTTPError(req.get_full_url(), 401, "digest auth failed",
|
||||||
|
headers, None)
|
||||||
|
else:
|
||||||
|
self.retried += 1
|
||||||
|
if authreq:
|
||||||
|
scheme = authreq.split()[0]
|
||||||
|
if scheme.lower() == 'digest':
|
||||||
|
return self.retry_http_digest_auth(req, authreq)
|
||||||
|
|
||||||
|
def retry_http_digest_auth(self, req, auth):
|
||||||
|
token, challenge = auth.split(' ', 1)
|
||||||
|
chal = parse_keqv_list(parse_http_list(challenge))
|
||||||
|
auth = self.get_authorization(req, chal)
|
||||||
|
if auth:
|
||||||
|
auth_val = 'Digest %s' % auth
|
||||||
|
if req.headers.get(self.auth_header, None) == auth_val:
|
||||||
|
return None
|
||||||
|
newreq = copy.copy(req)
|
||||||
|
newreq.add_unredirected_header(self.auth_header, auth_val)
|
||||||
|
newreq.visit = False
|
||||||
|
return self.parent.open(newreq)
|
||||||
|
|
||||||
|
def get_cnonce(self, nonce):
|
||||||
|
# The cnonce-value is an opaque
|
||||||
|
# quoted string value provided by the client and used by both client
|
||||||
|
# and server to avoid chosen plaintext attacks, to provide mutual
|
||||||
|
# authentication, and to provide some message integrity protection.
|
||||||
|
# This isn't a fabulous effort, but it's probably Good Enough.
|
||||||
|
dig = sha.new("%s:%s:%s:%s" % (self.nonce_count, nonce, time.ctime(),
|
||||||
|
randombytes(8))).hexdigest()
|
||||||
|
return dig[:16]
|
||||||
|
|
||||||
|
def get_authorization(self, req, chal):
|
||||||
|
try:
|
||||||
|
realm = chal['realm']
|
||||||
|
nonce = chal['nonce']
|
||||||
|
qop = chal.get('qop')
|
||||||
|
algorithm = chal.get('algorithm', 'MD5')
|
||||||
|
# mod_digest doesn't send an opaque, even though it isn't
|
||||||
|
# supposed to be optional
|
||||||
|
opaque = chal.get('opaque', None)
|
||||||
|
except KeyError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
H, KD = self.get_algorithm_impls(algorithm)
|
||||||
|
if H is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
user, pw = self.passwd.find_user_password(realm, req.get_full_url())
|
||||||
|
if user is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# XXX not implemented yet
|
||||||
|
if req.has_data():
|
||||||
|
entdig = self.get_entity_digest(req.get_data(), chal)
|
||||||
|
else:
|
||||||
|
entdig = None
|
||||||
|
|
||||||
|
A1 = "%s:%s:%s" % (user, realm, pw)
|
||||||
|
A2 = "%s:%s" % (req.get_method(),
|
||||||
|
# XXX selector: what about proxies and full urls
|
||||||
|
req.get_selector())
|
||||||
|
if qop == 'auth':
|
||||||
|
self.nonce_count += 1
|
||||||
|
ncvalue = '%08x' % self.nonce_count
|
||||||
|
cnonce = self.get_cnonce(nonce)
|
||||||
|
noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2))
|
||||||
|
respdig = KD(H(A1), noncebit)
|
||||||
|
elif qop is None:
|
||||||
|
respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
|
||||||
|
else:
|
||||||
|
# XXX handle auth-int.
|
||||||
|
pass
|
||||||
|
|
||||||
|
# XXX should the partial digests be encoded too?
|
||||||
|
|
||||||
|
base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
|
||||||
|
'response="%s"' % (user, realm, nonce, req.get_selector(),
|
||||||
|
respdig)
|
||||||
|
if opaque:
|
||||||
|
base += ', opaque="%s"' % opaque
|
||||||
|
if entdig:
|
||||||
|
base += ', digest="%s"' % entdig
|
||||||
|
base += ', algorithm="%s"' % algorithm
|
||||||
|
if qop:
|
||||||
|
base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
|
||||||
|
return base
|
||||||
|
|
||||||
|
def get_algorithm_impls(self, algorithm):
|
||||||
|
# lambdas assume digest modules are imported at the top level
|
||||||
|
if algorithm == 'MD5':
|
||||||
|
H = lambda x: md5.new(x).hexdigest()
|
||||||
|
elif algorithm == 'SHA':
|
||||||
|
H = lambda x: sha.new(x).hexdigest()
|
||||||
|
# XXX MD5-sess
|
||||||
|
KD = lambda s, d: H("%s:%s" % (s, d))
|
||||||
|
return H, KD
|
||||||
|
|
||||||
|
def get_entity_digest(self, data, chal):
|
||||||
|
# XXX not implemented yet
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
|
||||||
|
"""An authentication protocol defined by RFC 2069
|
||||||
|
|
||||||
|
Digest authentication improves on basic authentication because it
|
||||||
|
does not transmit passwords in the clear.
|
||||||
|
"""
|
||||||
|
|
||||||
|
auth_header = 'Authorization'
|
||||||
|
handler_order = 490
|
||||||
|
|
||||||
|
def http_error_401(self, req, fp, code, msg, headers):
|
||||||
|
host = urlparse.urlparse(req.get_full_url())[1]
|
||||||
|
retry = self.http_error_auth_reqed('www-authenticate',
|
||||||
|
host, req, headers)
|
||||||
|
self.reset_retry_count()
|
||||||
|
return retry
|
||||||
|
|
||||||
|
|
||||||
|
class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
|
||||||
|
|
||||||
|
auth_header = 'Proxy-Authorization'
|
||||||
|
handler_order = 490
|
||||||
|
|
||||||
|
def http_error_407(self, req, fp, code, msg, headers):
|
||||||
|
host = req.get_host()
|
||||||
|
retry = self.http_error_auth_reqed('proxy-authenticate',
|
||||||
|
host, req, headers)
|
||||||
|
self.reset_retry_count()
|
||||||
|
return retry
|
||||||
|
|
||||||
|
|
||||||
|
# XXX ugly implementation, should probably not bother deriving
|
||||||
|
class HTTPProxyPasswordMgr(HTTPPasswordMgr):
|
||||||
|
# has default realm and host/port
|
||||||
|
def add_password(self, realm, uri, user, passwd):
|
||||||
|
# uri could be a single URI or a sequence
|
||||||
|
if uri is None or isinstance(uri, basestring):
|
||||||
|
uris = [uri]
|
||||||
|
else:
|
||||||
|
uris = uri
|
||||||
|
passwd_by_domain = self.passwd.setdefault(realm, {})
|
||||||
|
for uri in uris:
|
||||||
|
for default_port in True, False:
|
||||||
|
reduced_uri = self.reduce_uri(uri, default_port)
|
||||||
|
passwd_by_domain[reduced_uri] = (user, passwd)
|
||||||
|
|
||||||
|
def find_user_password(self, realm, authuri):
|
||||||
|
attempts = [(realm, authuri), (None, authuri)]
|
||||||
|
# bleh, want default realm to take precedence over default
|
||||||
|
# URI/authority, hence this outer loop
|
||||||
|
for default_uri in False, True:
|
||||||
|
for realm, authuri in attempts:
|
||||||
|
authinfo_by_domain = self.passwd.get(realm, {})
|
||||||
|
for default_port in True, False:
|
||||||
|
reduced_authuri = self.reduce_uri(authuri, default_port)
|
||||||
|
for uri, authinfo in authinfo_by_domain.iteritems():
|
||||||
|
if uri is None and not default_uri:
|
||||||
|
continue
|
||||||
|
if self.is_suburi(uri, reduced_authuri):
|
||||||
|
return authinfo
|
||||||
|
user, password = None, None
|
||||||
|
|
||||||
|
if user is not None:
|
||||||
|
break
|
||||||
|
return user, password
|
||||||
|
|
||||||
|
def reduce_uri(self, uri, default_port=True):
|
||||||
|
if uri is None:
|
||||||
|
return None
|
||||||
|
return HTTPPasswordMgr.reduce_uri(self, uri, default_port)
|
||||||
|
|
||||||
|
def is_suburi(self, base, test):
|
||||||
|
if base is None:
|
||||||
|
# default to the proxy's host/port
|
||||||
|
hostport, path = test
|
||||||
|
base = (hostport, "/")
|
||||||
|
return HTTPPasswordMgr.is_suburi(self, base, test)
|
||||||
|
|
||||||
|
|
||||||
|
class HTTPSClientCertMgr(HTTPPasswordMgr):
|
||||||
|
# implementation inheritance: this is not a proper subclass
|
||||||
|
def add_key_cert(self, uri, key_file, cert_file):
|
||||||
|
self.add_password(None, uri, key_file, cert_file)
|
||||||
|
def find_key_cert(self, authuri):
|
||||||
|
return HTTPPasswordMgr.find_user_password(self, None, authuri)
|
1080
src/calibre/utils/mechanize/_beautifulsoup.py
Normal file
1080
src/calibre/utils/mechanize/_beautifulsoup.py
Normal file
File diff suppressed because it is too large
Load Diff
1651
src/calibre/utils/mechanize/_clientcookie.py
Normal file
1651
src/calibre/utils/mechanize/_clientcookie.py
Normal file
File diff suppressed because it is too large
Load Diff
28
src/calibre/utils/mechanize/_debug.py
Normal file
28
src/calibre/utils/mechanize/_debug.py
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
import logging
|
||||||
|
|
||||||
|
from urllib2 import BaseHandler
|
||||||
|
from _response import response_seek_wrapper
|
||||||
|
|
||||||
|
|
||||||
|
class HTTPResponseDebugProcessor(BaseHandler):
|
||||||
|
handler_order = 900 # before redirections, after everything else
|
||||||
|
|
||||||
|
def http_response(self, request, response):
|
||||||
|
if not hasattr(response, "seek"):
|
||||||
|
response = response_seek_wrapper(response)
|
||||||
|
info = logging.getLogger("mechanize.http_responses").info
|
||||||
|
try:
|
||||||
|
info(response.read())
|
||||||
|
finally:
|
||||||
|
response.seek(0)
|
||||||
|
info("*****************************************************")
|
||||||
|
return response
|
||||||
|
|
||||||
|
https_response = http_response
|
||||||
|
|
||||||
|
class HTTPRedirectDebugProcessor(BaseHandler):
|
||||||
|
def http_request(self, request):
|
||||||
|
if hasattr(request, "redirect_dict"):
|
||||||
|
info = logging.getLogger("mechanize.http_redirects").info
|
||||||
|
info("redirecting to %s", request.get_full_url())
|
||||||
|
return request
|
103
src/calibre/utils/mechanize/_gzip.py
Normal file
103
src/calibre/utils/mechanize/_gzip.py
Normal file
@ -0,0 +1,103 @@
|
|||||||
|
import urllib2
|
||||||
|
from cStringIO import StringIO
|
||||||
|
import _response
|
||||||
|
|
||||||
|
# GzipConsumer was taken from Fredrik Lundh's effbot.org-0.1-20041009 library
|
||||||
|
class GzipConsumer:
|
||||||
|
|
||||||
|
def __init__(self, consumer):
|
||||||
|
self.__consumer = consumer
|
||||||
|
self.__decoder = None
|
||||||
|
self.__data = ""
|
||||||
|
|
||||||
|
def __getattr__(self, key):
|
||||||
|
return getattr(self.__consumer, key)
|
||||||
|
|
||||||
|
def feed(self, data):
|
||||||
|
if self.__decoder is None:
|
||||||
|
# check if we have a full gzip header
|
||||||
|
data = self.__data + data
|
||||||
|
try:
|
||||||
|
i = 10
|
||||||
|
flag = ord(data[3])
|
||||||
|
if flag & 4: # extra
|
||||||
|
x = ord(data[i]) + 256*ord(data[i+1])
|
||||||
|
i = i + 2 + x
|
||||||
|
if flag & 8: # filename
|
||||||
|
while ord(data[i]):
|
||||||
|
i = i + 1
|
||||||
|
i = i + 1
|
||||||
|
if flag & 16: # comment
|
||||||
|
while ord(data[i]):
|
||||||
|
i = i + 1
|
||||||
|
i = i + 1
|
||||||
|
if flag & 2: # crc
|
||||||
|
i = i + 2
|
||||||
|
if len(data) < i:
|
||||||
|
raise IndexError("not enough data")
|
||||||
|
if data[:3] != "\x1f\x8b\x08":
|
||||||
|
raise IOError("invalid gzip data")
|
||||||
|
data = data[i:]
|
||||||
|
except IndexError:
|
||||||
|
self.__data = data
|
||||||
|
return # need more data
|
||||||
|
import zlib
|
||||||
|
self.__data = ""
|
||||||
|
self.__decoder = zlib.decompressobj(-zlib.MAX_WBITS)
|
||||||
|
data = self.__decoder.decompress(data)
|
||||||
|
if data:
|
||||||
|
self.__consumer.feed(data)
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
if self.__decoder:
|
||||||
|
data = self.__decoder.flush()
|
||||||
|
if data:
|
||||||
|
self.__consumer.feed(data)
|
||||||
|
self.__consumer.close()
|
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------
|
||||||
|
|
||||||
|
# the rest of this module is John Lee's stupid code, not
|
||||||
|
# Fredrik's nice code :-)
|
||||||
|
|
||||||
|
class stupid_gzip_consumer:
|
||||||
|
def __init__(self): self.data = []
|
||||||
|
def feed(self, data): self.data.append(data)
|
||||||
|
|
||||||
|
class stupid_gzip_wrapper(_response.closeable_response):
|
||||||
|
def __init__(self, response):
|
||||||
|
self._response = response
|
||||||
|
|
||||||
|
c = stupid_gzip_consumer()
|
||||||
|
gzc = GzipConsumer(c)
|
||||||
|
gzc.feed(response.read())
|
||||||
|
self.__data = StringIO("".join(c.data))
|
||||||
|
|
||||||
|
def read(self, size=-1):
|
||||||
|
return self.__data.read(size)
|
||||||
|
def readline(self, size=-1):
|
||||||
|
return self.__data.readline(size)
|
||||||
|
def readlines(self, sizehint=-1):
|
||||||
|
return self.__data.readlines(size)
|
||||||
|
|
||||||
|
def __getattr__(self, name):
|
||||||
|
# delegate unknown methods/attributes
|
||||||
|
return getattr(self._response, name)
|
||||||
|
|
||||||
|
class HTTPGzipProcessor(urllib2.BaseHandler):
|
||||||
|
handler_order = 200 # response processing before HTTPEquivProcessor
|
||||||
|
|
||||||
|
def http_request(self, request):
|
||||||
|
request.add_header("Accept-Encoding", "gzip")
|
||||||
|
return request
|
||||||
|
|
||||||
|
def http_response(self, request, response):
|
||||||
|
# post-process response
|
||||||
|
enc_hdrs = response.info().getheaders("Content-encoding")
|
||||||
|
for enc_hdr in enc_hdrs:
|
||||||
|
if ("gzip" in enc_hdr) or ("compress" in enc_hdr):
|
||||||
|
return stupid_gzip_wrapper(response)
|
||||||
|
return response
|
||||||
|
|
||||||
|
https_response = http_response
|
226
src/calibre/utils/mechanize/_headersutil.py
Normal file
226
src/calibre/utils/mechanize/_headersutil.py
Normal file
@ -0,0 +1,226 @@
|
|||||||
|
"""Utility functions for HTTP header value parsing and construction.
|
||||||
|
|
||||||
|
Copyright 1997-1998, Gisle Aas
|
||||||
|
Copyright 2002-2006, John J. Lee
|
||||||
|
|
||||||
|
This code is free software; you can redistribute it and/or modify it
|
||||||
|
under the terms of the BSD or ZPL 2.1 licenses (see the file
|
||||||
|
COPYING.txt included with the distribution).
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os, re
|
||||||
|
from types import StringType
|
||||||
|
from types import UnicodeType
|
||||||
|
STRING_TYPES = StringType, UnicodeType
|
||||||
|
|
||||||
|
from _util import http2time
|
||||||
|
import _rfc3986
|
||||||
|
|
||||||
|
def is_html(ct_headers, url, allow_xhtml=False):
|
||||||
|
"""
|
||||||
|
ct_headers: Sequence of Content-Type headers
|
||||||
|
url: Response URL
|
||||||
|
|
||||||
|
"""
|
||||||
|
if not ct_headers:
|
||||||
|
# guess
|
||||||
|
ext = os.path.splitext(_rfc3986.urlsplit(url)[2])[1]
|
||||||
|
html_exts = [".htm", ".html"]
|
||||||
|
if allow_xhtml:
|
||||||
|
html_exts += [".xhtml"]
|
||||||
|
return ext in html_exts
|
||||||
|
# use first header
|
||||||
|
ct = split_header_words(ct_headers)[0][0][0]
|
||||||
|
html_types = ["text/html"]
|
||||||
|
if allow_xhtml:
|
||||||
|
html_types += [
|
||||||
|
"text/xhtml", "text/xml",
|
||||||
|
"application/xml", "application/xhtml+xml",
|
||||||
|
]
|
||||||
|
return ct in html_types
|
||||||
|
|
||||||
|
def unmatched(match):
|
||||||
|
"""Return unmatched part of re.Match object."""
|
||||||
|
start, end = match.span(0)
|
||||||
|
return match.string[:start]+match.string[end:]
|
||||||
|
|
||||||
|
token_re = re.compile(r"^\s*([^=\s;,]+)")
|
||||||
|
quoted_value_re = re.compile(r"^\s*=\s*\"([^\"\\]*(?:\\.[^\"\\]*)*)\"")
|
||||||
|
value_re = re.compile(r"^\s*=\s*([^\s;,]*)")
|
||||||
|
escape_re = re.compile(r"\\(.)")
|
||||||
|
def split_header_words(header_values):
|
||||||
|
r"""Parse header values into a list of lists containing key,value pairs.
|
||||||
|
|
||||||
|
The function knows how to deal with ",", ";" and "=" as well as quoted
|
||||||
|
values after "=". A list of space separated tokens are parsed as if they
|
||||||
|
were separated by ";".
|
||||||
|
|
||||||
|
If the header_values passed as argument contains multiple values, then they
|
||||||
|
are treated as if they were a single value separated by comma ",".
|
||||||
|
|
||||||
|
This means that this function is useful for parsing header fields that
|
||||||
|
follow this syntax (BNF as from the HTTP/1.1 specification, but we relax
|
||||||
|
the requirement for tokens).
|
||||||
|
|
||||||
|
headers = #header
|
||||||
|
header = (token | parameter) *( [";"] (token | parameter))
|
||||||
|
|
||||||
|
token = 1*<any CHAR except CTLs or separators>
|
||||||
|
separators = "(" | ")" | "<" | ">" | "@"
|
||||||
|
| "," | ";" | ":" | "\" | <">
|
||||||
|
| "/" | "[" | "]" | "?" | "="
|
||||||
|
| "{" | "}" | SP | HT
|
||||||
|
|
||||||
|
quoted-string = ( <"> *(qdtext | quoted-pair ) <"> )
|
||||||
|
qdtext = <any TEXT except <">>
|
||||||
|
quoted-pair = "\" CHAR
|
||||||
|
|
||||||
|
parameter = attribute "=" value
|
||||||
|
attribute = token
|
||||||
|
value = token | quoted-string
|
||||||
|
|
||||||
|
Each header is represented by a list of key/value pairs. The value for a
|
||||||
|
simple token (not part of a parameter) is None. Syntactically incorrect
|
||||||
|
headers will not necessarily be parsed as you would want.
|
||||||
|
|
||||||
|
This is easier to describe with some examples:
|
||||||
|
|
||||||
|
>>> split_header_words(['foo="bar"; port="80,81"; discard, bar=baz'])
|
||||||
|
[[('foo', 'bar'), ('port', '80,81'), ('discard', None)], [('bar', 'baz')]]
|
||||||
|
>>> split_header_words(['text/html; charset="iso-8859-1"'])
|
||||||
|
[[('text/html', None), ('charset', 'iso-8859-1')]]
|
||||||
|
>>> split_header_words([r'Basic realm="\"foo\bar\""'])
|
||||||
|
[[('Basic', None), ('realm', '"foobar"')]]
|
||||||
|
|
||||||
|
"""
|
||||||
|
assert type(header_values) not in STRING_TYPES
|
||||||
|
result = []
|
||||||
|
for text in header_values:
|
||||||
|
orig_text = text
|
||||||
|
pairs = []
|
||||||
|
while text:
|
||||||
|
m = token_re.search(text)
|
||||||
|
if m:
|
||||||
|
text = unmatched(m)
|
||||||
|
name = m.group(1)
|
||||||
|
m = quoted_value_re.search(text)
|
||||||
|
if m: # quoted value
|
||||||
|
text = unmatched(m)
|
||||||
|
value = m.group(1)
|
||||||
|
value = escape_re.sub(r"\1", value)
|
||||||
|
else:
|
||||||
|
m = value_re.search(text)
|
||||||
|
if m: # unquoted value
|
||||||
|
text = unmatched(m)
|
||||||
|
value = m.group(1)
|
||||||
|
value = value.rstrip()
|
||||||
|
else:
|
||||||
|
# no value, a lone token
|
||||||
|
value = None
|
||||||
|
pairs.append((name, value))
|
||||||
|
elif text.lstrip().startswith(","):
|
||||||
|
# concatenated headers, as per RFC 2616 section 4.2
|
||||||
|
text = text.lstrip()[1:]
|
||||||
|
if pairs: result.append(pairs)
|
||||||
|
pairs = []
|
||||||
|
else:
|
||||||
|
# skip junk
|
||||||
|
non_junk, nr_junk_chars = re.subn("^[=\s;]*", "", text)
|
||||||
|
assert nr_junk_chars > 0, (
|
||||||
|
"split_header_words bug: '%s', '%s', %s" %
|
||||||
|
(orig_text, text, pairs))
|
||||||
|
text = non_junk
|
||||||
|
if pairs: result.append(pairs)
|
||||||
|
return result
|
||||||
|
|
||||||
|
join_escape_re = re.compile(r"([\"\\])")
|
||||||
|
def join_header_words(lists):
|
||||||
|
"""Do the inverse of the conversion done by split_header_words.
|
||||||
|
|
||||||
|
Takes a list of lists of (key, value) pairs and produces a single header
|
||||||
|
value. Attribute values are quoted if needed.
|
||||||
|
|
||||||
|
>>> join_header_words([[("text/plain", None), ("charset", "iso-8859/1")]])
|
||||||
|
'text/plain; charset="iso-8859/1"'
|
||||||
|
>>> join_header_words([[("text/plain", None)], [("charset", "iso-8859/1")]])
|
||||||
|
'text/plain, charset="iso-8859/1"'
|
||||||
|
|
||||||
|
"""
|
||||||
|
headers = []
|
||||||
|
for pairs in lists:
|
||||||
|
attr = []
|
||||||
|
for k, v in pairs:
|
||||||
|
if v is not None:
|
||||||
|
if not re.search(r"^\w+$", v):
|
||||||
|
v = join_escape_re.sub(r"\\\1", v) # escape " and \
|
||||||
|
v = '"%s"' % v
|
||||||
|
if k is None: # Netscape cookies may have no name
|
||||||
|
k = v
|
||||||
|
else:
|
||||||
|
k = "%s=%s" % (k, v)
|
||||||
|
attr.append(k)
|
||||||
|
if attr: headers.append("; ".join(attr))
|
||||||
|
return ", ".join(headers)
|
||||||
|
|
||||||
|
def parse_ns_headers(ns_headers):
|
||||||
|
"""Ad-hoc parser for Netscape protocol cookie-attributes.
|
||||||
|
|
||||||
|
The old Netscape cookie format for Set-Cookie can for instance contain
|
||||||
|
an unquoted "," in the expires field, so we have to use this ad-hoc
|
||||||
|
parser instead of split_header_words.
|
||||||
|
|
||||||
|
XXX This may not make the best possible effort to parse all the crap
|
||||||
|
that Netscape Cookie headers contain. Ronald Tschalar's HTTPClient
|
||||||
|
parser is probably better, so could do worse than following that if
|
||||||
|
this ever gives any trouble.
|
||||||
|
|
||||||
|
Currently, this is also used for parsing RFC 2109 cookies.
|
||||||
|
|
||||||
|
"""
|
||||||
|
known_attrs = ("expires", "domain", "path", "secure",
|
||||||
|
# RFC 2109 attrs (may turn up in Netscape cookies, too)
|
||||||
|
"port", "max-age")
|
||||||
|
|
||||||
|
result = []
|
||||||
|
for ns_header in ns_headers:
|
||||||
|
pairs = []
|
||||||
|
version_set = False
|
||||||
|
params = re.split(r";\s*", ns_header)
|
||||||
|
for ii in range(len(params)):
|
||||||
|
param = params[ii]
|
||||||
|
param = param.rstrip()
|
||||||
|
if param == "": continue
|
||||||
|
if "=" not in param:
|
||||||
|
k, v = param, None
|
||||||
|
else:
|
||||||
|
k, v = re.split(r"\s*=\s*", param, 1)
|
||||||
|
k = k.lstrip()
|
||||||
|
if ii != 0:
|
||||||
|
lc = k.lower()
|
||||||
|
if lc in known_attrs:
|
||||||
|
k = lc
|
||||||
|
if k == "version":
|
||||||
|
# This is an RFC 2109 cookie.
|
||||||
|
version_set = True
|
||||||
|
if k == "expires":
|
||||||
|
# convert expires date to seconds since epoch
|
||||||
|
if v.startswith('"'): v = v[1:]
|
||||||
|
if v.endswith('"'): v = v[:-1]
|
||||||
|
v = http2time(v) # None if invalid
|
||||||
|
pairs.append((k, v))
|
||||||
|
|
||||||
|
if pairs:
|
||||||
|
if not version_set:
|
||||||
|
pairs.append(("version", "0"))
|
||||||
|
result.append(pairs)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def _test():
|
||||||
|
import doctest, _headersutil
|
||||||
|
return doctest.testmod(_headersutil)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
_test()
|
607
src/calibre/utils/mechanize/_html.py
Normal file
607
src/calibre/utils/mechanize/_html.py
Normal file
@ -0,0 +1,607 @@
|
|||||||
|
"""HTML handling.
|
||||||
|
|
||||||
|
Copyright 2003-2006 John J. Lee <jjl@pobox.com>
|
||||||
|
|
||||||
|
This code is free software; you can redistribute it and/or modify it under
|
||||||
|
the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt
|
||||||
|
included with the distribution).
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re, copy, htmlentitydefs
|
||||||
|
import sgmllib, HTMLParser, ClientForm
|
||||||
|
|
||||||
|
import _request
|
||||||
|
from _headersutil import split_header_words, is_html as _is_html
|
||||||
|
import _rfc3986
|
||||||
|
|
||||||
|
DEFAULT_ENCODING = "latin-1"
|
||||||
|
|
||||||
|
|
||||||
|
# the base classe is purely for backwards compatibility
|
||||||
|
class ParseError(ClientForm.ParseError): pass
|
||||||
|
|
||||||
|
|
||||||
|
class CachingGeneratorFunction(object):
|
||||||
|
"""Caching wrapper around a no-arguments iterable."""
|
||||||
|
|
||||||
|
def __init__(self, iterable):
|
||||||
|
self._cache = []
|
||||||
|
# wrap iterable to make it non-restartable (otherwise, repeated
|
||||||
|
# __call__ would give incorrect results)
|
||||||
|
self._iterator = iter(iterable)
|
||||||
|
|
||||||
|
def __call__(self):
|
||||||
|
cache = self._cache
|
||||||
|
for item in cache:
|
||||||
|
yield item
|
||||||
|
for item in self._iterator:
|
||||||
|
cache.append(item)
|
||||||
|
yield item
|
||||||
|
|
||||||
|
|
||||||
|
class EncodingFinder:
|
||||||
|
def __init__(self, default_encoding):
|
||||||
|
self._default_encoding = default_encoding
|
||||||
|
def encoding(self, response):
|
||||||
|
# HTTPEquivProcessor may be in use, so both HTTP and HTTP-EQUIV
|
||||||
|
# headers may be in the response. HTTP-EQUIV headers come last,
|
||||||
|
# so try in order from first to last.
|
||||||
|
for ct in response.info().getheaders("content-type"):
|
||||||
|
for k, v in split_header_words([ct])[0]:
|
||||||
|
if k == "charset":
|
||||||
|
return v
|
||||||
|
return self._default_encoding
|
||||||
|
|
||||||
|
class ResponseTypeFinder:
|
||||||
|
def __init__(self, allow_xhtml):
|
||||||
|
self._allow_xhtml = allow_xhtml
|
||||||
|
def is_html(self, response, encoding):
|
||||||
|
ct_hdrs = response.info().getheaders("content-type")
|
||||||
|
url = response.geturl()
|
||||||
|
# XXX encoding
|
||||||
|
return _is_html(ct_hdrs, url, self._allow_xhtml)
|
||||||
|
|
||||||
|
|
||||||
|
# idea for this argument-processing trick is from Peter Otten
|
||||||
|
class Args:
|
||||||
|
def __init__(self, args_map):
|
||||||
|
self.dictionary = dict(args_map)
|
||||||
|
def __getattr__(self, key):
|
||||||
|
try:
|
||||||
|
return self.dictionary[key]
|
||||||
|
except KeyError:
|
||||||
|
return getattr(self.__class__, key)
|
||||||
|
|
||||||
|
def form_parser_args(
|
||||||
|
select_default=False,
|
||||||
|
form_parser_class=None,
|
||||||
|
request_class=None,
|
||||||
|
backwards_compat=False,
|
||||||
|
):
|
||||||
|
return Args(locals())
|
||||||
|
|
||||||
|
|
||||||
|
class Link:
|
||||||
|
def __init__(self, base_url, url, text, tag, attrs):
|
||||||
|
assert None not in [url, tag, attrs]
|
||||||
|
self.base_url = base_url
|
||||||
|
self.absolute_url = _rfc3986.urljoin(base_url, url)
|
||||||
|
self.url, self.text, self.tag, self.attrs = url, text, tag, attrs
|
||||||
|
def __cmp__(self, other):
|
||||||
|
try:
|
||||||
|
for name in "url", "text", "tag", "attrs":
|
||||||
|
if getattr(self, name) != getattr(other, name):
|
||||||
|
return -1
|
||||||
|
except AttributeError:
|
||||||
|
return -1
|
||||||
|
return 0
|
||||||
|
def __repr__(self):
|
||||||
|
return "Link(base_url=%r, url=%r, text=%r, tag=%r, attrs=%r)" % (
|
||||||
|
self.base_url, self.url, self.text, self.tag, self.attrs)
|
||||||
|
|
||||||
|
|
||||||
|
class LinksFactory:
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
link_parser_class=None,
|
||||||
|
link_class=Link,
|
||||||
|
urltags=None,
|
||||||
|
):
|
||||||
|
import _pullparser
|
||||||
|
if link_parser_class is None:
|
||||||
|
link_parser_class = _pullparser.TolerantPullParser
|
||||||
|
self.link_parser_class = link_parser_class
|
||||||
|
self.link_class = link_class
|
||||||
|
if urltags is None:
|
||||||
|
urltags = {
|
||||||
|
"a": "href",
|
||||||
|
"area": "href",
|
||||||
|
"frame": "src",
|
||||||
|
"iframe": "src",
|
||||||
|
}
|
||||||
|
self.urltags = urltags
|
||||||
|
self._response = None
|
||||||
|
self._encoding = None
|
||||||
|
|
||||||
|
def set_response(self, response, base_url, encoding):
|
||||||
|
self._response = response
|
||||||
|
self._encoding = encoding
|
||||||
|
self._base_url = base_url
|
||||||
|
|
||||||
|
def links(self):
|
||||||
|
"""Return an iterator that provides links of the document."""
|
||||||
|
response = self._response
|
||||||
|
encoding = self._encoding
|
||||||
|
base_url = self._base_url
|
||||||
|
p = self.link_parser_class(response, encoding=encoding)
|
||||||
|
|
||||||
|
try:
|
||||||
|
for token in p.tags(*(self.urltags.keys()+["base"])):
|
||||||
|
if token.type == "endtag":
|
||||||
|
continue
|
||||||
|
if token.data == "base":
|
||||||
|
base_href = dict(token.attrs).get("href")
|
||||||
|
if base_href is not None:
|
||||||
|
base_url = base_href
|
||||||
|
continue
|
||||||
|
attrs = dict(token.attrs)
|
||||||
|
tag = token.data
|
||||||
|
name = attrs.get("name")
|
||||||
|
text = None
|
||||||
|
# XXX use attr_encoding for ref'd doc if that doc does not
|
||||||
|
# provide one by other means
|
||||||
|
#attr_encoding = attrs.get("charset")
|
||||||
|
url = attrs.get(self.urltags[tag]) # XXX is "" a valid URL?
|
||||||
|
if not url:
|
||||||
|
# Probably an <A NAME="blah"> link or <AREA NOHREF...>.
|
||||||
|
# For our purposes a link is something with a URL, so
|
||||||
|
# ignore this.
|
||||||
|
continue
|
||||||
|
|
||||||
|
url = _rfc3986.clean_url(url, encoding)
|
||||||
|
if tag == "a":
|
||||||
|
if token.type != "startendtag":
|
||||||
|
# hmm, this'd break if end tag is missing
|
||||||
|
text = p.get_compressed_text(("endtag", tag))
|
||||||
|
# but this doesn't work for eg.
|
||||||
|
# <a href="blah"><b>Andy</b></a>
|
||||||
|
#text = p.get_compressed_text()
|
||||||
|
|
||||||
|
yield Link(base_url, url, text, tag, token.attrs)
|
||||||
|
except sgmllib.SGMLParseError, exc:
|
||||||
|
raise ParseError(exc)
|
||||||
|
|
||||||
|
class FormsFactory:
|
||||||
|
|
||||||
|
"""Makes a sequence of objects satisfying ClientForm.HTMLForm interface.
|
||||||
|
|
||||||
|
After calling .forms(), the .global_form attribute is a form object
|
||||||
|
containing all controls not a descendant of any FORM element.
|
||||||
|
|
||||||
|
For constructor argument docs, see ClientForm.ParseResponse
|
||||||
|
argument docs.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
select_default=False,
|
||||||
|
form_parser_class=None,
|
||||||
|
request_class=None,
|
||||||
|
backwards_compat=False,
|
||||||
|
):
|
||||||
|
import ClientForm
|
||||||
|
self.select_default = select_default
|
||||||
|
if form_parser_class is None:
|
||||||
|
form_parser_class = ClientForm.FormParser
|
||||||
|
self.form_parser_class = form_parser_class
|
||||||
|
if request_class is None:
|
||||||
|
request_class = _request.Request
|
||||||
|
self.request_class = request_class
|
||||||
|
self.backwards_compat = backwards_compat
|
||||||
|
self._response = None
|
||||||
|
self.encoding = None
|
||||||
|
self.global_form = None
|
||||||
|
|
||||||
|
def set_response(self, response, encoding):
|
||||||
|
self._response = response
|
||||||
|
self.encoding = encoding
|
||||||
|
self.global_form = None
|
||||||
|
|
||||||
|
def forms(self):
|
||||||
|
import ClientForm
|
||||||
|
encoding = self.encoding
|
||||||
|
try:
|
||||||
|
forms = ClientForm.ParseResponseEx(
|
||||||
|
self._response,
|
||||||
|
select_default=self.select_default,
|
||||||
|
form_parser_class=self.form_parser_class,
|
||||||
|
request_class=self.request_class,
|
||||||
|
encoding=encoding,
|
||||||
|
_urljoin=_rfc3986.urljoin,
|
||||||
|
_urlparse=_rfc3986.urlsplit,
|
||||||
|
_urlunparse=_rfc3986.urlunsplit,
|
||||||
|
)
|
||||||
|
except ClientForm.ParseError, exc:
|
||||||
|
raise ParseError(exc)
|
||||||
|
self.global_form = forms[0]
|
||||||
|
return forms[1:]
|
||||||
|
|
||||||
|
class TitleFactory:
|
||||||
|
def __init__(self):
|
||||||
|
self._response = self._encoding = None
|
||||||
|
|
||||||
|
def set_response(self, response, encoding):
|
||||||
|
self._response = response
|
||||||
|
self._encoding = encoding
|
||||||
|
|
||||||
|
def title(self):
|
||||||
|
import _pullparser
|
||||||
|
p = _pullparser.TolerantPullParser(
|
||||||
|
self._response, encoding=self._encoding)
|
||||||
|
try:
|
||||||
|
try:
|
||||||
|
p.get_tag("title")
|
||||||
|
except _pullparser.NoMoreTokensError:
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
return p.get_text()
|
||||||
|
except sgmllib.SGMLParseError, exc:
|
||||||
|
raise ParseError(exc)
|
||||||
|
|
||||||
|
|
||||||
|
def unescape(data, entities, encoding):
|
||||||
|
if data is None or "&" not in data:
|
||||||
|
return data
|
||||||
|
|
||||||
|
def replace_entities(match):
|
||||||
|
ent = match.group()
|
||||||
|
if ent[1] == "#":
|
||||||
|
return unescape_charref(ent[2:-1], encoding)
|
||||||
|
|
||||||
|
repl = entities.get(ent[1:-1])
|
||||||
|
if repl is not None:
|
||||||
|
repl = unichr(repl)
|
||||||
|
if type(repl) != type(""):
|
||||||
|
try:
|
||||||
|
repl = repl.encode(encoding)
|
||||||
|
except UnicodeError:
|
||||||
|
repl = ent
|
||||||
|
else:
|
||||||
|
repl = ent
|
||||||
|
return repl
|
||||||
|
|
||||||
|
return re.sub(r"&#?[A-Za-z0-9]+?;", replace_entities, data)
|
||||||
|
|
||||||
|
def unescape_charref(data, encoding):
|
||||||
|
name, base = data, 10
|
||||||
|
if name.startswith("x"):
|
||||||
|
name, base= name[1:], 16
|
||||||
|
uc = unichr(int(name, base))
|
||||||
|
if encoding is None:
|
||||||
|
return uc
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
repl = uc.encode(encoding)
|
||||||
|
except UnicodeError:
|
||||||
|
repl = "&#%s;" % data
|
||||||
|
return repl
|
||||||
|
|
||||||
|
|
||||||
|
# bizarre import gymnastics for bundled BeautifulSoup
|
||||||
|
import _beautifulsoup
|
||||||
|
import ClientForm
|
||||||
|
RobustFormParser, NestingRobustFormParser = ClientForm._create_bs_classes(
|
||||||
|
_beautifulsoup.BeautifulSoup, _beautifulsoup.ICantBelieveItsBeautifulSoup
|
||||||
|
)
|
||||||
|
# monkeypatch sgmllib to fix http://www.python.org/sf/803422 :-(
|
||||||
|
import sgmllib
|
||||||
|
sgmllib.charref = re.compile("&#(x?[0-9a-fA-F]+)[^0-9a-fA-F]")
|
||||||
|
|
||||||
|
class MechanizeBs(_beautifulsoup.BeautifulSoup):
|
||||||
|
_entitydefs = htmlentitydefs.name2codepoint
|
||||||
|
# don't want the magic Microsoft-char workaround
|
||||||
|
PARSER_MASSAGE = [(re.compile('(<[^<>]*)/>'),
|
||||||
|
lambda(x):x.group(1) + ' />'),
|
||||||
|
(re.compile('<!\s+([^<>]*)>'),
|
||||||
|
lambda(x):'<!' + x.group(1) + '>')
|
||||||
|
]
|
||||||
|
|
||||||
|
def __init__(self, encoding, text=None, avoidParserProblems=True,
|
||||||
|
initialTextIsEverything=True):
|
||||||
|
self._encoding = encoding
|
||||||
|
_beautifulsoup.BeautifulSoup.__init__(
|
||||||
|
self, text, avoidParserProblems, initialTextIsEverything)
|
||||||
|
|
||||||
|
def handle_charref(self, ref):
|
||||||
|
t = unescape("&#%s;"%ref, self._entitydefs, self._encoding)
|
||||||
|
self.handle_data(t)
|
||||||
|
def handle_entityref(self, ref):
|
||||||
|
t = unescape("&%s;"%ref, self._entitydefs, self._encoding)
|
||||||
|
self.handle_data(t)
|
||||||
|
def unescape_attrs(self, attrs):
|
||||||
|
escaped_attrs = []
|
||||||
|
for key, val in attrs:
|
||||||
|
val = unescape(val, self._entitydefs, self._encoding)
|
||||||
|
escaped_attrs.append((key, val))
|
||||||
|
return escaped_attrs
|
||||||
|
|
||||||
|
class RobustLinksFactory:
|
||||||
|
|
||||||
|
compress_re = re.compile(r"\s+")
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
link_parser_class=None,
|
||||||
|
link_class=Link,
|
||||||
|
urltags=None,
|
||||||
|
):
|
||||||
|
import _beautifulsoup
|
||||||
|
if link_parser_class is None:
|
||||||
|
link_parser_class = MechanizeBs
|
||||||
|
self.link_parser_class = link_parser_class
|
||||||
|
self.link_class = link_class
|
||||||
|
if urltags is None:
|
||||||
|
urltags = {
|
||||||
|
"a": "href",
|
||||||
|
"area": "href",
|
||||||
|
"frame": "src",
|
||||||
|
"iframe": "src",
|
||||||
|
}
|
||||||
|
self.urltags = urltags
|
||||||
|
self._bs = None
|
||||||
|
self._encoding = None
|
||||||
|
self._base_url = None
|
||||||
|
|
||||||
|
def set_soup(self, soup, base_url, encoding):
|
||||||
|
self._bs = soup
|
||||||
|
self._base_url = base_url
|
||||||
|
self._encoding = encoding
|
||||||
|
|
||||||
|
def links(self):
|
||||||
|
import _beautifulsoup
|
||||||
|
bs = self._bs
|
||||||
|
base_url = self._base_url
|
||||||
|
encoding = self._encoding
|
||||||
|
gen = bs.recursiveChildGenerator()
|
||||||
|
for ch in bs.recursiveChildGenerator():
|
||||||
|
if (isinstance(ch, _beautifulsoup.Tag) and
|
||||||
|
ch.name in self.urltags.keys()+["base"]):
|
||||||
|
link = ch
|
||||||
|
attrs = bs.unescape_attrs(link.attrs)
|
||||||
|
attrs_dict = dict(attrs)
|
||||||
|
if link.name == "base":
|
||||||
|
base_href = attrs_dict.get("href")
|
||||||
|
if base_href is not None:
|
||||||
|
base_url = base_href
|
||||||
|
continue
|
||||||
|
url_attr = self.urltags[link.name]
|
||||||
|
url = attrs_dict.get(url_attr)
|
||||||
|
if not url:
|
||||||
|
continue
|
||||||
|
url = _rfc3986.clean_url(url, encoding)
|
||||||
|
text = link.firstText(lambda t: True)
|
||||||
|
if text is _beautifulsoup.Null:
|
||||||
|
# follow _pullparser's weird behaviour rigidly
|
||||||
|
if link.name == "a":
|
||||||
|
text = ""
|
||||||
|
else:
|
||||||
|
text = None
|
||||||
|
else:
|
||||||
|
text = self.compress_re.sub(" ", text.strip())
|
||||||
|
yield Link(base_url, url, text, link.name, attrs)
|
||||||
|
|
||||||
|
|
||||||
|
class RobustFormsFactory(FormsFactory):
|
||||||
|
def __init__(self, *args, **kwds):
|
||||||
|
import ClientForm
|
||||||
|
args = form_parser_args(*args, **kwds)
|
||||||
|
if args.form_parser_class is None:
|
||||||
|
args.form_parser_class = RobustFormParser
|
||||||
|
FormsFactory.__init__(self, **args.dictionary)
|
||||||
|
|
||||||
|
def set_response(self, response, encoding):
|
||||||
|
self._response = response
|
||||||
|
self.encoding = encoding
|
||||||
|
|
||||||
|
|
||||||
|
class RobustTitleFactory:
|
||||||
|
def __init__(self):
|
||||||
|
self._bs = self._encoding = None
|
||||||
|
|
||||||
|
def set_soup(self, soup, encoding):
|
||||||
|
self._bs = soup
|
||||||
|
self._encoding = encoding
|
||||||
|
|
||||||
|
def title(self):
|
||||||
|
import _beautifulsoup
|
||||||
|
title = self._bs.first("title")
|
||||||
|
if title == _beautifulsoup.Null:
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
return title.firstText(lambda t: True)
|
||||||
|
|
||||||
|
|
||||||
|
class Factory:
|
||||||
|
"""Factory for forms, links, etc.
|
||||||
|
|
||||||
|
This interface may expand in future.
|
||||||
|
|
||||||
|
Public methods:
|
||||||
|
|
||||||
|
set_request_class(request_class)
|
||||||
|
set_response(response)
|
||||||
|
forms()
|
||||||
|
links()
|
||||||
|
|
||||||
|
Public attributes:
|
||||||
|
|
||||||
|
Note that accessing these attributes may raise ParseError.
|
||||||
|
|
||||||
|
encoding: string specifying the encoding of response if it contains a text
|
||||||
|
document (this value is left unspecified for documents that do not have
|
||||||
|
an encoding, e.g. an image file)
|
||||||
|
is_html: true if response contains an HTML document (XHTML may be
|
||||||
|
regarded as HTML too)
|
||||||
|
title: page title, or None if no title or not HTML
|
||||||
|
global_form: form object containing all controls that are not descendants
|
||||||
|
of any FORM element, or None if the forms_factory does not support
|
||||||
|
supplying a global form
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
LAZY_ATTRS = ["encoding", "is_html", "title", "global_form"]
|
||||||
|
|
||||||
|
def __init__(self, forms_factory, links_factory, title_factory,
|
||||||
|
encoding_finder=EncodingFinder(DEFAULT_ENCODING),
|
||||||
|
response_type_finder=ResponseTypeFinder(allow_xhtml=False),
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
|
||||||
|
Pass keyword arguments only.
|
||||||
|
|
||||||
|
default_encoding: character encoding to use if encoding cannot be
|
||||||
|
determined (or guessed) from the response. You should turn on
|
||||||
|
HTTP-EQUIV handling if you want the best chance of getting this right
|
||||||
|
without resorting to this default. The default value of this
|
||||||
|
parameter (currently latin-1) may change in future.
|
||||||
|
|
||||||
|
"""
|
||||||
|
self._forms_factory = forms_factory
|
||||||
|
self._links_factory = links_factory
|
||||||
|
self._title_factory = title_factory
|
||||||
|
self._encoding_finder = encoding_finder
|
||||||
|
self._response_type_finder = response_type_finder
|
||||||
|
|
||||||
|
self.set_response(None)
|
||||||
|
|
||||||
|
def set_request_class(self, request_class):
|
||||||
|
"""Set urllib2.Request class.
|
||||||
|
|
||||||
|
ClientForm.HTMLForm instances returned by .forms() will return
|
||||||
|
instances of this class when .click()ed.
|
||||||
|
|
||||||
|
"""
|
||||||
|
self._forms_factory.request_class = request_class
|
||||||
|
|
||||||
|
def set_response(self, response):
|
||||||
|
"""Set response.
|
||||||
|
|
||||||
|
The response must either be None or implement the same interface as
|
||||||
|
objects returned by urllib2.urlopen().
|
||||||
|
|
||||||
|
"""
|
||||||
|
self._response = response
|
||||||
|
self._forms_genf = self._links_genf = None
|
||||||
|
self._get_title = None
|
||||||
|
for name in self.LAZY_ATTRS:
|
||||||
|
try:
|
||||||
|
delattr(self, name)
|
||||||
|
except AttributeError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def __getattr__(self, name):
|
||||||
|
if name not in self.LAZY_ATTRS:
|
||||||
|
return getattr(self.__class__, name)
|
||||||
|
|
||||||
|
if name == "encoding":
|
||||||
|
self.encoding = self._encoding_finder.encoding(
|
||||||
|
copy.copy(self._response))
|
||||||
|
return self.encoding
|
||||||
|
elif name == "is_html":
|
||||||
|
self.is_html = self._response_type_finder.is_html(
|
||||||
|
copy.copy(self._response), self.encoding)
|
||||||
|
return self.is_html
|
||||||
|
elif name == "title":
|
||||||
|
if self.is_html:
|
||||||
|
self.title = self._title_factory.title()
|
||||||
|
else:
|
||||||
|
self.title = None
|
||||||
|
return self.title
|
||||||
|
elif name == "global_form":
|
||||||
|
self.forms()
|
||||||
|
return self.global_form
|
||||||
|
|
||||||
|
def forms(self):
|
||||||
|
"""Return iterable over ClientForm.HTMLForm-like objects.
|
||||||
|
|
||||||
|
Raises mechanize.ParseError on failure.
|
||||||
|
"""
|
||||||
|
# this implementation sets .global_form as a side-effect, for benefit
|
||||||
|
# of __getattr__ impl
|
||||||
|
if self._forms_genf is None:
|
||||||
|
try:
|
||||||
|
self._forms_genf = CachingGeneratorFunction(
|
||||||
|
self._forms_factory.forms())
|
||||||
|
except: # XXXX define exception!
|
||||||
|
self.set_response(self._response)
|
||||||
|
raise
|
||||||
|
self.global_form = getattr(
|
||||||
|
self._forms_factory, "global_form", None)
|
||||||
|
return self._forms_genf()
|
||||||
|
|
||||||
|
def links(self):
|
||||||
|
"""Return iterable over mechanize.Link-like objects.
|
||||||
|
|
||||||
|
Raises mechanize.ParseError on failure.
|
||||||
|
"""
|
||||||
|
if self._links_genf is None:
|
||||||
|
try:
|
||||||
|
self._links_genf = CachingGeneratorFunction(
|
||||||
|
self._links_factory.links())
|
||||||
|
except: # XXXX define exception!
|
||||||
|
self.set_response(self._response)
|
||||||
|
raise
|
||||||
|
return self._links_genf()
|
||||||
|
|
||||||
|
class DefaultFactory(Factory):
|
||||||
|
"""Based on sgmllib."""
|
||||||
|
def __init__(self, i_want_broken_xhtml_support=False):
|
||||||
|
Factory.__init__(
|
||||||
|
self,
|
||||||
|
forms_factory=FormsFactory(),
|
||||||
|
links_factory=LinksFactory(),
|
||||||
|
title_factory=TitleFactory(),
|
||||||
|
response_type_finder=ResponseTypeFinder(
|
||||||
|
allow_xhtml=i_want_broken_xhtml_support),
|
||||||
|
)
|
||||||
|
|
||||||
|
def set_response(self, response):
|
||||||
|
Factory.set_response(self, response)
|
||||||
|
if response is not None:
|
||||||
|
self._forms_factory.set_response(
|
||||||
|
copy.copy(response), self.encoding)
|
||||||
|
self._links_factory.set_response(
|
||||||
|
copy.copy(response), response.geturl(), self.encoding)
|
||||||
|
self._title_factory.set_response(
|
||||||
|
copy.copy(response), self.encoding)
|
||||||
|
|
||||||
|
class RobustFactory(Factory):
|
||||||
|
"""Based on BeautifulSoup, hopefully a bit more robust to bad HTML than is
|
||||||
|
DefaultFactory.
|
||||||
|
|
||||||
|
"""
|
||||||
|
def __init__(self, i_want_broken_xhtml_support=False,
|
||||||
|
soup_class=None):
|
||||||
|
Factory.__init__(
|
||||||
|
self,
|
||||||
|
forms_factory=RobustFormsFactory(),
|
||||||
|
links_factory=RobustLinksFactory(),
|
||||||
|
title_factory=RobustTitleFactory(),
|
||||||
|
response_type_finder=ResponseTypeFinder(
|
||||||
|
allow_xhtml=i_want_broken_xhtml_support),
|
||||||
|
)
|
||||||
|
if soup_class is None:
|
||||||
|
soup_class = MechanizeBs
|
||||||
|
self._soup_class = soup_class
|
||||||
|
|
||||||
|
def set_response(self, response):
|
||||||
|
import _beautifulsoup
|
||||||
|
Factory.set_response(self, response)
|
||||||
|
if response is not None:
|
||||||
|
data = response.read()
|
||||||
|
soup = self._soup_class(self.encoding, data)
|
||||||
|
self._forms_factory.set_response(
|
||||||
|
copy.copy(response), self.encoding)
|
||||||
|
self._links_factory.set_soup(
|
||||||
|
soup, response.geturl(), self.encoding)
|
||||||
|
self._title_factory.set_soup(soup, self.encoding)
|
729
src/calibre/utils/mechanize/_http.py
Normal file
729
src/calibre/utils/mechanize/_http.py
Normal file
@ -0,0 +1,729 @@
|
|||||||
|
"""HTTP related handlers.
|
||||||
|
|
||||||
|
Note that some other HTTP handlers live in more specific modules: _auth.py,
|
||||||
|
_gzip.py, etc.
|
||||||
|
|
||||||
|
|
||||||
|
Copyright 2002-2006 John J Lee <jjl@pobox.com>
|
||||||
|
|
||||||
|
This code is free software; you can redistribute it and/or modify it
|
||||||
|
under the terms of the BSD or ZPL 2.1 licenses (see the file
|
||||||
|
COPYING.txt included with the distribution).
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
import copy, time, tempfile, htmlentitydefs, re, logging, socket, \
|
||||||
|
urllib2, urllib, httplib, sgmllib
|
||||||
|
from urllib2 import URLError, HTTPError, BaseHandler
|
||||||
|
from cStringIO import StringIO
|
||||||
|
|
||||||
|
from _request import Request
|
||||||
|
from _util import isstringlike
|
||||||
|
from _response import closeable_response, response_seek_wrapper
|
||||||
|
from _html import unescape, unescape_charref
|
||||||
|
from _headersutil import is_html
|
||||||
|
from _clientcookie import CookieJar, request_host
|
||||||
|
import _rfc3986
|
||||||
|
|
||||||
|
debug = logging.getLogger("mechanize").debug
|
||||||
|
|
||||||
|
# monkeypatch urllib2.HTTPError to show URL
|
||||||
|
## def urllib2_str(self):
|
||||||
|
## return 'HTTP Error %s: %s (%s)' % (
|
||||||
|
## self.code, self.msg, self.geturl())
|
||||||
|
## urllib2.HTTPError.__str__ = urllib2_str
|
||||||
|
|
||||||
|
|
||||||
|
CHUNK = 1024 # size of chunks fed to HTML HEAD parser, in bytes
|
||||||
|
DEFAULT_ENCODING = 'latin-1'
|
||||||
|
|
||||||
|
|
||||||
|
# This adds "refresh" to the list of redirectables and provides a redirection
|
||||||
|
# algorithm that doesn't go into a loop in the presence of cookies
|
||||||
|
# (Python 2.4 has this new algorithm, 2.3 doesn't).
|
||||||
|
class HTTPRedirectHandler(BaseHandler):
|
||||||
|
# maximum number of redirections to any single URL
|
||||||
|
# this is needed because of the state that cookies introduce
|
||||||
|
max_repeats = 4
|
||||||
|
# maximum total number of redirections (regardless of URL) before
|
||||||
|
# assuming we're in a loop
|
||||||
|
max_redirections = 10
|
||||||
|
|
||||||
|
# Implementation notes:
|
||||||
|
|
||||||
|
# To avoid the server sending us into an infinite loop, the request
|
||||||
|
# object needs to track what URLs we have already seen. Do this by
|
||||||
|
# adding a handler-specific attribute to the Request object. The value
|
||||||
|
# of the dict is used to count the number of times the same URL has
|
||||||
|
# been visited. This is needed because visiting the same URL twice
|
||||||
|
# does not necessarily imply a loop, thanks to state introduced by
|
||||||
|
# cookies.
|
||||||
|
|
||||||
|
# Always unhandled redirection codes:
|
||||||
|
# 300 Multiple Choices: should not handle this here.
|
||||||
|
# 304 Not Modified: no need to handle here: only of interest to caches
|
||||||
|
# that do conditional GETs
|
||||||
|
# 305 Use Proxy: probably not worth dealing with here
|
||||||
|
# 306 Unused: what was this for in the previous versions of protocol??
|
||||||
|
|
||||||
|
def redirect_request(self, newurl, req, fp, code, msg, headers):
|
||||||
|
"""Return a Request or None in response to a redirect.
|
||||||
|
|
||||||
|
This is called by the http_error_30x methods when a redirection
|
||||||
|
response is received. If a redirection should take place, return a
|
||||||
|
new Request to allow http_error_30x to perform the redirect;
|
||||||
|
otherwise, return None to indicate that an HTTPError should be
|
||||||
|
raised.
|
||||||
|
|
||||||
|
"""
|
||||||
|
if code in (301, 302, 303, "refresh") or \
|
||||||
|
(code == 307 and not req.has_data()):
|
||||||
|
# Strictly (according to RFC 2616), 301 or 302 in response to
|
||||||
|
# a POST MUST NOT cause a redirection without confirmation
|
||||||
|
# from the user (of urllib2, in this case). In practice,
|
||||||
|
# essentially all clients do redirect in this case, so we do
|
||||||
|
# the same.
|
||||||
|
# XXX really refresh redirections should be visiting; tricky to
|
||||||
|
# fix, so this will wait until post-stable release
|
||||||
|
new = Request(newurl,
|
||||||
|
headers=req.headers,
|
||||||
|
origin_req_host=req.get_origin_req_host(),
|
||||||
|
unverifiable=True,
|
||||||
|
visit=False,
|
||||||
|
)
|
||||||
|
new._origin_req = getattr(req, "_origin_req", req)
|
||||||
|
return new
|
||||||
|
else:
|
||||||
|
raise HTTPError(req.get_full_url(), code, msg, headers, fp)
|
||||||
|
|
||||||
|
def http_error_302(self, req, fp, code, msg, headers):
|
||||||
|
# Some servers (incorrectly) return multiple Location headers
|
||||||
|
# (so probably same goes for URI). Use first header.
|
||||||
|
if headers.has_key('location'):
|
||||||
|
newurl = headers.getheaders('location')[0]
|
||||||
|
elif headers.has_key('uri'):
|
||||||
|
newurl = headers.getheaders('uri')[0]
|
||||||
|
else:
|
||||||
|
return
|
||||||
|
newurl = _rfc3986.clean_url(newurl, "latin-1")
|
||||||
|
newurl = _rfc3986.urljoin(req.get_full_url(), newurl)
|
||||||
|
|
||||||
|
# XXX Probably want to forget about the state of the current
|
||||||
|
# request, although that might interact poorly with other
|
||||||
|
# handlers that also use handler-specific request attributes
|
||||||
|
new = self.redirect_request(newurl, req, fp, code, msg, headers)
|
||||||
|
if new is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
# loop detection
|
||||||
|
# .redirect_dict has a key url if url was previously visited.
|
||||||
|
if hasattr(req, 'redirect_dict'):
|
||||||
|
visited = new.redirect_dict = req.redirect_dict
|
||||||
|
if (visited.get(newurl, 0) >= self.max_repeats or
|
||||||
|
len(visited) >= self.max_redirections):
|
||||||
|
raise HTTPError(req.get_full_url(), code,
|
||||||
|
self.inf_msg + msg, headers, fp)
|
||||||
|
else:
|
||||||
|
visited = new.redirect_dict = req.redirect_dict = {}
|
||||||
|
visited[newurl] = visited.get(newurl, 0) + 1
|
||||||
|
|
||||||
|
# Don't close the fp until we are sure that we won't use it
|
||||||
|
# with HTTPError.
|
||||||
|
fp.read()
|
||||||
|
fp.close()
|
||||||
|
|
||||||
|
return self.parent.open(new)
|
||||||
|
|
||||||
|
http_error_301 = http_error_303 = http_error_307 = http_error_302
|
||||||
|
http_error_refresh = http_error_302
|
||||||
|
|
||||||
|
inf_msg = "The HTTP server returned a redirect error that would " \
|
||||||
|
"lead to an infinite loop.\n" \
|
||||||
|
"The last 30x error message was:\n"
|
||||||
|
|
||||||
|
|
||||||
|
# XXX would self.reset() work, instead of raising this exception?
|
||||||
|
class EndOfHeadError(Exception): pass
|
||||||
|
class AbstractHeadParser:
|
||||||
|
# only these elements are allowed in or before HEAD of document
|
||||||
|
head_elems = ("html", "head",
|
||||||
|
"title", "base",
|
||||||
|
"script", "style", "meta", "link", "object")
|
||||||
|
_entitydefs = htmlentitydefs.name2codepoint
|
||||||
|
_encoding = DEFAULT_ENCODING
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.http_equiv = []
|
||||||
|
|
||||||
|
def start_meta(self, attrs):
|
||||||
|
http_equiv = content = None
|
||||||
|
for key, value in attrs:
|
||||||
|
if key == "http-equiv":
|
||||||
|
http_equiv = self.unescape_attr_if_required(value)
|
||||||
|
elif key == "content":
|
||||||
|
content = self.unescape_attr_if_required(value)
|
||||||
|
if http_equiv is not None and content is not None:
|
||||||
|
self.http_equiv.append((http_equiv, content))
|
||||||
|
|
||||||
|
def end_head(self):
|
||||||
|
raise EndOfHeadError()
|
||||||
|
|
||||||
|
def handle_entityref(self, name):
|
||||||
|
#debug("%s", name)
|
||||||
|
self.handle_data(unescape(
|
||||||
|
'&%s;' % name, self._entitydefs, self._encoding))
|
||||||
|
|
||||||
|
def handle_charref(self, name):
|
||||||
|
#debug("%s", name)
|
||||||
|
self.handle_data(unescape_charref(name, self._encoding))
|
||||||
|
|
||||||
|
def unescape_attr(self, name):
|
||||||
|
#debug("%s", name)
|
||||||
|
return unescape(name, self._entitydefs, self._encoding)
|
||||||
|
|
||||||
|
def unescape_attrs(self, attrs):
|
||||||
|
#debug("%s", attrs)
|
||||||
|
escaped_attrs = {}
|
||||||
|
for key, val in attrs.items():
|
||||||
|
escaped_attrs[key] = self.unescape_attr(val)
|
||||||
|
return escaped_attrs
|
||||||
|
|
||||||
|
def unknown_entityref(self, ref):
|
||||||
|
self.handle_data("&%s;" % ref)
|
||||||
|
|
||||||
|
def unknown_charref(self, ref):
|
||||||
|
self.handle_data("&#%s;" % ref)
|
||||||
|
|
||||||
|
|
||||||
|
try:
|
||||||
|
import HTMLParser
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
class XHTMLCompatibleHeadParser(AbstractHeadParser,
|
||||||
|
HTMLParser.HTMLParser):
|
||||||
|
def __init__(self):
|
||||||
|
HTMLParser.HTMLParser.__init__(self)
|
||||||
|
AbstractHeadParser.__init__(self)
|
||||||
|
|
||||||
|
def handle_starttag(self, tag, attrs):
|
||||||
|
if tag not in self.head_elems:
|
||||||
|
raise EndOfHeadError()
|
||||||
|
try:
|
||||||
|
method = getattr(self, 'start_' + tag)
|
||||||
|
except AttributeError:
|
||||||
|
try:
|
||||||
|
method = getattr(self, 'do_' + tag)
|
||||||
|
except AttributeError:
|
||||||
|
pass # unknown tag
|
||||||
|
else:
|
||||||
|
method(attrs)
|
||||||
|
else:
|
||||||
|
method(attrs)
|
||||||
|
|
||||||
|
def handle_endtag(self, tag):
|
||||||
|
if tag not in self.head_elems:
|
||||||
|
raise EndOfHeadError()
|
||||||
|
try:
|
||||||
|
method = getattr(self, 'end_' + tag)
|
||||||
|
except AttributeError:
|
||||||
|
pass # unknown tag
|
||||||
|
else:
|
||||||
|
method()
|
||||||
|
|
||||||
|
def unescape(self, name):
|
||||||
|
# Use the entitydefs passed into constructor, not
|
||||||
|
# HTMLParser.HTMLParser's entitydefs.
|
||||||
|
return self.unescape_attr(name)
|
||||||
|
|
||||||
|
def unescape_attr_if_required(self, name):
|
||||||
|
return name # HTMLParser.HTMLParser already did it
|
||||||
|
|
||||||
|
class HeadParser(AbstractHeadParser, sgmllib.SGMLParser):
|
||||||
|
|
||||||
|
def _not_called(self):
|
||||||
|
assert False
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
sgmllib.SGMLParser.__init__(self)
|
||||||
|
AbstractHeadParser.__init__(self)
|
||||||
|
|
||||||
|
def handle_starttag(self, tag, method, attrs):
|
||||||
|
if tag not in self.head_elems:
|
||||||
|
raise EndOfHeadError()
|
||||||
|
if tag == "meta":
|
||||||
|
method(attrs)
|
||||||
|
|
||||||
|
def unknown_starttag(self, tag, attrs):
|
||||||
|
self.handle_starttag(tag, self._not_called, attrs)
|
||||||
|
|
||||||
|
def handle_endtag(self, tag, method):
|
||||||
|
if tag in self.head_elems:
|
||||||
|
method()
|
||||||
|
else:
|
||||||
|
raise EndOfHeadError()
|
||||||
|
|
||||||
|
def unescape_attr_if_required(self, name):
|
||||||
|
return self.unescape_attr(name)
|
||||||
|
|
||||||
|
def parse_head(fileobj, parser):
|
||||||
|
"""Return a list of key, value pairs."""
|
||||||
|
while 1:
|
||||||
|
data = fileobj.read(CHUNK)
|
||||||
|
try:
|
||||||
|
parser.feed(data)
|
||||||
|
except EndOfHeadError:
|
||||||
|
break
|
||||||
|
if len(data) != CHUNK:
|
||||||
|
# this should only happen if there is no HTML body, or if
|
||||||
|
# CHUNK is big
|
||||||
|
break
|
||||||
|
return parser.http_equiv
|
||||||
|
|
||||||
|
class HTTPEquivProcessor(BaseHandler):
|
||||||
|
"""Append META HTTP-EQUIV headers to regular HTTP headers."""
|
||||||
|
|
||||||
|
handler_order = 300 # before handlers that look at HTTP headers
|
||||||
|
|
||||||
|
def __init__(self, head_parser_class=HeadParser,
|
||||||
|
i_want_broken_xhtml_support=False,
|
||||||
|
):
|
||||||
|
self.head_parser_class = head_parser_class
|
||||||
|
self._allow_xhtml = i_want_broken_xhtml_support
|
||||||
|
|
||||||
|
def http_response(self, request, response):
|
||||||
|
if not hasattr(response, "seek"):
|
||||||
|
response = response_seek_wrapper(response)
|
||||||
|
http_message = response.info()
|
||||||
|
url = response.geturl()
|
||||||
|
ct_hdrs = http_message.getheaders("content-type")
|
||||||
|
if is_html(ct_hdrs, url, self._allow_xhtml):
|
||||||
|
try:
|
||||||
|
try:
|
||||||
|
html_headers = parse_head(response, self.head_parser_class())
|
||||||
|
finally:
|
||||||
|
response.seek(0)
|
||||||
|
except (HTMLParser.HTMLParseError,
|
||||||
|
sgmllib.SGMLParseError):
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
for hdr, val in html_headers:
|
||||||
|
# add a header
|
||||||
|
http_message.dict[hdr.lower()] = val
|
||||||
|
text = hdr + ": " + val
|
||||||
|
for line in text.split("\n"):
|
||||||
|
http_message.headers.append(line + "\n")
|
||||||
|
return response
|
||||||
|
|
||||||
|
https_response = http_response
|
||||||
|
|
||||||
|
class HTTPCookieProcessor(BaseHandler):
|
||||||
|
"""Handle HTTP cookies.
|
||||||
|
|
||||||
|
Public attributes:
|
||||||
|
|
||||||
|
cookiejar: CookieJar instance
|
||||||
|
|
||||||
|
"""
|
||||||
|
def __init__(self, cookiejar=None):
|
||||||
|
if cookiejar is None:
|
||||||
|
cookiejar = CookieJar()
|
||||||
|
self.cookiejar = cookiejar
|
||||||
|
|
||||||
|
def http_request(self, request):
|
||||||
|
self.cookiejar.add_cookie_header(request)
|
||||||
|
return request
|
||||||
|
|
||||||
|
def http_response(self, request, response):
|
||||||
|
self.cookiejar.extract_cookies(response, request)
|
||||||
|
return response
|
||||||
|
|
||||||
|
https_request = http_request
|
||||||
|
https_response = http_response
|
||||||
|
|
||||||
|
try:
|
||||||
|
import robotparser
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
class MechanizeRobotFileParser(robotparser.RobotFileParser):
|
||||||
|
|
||||||
|
def __init__(self, url='', opener=None):
|
||||||
|
import _opener
|
||||||
|
robotparser.RobotFileParser.__init__(self, url)
|
||||||
|
self._opener = opener
|
||||||
|
|
||||||
|
def set_opener(self, opener=None):
|
||||||
|
if opener is None:
|
||||||
|
opener = _opener.OpenerDirector()
|
||||||
|
self._opener = opener
|
||||||
|
|
||||||
|
def read(self):
|
||||||
|
"""Reads the robots.txt URL and feeds it to the parser."""
|
||||||
|
if self._opener is None:
|
||||||
|
self.set_opener()
|
||||||
|
req = Request(self.url, unverifiable=True, visit=False)
|
||||||
|
try:
|
||||||
|
f = self._opener.open(req)
|
||||||
|
except HTTPError, f:
|
||||||
|
pass
|
||||||
|
except (IOError, socket.error, OSError), exc:
|
||||||
|
robotparser._debug("ignoring error opening %r: %s" %
|
||||||
|
(self.url, exc))
|
||||||
|
return
|
||||||
|
lines = []
|
||||||
|
line = f.readline()
|
||||||
|
while line:
|
||||||
|
lines.append(line.strip())
|
||||||
|
line = f.readline()
|
||||||
|
status = f.code
|
||||||
|
if status == 401 or status == 403:
|
||||||
|
self.disallow_all = True
|
||||||
|
robotparser._debug("disallow all")
|
||||||
|
elif status >= 400:
|
||||||
|
self.allow_all = True
|
||||||
|
robotparser._debug("allow all")
|
||||||
|
elif status == 200 and lines:
|
||||||
|
robotparser._debug("parse lines")
|
||||||
|
self.parse(lines)
|
||||||
|
|
||||||
|
class RobotExclusionError(urllib2.HTTPError):
|
||||||
|
def __init__(self, request, *args):
|
||||||
|
apply(urllib2.HTTPError.__init__, (self,)+args)
|
||||||
|
self.request = request
|
||||||
|
|
||||||
|
class HTTPRobotRulesProcessor(BaseHandler):
|
||||||
|
# before redirections, after everything else
|
||||||
|
handler_order = 800
|
||||||
|
|
||||||
|
try:
|
||||||
|
from httplib import HTTPMessage
|
||||||
|
except:
|
||||||
|
from mimetools import Message
|
||||||
|
http_response_class = Message
|
||||||
|
else:
|
||||||
|
http_response_class = HTTPMessage
|
||||||
|
|
||||||
|
def __init__(self, rfp_class=MechanizeRobotFileParser):
|
||||||
|
self.rfp_class = rfp_class
|
||||||
|
self.rfp = None
|
||||||
|
self._host = None
|
||||||
|
|
||||||
|
def http_request(self, request):
|
||||||
|
scheme = request.get_type()
|
||||||
|
if scheme not in ["http", "https"]:
|
||||||
|
# robots exclusion only applies to HTTP
|
||||||
|
return request
|
||||||
|
|
||||||
|
if request.get_selector() == "/robots.txt":
|
||||||
|
# /robots.txt is always OK to fetch
|
||||||
|
return request
|
||||||
|
|
||||||
|
host = request.get_host()
|
||||||
|
|
||||||
|
# robots.txt requests don't need to be allowed by robots.txt :-)
|
||||||
|
origin_req = getattr(request, "_origin_req", None)
|
||||||
|
if (origin_req is not None and
|
||||||
|
origin_req.get_selector() == "/robots.txt" and
|
||||||
|
origin_req.get_host() == host
|
||||||
|
):
|
||||||
|
return request
|
||||||
|
|
||||||
|
if host != self._host:
|
||||||
|
self.rfp = self.rfp_class()
|
||||||
|
try:
|
||||||
|
self.rfp.set_opener(self.parent)
|
||||||
|
except AttributeError:
|
||||||
|
debug("%r instance does not support set_opener" %
|
||||||
|
self.rfp.__class__)
|
||||||
|
self.rfp.set_url(scheme+"://"+host+"/robots.txt")
|
||||||
|
self.rfp.read()
|
||||||
|
self._host = host
|
||||||
|
|
||||||
|
ua = request.get_header("User-agent", "")
|
||||||
|
if self.rfp.can_fetch(ua, request.get_full_url()):
|
||||||
|
return request
|
||||||
|
else:
|
||||||
|
# XXX This should really have raised URLError. Too late now...
|
||||||
|
msg = "request disallowed by robots.txt"
|
||||||
|
raise RobotExclusionError(
|
||||||
|
request,
|
||||||
|
request.get_full_url(),
|
||||||
|
403, msg,
|
||||||
|
self.http_response_class(StringIO()), StringIO(msg))
|
||||||
|
|
||||||
|
https_request = http_request
|
||||||
|
|
||||||
|
class HTTPRefererProcessor(BaseHandler):
|
||||||
|
"""Add Referer header to requests.
|
||||||
|
|
||||||
|
This only makes sense if you use each RefererProcessor for a single
|
||||||
|
chain of requests only (so, for example, if you use a single
|
||||||
|
HTTPRefererProcessor to fetch a series of URLs extracted from a single
|
||||||
|
page, this will break).
|
||||||
|
|
||||||
|
There's a proper implementation of this in mechanize.Browser.
|
||||||
|
|
||||||
|
"""
|
||||||
|
def __init__(self):
|
||||||
|
self.referer = None
|
||||||
|
|
||||||
|
def http_request(self, request):
|
||||||
|
if ((self.referer is not None) and
|
||||||
|
not request.has_header("Referer")):
|
||||||
|
request.add_unredirected_header("Referer", self.referer)
|
||||||
|
return request
|
||||||
|
|
||||||
|
def http_response(self, request, response):
|
||||||
|
self.referer = response.geturl()
|
||||||
|
return response
|
||||||
|
|
||||||
|
https_request = http_request
|
||||||
|
https_response = http_response
|
||||||
|
|
||||||
|
|
||||||
|
def clean_refresh_url(url):
|
||||||
|
# e.g. Firefox 1.5 does (something like) this
|
||||||
|
if ((url.startswith('"') and url.endswith('"')) or
|
||||||
|
(url.startswith("'") and url.endswith("'"))):
|
||||||
|
url = url[1:-1]
|
||||||
|
return _rfc3986.clean_url(url, "latin-1") # XXX encoding
|
||||||
|
|
||||||
|
def parse_refresh_header(refresh):
|
||||||
|
"""
|
||||||
|
>>> parse_refresh_header("1; url=http://example.com/")
|
||||||
|
(1.0, 'http://example.com/')
|
||||||
|
>>> parse_refresh_header("1; url='http://example.com/'")
|
||||||
|
(1.0, 'http://example.com/')
|
||||||
|
>>> parse_refresh_header("1")
|
||||||
|
(1.0, None)
|
||||||
|
>>> parse_refresh_header("blah")
|
||||||
|
Traceback (most recent call last):
|
||||||
|
ValueError: invalid literal for float(): blah
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
ii = refresh.find(";")
|
||||||
|
if ii != -1:
|
||||||
|
pause, newurl_spec = float(refresh[:ii]), refresh[ii+1:]
|
||||||
|
jj = newurl_spec.find("=")
|
||||||
|
key = None
|
||||||
|
if jj != -1:
|
||||||
|
key, newurl = newurl_spec[:jj], newurl_spec[jj+1:]
|
||||||
|
newurl = clean_refresh_url(newurl)
|
||||||
|
if key is None or key.strip().lower() != "url":
|
||||||
|
raise ValueError()
|
||||||
|
else:
|
||||||
|
pause, newurl = float(refresh), None
|
||||||
|
return pause, newurl
|
||||||
|
|
||||||
|
class HTTPRefreshProcessor(BaseHandler):
|
||||||
|
"""Perform HTTP Refresh redirections.
|
||||||
|
|
||||||
|
Note that if a non-200 HTTP code has occurred (for example, a 30x
|
||||||
|
redirect), this processor will do nothing.
|
||||||
|
|
||||||
|
By default, only zero-time Refresh headers are redirected. Use the
|
||||||
|
max_time attribute / constructor argument to allow Refresh with longer
|
||||||
|
pauses. Use the honor_time attribute / constructor argument to control
|
||||||
|
whether the requested pause is honoured (with a time.sleep()) or
|
||||||
|
skipped in favour of immediate redirection.
|
||||||
|
|
||||||
|
Public attributes:
|
||||||
|
|
||||||
|
max_time: see above
|
||||||
|
honor_time: see above
|
||||||
|
|
||||||
|
"""
|
||||||
|
handler_order = 1000
|
||||||
|
|
||||||
|
def __init__(self, max_time=0, honor_time=True):
|
||||||
|
self.max_time = max_time
|
||||||
|
self.honor_time = honor_time
|
||||||
|
|
||||||
|
def http_response(self, request, response):
|
||||||
|
code, msg, hdrs = response.code, response.msg, response.info()
|
||||||
|
|
||||||
|
if code == 200 and hdrs.has_key("refresh"):
|
||||||
|
refresh = hdrs.getheaders("refresh")[0]
|
||||||
|
try:
|
||||||
|
pause, newurl = parse_refresh_header(refresh)
|
||||||
|
except ValueError:
|
||||||
|
debug("bad Refresh header: %r" % refresh)
|
||||||
|
return response
|
||||||
|
if newurl is None:
|
||||||
|
newurl = response.geturl()
|
||||||
|
if (self.max_time is None) or (pause <= self.max_time):
|
||||||
|
if pause > 1E-3 and self.honor_time:
|
||||||
|
time.sleep(pause)
|
||||||
|
hdrs["location"] = newurl
|
||||||
|
# hardcoded http is NOT a bug
|
||||||
|
response = self.parent.error(
|
||||||
|
"http", request, response,
|
||||||
|
"refresh", msg, hdrs)
|
||||||
|
|
||||||
|
return response
|
||||||
|
|
||||||
|
https_response = http_response
|
||||||
|
|
||||||
|
class HTTPErrorProcessor(BaseHandler):
|
||||||
|
"""Process HTTP error responses.
|
||||||
|
|
||||||
|
The purpose of this handler is to to allow other response processors a
|
||||||
|
look-in by removing the call to parent.error() from
|
||||||
|
AbstractHTTPHandler.
|
||||||
|
|
||||||
|
For non-200 error codes, this just passes the job on to the
|
||||||
|
Handler.<proto>_error_<code> methods, via the OpenerDirector.error
|
||||||
|
method. Eventually, urllib2.HTTPDefaultErrorHandler will raise an
|
||||||
|
HTTPError if no other handler handles the error.
|
||||||
|
|
||||||
|
"""
|
||||||
|
handler_order = 1000 # after all other processors
|
||||||
|
|
||||||
|
def http_response(self, request, response):
|
||||||
|
code, msg, hdrs = response.code, response.msg, response.info()
|
||||||
|
|
||||||
|
if code != 200:
|
||||||
|
# hardcoded http is NOT a bug
|
||||||
|
response = self.parent.error(
|
||||||
|
"http", request, response, code, msg, hdrs)
|
||||||
|
|
||||||
|
return response
|
||||||
|
|
||||||
|
https_response = http_response
|
||||||
|
|
||||||
|
|
||||||
|
class HTTPDefaultErrorHandler(BaseHandler):
|
||||||
|
def http_error_default(self, req, fp, code, msg, hdrs):
|
||||||
|
# why these error methods took the code, msg, headers args in the first
|
||||||
|
# place rather than a response object, I don't know, but to avoid
|
||||||
|
# multiple wrapping, we're discarding them
|
||||||
|
|
||||||
|
if isinstance(fp, urllib2.HTTPError):
|
||||||
|
response = fp
|
||||||
|
else:
|
||||||
|
response = urllib2.HTTPError(
|
||||||
|
req.get_full_url(), code, msg, hdrs, fp)
|
||||||
|
assert code == response.code
|
||||||
|
assert msg == response.msg
|
||||||
|
assert hdrs == response.hdrs
|
||||||
|
raise response
|
||||||
|
|
||||||
|
|
||||||
|
class AbstractHTTPHandler(BaseHandler):
|
||||||
|
|
||||||
|
def __init__(self, debuglevel=0):
|
||||||
|
self._debuglevel = debuglevel
|
||||||
|
|
||||||
|
def set_http_debuglevel(self, level):
|
||||||
|
self._debuglevel = level
|
||||||
|
|
||||||
|
def do_request_(self, request):
|
||||||
|
host = request.get_host()
|
||||||
|
if not host:
|
||||||
|
raise URLError('no host given')
|
||||||
|
|
||||||
|
if request.has_data(): # POST
|
||||||
|
data = request.get_data()
|
||||||
|
if not request.has_header('Content-type'):
|
||||||
|
request.add_unredirected_header(
|
||||||
|
'Content-type',
|
||||||
|
'application/x-www-form-urlencoded')
|
||||||
|
|
||||||
|
scheme, sel = urllib.splittype(request.get_selector())
|
||||||
|
sel_host, sel_path = urllib.splithost(sel)
|
||||||
|
if not request.has_header('Host'):
|
||||||
|
request.add_unredirected_header('Host', sel_host or host)
|
||||||
|
for name, value in self.parent.addheaders:
|
||||||
|
name = name.capitalize()
|
||||||
|
if not request.has_header(name):
|
||||||
|
request.add_unredirected_header(name, value)
|
||||||
|
|
||||||
|
return request
|
||||||
|
|
||||||
|
def do_open(self, http_class, req):
|
||||||
|
"""Return an addinfourl object for the request, using http_class.
|
||||||
|
|
||||||
|
http_class must implement the HTTPConnection API from httplib.
|
||||||
|
The addinfourl return value is a file-like object. It also
|
||||||
|
has methods and attributes including:
|
||||||
|
- info(): return a mimetools.Message object for the headers
|
||||||
|
- geturl(): return the original request URL
|
||||||
|
- code: HTTP status code
|
||||||
|
"""
|
||||||
|
host = req.get_host()
|
||||||
|
if not host:
|
||||||
|
raise URLError('no host given')
|
||||||
|
|
||||||
|
h = http_class(host) # will parse host:port
|
||||||
|
h.set_debuglevel(self._debuglevel)
|
||||||
|
|
||||||
|
headers = dict(req.headers)
|
||||||
|
headers.update(req.unredirected_hdrs)
|
||||||
|
# We want to make an HTTP/1.1 request, but the addinfourl
|
||||||
|
# class isn't prepared to deal with a persistent connection.
|
||||||
|
# It will try to read all remaining data from the socket,
|
||||||
|
# which will block while the server waits for the next request.
|
||||||
|
# So make sure the connection gets closed after the (only)
|
||||||
|
# request.
|
||||||
|
headers["Connection"] = "close"
|
||||||
|
headers = dict(
|
||||||
|
[(name.title(), val) for name, val in headers.items()])
|
||||||
|
try:
|
||||||
|
h.request(req.get_method(), req.get_selector(), req.data, headers)
|
||||||
|
r = h.getresponse()
|
||||||
|
except socket.error, err: # XXX what error?
|
||||||
|
raise URLError(err)
|
||||||
|
|
||||||
|
# Pick apart the HTTPResponse object to get the addinfourl
|
||||||
|
# object initialized properly.
|
||||||
|
|
||||||
|
# Wrap the HTTPResponse object in socket's file object adapter
|
||||||
|
# for Windows. That adapter calls recv(), so delegate recv()
|
||||||
|
# to read(). This weird wrapping allows the returned object to
|
||||||
|
# have readline() and readlines() methods.
|
||||||
|
|
||||||
|
# XXX It might be better to extract the read buffering code
|
||||||
|
# out of socket._fileobject() and into a base class.
|
||||||
|
|
||||||
|
r.recv = r.read
|
||||||
|
fp = socket._fileobject(r)
|
||||||
|
|
||||||
|
resp = closeable_response(fp, r.msg, req.get_full_url(),
|
||||||
|
r.status, r.reason)
|
||||||
|
return resp
|
||||||
|
|
||||||
|
|
||||||
|
class HTTPHandler(AbstractHTTPHandler):
|
||||||
|
def http_open(self, req):
|
||||||
|
return self.do_open(httplib.HTTPConnection, req)
|
||||||
|
|
||||||
|
http_request = AbstractHTTPHandler.do_request_
|
||||||
|
|
||||||
|
if hasattr(httplib, 'HTTPS'):
|
||||||
|
|
||||||
|
class HTTPSConnectionFactory:
|
||||||
|
def __init__(self, key_file, cert_file):
|
||||||
|
self._key_file = key_file
|
||||||
|
self._cert_file = cert_file
|
||||||
|
def __call__(self, hostport):
|
||||||
|
return httplib.HTTPSConnection(
|
||||||
|
hostport,
|
||||||
|
key_file=self._key_file, cert_file=self._cert_file)
|
||||||
|
|
||||||
|
class HTTPSHandler(AbstractHTTPHandler):
|
||||||
|
def __init__(self, client_cert_manager=None):
|
||||||
|
AbstractHTTPHandler.__init__(self)
|
||||||
|
self.client_cert_manager = client_cert_manager
|
||||||
|
|
||||||
|
def https_open(self, req):
|
||||||
|
if self.client_cert_manager is not None:
|
||||||
|
key_file, cert_file = self.client_cert_manager.find_key_cert(
|
||||||
|
req.get_full_url())
|
||||||
|
conn_factory = HTTPSConnectionFactory(key_file, cert_file)
|
||||||
|
else:
|
||||||
|
conn_factory = httplib.HTTPSConnection
|
||||||
|
return self.do_open(conn_factory, req)
|
||||||
|
|
||||||
|
https_request = AbstractHTTPHandler.do_request_
|
185
src/calibre/utils/mechanize/_lwpcookiejar.py
Normal file
185
src/calibre/utils/mechanize/_lwpcookiejar.py
Normal file
@ -0,0 +1,185 @@
|
|||||||
|
"""Load / save to libwww-perl (LWP) format files.
|
||||||
|
|
||||||
|
Actually, the format is slightly extended from that used by LWP's
|
||||||
|
(libwww-perl's) HTTP::Cookies, to avoid losing some RFC 2965 information
|
||||||
|
not recorded by LWP.
|
||||||
|
|
||||||
|
It uses the version string "2.0", though really there isn't an LWP Cookies
|
||||||
|
2.0 format. This indicates that there is extra information in here
|
||||||
|
(domain_dot and port_spec) while still being compatible with libwww-perl,
|
||||||
|
I hope.
|
||||||
|
|
||||||
|
Copyright 2002-2006 John J Lee <jjl@pobox.com>
|
||||||
|
Copyright 1997-1999 Gisle Aas (original libwww-perl code)
|
||||||
|
|
||||||
|
This code is free software; you can redistribute it and/or modify it
|
||||||
|
under the terms of the BSD or ZPL 2.1 licenses (see the file
|
||||||
|
COPYING.txt included with the distribution).
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
import time, re, logging
|
||||||
|
|
||||||
|
from _clientcookie import reraise_unmasked_exceptions, FileCookieJar, Cookie, \
|
||||||
|
MISSING_FILENAME_TEXT, LoadError
|
||||||
|
from _headersutil import join_header_words, split_header_words
|
||||||
|
from _util import iso2time, time2isoz
|
||||||
|
|
||||||
|
debug = logging.getLogger("mechanize").debug
|
||||||
|
|
||||||
|
|
||||||
|
def lwp_cookie_str(cookie):
|
||||||
|
"""Return string representation of Cookie in an the LWP cookie file format.
|
||||||
|
|
||||||
|
Actually, the format is extended a bit -- see module docstring.
|
||||||
|
|
||||||
|
"""
|
||||||
|
h = [(cookie.name, cookie.value),
|
||||||
|
("path", cookie.path),
|
||||||
|
("domain", cookie.domain)]
|
||||||
|
if cookie.port is not None: h.append(("port", cookie.port))
|
||||||
|
if cookie.path_specified: h.append(("path_spec", None))
|
||||||
|
if cookie.port_specified: h.append(("port_spec", None))
|
||||||
|
if cookie.domain_initial_dot: h.append(("domain_dot", None))
|
||||||
|
if cookie.secure: h.append(("secure", None))
|
||||||
|
if cookie.expires: h.append(("expires",
|
||||||
|
time2isoz(float(cookie.expires))))
|
||||||
|
if cookie.discard: h.append(("discard", None))
|
||||||
|
if cookie.comment: h.append(("comment", cookie.comment))
|
||||||
|
if cookie.comment_url: h.append(("commenturl", cookie.comment_url))
|
||||||
|
if cookie.rfc2109: h.append(("rfc2109", None))
|
||||||
|
|
||||||
|
keys = cookie.nonstandard_attr_keys()
|
||||||
|
keys.sort()
|
||||||
|
for k in keys:
|
||||||
|
h.append((k, str(cookie.get_nonstandard_attr(k))))
|
||||||
|
|
||||||
|
h.append(("version", str(cookie.version)))
|
||||||
|
|
||||||
|
return join_header_words([h])
|
||||||
|
|
||||||
|
class LWPCookieJar(FileCookieJar):
|
||||||
|
"""
|
||||||
|
The LWPCookieJar saves a sequence of"Set-Cookie3" lines.
|
||||||
|
"Set-Cookie3" is the format used by the libwww-perl libary, not known
|
||||||
|
to be compatible with any browser, but which is easy to read and
|
||||||
|
doesn't lose information about RFC 2965 cookies.
|
||||||
|
|
||||||
|
Additional methods
|
||||||
|
|
||||||
|
as_lwp_str(ignore_discard=True, ignore_expired=True)
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
magic_re = r"^\#LWP-Cookies-(\d+\.\d+)"
|
||||||
|
|
||||||
|
def as_lwp_str(self, ignore_discard=True, ignore_expires=True):
|
||||||
|
"""Return cookies as a string of "\n"-separated "Set-Cookie3" headers.
|
||||||
|
|
||||||
|
ignore_discard and ignore_expires: see docstring for FileCookieJar.save
|
||||||
|
|
||||||
|
"""
|
||||||
|
now = time.time()
|
||||||
|
r = []
|
||||||
|
for cookie in self:
|
||||||
|
if not ignore_discard and cookie.discard:
|
||||||
|
debug(" Not saving %s: marked for discard", cookie.name)
|
||||||
|
continue
|
||||||
|
if not ignore_expires and cookie.is_expired(now):
|
||||||
|
debug(" Not saving %s: expired", cookie.name)
|
||||||
|
continue
|
||||||
|
r.append("Set-Cookie3: %s" % lwp_cookie_str(cookie))
|
||||||
|
return "\n".join(r+[""])
|
||||||
|
|
||||||
|
def save(self, filename=None, ignore_discard=False, ignore_expires=False):
|
||||||
|
if filename is None:
|
||||||
|
if self.filename is not None: filename = self.filename
|
||||||
|
else: raise ValueError(MISSING_FILENAME_TEXT)
|
||||||
|
|
||||||
|
f = open(filename, "w")
|
||||||
|
try:
|
||||||
|
debug("Saving LWP cookies file")
|
||||||
|
# There really isn't an LWP Cookies 2.0 format, but this indicates
|
||||||
|
# that there is extra information in here (domain_dot and
|
||||||
|
# port_spec) while still being compatible with libwww-perl, I hope.
|
||||||
|
f.write("#LWP-Cookies-2.0\n")
|
||||||
|
f.write(self.as_lwp_str(ignore_discard, ignore_expires))
|
||||||
|
finally:
|
||||||
|
f.close()
|
||||||
|
|
||||||
|
def _really_load(self, f, filename, ignore_discard, ignore_expires):
|
||||||
|
magic = f.readline()
|
||||||
|
if not re.search(self.magic_re, magic):
|
||||||
|
msg = "%s does not seem to contain cookies" % filename
|
||||||
|
raise LoadError(msg)
|
||||||
|
|
||||||
|
now = time.time()
|
||||||
|
|
||||||
|
header = "Set-Cookie3:"
|
||||||
|
boolean_attrs = ("port_spec", "path_spec", "domain_dot",
|
||||||
|
"secure", "discard", "rfc2109")
|
||||||
|
value_attrs = ("version",
|
||||||
|
"port", "path", "domain",
|
||||||
|
"expires",
|
||||||
|
"comment", "commenturl")
|
||||||
|
|
||||||
|
try:
|
||||||
|
while 1:
|
||||||
|
line = f.readline()
|
||||||
|
if line == "": break
|
||||||
|
if not line.startswith(header):
|
||||||
|
continue
|
||||||
|
line = line[len(header):].strip()
|
||||||
|
|
||||||
|
for data in split_header_words([line]):
|
||||||
|
name, value = data[0]
|
||||||
|
standard = {}
|
||||||
|
rest = {}
|
||||||
|
for k in boolean_attrs:
|
||||||
|
standard[k] = False
|
||||||
|
for k, v in data[1:]:
|
||||||
|
if k is not None:
|
||||||
|
lc = k.lower()
|
||||||
|
else:
|
||||||
|
lc = None
|
||||||
|
# don't lose case distinction for unknown fields
|
||||||
|
if (lc in value_attrs) or (lc in boolean_attrs):
|
||||||
|
k = lc
|
||||||
|
if k in boolean_attrs:
|
||||||
|
if v is None: v = True
|
||||||
|
standard[k] = v
|
||||||
|
elif k in value_attrs:
|
||||||
|
standard[k] = v
|
||||||
|
else:
|
||||||
|
rest[k] = v
|
||||||
|
|
||||||
|
h = standard.get
|
||||||
|
expires = h("expires")
|
||||||
|
discard = h("discard")
|
||||||
|
if expires is not None:
|
||||||
|
expires = iso2time(expires)
|
||||||
|
if expires is None:
|
||||||
|
discard = True
|
||||||
|
domain = h("domain")
|
||||||
|
domain_specified = domain.startswith(".")
|
||||||
|
c = Cookie(h("version"), name, value,
|
||||||
|
h("port"), h("port_spec"),
|
||||||
|
domain, domain_specified, h("domain_dot"),
|
||||||
|
h("path"), h("path_spec"),
|
||||||
|
h("secure"),
|
||||||
|
expires,
|
||||||
|
discard,
|
||||||
|
h("comment"),
|
||||||
|
h("commenturl"),
|
||||||
|
rest,
|
||||||
|
h("rfc2109"),
|
||||||
|
)
|
||||||
|
if not ignore_discard and c.discard:
|
||||||
|
continue
|
||||||
|
if not ignore_expires and c.is_expired(now):
|
||||||
|
continue
|
||||||
|
self.set_cookie(c)
|
||||||
|
except:
|
||||||
|
reraise_unmasked_exceptions((IOError,))
|
||||||
|
raise LoadError("invalid Set-Cookie3 format file %s" % filename)
|
||||||
|
|
656
src/calibre/utils/mechanize/_mechanize.py
Normal file
656
src/calibre/utils/mechanize/_mechanize.py
Normal file
@ -0,0 +1,656 @@
|
|||||||
|
"""Stateful programmatic WWW navigation, after Perl's WWW::Mechanize.
|
||||||
|
|
||||||
|
Copyright 2003-2006 John J. Lee <jjl@pobox.com>
|
||||||
|
Copyright 2003 Andy Lester (original Perl code)
|
||||||
|
|
||||||
|
This code is free software; you can redistribute it and/or modify it
|
||||||
|
under the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt
|
||||||
|
included with the distribution).
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
import urllib2, sys, copy, re
|
||||||
|
|
||||||
|
from _useragent import UserAgentBase
|
||||||
|
from _html import DefaultFactory
|
||||||
|
import _response
|
||||||
|
import _request
|
||||||
|
import _rfc3986
|
||||||
|
|
||||||
|
__version__ = (0, 1, 7, "b", None) # 0.1.7b
|
||||||
|
|
||||||
|
class BrowserStateError(Exception): pass
|
||||||
|
class LinkNotFoundError(Exception): pass
|
||||||
|
class FormNotFoundError(Exception): pass
|
||||||
|
|
||||||
|
|
||||||
|
class History:
|
||||||
|
"""
|
||||||
|
|
||||||
|
Though this will become public, the implied interface is not yet stable.
|
||||||
|
|
||||||
|
"""
|
||||||
|
def __init__(self):
|
||||||
|
self._history = [] # LIFO
|
||||||
|
def add(self, request, response):
|
||||||
|
self._history.append((request, response))
|
||||||
|
def back(self, n, _response):
|
||||||
|
response = _response # XXX move Browser._response into this class?
|
||||||
|
while n > 0 or response is None:
|
||||||
|
try:
|
||||||
|
request, response = self._history.pop()
|
||||||
|
except IndexError:
|
||||||
|
raise BrowserStateError("already at start of history")
|
||||||
|
n -= 1
|
||||||
|
return request, response
|
||||||
|
def clear(self):
|
||||||
|
del self._history[:]
|
||||||
|
def close(self):
|
||||||
|
for request, response in self._history:
|
||||||
|
if response is not None:
|
||||||
|
response.close()
|
||||||
|
del self._history[:]
|
||||||
|
|
||||||
|
|
||||||
|
class HTTPRefererProcessor(urllib2.BaseHandler):
|
||||||
|
def http_request(self, request):
|
||||||
|
# See RFC 2616 14.36. The only times we know the source of the
|
||||||
|
# request URI has a URI associated with it are redirect, and
|
||||||
|
# Browser.click() / Browser.submit() / Browser.follow_link().
|
||||||
|
# Otherwise, it's the user's job to add any Referer header before
|
||||||
|
# .open()ing.
|
||||||
|
if hasattr(request, "redirect_dict"):
|
||||||
|
request = self.parent._add_referer_header(
|
||||||
|
request, origin_request=False)
|
||||||
|
return request
|
||||||
|
|
||||||
|
https_request = http_request
|
||||||
|
|
||||||
|
|
||||||
|
class Browser(UserAgentBase):
|
||||||
|
"""Browser-like class with support for history, forms and links.
|
||||||
|
|
||||||
|
BrowserStateError is raised whenever the browser is in the wrong state to
|
||||||
|
complete the requested operation - eg., when .back() is called when the
|
||||||
|
browser history is empty, or when .follow_link() is called when the current
|
||||||
|
response does not contain HTML data.
|
||||||
|
|
||||||
|
Public attributes:
|
||||||
|
|
||||||
|
request: current request (mechanize.Request or urllib2.Request)
|
||||||
|
form: currently selected form (see .select_form())
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
handler_classes = copy.copy(UserAgentBase.handler_classes)
|
||||||
|
handler_classes["_referer"] = HTTPRefererProcessor
|
||||||
|
default_features = copy.copy(UserAgentBase.default_features)
|
||||||
|
default_features.append("_referer")
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
factory=None,
|
||||||
|
history=None,
|
||||||
|
request_class=None,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
|
||||||
|
Only named arguments should be passed to this constructor.
|
||||||
|
|
||||||
|
factory: object implementing the mechanize.Factory interface.
|
||||||
|
history: object implementing the mechanize.History interface. Note
|
||||||
|
this interface is still experimental and may change in future.
|
||||||
|
request_class: Request class to use. Defaults to mechanize.Request
|
||||||
|
by default for Pythons older than 2.4, urllib2.Request otherwise.
|
||||||
|
|
||||||
|
The Factory and History objects passed in are 'owned' by the Browser,
|
||||||
|
so they should not be shared across Browsers. In particular,
|
||||||
|
factory.set_response() should not be called except by the owning
|
||||||
|
Browser itself.
|
||||||
|
|
||||||
|
Note that the supplied factory's request_class is overridden by this
|
||||||
|
constructor, to ensure only one Request class is used.
|
||||||
|
|
||||||
|
"""
|
||||||
|
self._handle_referer = True
|
||||||
|
|
||||||
|
if history is None:
|
||||||
|
history = History()
|
||||||
|
self._history = history
|
||||||
|
|
||||||
|
if request_class is None:
|
||||||
|
if not hasattr(urllib2.Request, "add_unredirected_header"):
|
||||||
|
request_class = _request.Request
|
||||||
|
else:
|
||||||
|
request_class = urllib2.Request # Python >= 2.4
|
||||||
|
|
||||||
|
if factory is None:
|
||||||
|
factory = DefaultFactory()
|
||||||
|
factory.set_request_class(request_class)
|
||||||
|
self._factory = factory
|
||||||
|
self.request_class = request_class
|
||||||
|
|
||||||
|
self.request = None
|
||||||
|
self._set_response(None, False)
|
||||||
|
|
||||||
|
# do this last to avoid __getattr__ problems
|
||||||
|
UserAgentBase.__init__(self)
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
UserAgentBase.close(self)
|
||||||
|
if self._response is not None:
|
||||||
|
self._response.close()
|
||||||
|
if self._history is not None:
|
||||||
|
self._history.close()
|
||||||
|
self._history = None
|
||||||
|
|
||||||
|
# make use after .close easy to spot
|
||||||
|
self.form = None
|
||||||
|
self.request = self._response = None
|
||||||
|
self.request = self.response = self.set_response = None
|
||||||
|
self.geturl = self.reload = self.back = None
|
||||||
|
self.clear_history = self.set_cookie = self.links = self.forms = None
|
||||||
|
self.viewing_html = self.encoding = self.title = None
|
||||||
|
self.select_form = self.click = self.submit = self.click_link = None
|
||||||
|
self.follow_link = self.find_link = None
|
||||||
|
|
||||||
|
def set_handle_referer(self, handle):
|
||||||
|
"""Set whether to add Referer header to each request.
|
||||||
|
|
||||||
|
This base class does not implement this feature (so don't turn this on
|
||||||
|
if you're using this base class directly), but the subclass
|
||||||
|
mechanize.Browser does.
|
||||||
|
|
||||||
|
"""
|
||||||
|
self._set_handler("_referer", handle)
|
||||||
|
self._handle_referer = bool(handle)
|
||||||
|
|
||||||
|
def _add_referer_header(self, request, origin_request=True):
|
||||||
|
if self.request is None:
|
||||||
|
return request
|
||||||
|
scheme = request.get_type()
|
||||||
|
original_scheme = self.request.get_type()
|
||||||
|
if scheme not in ["http", "https"]:
|
||||||
|
return request
|
||||||
|
if not origin_request and not self.request.has_header("Referer"):
|
||||||
|
return request
|
||||||
|
|
||||||
|
if (self._handle_referer and
|
||||||
|
original_scheme in ["http", "https"] and
|
||||||
|
not (original_scheme == "https" and scheme != "https")):
|
||||||
|
# strip URL fragment (RFC 2616 14.36)
|
||||||
|
parts = _rfc3986.urlsplit(self.request.get_full_url())
|
||||||
|
parts = parts[:-1]+(None,)
|
||||||
|
referer = _rfc3986.urlunsplit(parts)
|
||||||
|
request.add_unredirected_header("Referer", referer)
|
||||||
|
return request
|
||||||
|
|
||||||
|
def open_novisit(self, url, data=None):
|
||||||
|
"""Open a URL without visiting it.
|
||||||
|
|
||||||
|
The browser state (including .request, .response(), history, forms and
|
||||||
|
links) are all left unchanged by calling this function.
|
||||||
|
|
||||||
|
The interface is the same as for .open().
|
||||||
|
|
||||||
|
This is useful for things like fetching images.
|
||||||
|
|
||||||
|
See also .retrieve().
|
||||||
|
|
||||||
|
"""
|
||||||
|
return self._mech_open(url, data, visit=False)
|
||||||
|
|
||||||
|
def open(self, url, data=None):
|
||||||
|
return self._mech_open(url, data)
|
||||||
|
|
||||||
|
def _mech_open(self, url, data=None, update_history=True, visit=None):
|
||||||
|
try:
|
||||||
|
url.get_full_url
|
||||||
|
except AttributeError:
|
||||||
|
# string URL -- convert to absolute URL if required
|
||||||
|
scheme, authority = _rfc3986.urlsplit(url)[:2]
|
||||||
|
if scheme is None:
|
||||||
|
# relative URL
|
||||||
|
if self._response is None:
|
||||||
|
raise BrowserStateError(
|
||||||
|
"can't fetch relative reference: "
|
||||||
|
"not viewing any document")
|
||||||
|
url = _rfc3986.urljoin(self._response.geturl(), url)
|
||||||
|
|
||||||
|
request = self._request(url, data, visit)
|
||||||
|
visit = request.visit
|
||||||
|
if visit is None:
|
||||||
|
visit = True
|
||||||
|
|
||||||
|
if visit:
|
||||||
|
self._visit_request(request, update_history)
|
||||||
|
|
||||||
|
success = True
|
||||||
|
try:
|
||||||
|
response = UserAgentBase.open(self, request, data)
|
||||||
|
except urllib2.HTTPError, error:
|
||||||
|
success = False
|
||||||
|
if error.fp is None: # not a response
|
||||||
|
raise
|
||||||
|
response = error
|
||||||
|
## except (IOError, socket.error, OSError), error:
|
||||||
|
## # Yes, urllib2 really does raise all these :-((
|
||||||
|
## # See test_urllib2.py for examples of socket.gaierror and OSError,
|
||||||
|
## # plus note that FTPHandler raises IOError.
|
||||||
|
## # XXX I don't seem to have an example of exactly socket.error being
|
||||||
|
## # raised, only socket.gaierror...
|
||||||
|
## # I don't want to start fixing these here, though, since this is a
|
||||||
|
## # subclass of OpenerDirector, and it would break old code. Even in
|
||||||
|
## # Python core, a fix would need some backwards-compat. hack to be
|
||||||
|
## # acceptable.
|
||||||
|
## raise
|
||||||
|
|
||||||
|
if visit:
|
||||||
|
self._set_response(response, False)
|
||||||
|
response = copy.copy(self._response)
|
||||||
|
elif response is not None:
|
||||||
|
response = _response.upgrade_response(response)
|
||||||
|
|
||||||
|
if not success:
|
||||||
|
raise response
|
||||||
|
return response
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
text = []
|
||||||
|
text.append("<%s " % self.__class__.__name__)
|
||||||
|
if self._response:
|
||||||
|
text.append("visiting %s" % self._response.geturl())
|
||||||
|
else:
|
||||||
|
text.append("(not visiting a URL)")
|
||||||
|
if self.form:
|
||||||
|
text.append("\n selected form:\n %s\n" % str(self.form))
|
||||||
|
text.append(">")
|
||||||
|
return "".join(text)
|
||||||
|
|
||||||
|
def response(self):
|
||||||
|
"""Return a copy of the current response.
|
||||||
|
|
||||||
|
The returned object has the same interface as the object returned by
|
||||||
|
.open() (or urllib2.urlopen()).
|
||||||
|
|
||||||
|
"""
|
||||||
|
return copy.copy(self._response)
|
||||||
|
|
||||||
|
def set_response(self, response):
|
||||||
|
"""Replace current response with (a copy of) response.
|
||||||
|
|
||||||
|
response may be None.
|
||||||
|
|
||||||
|
This is intended mostly for HTML-preprocessing.
|
||||||
|
"""
|
||||||
|
self._set_response(response, True)
|
||||||
|
|
||||||
|
def _set_response(self, response, close_current):
|
||||||
|
# sanity check, necessary but far from sufficient
|
||||||
|
if not (response is None or
|
||||||
|
(hasattr(response, "info") and hasattr(response, "geturl") and
|
||||||
|
hasattr(response, "read")
|
||||||
|
)
|
||||||
|
):
|
||||||
|
raise ValueError("not a response object")
|
||||||
|
|
||||||
|
self.form = None
|
||||||
|
if response is not None:
|
||||||
|
response = _response.upgrade_response(response)
|
||||||
|
if close_current and self._response is not None:
|
||||||
|
self._response.close()
|
||||||
|
self._response = response
|
||||||
|
self._factory.set_response(response)
|
||||||
|
|
||||||
|
def visit_response(self, response, request=None):
|
||||||
|
"""Visit the response, as if it had been .open()ed.
|
||||||
|
|
||||||
|
Unlike .set_response(), this updates history rather than replacing the
|
||||||
|
current response.
|
||||||
|
"""
|
||||||
|
if request is None:
|
||||||
|
request = _request.Request(response.geturl())
|
||||||
|
self._visit_request(request, True)
|
||||||
|
self._set_response(response, False)
|
||||||
|
|
||||||
|
def _visit_request(self, request, update_history):
|
||||||
|
if self._response is not None:
|
||||||
|
self._response.close()
|
||||||
|
if self.request is not None and update_history:
|
||||||
|
self._history.add(self.request, self._response)
|
||||||
|
self._response = None
|
||||||
|
# we want self.request to be assigned even if UserAgentBase.open
|
||||||
|
# fails
|
||||||
|
self.request = request
|
||||||
|
|
||||||
|
def geturl(self):
|
||||||
|
"""Get URL of current document."""
|
||||||
|
if self._response is None:
|
||||||
|
raise BrowserStateError("not viewing any document")
|
||||||
|
return self._response.geturl()
|
||||||
|
|
||||||
|
def reload(self):
|
||||||
|
"""Reload current document, and return response object."""
|
||||||
|
if self.request is None:
|
||||||
|
raise BrowserStateError("no URL has yet been .open()ed")
|
||||||
|
if self._response is not None:
|
||||||
|
self._response.close()
|
||||||
|
return self._mech_open(self.request, update_history=False)
|
||||||
|
|
||||||
|
def back(self, n=1):
|
||||||
|
"""Go back n steps in history, and return response object.
|
||||||
|
|
||||||
|
n: go back this number of steps (default 1 step)
|
||||||
|
|
||||||
|
"""
|
||||||
|
if self._response is not None:
|
||||||
|
self._response.close()
|
||||||
|
self.request, response = self._history.back(n, self._response)
|
||||||
|
self.set_response(response)
|
||||||
|
if not response.read_complete:
|
||||||
|
return self.reload()
|
||||||
|
return copy.copy(response)
|
||||||
|
|
||||||
|
def clear_history(self):
|
||||||
|
self._history.clear()
|
||||||
|
|
||||||
|
def set_cookie(self, cookie_string):
|
||||||
|
"""Request to set a cookie.
|
||||||
|
|
||||||
|
Note that it is NOT necessary to call this method under ordinary
|
||||||
|
circumstances: cookie handling is normally entirely automatic. The
|
||||||
|
intended use case is rather to simulate the setting of a cookie by
|
||||||
|
client script in a web page (e.g. JavaScript). In that case, use of
|
||||||
|
this method is necessary because mechanize currently does not support
|
||||||
|
JavaScript, VBScript, etc.
|
||||||
|
|
||||||
|
The cookie is added in the same way as if it had arrived with the
|
||||||
|
current response, as a result of the current request. This means that,
|
||||||
|
for example, it is not appropriate to set the cookie based on the
|
||||||
|
current request, no cookie will be set.
|
||||||
|
|
||||||
|
The cookie will be returned automatically with subsequent responses
|
||||||
|
made by the Browser instance whenever that's appropriate.
|
||||||
|
|
||||||
|
cookie_string should be a valid value of the Set-Cookie header.
|
||||||
|
|
||||||
|
For example:
|
||||||
|
|
||||||
|
browser.set_cookie(
|
||||||
|
"sid=abcdef; expires=Wednesday, 09-Nov-06 23:12:40 GMT")
|
||||||
|
|
||||||
|
Currently, this method does not allow for adding RFC 2986 cookies.
|
||||||
|
This limitation will be lifted if anybody requests it.
|
||||||
|
|
||||||
|
"""
|
||||||
|
if self._response is None:
|
||||||
|
raise BrowserStateError("not viewing any document")
|
||||||
|
if self.request.get_type() not in ["http", "https"]:
|
||||||
|
raise BrowserStateError("can't set cookie for non-HTTP/HTTPS "
|
||||||
|
"transactions")
|
||||||
|
cookiejar = self._ua_handlers["_cookies"].cookiejar
|
||||||
|
response = self.response() # copy
|
||||||
|
headers = response.info()
|
||||||
|
headers["Set-cookie"] = cookie_string
|
||||||
|
cookiejar.extract_cookies(response, self.request)
|
||||||
|
|
||||||
|
def links(self, **kwds):
|
||||||
|
"""Return iterable over links (mechanize.Link objects)."""
|
||||||
|
if not self.viewing_html():
|
||||||
|
raise BrowserStateError("not viewing HTML")
|
||||||
|
links = self._factory.links()
|
||||||
|
if kwds:
|
||||||
|
return self._filter_links(links, **kwds)
|
||||||
|
else:
|
||||||
|
return links
|
||||||
|
|
||||||
|
def forms(self):
|
||||||
|
"""Return iterable over forms.
|
||||||
|
|
||||||
|
The returned form objects implement the ClientForm.HTMLForm interface.
|
||||||
|
|
||||||
|
"""
|
||||||
|
if not self.viewing_html():
|
||||||
|
raise BrowserStateError("not viewing HTML")
|
||||||
|
return self._factory.forms()
|
||||||
|
|
||||||
|
def global_form(self):
|
||||||
|
"""Return the global form object, or None if the factory implementation
|
||||||
|
did not supply one.
|
||||||
|
|
||||||
|
The "global" form object contains all controls that are not descendants of
|
||||||
|
any FORM element.
|
||||||
|
|
||||||
|
The returned form object implements the ClientForm.HTMLForm interface.
|
||||||
|
|
||||||
|
This is a separate method since the global form is not regarded as part
|
||||||
|
of the sequence of forms in the document -- mostly for
|
||||||
|
backwards-compatibility.
|
||||||
|
|
||||||
|
"""
|
||||||
|
if not self.viewing_html():
|
||||||
|
raise BrowserStateError("not viewing HTML")
|
||||||
|
return self._factory.global_form
|
||||||
|
|
||||||
|
def viewing_html(self):
|
||||||
|
"""Return whether the current response contains HTML data."""
|
||||||
|
if self._response is None:
|
||||||
|
raise BrowserStateError("not viewing any document")
|
||||||
|
return self._factory.is_html
|
||||||
|
|
||||||
|
def encoding(self):
|
||||||
|
""""""
|
||||||
|
if self._response is None:
|
||||||
|
raise BrowserStateError("not viewing any document")
|
||||||
|
return self._factory.encoding
|
||||||
|
|
||||||
|
def title(self):
|
||||||
|
"""Return title, or None if there is no title element in the document.
|
||||||
|
|
||||||
|
Tags are stripped or textified as described in docs for
|
||||||
|
PullParser.get_text() method of pullparser module.
|
||||||
|
|
||||||
|
"""
|
||||||
|
if not self.viewing_html():
|
||||||
|
raise BrowserStateError("not viewing HTML")
|
||||||
|
return self._factory.title
|
||||||
|
|
||||||
|
def select_form(self, name=None, predicate=None, nr=None):
|
||||||
|
"""Select an HTML form for input.
|
||||||
|
|
||||||
|
This is a bit like giving a form the "input focus" in a browser.
|
||||||
|
|
||||||
|
If a form is selected, the Browser object supports the HTMLForm
|
||||||
|
interface, so you can call methods like .set_value(), .set(), and
|
||||||
|
.click().
|
||||||
|
|
||||||
|
Another way to select a form is to assign to the .form attribute. The
|
||||||
|
form assigned should be one of the objects returned by the .forms()
|
||||||
|
method.
|
||||||
|
|
||||||
|
At least one of the name, predicate and nr arguments must be supplied.
|
||||||
|
If no matching form is found, mechanize.FormNotFoundError is raised.
|
||||||
|
|
||||||
|
If name is specified, then the form must have the indicated name.
|
||||||
|
|
||||||
|
If predicate is specified, then the form must match that function. The
|
||||||
|
predicate function is passed the HTMLForm as its single argument, and
|
||||||
|
should return a boolean value indicating whether the form matched.
|
||||||
|
|
||||||
|
nr, if supplied, is the sequence number of the form (where 0 is the
|
||||||
|
first). Note that control 0 is the first form matching all the other
|
||||||
|
arguments (if supplied); it is not necessarily the first control in the
|
||||||
|
form.
|
||||||
|
|
||||||
|
"""
|
||||||
|
if not self.viewing_html():
|
||||||
|
raise BrowserStateError("not viewing HTML")
|
||||||
|
if (name is None) and (predicate is None) and (nr is None):
|
||||||
|
raise ValueError(
|
||||||
|
"at least one argument must be supplied to specify form")
|
||||||
|
|
||||||
|
orig_nr = nr
|
||||||
|
for form in self.forms():
|
||||||
|
if name is not None and name != form.name:
|
||||||
|
continue
|
||||||
|
if predicate is not None and not predicate(form):
|
||||||
|
continue
|
||||||
|
if nr:
|
||||||
|
nr -= 1
|
||||||
|
continue
|
||||||
|
self.form = form
|
||||||
|
break # success
|
||||||
|
else:
|
||||||
|
# failure
|
||||||
|
description = []
|
||||||
|
if name is not None: description.append("name '%s'" % name)
|
||||||
|
if predicate is not None:
|
||||||
|
description.append("predicate %s" % predicate)
|
||||||
|
if orig_nr is not None: description.append("nr %d" % orig_nr)
|
||||||
|
description = ", ".join(description)
|
||||||
|
raise FormNotFoundError("no form matching "+description)
|
||||||
|
|
||||||
|
def click(self, *args, **kwds):
|
||||||
|
"""See ClientForm.HTMLForm.click for documentation."""
|
||||||
|
if not self.viewing_html():
|
||||||
|
raise BrowserStateError("not viewing HTML")
|
||||||
|
request = self.form.click(*args, **kwds)
|
||||||
|
return self._add_referer_header(request)
|
||||||
|
|
||||||
|
def submit(self, *args, **kwds):
|
||||||
|
"""Submit current form.
|
||||||
|
|
||||||
|
Arguments are as for ClientForm.HTMLForm.click().
|
||||||
|
|
||||||
|
Return value is same as for Browser.open().
|
||||||
|
|
||||||
|
"""
|
||||||
|
return self.open(self.click(*args, **kwds))
|
||||||
|
|
||||||
|
def click_link(self, link=None, **kwds):
|
||||||
|
"""Find a link and return a Request object for it.
|
||||||
|
|
||||||
|
Arguments are as for .find_link(), except that a link may be supplied
|
||||||
|
as the first argument.
|
||||||
|
|
||||||
|
"""
|
||||||
|
if not self.viewing_html():
|
||||||
|
raise BrowserStateError("not viewing HTML")
|
||||||
|
if not link:
|
||||||
|
link = self.find_link(**kwds)
|
||||||
|
else:
|
||||||
|
if kwds:
|
||||||
|
raise ValueError(
|
||||||
|
"either pass a Link, or keyword arguments, not both")
|
||||||
|
request = self.request_class(link.absolute_url)
|
||||||
|
return self._add_referer_header(request)
|
||||||
|
|
||||||
|
def follow_link(self, link=None, **kwds):
|
||||||
|
"""Find a link and .open() it.
|
||||||
|
|
||||||
|
Arguments are as for .click_link().
|
||||||
|
|
||||||
|
Return value is same as for Browser.open().
|
||||||
|
|
||||||
|
"""
|
||||||
|
return self.open(self.click_link(link, **kwds))
|
||||||
|
|
||||||
|
def find_link(self, **kwds):
|
||||||
|
"""Find a link in current page.
|
||||||
|
|
||||||
|
Links are returned as mechanize.Link objects.
|
||||||
|
|
||||||
|
# Return third link that .search()-matches the regexp "python"
|
||||||
|
# (by ".search()-matches", I mean that the regular expression method
|
||||||
|
# .search() is used, rather than .match()).
|
||||||
|
find_link(text_regex=re.compile("python"), nr=2)
|
||||||
|
|
||||||
|
# Return first http link in the current page that points to somewhere
|
||||||
|
# on python.org whose link text (after tags have been removed) is
|
||||||
|
# exactly "monty python".
|
||||||
|
find_link(text="monty python",
|
||||||
|
url_regex=re.compile("http.*python.org"))
|
||||||
|
|
||||||
|
# Return first link with exactly three HTML attributes.
|
||||||
|
find_link(predicate=lambda link: len(link.attrs) == 3)
|
||||||
|
|
||||||
|
Links include anchors (<a>), image maps (<area>), and frames (<frame>,
|
||||||
|
<iframe>).
|
||||||
|
|
||||||
|
All arguments must be passed by keyword, not position. Zero or more
|
||||||
|
arguments may be supplied. In order to find a link, all arguments
|
||||||
|
supplied must match.
|
||||||
|
|
||||||
|
If a matching link is not found, mechanize.LinkNotFoundError is raised.
|
||||||
|
|
||||||
|
text: link text between link tags: eg. <a href="blah">this bit</a> (as
|
||||||
|
returned by pullparser.get_compressed_text(), ie. without tags but
|
||||||
|
with opening tags "textified" as per the pullparser docs) must compare
|
||||||
|
equal to this argument, if supplied
|
||||||
|
text_regex: link text between tag (as defined above) must match the
|
||||||
|
regular expression object or regular expression string passed as this
|
||||||
|
argument, if supplied
|
||||||
|
name, name_regex: as for text and text_regex, but matched against the
|
||||||
|
name HTML attribute of the link tag
|
||||||
|
url, url_regex: as for text and text_regex, but matched against the
|
||||||
|
URL of the link tag (note this matches against Link.url, which is a
|
||||||
|
relative or absolute URL according to how it was written in the HTML)
|
||||||
|
tag: element name of opening tag, eg. "a"
|
||||||
|
predicate: a function taking a Link object as its single argument,
|
||||||
|
returning a boolean result, indicating whether the links
|
||||||
|
nr: matches the nth link that matches all other criteria (default 0)
|
||||||
|
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
return self._filter_links(self._factory.links(), **kwds).next()
|
||||||
|
except StopIteration:
|
||||||
|
raise LinkNotFoundError()
|
||||||
|
|
||||||
|
def __getattr__(self, name):
|
||||||
|
# pass through ClientForm / DOMForm methods and attributes
|
||||||
|
form = self.__dict__.get("form")
|
||||||
|
if form is None:
|
||||||
|
raise AttributeError(
|
||||||
|
"%s instance has no attribute %s (perhaps you forgot to "
|
||||||
|
".select_form()?)" % (self.__class__, name))
|
||||||
|
return getattr(form, name)
|
||||||
|
|
||||||
|
def _filter_links(self, links,
|
||||||
|
text=None, text_regex=None,
|
||||||
|
name=None, name_regex=None,
|
||||||
|
url=None, url_regex=None,
|
||||||
|
tag=None,
|
||||||
|
predicate=None,
|
||||||
|
nr=0
|
||||||
|
):
|
||||||
|
if not self.viewing_html():
|
||||||
|
raise BrowserStateError("not viewing HTML")
|
||||||
|
|
||||||
|
found_links = []
|
||||||
|
orig_nr = nr
|
||||||
|
|
||||||
|
for link in links:
|
||||||
|
if url is not None and url != link.url:
|
||||||
|
continue
|
||||||
|
if url_regex is not None and not re.search(url_regex, link.url):
|
||||||
|
continue
|
||||||
|
if (text is not None and
|
||||||
|
(link.text is None or text != link.text)):
|
||||||
|
continue
|
||||||
|
if (text_regex is not None and
|
||||||
|
(link.text is None or not re.search(text_regex, link.text))):
|
||||||
|
continue
|
||||||
|
if name is not None and name != dict(link.attrs).get("name"):
|
||||||
|
continue
|
||||||
|
if name_regex is not None:
|
||||||
|
link_name = dict(link.attrs).get("name")
|
||||||
|
if link_name is None or not re.search(name_regex, link_name):
|
||||||
|
continue
|
||||||
|
if tag is not None and tag != link.tag:
|
||||||
|
continue
|
||||||
|
if predicate is not None and not predicate(link):
|
||||||
|
continue
|
||||||
|
if nr:
|
||||||
|
nr -= 1
|
||||||
|
continue
|
||||||
|
yield link
|
||||||
|
nr = orig_nr
|
159
src/calibre/utils/mechanize/_mozillacookiejar.py
Normal file
159
src/calibre/utils/mechanize/_mozillacookiejar.py
Normal file
@ -0,0 +1,159 @@
|
|||||||
|
"""Mozilla / Netscape cookie loading / saving.
|
||||||
|
|
||||||
|
Copyright 2002-2006 John J Lee <jjl@pobox.com>
|
||||||
|
Copyright 1997-1999 Gisle Aas (original libwww-perl code)
|
||||||
|
|
||||||
|
This code is free software; you can redistribute it and/or modify it
|
||||||
|
under the terms of the BSD or ZPL 2.1 licenses (see the file
|
||||||
|
COPYING.txt included with the distribution).
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re, time, logging
|
||||||
|
|
||||||
|
from _clientcookie import reraise_unmasked_exceptions, FileCookieJar, Cookie, \
|
||||||
|
MISSING_FILENAME_TEXT, LoadError
|
||||||
|
debug = logging.getLogger("ClientCookie").debug
|
||||||
|
|
||||||
|
|
||||||
|
class MozillaCookieJar(FileCookieJar):
|
||||||
|
"""
|
||||||
|
|
||||||
|
WARNING: you may want to backup your browser's cookies file if you use
|
||||||
|
this class to save cookies. I *think* it works, but there have been
|
||||||
|
bugs in the past!
|
||||||
|
|
||||||
|
This class differs from CookieJar only in the format it uses to save and
|
||||||
|
load cookies to and from a file. This class uses the Mozilla/Netscape
|
||||||
|
`cookies.txt' format. lynx uses this file format, too.
|
||||||
|
|
||||||
|
Don't expect cookies saved while the browser is running to be noticed by
|
||||||
|
the browser (in fact, Mozilla on unix will overwrite your saved cookies if
|
||||||
|
you change them on disk while it's running; on Windows, you probably can't
|
||||||
|
save at all while the browser is running).
|
||||||
|
|
||||||
|
Note that the Mozilla/Netscape format will downgrade RFC2965 cookies to
|
||||||
|
Netscape cookies on saving.
|
||||||
|
|
||||||
|
In particular, the cookie version and port number information is lost,
|
||||||
|
together with information about whether or not Path, Port and Discard were
|
||||||
|
specified by the Set-Cookie2 (or Set-Cookie) header, and whether or not the
|
||||||
|
domain as set in the HTTP header started with a dot (yes, I'm aware some
|
||||||
|
domains in Netscape files start with a dot and some don't -- trust me, you
|
||||||
|
really don't want to know any more about this).
|
||||||
|
|
||||||
|
Note that though Mozilla and Netscape use the same format, they use
|
||||||
|
slightly different headers. The class saves cookies using the Netscape
|
||||||
|
header by default (Mozilla can cope with that).
|
||||||
|
|
||||||
|
"""
|
||||||
|
magic_re = "#( Netscape)? HTTP Cookie File"
|
||||||
|
header = """\
|
||||||
|
# Netscape HTTP Cookie File
|
||||||
|
# http://www.netscape.com/newsref/std/cookie_spec.html
|
||||||
|
# This is a generated file! Do not edit.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def _really_load(self, f, filename, ignore_discard, ignore_expires):
|
||||||
|
now = time.time()
|
||||||
|
|
||||||
|
magic = f.readline()
|
||||||
|
if not re.search(self.magic_re, magic):
|
||||||
|
f.close()
|
||||||
|
raise LoadError(
|
||||||
|
"%s does not look like a Netscape format cookies file" %
|
||||||
|
filename)
|
||||||
|
|
||||||
|
try:
|
||||||
|
while 1:
|
||||||
|
line = f.readline()
|
||||||
|
if line == "": break
|
||||||
|
|
||||||
|
# last field may be absent, so keep any trailing tab
|
||||||
|
if line.endswith("\n"): line = line[:-1]
|
||||||
|
|
||||||
|
# skip comments and blank lines XXX what is $ for?
|
||||||
|
if (line.strip().startswith("#") or
|
||||||
|
line.strip().startswith("$") or
|
||||||
|
line.strip() == ""):
|
||||||
|
continue
|
||||||
|
|
||||||
|
domain, domain_specified, path, secure, expires, name, value = \
|
||||||
|
line.split("\t")
|
||||||
|
secure = (secure == "TRUE")
|
||||||
|
domain_specified = (domain_specified == "TRUE")
|
||||||
|
if name == "":
|
||||||
|
name = value
|
||||||
|
value = None
|
||||||
|
|
||||||
|
initial_dot = domain.startswith(".")
|
||||||
|
assert domain_specified == initial_dot
|
||||||
|
|
||||||
|
discard = False
|
||||||
|
if expires == "":
|
||||||
|
expires = None
|
||||||
|
discard = True
|
||||||
|
|
||||||
|
# assume path_specified is false
|
||||||
|
c = Cookie(0, name, value,
|
||||||
|
None, False,
|
||||||
|
domain, domain_specified, initial_dot,
|
||||||
|
path, False,
|
||||||
|
secure,
|
||||||
|
expires,
|
||||||
|
discard,
|
||||||
|
None,
|
||||||
|
None,
|
||||||
|
{})
|
||||||
|
if not ignore_discard and c.discard:
|
||||||
|
continue
|
||||||
|
if not ignore_expires and c.is_expired(now):
|
||||||
|
continue
|
||||||
|
self.set_cookie(c)
|
||||||
|
|
||||||
|
except:
|
||||||
|
reraise_unmasked_exceptions((IOError,))
|
||||||
|
raise LoadError("invalid Netscape format file %s: %s" %
|
||||||
|
(filename, line))
|
||||||
|
|
||||||
|
def save(self, filename=None, ignore_discard=False, ignore_expires=False):
|
||||||
|
if filename is None:
|
||||||
|
if self.filename is not None: filename = self.filename
|
||||||
|
else: raise ValueError(MISSING_FILENAME_TEXT)
|
||||||
|
|
||||||
|
f = open(filename, "w")
|
||||||
|
try:
|
||||||
|
debug("Saving Netscape cookies.txt file")
|
||||||
|
f.write(self.header)
|
||||||
|
now = time.time()
|
||||||
|
for cookie in self:
|
||||||
|
if not ignore_discard and cookie.discard:
|
||||||
|
debug(" Not saving %s: marked for discard", cookie.name)
|
||||||
|
continue
|
||||||
|
if not ignore_expires and cookie.is_expired(now):
|
||||||
|
debug(" Not saving %s: expired", cookie.name)
|
||||||
|
continue
|
||||||
|
if cookie.secure: secure = "TRUE"
|
||||||
|
else: secure = "FALSE"
|
||||||
|
if cookie.domain.startswith("."): initial_dot = "TRUE"
|
||||||
|
else: initial_dot = "FALSE"
|
||||||
|
if cookie.expires is not None:
|
||||||
|
expires = str(cookie.expires)
|
||||||
|
else:
|
||||||
|
expires = ""
|
||||||
|
if cookie.value is None:
|
||||||
|
# cookies.txt regards 'Set-Cookie: foo' as a cookie
|
||||||
|
# with no name, whereas cookielib regards it as a
|
||||||
|
# cookie with no value.
|
||||||
|
name = ""
|
||||||
|
value = cookie.name
|
||||||
|
else:
|
||||||
|
name = cookie.name
|
||||||
|
value = cookie.value
|
||||||
|
f.write(
|
||||||
|
"\t".join([cookie.domain, initial_dot, cookie.path,
|
||||||
|
secure, expires, name, value])+
|
||||||
|
"\n")
|
||||||
|
finally:
|
||||||
|
f.close()
|
387
src/calibre/utils/mechanize/_msiecookiejar.py
Normal file
387
src/calibre/utils/mechanize/_msiecookiejar.py
Normal file
@ -0,0 +1,387 @@
|
|||||||
|
"""Microsoft Internet Explorer cookie loading on Windows.
|
||||||
|
|
||||||
|
Copyright 2002-2003 Johnny Lee <typo_pl@hotmail.com> (MSIE Perl code)
|
||||||
|
Copyright 2002-2006 John J Lee <jjl@pobox.com> (The Python port)
|
||||||
|
|
||||||
|
This code is free software; you can redistribute it and/or modify it
|
||||||
|
under the terms of the BSD or ZPL 2.1 licenses (see the file
|
||||||
|
COPYING.txt included with the distribution).
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
# XXX names and comments are not great here
|
||||||
|
|
||||||
|
import os, re, time, struct, logging
|
||||||
|
if os.name == "nt":
|
||||||
|
import _winreg
|
||||||
|
|
||||||
|
from _clientcookie import FileCookieJar, CookieJar, Cookie, \
|
||||||
|
MISSING_FILENAME_TEXT, LoadError
|
||||||
|
|
||||||
|
debug = logging.getLogger("mechanize").debug
|
||||||
|
|
||||||
|
|
||||||
|
def regload(path, leaf):
|
||||||
|
key = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER, path, 0,
|
||||||
|
_winreg.KEY_ALL_ACCESS)
|
||||||
|
try:
|
||||||
|
value = _winreg.QueryValueEx(key, leaf)[0]
|
||||||
|
except WindowsError:
|
||||||
|
value = None
|
||||||
|
return value
|
||||||
|
|
||||||
|
WIN32_EPOCH = 0x019db1ded53e8000L # 1970 Jan 01 00:00:00 in Win32 FILETIME
|
||||||
|
|
||||||
|
def epoch_time_offset_from_win32_filetime(filetime):
|
||||||
|
"""Convert from win32 filetime to seconds-since-epoch value.
|
||||||
|
|
||||||
|
MSIE stores create and expire times as Win32 FILETIME, which is 64
|
||||||
|
bits of 100 nanosecond intervals since Jan 01 1601.
|
||||||
|
|
||||||
|
mechanize expects time in 32-bit value expressed in seconds since the
|
||||||
|
epoch (Jan 01 1970).
|
||||||
|
|
||||||
|
"""
|
||||||
|
if filetime < WIN32_EPOCH:
|
||||||
|
raise ValueError("filetime (%d) is before epoch (%d)" %
|
||||||
|
(filetime, WIN32_EPOCH))
|
||||||
|
|
||||||
|
return divmod((filetime - WIN32_EPOCH), 10000000L)[0]
|
||||||
|
|
||||||
|
def binary_to_char(c): return "%02X" % ord(c)
|
||||||
|
def binary_to_str(d): return "".join(map(binary_to_char, list(d)))
|
||||||
|
|
||||||
|
class MSIEBase:
|
||||||
|
magic_re = re.compile(r"Client UrlCache MMF Ver \d\.\d.*")
|
||||||
|
padding = "\x0d\xf0\xad\x0b"
|
||||||
|
|
||||||
|
msie_domain_re = re.compile(r"^([^/]+)(/.*)$")
|
||||||
|
cookie_re = re.compile("Cookie\:.+\@([\x21-\xFF]+).*?"
|
||||||
|
"(.+\@[\x21-\xFF]+\.txt)")
|
||||||
|
|
||||||
|
# path under HKEY_CURRENT_USER from which to get location of index.dat
|
||||||
|
reg_path = r"software\microsoft\windows" \
|
||||||
|
r"\currentversion\explorer\shell folders"
|
||||||
|
reg_key = "Cookies"
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self._delayload_domains = {}
|
||||||
|
|
||||||
|
def _delayload_domain(self, domain):
|
||||||
|
# if necessary, lazily load cookies for this domain
|
||||||
|
delayload_info = self._delayload_domains.get(domain)
|
||||||
|
if delayload_info is not None:
|
||||||
|
cookie_file, ignore_discard, ignore_expires = delayload_info
|
||||||
|
try:
|
||||||
|
self.load_cookie_data(cookie_file,
|
||||||
|
ignore_discard, ignore_expires)
|
||||||
|
except (LoadError, IOError):
|
||||||
|
debug("error reading cookie file, skipping: %s", cookie_file)
|
||||||
|
else:
|
||||||
|
del self._delayload_domains[domain]
|
||||||
|
|
||||||
|
def _load_cookies_from_file(self, filename):
|
||||||
|
debug("Loading MSIE cookies file: %s", filename)
|
||||||
|
cookies = []
|
||||||
|
|
||||||
|
cookies_fh = open(filename)
|
||||||
|
|
||||||
|
try:
|
||||||
|
while 1:
|
||||||
|
key = cookies_fh.readline()
|
||||||
|
if key == "": break
|
||||||
|
|
||||||
|
rl = cookies_fh.readline
|
||||||
|
def getlong(rl=rl): return long(rl().rstrip())
|
||||||
|
def getstr(rl=rl): return rl().rstrip()
|
||||||
|
|
||||||
|
key = key.rstrip()
|
||||||
|
value = getstr()
|
||||||
|
domain_path = getstr()
|
||||||
|
flags = getlong() # 0x2000 bit is for secure I think
|
||||||
|
lo_expire = getlong()
|
||||||
|
hi_expire = getlong()
|
||||||
|
lo_create = getlong()
|
||||||
|
hi_create = getlong()
|
||||||
|
sep = getstr()
|
||||||
|
|
||||||
|
if "" in (key, value, domain_path, flags, hi_expire, lo_expire,
|
||||||
|
hi_create, lo_create, sep) or (sep != "*"):
|
||||||
|
break
|
||||||
|
|
||||||
|
m = self.msie_domain_re.search(domain_path)
|
||||||
|
if m:
|
||||||
|
domain = m.group(1)
|
||||||
|
path = m.group(2)
|
||||||
|
|
||||||
|
cookies.append({"KEY": key, "VALUE": value, "DOMAIN": domain,
|
||||||
|
"PATH": path, "FLAGS": flags, "HIXP": hi_expire,
|
||||||
|
"LOXP": lo_expire, "HICREATE": hi_create,
|
||||||
|
"LOCREATE": lo_create})
|
||||||
|
finally:
|
||||||
|
cookies_fh.close()
|
||||||
|
|
||||||
|
return cookies
|
||||||
|
|
||||||
|
def load_cookie_data(self, filename,
|
||||||
|
ignore_discard=False, ignore_expires=False):
|
||||||
|
"""Load cookies from file containing actual cookie data.
|
||||||
|
|
||||||
|
Old cookies are kept unless overwritten by newly loaded ones.
|
||||||
|
|
||||||
|
You should not call this method if the delayload attribute is set.
|
||||||
|
|
||||||
|
I think each of these files contain all cookies for one user, domain,
|
||||||
|
and path.
|
||||||
|
|
||||||
|
filename: file containing cookies -- usually found in a file like
|
||||||
|
C:\WINNT\Profiles\joe\Cookies\joe@blah[1].txt
|
||||||
|
|
||||||
|
"""
|
||||||
|
now = int(time.time())
|
||||||
|
|
||||||
|
cookie_data = self._load_cookies_from_file(filename)
|
||||||
|
|
||||||
|
for cookie in cookie_data:
|
||||||
|
flags = cookie["FLAGS"]
|
||||||
|
secure = ((flags & 0x2000) != 0)
|
||||||
|
filetime = (cookie["HIXP"] << 32) + cookie["LOXP"]
|
||||||
|
expires = epoch_time_offset_from_win32_filetime(filetime)
|
||||||
|
if expires < now:
|
||||||
|
discard = True
|
||||||
|
else:
|
||||||
|
discard = False
|
||||||
|
domain = cookie["DOMAIN"]
|
||||||
|
initial_dot = domain.startswith(".")
|
||||||
|
if initial_dot:
|
||||||
|
domain_specified = True
|
||||||
|
else:
|
||||||
|
# MSIE 5 does not record whether the domain cookie-attribute
|
||||||
|
# was specified.
|
||||||
|
# Assuming it wasn't is conservative, because with strict
|
||||||
|
# domain matching this will match less frequently; with regular
|
||||||
|
# Netscape tail-matching, this will match at exactly the same
|
||||||
|
# times that domain_specified = True would. It also means we
|
||||||
|
# don't have to prepend a dot to achieve consistency with our
|
||||||
|
# own & Mozilla's domain-munging scheme.
|
||||||
|
domain_specified = False
|
||||||
|
|
||||||
|
# assume path_specified is false
|
||||||
|
# XXX is there other stuff in here? -- eg. comment, commentURL?
|
||||||
|
c = Cookie(0,
|
||||||
|
cookie["KEY"], cookie["VALUE"],
|
||||||
|
None, False,
|
||||||
|
domain, domain_specified, initial_dot,
|
||||||
|
cookie["PATH"], False,
|
||||||
|
secure,
|
||||||
|
expires,
|
||||||
|
discard,
|
||||||
|
None,
|
||||||
|
None,
|
||||||
|
{"flags": flags})
|
||||||
|
if not ignore_discard and c.discard:
|
||||||
|
continue
|
||||||
|
if not ignore_expires and c.is_expired(now):
|
||||||
|
continue
|
||||||
|
CookieJar.set_cookie(self, c)
|
||||||
|
|
||||||
|
def load_from_registry(self, ignore_discard=False, ignore_expires=False,
|
||||||
|
username=None):
|
||||||
|
"""
|
||||||
|
username: only required on win9x
|
||||||
|
|
||||||
|
"""
|
||||||
|
cookies_dir = regload(self.reg_path, self.reg_key)
|
||||||
|
filename = os.path.normpath(os.path.join(cookies_dir, "INDEX.DAT"))
|
||||||
|
self.load(filename, ignore_discard, ignore_expires, username)
|
||||||
|
|
||||||
|
def _really_load(self, index, filename, ignore_discard, ignore_expires,
|
||||||
|
username):
|
||||||
|
now = int(time.time())
|
||||||
|
|
||||||
|
if username is None:
|
||||||
|
username = os.environ['USERNAME'].lower()
|
||||||
|
|
||||||
|
cookie_dir = os.path.dirname(filename)
|
||||||
|
|
||||||
|
data = index.read(256)
|
||||||
|
if len(data) != 256:
|
||||||
|
raise LoadError("%s file is too short" % filename)
|
||||||
|
|
||||||
|
# Cookies' index.dat file starts with 32 bytes of signature
|
||||||
|
# followed by an offset to the first record, stored as a little-
|
||||||
|
# endian DWORD.
|
||||||
|
sig, size, data = data[:32], data[32:36], data[36:]
|
||||||
|
size = struct.unpack("<L", size)[0]
|
||||||
|
|
||||||
|
# check that sig is valid
|
||||||
|
if not self.magic_re.match(sig) or size != 0x4000:
|
||||||
|
raise LoadError("%s ['%s' %s] does not seem to contain cookies" %
|
||||||
|
(str(filename), sig, size))
|
||||||
|
|
||||||
|
# skip to start of first record
|
||||||
|
index.seek(size, 0)
|
||||||
|
|
||||||
|
sector = 128 # size of sector in bytes
|
||||||
|
|
||||||
|
while 1:
|
||||||
|
data = ""
|
||||||
|
|
||||||
|
# Cookies are usually in two contiguous sectors, so read in two
|
||||||
|
# sectors and adjust if not a Cookie.
|
||||||
|
to_read = 2 * sector
|
||||||
|
d = index.read(to_read)
|
||||||
|
if len(d) != to_read:
|
||||||
|
break
|
||||||
|
data = data + d
|
||||||
|
|
||||||
|
# Each record starts with a 4-byte signature and a count
|
||||||
|
# (little-endian DWORD) of sectors for the record.
|
||||||
|
sig, size, data = data[:4], data[4:8], data[8:]
|
||||||
|
size = struct.unpack("<L", size)[0]
|
||||||
|
|
||||||
|
to_read = (size - 2) * sector
|
||||||
|
|
||||||
|
## from urllib import quote
|
||||||
|
## print "data", quote(data)
|
||||||
|
## print "sig", quote(sig)
|
||||||
|
## print "size in sectors", size
|
||||||
|
## print "size in bytes", size*sector
|
||||||
|
## print "size in units of 16 bytes", (size*sector) / 16
|
||||||
|
## print "size to read in bytes", to_read
|
||||||
|
## print
|
||||||
|
|
||||||
|
if sig != "URL ":
|
||||||
|
assert (sig in ("HASH", "LEAK",
|
||||||
|
self.padding, "\x00\x00\x00\x00"),
|
||||||
|
"unrecognized MSIE index.dat record: %s" %
|
||||||
|
binary_to_str(sig))
|
||||||
|
if sig == "\x00\x00\x00\x00":
|
||||||
|
# assume we've got all the cookies, and stop
|
||||||
|
break
|
||||||
|
if sig == self.padding:
|
||||||
|
continue
|
||||||
|
# skip the rest of this record
|
||||||
|
assert to_read >= 0
|
||||||
|
if size != 2:
|
||||||
|
assert to_read != 0
|
||||||
|
index.seek(to_read, 1)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# read in rest of record if necessary
|
||||||
|
if size > 2:
|
||||||
|
more_data = index.read(to_read)
|
||||||
|
if len(more_data) != to_read: break
|
||||||
|
data = data + more_data
|
||||||
|
|
||||||
|
cookie_re = ("Cookie\:%s\@([\x21-\xFF]+).*?" % username +
|
||||||
|
"(%s\@[\x21-\xFF]+\.txt)" % username)
|
||||||
|
m = re.search(cookie_re, data, re.I)
|
||||||
|
if m:
|
||||||
|
cookie_file = os.path.join(cookie_dir, m.group(2))
|
||||||
|
if not self.delayload:
|
||||||
|
try:
|
||||||
|
self.load_cookie_data(cookie_file,
|
||||||
|
ignore_discard, ignore_expires)
|
||||||
|
except (LoadError, IOError):
|
||||||
|
debug("error reading cookie file, skipping: %s",
|
||||||
|
cookie_file)
|
||||||
|
else:
|
||||||
|
domain = m.group(1)
|
||||||
|
i = domain.find("/")
|
||||||
|
if i != -1:
|
||||||
|
domain = domain[:i]
|
||||||
|
|
||||||
|
self._delayload_domains[domain] = (
|
||||||
|
cookie_file, ignore_discard, ignore_expires)
|
||||||
|
|
||||||
|
|
||||||
|
class MSIECookieJar(MSIEBase, FileCookieJar):
|
||||||
|
"""FileCookieJar that reads from the Windows MSIE cookies database.
|
||||||
|
|
||||||
|
MSIECookieJar can read the cookie files of Microsoft Internet Explorer
|
||||||
|
(MSIE) for Windows version 5 on Windows NT and version 6 on Windows XP and
|
||||||
|
Windows 98. Other configurations may also work, but are untested. Saving
|
||||||
|
cookies in MSIE format is NOT supported. If you save cookies, they'll be
|
||||||
|
in the usual Set-Cookie3 format, which you can read back in using an
|
||||||
|
instance of the plain old CookieJar class. Don't save using the same
|
||||||
|
filename that you loaded cookies from, because you may succeed in
|
||||||
|
clobbering your MSIE cookies index file!
|
||||||
|
|
||||||
|
You should be able to have LWP share Internet Explorer's cookies like
|
||||||
|
this (note you need to supply a username to load_from_registry if you're on
|
||||||
|
Windows 9x or Windows ME):
|
||||||
|
|
||||||
|
cj = MSIECookieJar(delayload=1)
|
||||||
|
# find cookies index file in registry and load cookies from it
|
||||||
|
cj.load_from_registry()
|
||||||
|
opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cj))
|
||||||
|
response = opener.open("http://example.com/")
|
||||||
|
|
||||||
|
Iterating over a delayloaded MSIECookieJar instance will not cause any
|
||||||
|
cookies to be read from disk. To force reading of all cookies from disk,
|
||||||
|
call read_all_cookies. Note that the following methods iterate over self:
|
||||||
|
clear_temporary_cookies, clear_expired_cookies, __len__, __repr__, __str__
|
||||||
|
and as_string.
|
||||||
|
|
||||||
|
Additional methods:
|
||||||
|
|
||||||
|
load_from_registry(ignore_discard=False, ignore_expires=False,
|
||||||
|
username=None)
|
||||||
|
load_cookie_data(filename, ignore_discard=False, ignore_expires=False)
|
||||||
|
read_all_cookies()
|
||||||
|
|
||||||
|
"""
|
||||||
|
def __init__(self, filename=None, delayload=False, policy=None):
|
||||||
|
MSIEBase.__init__(self)
|
||||||
|
FileCookieJar.__init__(self, filename, delayload, policy)
|
||||||
|
|
||||||
|
def set_cookie(self, cookie):
|
||||||
|
if self.delayload:
|
||||||
|
self._delayload_domain(cookie.domain)
|
||||||
|
CookieJar.set_cookie(self, cookie)
|
||||||
|
|
||||||
|
def _cookies_for_request(self, request):
|
||||||
|
"""Return a list of cookies to be returned to server."""
|
||||||
|
domains = self._cookies.copy()
|
||||||
|
domains.update(self._delayload_domains)
|
||||||
|
domains = domains.keys()
|
||||||
|
|
||||||
|
cookies = []
|
||||||
|
for domain in domains:
|
||||||
|
cookies.extend(self._cookies_for_domain(domain, request))
|
||||||
|
return cookies
|
||||||
|
|
||||||
|
def _cookies_for_domain(self, domain, request):
|
||||||
|
if not self._policy.domain_return_ok(domain, request):
|
||||||
|
return []
|
||||||
|
debug("Checking %s for cookies to return", domain)
|
||||||
|
if self.delayload:
|
||||||
|
self._delayload_domain(domain)
|
||||||
|
return CookieJar._cookies_for_domain(self, domain, request)
|
||||||
|
|
||||||
|
def read_all_cookies(self):
|
||||||
|
"""Eagerly read in all cookies."""
|
||||||
|
if self.delayload:
|
||||||
|
for domain in self._delayload_domains.keys():
|
||||||
|
self._delayload_domain(domain)
|
||||||
|
|
||||||
|
def load(self, filename, ignore_discard=False, ignore_expires=False,
|
||||||
|
username=None):
|
||||||
|
"""Load cookies from an MSIE 'index.dat' cookies index file.
|
||||||
|
|
||||||
|
filename: full path to cookie index file
|
||||||
|
username: only required on win9x
|
||||||
|
|
||||||
|
"""
|
||||||
|
if filename is None:
|
||||||
|
if self.filename is not None: filename = self.filename
|
||||||
|
else: raise ValueError(MISSING_FILENAME_TEXT)
|
||||||
|
|
||||||
|
index = open(filename, "rb")
|
||||||
|
|
||||||
|
try:
|
||||||
|
self._really_load(index, filename, ignore_discard, ignore_expires,
|
||||||
|
username)
|
||||||
|
finally:
|
||||||
|
index.close()
|
421
src/calibre/utils/mechanize/_opener.py
Normal file
421
src/calibre/utils/mechanize/_opener.py
Normal file
@ -0,0 +1,421 @@
|
|||||||
|
"""Integration with Python standard library module urllib2: OpenerDirector
|
||||||
|
class.
|
||||||
|
|
||||||
|
Copyright 2004-2006 John J Lee <jjl@pobox.com>
|
||||||
|
|
||||||
|
This code is free software; you can redistribute it and/or modify it
|
||||||
|
under the terms of the BSD or ZPL 2.1 licenses (see the file
|
||||||
|
COPYING.txt included with the distribution).
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os, urllib2, bisect, urllib, httplib, types, tempfile
|
||||||
|
try:
|
||||||
|
import threading as _threading
|
||||||
|
except ImportError:
|
||||||
|
import dummy_threading as _threading
|
||||||
|
try:
|
||||||
|
set
|
||||||
|
except NameError:
|
||||||
|
import sets
|
||||||
|
set = sets.Set
|
||||||
|
|
||||||
|
import _http
|
||||||
|
import _upgrade
|
||||||
|
import _rfc3986
|
||||||
|
import _response
|
||||||
|
from _util import isstringlike
|
||||||
|
from _request import Request
|
||||||
|
|
||||||
|
|
||||||
|
class ContentTooShortError(urllib2.URLError):
|
||||||
|
def __init__(self, reason, result):
|
||||||
|
urllib2.URLError.__init__(self, reason)
|
||||||
|
self.result = result
|
||||||
|
|
||||||
|
|
||||||
|
class OpenerDirector(urllib2.OpenerDirector):
|
||||||
|
def __init__(self):
|
||||||
|
urllib2.OpenerDirector.__init__(self)
|
||||||
|
# really none of these are (sanely) public -- the lack of initial
|
||||||
|
# underscore on some is just due to following urllib2
|
||||||
|
self.process_response = {}
|
||||||
|
self.process_request = {}
|
||||||
|
self._any_request = {}
|
||||||
|
self._any_response = {}
|
||||||
|
self._handler_index_valid = True
|
||||||
|
self._tempfiles = []
|
||||||
|
|
||||||
|
def add_handler(self, handler):
|
||||||
|
if handler in self.handlers:
|
||||||
|
return
|
||||||
|
# XXX why does self.handlers need to be sorted?
|
||||||
|
bisect.insort(self.handlers, handler)
|
||||||
|
handler.add_parent(self)
|
||||||
|
self._handler_index_valid = False
|
||||||
|
|
||||||
|
def _maybe_reindex_handlers(self):
|
||||||
|
if self._handler_index_valid:
|
||||||
|
return
|
||||||
|
|
||||||
|
handle_error = {}
|
||||||
|
handle_open = {}
|
||||||
|
process_request = {}
|
||||||
|
process_response = {}
|
||||||
|
any_request = set()
|
||||||
|
any_response = set()
|
||||||
|
unwanted = []
|
||||||
|
|
||||||
|
for handler in self.handlers:
|
||||||
|
added = False
|
||||||
|
for meth in dir(handler):
|
||||||
|
if meth in ["redirect_request", "do_open", "proxy_open"]:
|
||||||
|
# oops, coincidental match
|
||||||
|
continue
|
||||||
|
|
||||||
|
if meth == "any_request":
|
||||||
|
any_request.add(handler)
|
||||||
|
added = True
|
||||||
|
continue
|
||||||
|
elif meth == "any_response":
|
||||||
|
any_response.add(handler)
|
||||||
|
added = True
|
||||||
|
continue
|
||||||
|
|
||||||
|
ii = meth.find("_")
|
||||||
|
scheme = meth[:ii]
|
||||||
|
condition = meth[ii+1:]
|
||||||
|
|
||||||
|
if condition.startswith("error"):
|
||||||
|
jj = meth[ii+1:].find("_") + ii + 1
|
||||||
|
kind = meth[jj+1:]
|
||||||
|
try:
|
||||||
|
kind = int(kind)
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
lookup = handle_error.setdefault(scheme, {})
|
||||||
|
elif condition == "open":
|
||||||
|
kind = scheme
|
||||||
|
lookup = handle_open
|
||||||
|
elif condition == "request":
|
||||||
|
kind = scheme
|
||||||
|
lookup = process_request
|
||||||
|
elif condition == "response":
|
||||||
|
kind = scheme
|
||||||
|
lookup = process_response
|
||||||
|
else:
|
||||||
|
continue
|
||||||
|
|
||||||
|
lookup.setdefault(kind, set()).add(handler)
|
||||||
|
added = True
|
||||||
|
|
||||||
|
if not added:
|
||||||
|
unwanted.append(handler)
|
||||||
|
|
||||||
|
for handler in unwanted:
|
||||||
|
self.handlers.remove(handler)
|
||||||
|
|
||||||
|
# sort indexed methods
|
||||||
|
# XXX could be cleaned up
|
||||||
|
for lookup in [process_request, process_response]:
|
||||||
|
for scheme, handlers in lookup.iteritems():
|
||||||
|
lookup[scheme] = handlers
|
||||||
|
for scheme, lookup in handle_error.iteritems():
|
||||||
|
for code, handlers in lookup.iteritems():
|
||||||
|
handlers = list(handlers)
|
||||||
|
handlers.sort()
|
||||||
|
lookup[code] = handlers
|
||||||
|
for scheme, handlers in handle_open.iteritems():
|
||||||
|
handlers = list(handlers)
|
||||||
|
handlers.sort()
|
||||||
|
handle_open[scheme] = handlers
|
||||||
|
|
||||||
|
# cache the indexes
|
||||||
|
self.handle_error = handle_error
|
||||||
|
self.handle_open = handle_open
|
||||||
|
self.process_request = process_request
|
||||||
|
self.process_response = process_response
|
||||||
|
self._any_request = any_request
|
||||||
|
self._any_response = any_response
|
||||||
|
|
||||||
|
def _request(self, url_or_req, data, visit):
|
||||||
|
if isstringlike(url_or_req):
|
||||||
|
req = Request(url_or_req, data, visit=visit)
|
||||||
|
else:
|
||||||
|
# already a urllib2.Request or mechanize.Request instance
|
||||||
|
req = url_or_req
|
||||||
|
if data is not None:
|
||||||
|
req.add_data(data)
|
||||||
|
# XXX yuck, give request a .visit attribute if it doesn't have one
|
||||||
|
try:
|
||||||
|
req.visit
|
||||||
|
except AttributeError:
|
||||||
|
req.visit = None
|
||||||
|
if visit is not None:
|
||||||
|
req.visit = visit
|
||||||
|
return req
|
||||||
|
|
||||||
|
def open(self, fullurl, data=None):
|
||||||
|
req = self._request(fullurl, data, None)
|
||||||
|
req_scheme = req.get_type()
|
||||||
|
|
||||||
|
self._maybe_reindex_handlers()
|
||||||
|
|
||||||
|
# pre-process request
|
||||||
|
# XXX should we allow a Processor to change the URL scheme
|
||||||
|
# of the request?
|
||||||
|
request_processors = set(self.process_request.get(req_scheme, []))
|
||||||
|
request_processors.update(self._any_request)
|
||||||
|
request_processors = list(request_processors)
|
||||||
|
request_processors.sort()
|
||||||
|
for processor in request_processors:
|
||||||
|
for meth_name in ["any_request", req_scheme+"_request"]:
|
||||||
|
meth = getattr(processor, meth_name, None)
|
||||||
|
if meth:
|
||||||
|
req = meth(req)
|
||||||
|
|
||||||
|
# In Python >= 2.4, .open() supports processors already, so we must
|
||||||
|
# call ._open() instead.
|
||||||
|
urlopen = getattr(urllib2.OpenerDirector, "_open",
|
||||||
|
urllib2.OpenerDirector.open)
|
||||||
|
response = urlopen(self, req, data)
|
||||||
|
|
||||||
|
# post-process response
|
||||||
|
response_processors = set(self.process_response.get(req_scheme, []))
|
||||||
|
response_processors.update(self._any_response)
|
||||||
|
response_processors = list(response_processors)
|
||||||
|
response_processors.sort()
|
||||||
|
for processor in response_processors:
|
||||||
|
for meth_name in ["any_response", req_scheme+"_response"]:
|
||||||
|
meth = getattr(processor, meth_name, None)
|
||||||
|
if meth:
|
||||||
|
response = meth(req, response)
|
||||||
|
|
||||||
|
return response
|
||||||
|
|
||||||
|
def error(self, proto, *args):
|
||||||
|
if proto in ['http', 'https']:
|
||||||
|
# XXX http[s] protocols are special-cased
|
||||||
|
dict = self.handle_error['http'] # https is not different than http
|
||||||
|
proto = args[2] # YUCK!
|
||||||
|
meth_name = 'http_error_%s' % proto
|
||||||
|
http_err = 1
|
||||||
|
orig_args = args
|
||||||
|
else:
|
||||||
|
dict = self.handle_error
|
||||||
|
meth_name = proto + '_error'
|
||||||
|
http_err = 0
|
||||||
|
args = (dict, proto, meth_name) + args
|
||||||
|
result = apply(self._call_chain, args)
|
||||||
|
if result:
|
||||||
|
return result
|
||||||
|
|
||||||
|
if http_err:
|
||||||
|
args = (dict, 'default', 'http_error_default') + orig_args
|
||||||
|
return apply(self._call_chain, args)
|
||||||
|
|
||||||
|
BLOCK_SIZE = 1024*8
|
||||||
|
def retrieve(self, fullurl, filename=None, reporthook=None, data=None):
|
||||||
|
"""Returns (filename, headers).
|
||||||
|
|
||||||
|
For remote objects, the default filename will refer to a temporary
|
||||||
|
file. Temporary files are removed when the OpenerDirector.close()
|
||||||
|
method is called.
|
||||||
|
|
||||||
|
For file: URLs, at present the returned filename is None. This may
|
||||||
|
change in future.
|
||||||
|
|
||||||
|
If the actual number of bytes read is less than indicated by the
|
||||||
|
Content-Length header, raises ContentTooShortError (a URLError
|
||||||
|
subclass). The exception's .result attribute contains the (filename,
|
||||||
|
headers) that would have been returned.
|
||||||
|
|
||||||
|
"""
|
||||||
|
req = self._request(fullurl, data, False)
|
||||||
|
scheme = req.get_type()
|
||||||
|
fp = self.open(req)
|
||||||
|
headers = fp.info()
|
||||||
|
if filename is None and scheme == 'file':
|
||||||
|
# XXX req.get_selector() seems broken here, return None,
|
||||||
|
# pending sanity :-/
|
||||||
|
return None, headers
|
||||||
|
#return urllib.url2pathname(req.get_selector()), headers
|
||||||
|
if filename:
|
||||||
|
tfp = open(filename, 'wb')
|
||||||
|
else:
|
||||||
|
path = _rfc3986.urlsplit(fullurl)[2]
|
||||||
|
suffix = os.path.splitext(path)[1]
|
||||||
|
fd, filename = tempfile.mkstemp(suffix)
|
||||||
|
self._tempfiles.append(filename)
|
||||||
|
tfp = os.fdopen(fd, 'wb')
|
||||||
|
|
||||||
|
result = filename, headers
|
||||||
|
bs = self.BLOCK_SIZE
|
||||||
|
size = -1
|
||||||
|
read = 0
|
||||||
|
blocknum = 0
|
||||||
|
if reporthook:
|
||||||
|
if "content-length" in headers:
|
||||||
|
size = int(headers["Content-Length"])
|
||||||
|
reporthook(blocknum, bs, size)
|
||||||
|
while 1:
|
||||||
|
block = fp.read(bs)
|
||||||
|
if block == "":
|
||||||
|
break
|
||||||
|
read += len(block)
|
||||||
|
tfp.write(block)
|
||||||
|
blocknum += 1
|
||||||
|
if reporthook:
|
||||||
|
reporthook(blocknum, bs, size)
|
||||||
|
fp.close()
|
||||||
|
tfp.close()
|
||||||
|
del fp
|
||||||
|
del tfp
|
||||||
|
|
||||||
|
# raise exception if actual size does not match content-length header
|
||||||
|
if size >= 0 and read < size:
|
||||||
|
raise ContentTooShortError(
|
||||||
|
"retrieval incomplete: "
|
||||||
|
"got only %i out of %i bytes" % (read, size),
|
||||||
|
result
|
||||||
|
)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
urllib2.OpenerDirector.close(self)
|
||||||
|
|
||||||
|
# make it very obvious this object is no longer supposed to be used
|
||||||
|
self.open = self.error = self.retrieve = self.add_handler = None
|
||||||
|
|
||||||
|
if self._tempfiles:
|
||||||
|
for filename in self._tempfiles:
|
||||||
|
try:
|
||||||
|
os.unlink(filename)
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
del self._tempfiles[:]
|
||||||
|
|
||||||
|
|
||||||
|
def wrapped_open(urlopen, process_response_object, fullurl, data=None):
|
||||||
|
success = True
|
||||||
|
try:
|
||||||
|
response = urlopen(fullurl, data)
|
||||||
|
except urllib2.HTTPError, error:
|
||||||
|
success = False
|
||||||
|
if error.fp is None: # not a response
|
||||||
|
raise
|
||||||
|
response = error
|
||||||
|
|
||||||
|
if response is not None:
|
||||||
|
response = process_response_object(response)
|
||||||
|
|
||||||
|
if not success:
|
||||||
|
raise response
|
||||||
|
return response
|
||||||
|
|
||||||
|
class ResponseProcessingOpener(OpenerDirector):
|
||||||
|
|
||||||
|
def open(self, fullurl, data=None):
|
||||||
|
def bound_open(fullurl, data=None):
|
||||||
|
return OpenerDirector.open(self, fullurl, data)
|
||||||
|
return wrapped_open(
|
||||||
|
bound_open, self.process_response_object, fullurl, data)
|
||||||
|
|
||||||
|
def process_response_object(self, response):
|
||||||
|
return response
|
||||||
|
|
||||||
|
|
||||||
|
class SeekableResponseOpener(ResponseProcessingOpener):
|
||||||
|
def process_response_object(self, response):
|
||||||
|
return _response.seek_wrapped_response(response)
|
||||||
|
|
||||||
|
|
||||||
|
class OpenerFactory:
|
||||||
|
"""This class's interface is quite likely to change."""
|
||||||
|
|
||||||
|
default_classes = [
|
||||||
|
# handlers
|
||||||
|
urllib2.ProxyHandler,
|
||||||
|
urllib2.UnknownHandler,
|
||||||
|
_http.HTTPHandler, # derived from new AbstractHTTPHandler
|
||||||
|
_http.HTTPDefaultErrorHandler,
|
||||||
|
_http.HTTPRedirectHandler, # bugfixed
|
||||||
|
urllib2.FTPHandler,
|
||||||
|
urllib2.FileHandler,
|
||||||
|
# processors
|
||||||
|
_upgrade.HTTPRequestUpgradeProcessor,
|
||||||
|
_http.HTTPCookieProcessor,
|
||||||
|
_http.HTTPErrorProcessor,
|
||||||
|
]
|
||||||
|
if hasattr(httplib, 'HTTPS'):
|
||||||
|
default_classes.append(_http.HTTPSHandler)
|
||||||
|
handlers = []
|
||||||
|
replacement_handlers = []
|
||||||
|
|
||||||
|
def __init__(self, klass=OpenerDirector):
|
||||||
|
self.klass = klass
|
||||||
|
|
||||||
|
def build_opener(self, *handlers):
|
||||||
|
"""Create an opener object from a list of handlers and processors.
|
||||||
|
|
||||||
|
The opener will use several default handlers and processors, including
|
||||||
|
support for HTTP and FTP.
|
||||||
|
|
||||||
|
If any of the handlers passed as arguments are subclasses of the
|
||||||
|
default handlers, the default handlers will not be used.
|
||||||
|
|
||||||
|
"""
|
||||||
|
opener = self.klass()
|
||||||
|
default_classes = list(self.default_classes)
|
||||||
|
skip = []
|
||||||
|
for klass in default_classes:
|
||||||
|
for check in handlers:
|
||||||
|
if type(check) == types.ClassType:
|
||||||
|
if issubclass(check, klass):
|
||||||
|
skip.append(klass)
|
||||||
|
elif type(check) == types.InstanceType:
|
||||||
|
if isinstance(check, klass):
|
||||||
|
skip.append(klass)
|
||||||
|
for klass in skip:
|
||||||
|
default_classes.remove(klass)
|
||||||
|
|
||||||
|
for klass in default_classes:
|
||||||
|
opener.add_handler(klass())
|
||||||
|
for h in handlers:
|
||||||
|
if type(h) == types.ClassType:
|
||||||
|
h = h()
|
||||||
|
opener.add_handler(h)
|
||||||
|
|
||||||
|
return opener
|
||||||
|
|
||||||
|
|
||||||
|
build_opener = OpenerFactory().build_opener
|
||||||
|
|
||||||
|
_opener = None
|
||||||
|
urlopen_lock = _threading.Lock()
|
||||||
|
def urlopen(url, data=None):
|
||||||
|
global _opener
|
||||||
|
if _opener is None:
|
||||||
|
urlopen_lock.acquire()
|
||||||
|
try:
|
||||||
|
if _opener is None:
|
||||||
|
_opener = build_opener()
|
||||||
|
finally:
|
||||||
|
urlopen_lock.release()
|
||||||
|
return _opener.open(url, data)
|
||||||
|
|
||||||
|
def urlretrieve(url, filename=None, reporthook=None, data=None):
|
||||||
|
global _opener
|
||||||
|
if _opener is None:
|
||||||
|
urlopen_lock.acquire()
|
||||||
|
try:
|
||||||
|
if _opener is None:
|
||||||
|
_opener = build_opener()
|
||||||
|
finally:
|
||||||
|
urlopen_lock.release()
|
||||||
|
return _opener.retrieve(url, filename, reporthook, data)
|
||||||
|
|
||||||
|
def install_opener(opener):
|
||||||
|
global _opener
|
||||||
|
_opener = opener
|
334
src/calibre/utils/mechanize/_pullparser.py
Normal file
334
src/calibre/utils/mechanize/_pullparser.py
Normal file
@ -0,0 +1,334 @@
|
|||||||
|
"""A simple "pull API" for HTML parsing, after Perl's HTML::TokeParser.
|
||||||
|
|
||||||
|
Examples
|
||||||
|
|
||||||
|
This program extracts all links from a document. It will print one
|
||||||
|
line for each link, containing the URL and the textual description
|
||||||
|
between the <A>...</A> tags:
|
||||||
|
|
||||||
|
import pullparser, sys
|
||||||
|
f = file(sys.argv[1])
|
||||||
|
p = pullparser.PullParser(f)
|
||||||
|
for token in p.tags("a"):
|
||||||
|
if token.type == "endtag": continue
|
||||||
|
url = dict(token.attrs).get("href", "-")
|
||||||
|
text = p.get_compressed_text(endat=("endtag", "a"))
|
||||||
|
print "%s\t%s" % (url, text)
|
||||||
|
|
||||||
|
This program extracts the <TITLE> from the document:
|
||||||
|
|
||||||
|
import pullparser, sys
|
||||||
|
f = file(sys.argv[1])
|
||||||
|
p = pullparser.PullParser(f)
|
||||||
|
if p.get_tag("title"):
|
||||||
|
title = p.get_compressed_text()
|
||||||
|
print "Title: %s" % title
|
||||||
|
|
||||||
|
|
||||||
|
Copyright 2003-2006 John J. Lee <jjl@pobox.com>
|
||||||
|
Copyright 1998-2001 Gisle Aas (original libwww-perl code)
|
||||||
|
|
||||||
|
This code is free software; you can redistribute it and/or modify it
|
||||||
|
under the terms of the BSD or ZPL 2.1 licenses.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re, htmlentitydefs
|
||||||
|
import sgmllib, HTMLParser
|
||||||
|
|
||||||
|
from _html import unescape, unescape_charref
|
||||||
|
|
||||||
|
|
||||||
|
class NoMoreTokensError(Exception): pass
|
||||||
|
|
||||||
|
class Token:
|
||||||
|
"""Represents an HTML tag, declaration, processing instruction etc.
|
||||||
|
|
||||||
|
Behaves as both a tuple-like object (ie. iterable) and has attributes
|
||||||
|
.type, .data and .attrs.
|
||||||
|
|
||||||
|
>>> t = Token("starttag", "a", [("href", "http://www.python.org/")])
|
||||||
|
>>> t == ("starttag", "a", [("href", "http://www.python.org/")])
|
||||||
|
True
|
||||||
|
>>> (t.type, t.data) == ("starttag", "a")
|
||||||
|
True
|
||||||
|
>>> t.attrs == [("href", "http://www.python.org/")]
|
||||||
|
True
|
||||||
|
|
||||||
|
Public attributes
|
||||||
|
|
||||||
|
type: one of "starttag", "endtag", "startendtag", "charref", "entityref",
|
||||||
|
"data", "comment", "decl", "pi", after the corresponding methods of
|
||||||
|
HTMLParser.HTMLParser
|
||||||
|
data: For a tag, the tag name; otherwise, the relevant data carried by the
|
||||||
|
tag, as a string
|
||||||
|
attrs: list of (name, value) pairs representing HTML attributes
|
||||||
|
(or None if token does not represent an opening tag)
|
||||||
|
|
||||||
|
"""
|
||||||
|
def __init__(self, type, data, attrs=None):
|
||||||
|
self.type = type
|
||||||
|
self.data = data
|
||||||
|
self.attrs = attrs
|
||||||
|
def __iter__(self):
|
||||||
|
return iter((self.type, self.data, self.attrs))
|
||||||
|
def __eq__(self, other):
|
||||||
|
type, data, attrs = other
|
||||||
|
if (self.type == type and
|
||||||
|
self.data == data and
|
||||||
|
self.attrs == attrs):
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
def __ne__(self, other): return not self.__eq__(other)
|
||||||
|
def __repr__(self):
|
||||||
|
args = ", ".join(map(repr, [self.type, self.data, self.attrs]))
|
||||||
|
return self.__class__.__name__+"(%s)" % args
|
||||||
|
|
||||||
|
def iter_until_exception(fn, exception, *args, **kwds):
|
||||||
|
while 1:
|
||||||
|
try:
|
||||||
|
yield fn(*args, **kwds)
|
||||||
|
except exception:
|
||||||
|
raise StopIteration
|
||||||
|
|
||||||
|
|
||||||
|
class _AbstractParser:
|
||||||
|
chunk = 1024
|
||||||
|
compress_re = re.compile(r"\s+")
|
||||||
|
def __init__(self, fh, textify={"img": "alt", "applet": "alt"},
|
||||||
|
encoding="ascii", entitydefs=None):
|
||||||
|
"""
|
||||||
|
fh: file-like object (only a .read() method is required) from which to
|
||||||
|
read HTML to be parsed
|
||||||
|
textify: mapping used by .get_text() and .get_compressed_text() methods
|
||||||
|
to represent opening tags as text
|
||||||
|
encoding: encoding used to encode numeric character references by
|
||||||
|
.get_text() and .get_compressed_text() ("ascii" by default)
|
||||||
|
|
||||||
|
entitydefs: mapping like {"amp": "&", ...} containing HTML entity
|
||||||
|
definitions (a sensible default is used). This is used to unescape
|
||||||
|
entities in .get_text() (and .get_compressed_text()) and attribute
|
||||||
|
values. If the encoding can not represent the character, the entity
|
||||||
|
reference is left unescaped. Note that entity references (both
|
||||||
|
numeric - e.g. { or ઼ - and non-numeric - e.g. &) are
|
||||||
|
unescaped in attribute values and the return value of .get_text(), but
|
||||||
|
not in data outside of tags. Instead, entity references outside of
|
||||||
|
tags are represented as tokens. This is a bit odd, it's true :-/
|
||||||
|
|
||||||
|
If the element name of an opening tag matches a key in the textify
|
||||||
|
mapping then that tag is converted to text. The corresponding value is
|
||||||
|
used to specify which tag attribute to obtain the text from. textify
|
||||||
|
maps from element names to either:
|
||||||
|
|
||||||
|
- an HTML attribute name, in which case the HTML attribute value is
|
||||||
|
used as its text value along with the element name in square
|
||||||
|
brackets (eg."alt text goes here[IMG]", or, if the alt attribute
|
||||||
|
were missing, just "[IMG]")
|
||||||
|
- a callable object (eg. a function) which takes a Token and returns
|
||||||
|
the string to be used as its text value
|
||||||
|
|
||||||
|
If textify has no key for an element name, nothing is substituted for
|
||||||
|
the opening tag.
|
||||||
|
|
||||||
|
Public attributes:
|
||||||
|
|
||||||
|
encoding and textify: see above
|
||||||
|
|
||||||
|
"""
|
||||||
|
self._fh = fh
|
||||||
|
self._tokenstack = [] # FIFO
|
||||||
|
self.textify = textify
|
||||||
|
self.encoding = encoding
|
||||||
|
if entitydefs is None:
|
||||||
|
entitydefs = htmlentitydefs.name2codepoint
|
||||||
|
self._entitydefs = entitydefs
|
||||||
|
|
||||||
|
def __iter__(self): return self
|
||||||
|
|
||||||
|
def tags(self, *names):
|
||||||
|
return iter_until_exception(self.get_tag, NoMoreTokensError, *names)
|
||||||
|
|
||||||
|
def tokens(self, *tokentypes):
|
||||||
|
return iter_until_exception(self.get_token, NoMoreTokensError, *tokentypes)
|
||||||
|
|
||||||
|
def next(self):
|
||||||
|
try:
|
||||||
|
return self.get_token()
|
||||||
|
except NoMoreTokensError:
|
||||||
|
raise StopIteration()
|
||||||
|
|
||||||
|
def get_token(self, *tokentypes):
|
||||||
|
"""Pop the next Token object from the stack of parsed tokens.
|
||||||
|
|
||||||
|
If arguments are given, they are taken to be token types in which the
|
||||||
|
caller is interested: tokens representing other elements will be
|
||||||
|
skipped. Element names must be given in lower case.
|
||||||
|
|
||||||
|
Raises NoMoreTokensError.
|
||||||
|
|
||||||
|
"""
|
||||||
|
while 1:
|
||||||
|
while self._tokenstack:
|
||||||
|
token = self._tokenstack.pop(0)
|
||||||
|
if tokentypes:
|
||||||
|
if token.type in tokentypes:
|
||||||
|
return token
|
||||||
|
else:
|
||||||
|
return token
|
||||||
|
data = self._fh.read(self.chunk)
|
||||||
|
if not data:
|
||||||
|
raise NoMoreTokensError()
|
||||||
|
self.feed(data)
|
||||||
|
|
||||||
|
def unget_token(self, token):
|
||||||
|
"""Push a Token back onto the stack."""
|
||||||
|
self._tokenstack.insert(0, token)
|
||||||
|
|
||||||
|
def get_tag(self, *names):
|
||||||
|
"""Return the next Token that represents an opening or closing tag.
|
||||||
|
|
||||||
|
If arguments are given, they are taken to be element names in which the
|
||||||
|
caller is interested: tags representing other elements will be skipped.
|
||||||
|
Element names must be given in lower case.
|
||||||
|
|
||||||
|
Raises NoMoreTokensError.
|
||||||
|
|
||||||
|
"""
|
||||||
|
while 1:
|
||||||
|
tok = self.get_token()
|
||||||
|
if tok.type not in ["starttag", "endtag", "startendtag"]:
|
||||||
|
continue
|
||||||
|
if names:
|
||||||
|
if tok.data in names:
|
||||||
|
return tok
|
||||||
|
else:
|
||||||
|
return tok
|
||||||
|
|
||||||
|
def get_text(self, endat=None):
|
||||||
|
"""Get some text.
|
||||||
|
|
||||||
|
endat: stop reading text at this tag (the tag is included in the
|
||||||
|
returned text); endtag is a tuple (type, name) where type is
|
||||||
|
"starttag", "endtag" or "startendtag", and name is the element name of
|
||||||
|
the tag (element names must be given in lower case)
|
||||||
|
|
||||||
|
If endat is not given, .get_text() will stop at the next opening or
|
||||||
|
closing tag, or when there are no more tokens (no exception is raised).
|
||||||
|
Note that .get_text() includes the text representation (if any) of the
|
||||||
|
opening tag, but pushes the opening tag back onto the stack. As a
|
||||||
|
result, if you want to call .get_text() again, you need to call
|
||||||
|
.get_tag() first (unless you want an empty string returned when you
|
||||||
|
next call .get_text()).
|
||||||
|
|
||||||
|
Entity references are translated using the value of the entitydefs
|
||||||
|
constructor argument (a mapping from names to characters like that
|
||||||
|
provided by the standard module htmlentitydefs). Named entity
|
||||||
|
references that are not in this mapping are left unchanged.
|
||||||
|
|
||||||
|
The textify attribute is used to translate opening tags into text: see
|
||||||
|
the class docstring.
|
||||||
|
|
||||||
|
"""
|
||||||
|
text = []
|
||||||
|
tok = None
|
||||||
|
while 1:
|
||||||
|
try:
|
||||||
|
tok = self.get_token()
|
||||||
|
except NoMoreTokensError:
|
||||||
|
# unget last token (not the one we just failed to get)
|
||||||
|
if tok: self.unget_token(tok)
|
||||||
|
break
|
||||||
|
if tok.type == "data":
|
||||||
|
text.append(tok.data)
|
||||||
|
elif tok.type == "entityref":
|
||||||
|
t = unescape("&%s;"%tok.data, self._entitydefs, self.encoding)
|
||||||
|
text.append(t)
|
||||||
|
elif tok.type == "charref":
|
||||||
|
t = unescape_charref(tok.data, self.encoding)
|
||||||
|
text.append(t)
|
||||||
|
elif tok.type in ["starttag", "endtag", "startendtag"]:
|
||||||
|
tag_name = tok.data
|
||||||
|
if tok.type in ["starttag", "startendtag"]:
|
||||||
|
alt = self.textify.get(tag_name)
|
||||||
|
if alt is not None:
|
||||||
|
if callable(alt):
|
||||||
|
text.append(alt(tok))
|
||||||
|
elif tok.attrs is not None:
|
||||||
|
for k, v in tok.attrs:
|
||||||
|
if k == alt:
|
||||||
|
text.append(v)
|
||||||
|
text.append("[%s]" % tag_name.upper())
|
||||||
|
if endat is None or endat == (tok.type, tag_name):
|
||||||
|
self.unget_token(tok)
|
||||||
|
break
|
||||||
|
return "".join(text)
|
||||||
|
|
||||||
|
def get_compressed_text(self, *args, **kwds):
|
||||||
|
"""
|
||||||
|
As .get_text(), but collapses each group of contiguous whitespace to a
|
||||||
|
single space character, and removes all initial and trailing
|
||||||
|
whitespace.
|
||||||
|
|
||||||
|
"""
|
||||||
|
text = self.get_text(*args, **kwds)
|
||||||
|
text = text.strip()
|
||||||
|
return self.compress_re.sub(" ", text)
|
||||||
|
|
||||||
|
def handle_startendtag(self, tag, attrs):
|
||||||
|
self._tokenstack.append(Token("startendtag", tag, attrs))
|
||||||
|
def handle_starttag(self, tag, attrs):
|
||||||
|
self._tokenstack.append(Token("starttag", tag, attrs))
|
||||||
|
def handle_endtag(self, tag):
|
||||||
|
self._tokenstack.append(Token("endtag", tag))
|
||||||
|
def handle_charref(self, name):
|
||||||
|
self._tokenstack.append(Token("charref", name))
|
||||||
|
def handle_entityref(self, name):
|
||||||
|
self._tokenstack.append(Token("entityref", name))
|
||||||
|
def handle_data(self, data):
|
||||||
|
self._tokenstack.append(Token("data", data))
|
||||||
|
def handle_comment(self, data):
|
||||||
|
self._tokenstack.append(Token("comment", data))
|
||||||
|
def handle_decl(self, decl):
|
||||||
|
self._tokenstack.append(Token("decl", decl))
|
||||||
|
def unknown_decl(self, data):
|
||||||
|
# XXX should this call self.error instead?
|
||||||
|
#self.error("unknown declaration: " + `data`)
|
||||||
|
self._tokenstack.append(Token("decl", data))
|
||||||
|
def handle_pi(self, data):
|
||||||
|
self._tokenstack.append(Token("pi", data))
|
||||||
|
|
||||||
|
def unescape_attr(self, name):
|
||||||
|
return unescape(name, self._entitydefs, self.encoding)
|
||||||
|
def unescape_attrs(self, attrs):
|
||||||
|
escaped_attrs = []
|
||||||
|
for key, val in attrs:
|
||||||
|
escaped_attrs.append((key, self.unescape_attr(val)))
|
||||||
|
return escaped_attrs
|
||||||
|
|
||||||
|
class PullParser(_AbstractParser, HTMLParser.HTMLParser):
|
||||||
|
def __init__(self, *args, **kwds):
|
||||||
|
HTMLParser.HTMLParser.__init__(self)
|
||||||
|
_AbstractParser.__init__(self, *args, **kwds)
|
||||||
|
def unescape(self, name):
|
||||||
|
# Use the entitydefs passed into constructor, not
|
||||||
|
# HTMLParser.HTMLParser's entitydefs.
|
||||||
|
return self.unescape_attr(name)
|
||||||
|
|
||||||
|
class TolerantPullParser(_AbstractParser, sgmllib.SGMLParser):
|
||||||
|
def __init__(self, *args, **kwds):
|
||||||
|
sgmllib.SGMLParser.__init__(self)
|
||||||
|
_AbstractParser.__init__(self, *args, **kwds)
|
||||||
|
def unknown_starttag(self, tag, attrs):
|
||||||
|
attrs = self.unescape_attrs(attrs)
|
||||||
|
self._tokenstack.append(Token("starttag", tag, attrs))
|
||||||
|
def unknown_endtag(self, tag):
|
||||||
|
self._tokenstack.append(Token("endtag", tag))
|
||||||
|
|
||||||
|
|
||||||
|
def _test():
|
||||||
|
import doctest, _pullparser
|
||||||
|
return doctest.testmod(_pullparser)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
_test()
|
86
src/calibre/utils/mechanize/_request.py
Normal file
86
src/calibre/utils/mechanize/_request.py
Normal file
@ -0,0 +1,86 @@
|
|||||||
|
"""Integration with Python standard library module urllib2: Request class.
|
||||||
|
|
||||||
|
Copyright 2004-2006 John J Lee <jjl@pobox.com>
|
||||||
|
|
||||||
|
This code is free software; you can redistribute it and/or modify it
|
||||||
|
under the terms of the BSD or ZPL 2.1 licenses (see the file
|
||||||
|
COPYING.txt included with the distribution).
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
import urllib2, urllib, logging
|
||||||
|
|
||||||
|
from _clientcookie import request_host
|
||||||
|
import _rfc3986
|
||||||
|
|
||||||
|
warn = logging.getLogger("mechanize").warning
|
||||||
|
# don't complain about missing logging handler
|
||||||
|
logging.getLogger("mechanize").setLevel(logging.ERROR)
|
||||||
|
|
||||||
|
|
||||||
|
class Request(urllib2.Request):
|
||||||
|
def __init__(self, url, data=None, headers={},
|
||||||
|
origin_req_host=None, unverifiable=False, visit=None):
|
||||||
|
# In mechanize 0.2, the interpretation of a unicode url argument will
|
||||||
|
# change: A unicode url argument will be interpreted as an IRI, and a
|
||||||
|
# bytestring as a URI. For now, we accept unicode or bytestring. We
|
||||||
|
# don't insist that the value is always a URI (specifically, must only
|
||||||
|
# contain characters which are legal), because that might break working
|
||||||
|
# code (who knows what bytes some servers want to see, especially with
|
||||||
|
# browser plugins for internationalised URIs).
|
||||||
|
if not _rfc3986.is_clean_uri(url):
|
||||||
|
warn("url argument is not a URI "
|
||||||
|
"(contains illegal characters) %r" % url)
|
||||||
|
urllib2.Request.__init__(self, url, data, headers)
|
||||||
|
self.selector = None
|
||||||
|
self.unredirected_hdrs = {}
|
||||||
|
self.visit = visit
|
||||||
|
|
||||||
|
# All the terminology below comes from RFC 2965.
|
||||||
|
self.unverifiable = unverifiable
|
||||||
|
# Set request-host of origin transaction.
|
||||||
|
# The origin request-host is needed in order to decide whether
|
||||||
|
# unverifiable sub-requests (automatic redirects, images embedded
|
||||||
|
# in HTML, etc.) are to third-party hosts. If they are, the
|
||||||
|
# resulting transactions might need to be conducted with cookies
|
||||||
|
# turned off.
|
||||||
|
if origin_req_host is None:
|
||||||
|
origin_req_host = request_host(self)
|
||||||
|
self.origin_req_host = origin_req_host
|
||||||
|
|
||||||
|
def get_selector(self):
|
||||||
|
return urllib.splittag(self.__r_host)[0]
|
||||||
|
|
||||||
|
def get_origin_req_host(self):
|
||||||
|
return self.origin_req_host
|
||||||
|
|
||||||
|
def is_unverifiable(self):
|
||||||
|
return self.unverifiable
|
||||||
|
|
||||||
|
def add_unredirected_header(self, key, val):
|
||||||
|
"""Add a header that will not be added to a redirected request."""
|
||||||
|
self.unredirected_hdrs[key.capitalize()] = val
|
||||||
|
|
||||||
|
def has_header(self, header_name):
|
||||||
|
"""True iff request has named header (regular or unredirected)."""
|
||||||
|
return (header_name in self.headers or
|
||||||
|
header_name in self.unredirected_hdrs)
|
||||||
|
|
||||||
|
def get_header(self, header_name, default=None):
|
||||||
|
return self.headers.get(
|
||||||
|
header_name,
|
||||||
|
self.unredirected_hdrs.get(header_name, default))
|
||||||
|
|
||||||
|
def header_items(self):
|
||||||
|
hdrs = self.unredirected_hdrs.copy()
|
||||||
|
hdrs.update(self.headers)
|
||||||
|
return hdrs.items()
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return "<Request for %s>" % self.get_full_url()
|
||||||
|
|
||||||
|
def get_method(self):
|
||||||
|
if self.has_data():
|
||||||
|
return "POST"
|
||||||
|
else:
|
||||||
|
return "GET"
|
515
src/calibre/utils/mechanize/_response.py
Normal file
515
src/calibre/utils/mechanize/_response.py
Normal file
@ -0,0 +1,515 @@
|
|||||||
|
"""Response classes.
|
||||||
|
|
||||||
|
The seek_wrapper code is not used if you're using UserAgent with
|
||||||
|
.set_seekable_responses(False), or if you're using the urllib2-level interface
|
||||||
|
without SeekableProcessor or HTTPEquivProcessor. Class closeable_response is
|
||||||
|
instantiated by some handlers (AbstractHTTPHandler), but the closeable_response
|
||||||
|
interface is only depended upon by Browser-level code. Function
|
||||||
|
upgrade_response is only used if you're using Browser or
|
||||||
|
ResponseUpgradeProcessor.
|
||||||
|
|
||||||
|
|
||||||
|
Copyright 2006 John J. Lee <jjl@pobox.com>
|
||||||
|
|
||||||
|
This code is free software; you can redistribute it and/or modify it
|
||||||
|
under the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt
|
||||||
|
included with the distribution).
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
import copy, mimetools
|
||||||
|
from cStringIO import StringIO
|
||||||
|
import urllib2
|
||||||
|
|
||||||
|
# XXX Andrew Dalke kindly sent me a similar class in response to my request on
|
||||||
|
# comp.lang.python, which I then proceeded to lose. I wrote this class
|
||||||
|
# instead, but I think he's released his code publicly since, could pinch the
|
||||||
|
# tests from it, at least...
|
||||||
|
|
||||||
|
# For testing seek_wrapper invariant (note that
|
||||||
|
# test_urllib2.HandlerTest.test_seekable is expected to fail when this
|
||||||
|
# invariant checking is turned on). The invariant checking is done by module
|
||||||
|
# ipdc, which is available here:
|
||||||
|
# http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/436834
|
||||||
|
## from ipdbc import ContractBase
|
||||||
|
## class seek_wrapper(ContractBase):
|
||||||
|
class seek_wrapper:
|
||||||
|
"""Adds a seek method to a file object.
|
||||||
|
|
||||||
|
This is only designed for seeking on readonly file-like objects.
|
||||||
|
|
||||||
|
Wrapped file-like object must have a read method. The readline method is
|
||||||
|
only supported if that method is present on the wrapped object. The
|
||||||
|
readlines method is always supported. xreadlines and iteration are
|
||||||
|
supported only for Python 2.2 and above.
|
||||||
|
|
||||||
|
Public attributes:
|
||||||
|
|
||||||
|
wrapped: the wrapped file object
|
||||||
|
is_closed: true iff .close() has been called
|
||||||
|
|
||||||
|
WARNING: All other attributes of the wrapped object (ie. those that are not
|
||||||
|
one of wrapped, read, readline, readlines, xreadlines, __iter__ and next)
|
||||||
|
are passed through unaltered, which may or may not make sense for your
|
||||||
|
particular file object.
|
||||||
|
|
||||||
|
"""
|
||||||
|
# General strategy is to check that cache is full enough, then delegate to
|
||||||
|
# the cache (self.__cache, which is a cStringIO.StringIO instance). A seek
|
||||||
|
# position (self.__pos) is maintained independently of the cache, in order
|
||||||
|
# that a single cache may be shared between multiple seek_wrapper objects.
|
||||||
|
# Copying using module copy shares the cache in this way.
|
||||||
|
|
||||||
|
def __init__(self, wrapped):
|
||||||
|
self.wrapped = wrapped
|
||||||
|
self.__read_complete_state = [False]
|
||||||
|
self.__is_closed_state = [False]
|
||||||
|
self.__have_readline = hasattr(self.wrapped, "readline")
|
||||||
|
self.__cache = StringIO()
|
||||||
|
self.__pos = 0 # seek position
|
||||||
|
|
||||||
|
def invariant(self):
|
||||||
|
# The end of the cache is always at the same place as the end of the
|
||||||
|
# wrapped file.
|
||||||
|
return self.wrapped.tell() == len(self.__cache.getvalue())
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
self.wrapped.close()
|
||||||
|
self.is_closed = True
|
||||||
|
|
||||||
|
def __getattr__(self, name):
|
||||||
|
if name == "is_closed":
|
||||||
|
return self.__is_closed_state[0]
|
||||||
|
elif name == "read_complete":
|
||||||
|
return self.__read_complete_state[0]
|
||||||
|
|
||||||
|
wrapped = self.__dict__.get("wrapped")
|
||||||
|
if wrapped:
|
||||||
|
return getattr(wrapped, name)
|
||||||
|
|
||||||
|
return getattr(self.__class__, name)
|
||||||
|
|
||||||
|
def __setattr__(self, name, value):
|
||||||
|
if name == "is_closed":
|
||||||
|
self.__is_closed_state[0] = bool(value)
|
||||||
|
elif name == "read_complete":
|
||||||
|
if not self.is_closed:
|
||||||
|
self.__read_complete_state[0] = bool(value)
|
||||||
|
else:
|
||||||
|
self.__dict__[name] = value
|
||||||
|
|
||||||
|
def seek(self, offset, whence=0):
|
||||||
|
assert whence in [0,1,2]
|
||||||
|
|
||||||
|
# how much data, if any, do we need to read?
|
||||||
|
if whence == 2: # 2: relative to end of *wrapped* file
|
||||||
|
if offset < 0: raise ValueError("negative seek offset")
|
||||||
|
# since we don't know yet where the end of that file is, we must
|
||||||
|
# read everything
|
||||||
|
to_read = None
|
||||||
|
else:
|
||||||
|
if whence == 0: # 0: absolute
|
||||||
|
if offset < 0: raise ValueError("negative seek offset")
|
||||||
|
dest = offset
|
||||||
|
else: # 1: relative to current position
|
||||||
|
pos = self.__pos
|
||||||
|
if pos < offset:
|
||||||
|
raise ValueError("seek to before start of file")
|
||||||
|
dest = pos + offset
|
||||||
|
end = len(self.__cache.getvalue())
|
||||||
|
to_read = dest - end
|
||||||
|
if to_read < 0:
|
||||||
|
to_read = 0
|
||||||
|
|
||||||
|
if to_read != 0:
|
||||||
|
self.__cache.seek(0, 2)
|
||||||
|
if to_read is None:
|
||||||
|
assert whence == 2
|
||||||
|
self.__cache.write(self.wrapped.read())
|
||||||
|
self.read_complete = True
|
||||||
|
self.__pos = self.__cache.tell() - offset
|
||||||
|
else:
|
||||||
|
data = self.wrapped.read(to_read)
|
||||||
|
if not data:
|
||||||
|
self.read_complete = True
|
||||||
|
else:
|
||||||
|
self.__cache.write(data)
|
||||||
|
# Don't raise an exception even if we've seek()ed past the end
|
||||||
|
# of .wrapped, since fseek() doesn't complain in that case.
|
||||||
|
# Also like fseek(), pretend we have seek()ed past the end,
|
||||||
|
# i.e. not:
|
||||||
|
#self.__pos = self.__cache.tell()
|
||||||
|
# but rather:
|
||||||
|
self.__pos = dest
|
||||||
|
else:
|
||||||
|
self.__pos = dest
|
||||||
|
|
||||||
|
def tell(self):
|
||||||
|
return self.__pos
|
||||||
|
|
||||||
|
def __copy__(self):
|
||||||
|
cpy = self.__class__(self.wrapped)
|
||||||
|
cpy.__cache = self.__cache
|
||||||
|
cpy.__read_complete_state = self.__read_complete_state
|
||||||
|
cpy.__is_closed_state = self.__is_closed_state
|
||||||
|
return cpy
|
||||||
|
|
||||||
|
def get_data(self):
|
||||||
|
pos = self.__pos
|
||||||
|
try:
|
||||||
|
self.seek(0)
|
||||||
|
return self.read(-1)
|
||||||
|
finally:
|
||||||
|
self.__pos = pos
|
||||||
|
|
||||||
|
def read(self, size=-1):
|
||||||
|
pos = self.__pos
|
||||||
|
end = len(self.__cache.getvalue())
|
||||||
|
available = end - pos
|
||||||
|
|
||||||
|
# enough data already cached?
|
||||||
|
if size <= available and size != -1:
|
||||||
|
self.__cache.seek(pos)
|
||||||
|
self.__pos = pos+size
|
||||||
|
return self.__cache.read(size)
|
||||||
|
|
||||||
|
# no, so read sufficient data from wrapped file and cache it
|
||||||
|
self.__cache.seek(0, 2)
|
||||||
|
if size == -1:
|
||||||
|
self.__cache.write(self.wrapped.read())
|
||||||
|
self.read_complete = True
|
||||||
|
else:
|
||||||
|
to_read = size - available
|
||||||
|
assert to_read > 0
|
||||||
|
data = self.wrapped.read(to_read)
|
||||||
|
if not data:
|
||||||
|
self.read_complete = True
|
||||||
|
else:
|
||||||
|
self.__cache.write(data)
|
||||||
|
self.__cache.seek(pos)
|
||||||
|
|
||||||
|
data = self.__cache.read(size)
|
||||||
|
self.__pos = self.__cache.tell()
|
||||||
|
assert self.__pos == pos + len(data)
|
||||||
|
return data
|
||||||
|
|
||||||
|
def readline(self, size=-1):
|
||||||
|
if not self.__have_readline:
|
||||||
|
raise NotImplementedError("no readline method on wrapped object")
|
||||||
|
|
||||||
|
# line we're about to read might not be complete in the cache, so
|
||||||
|
# read another line first
|
||||||
|
pos = self.__pos
|
||||||
|
self.__cache.seek(0, 2)
|
||||||
|
data = self.wrapped.readline()
|
||||||
|
if not data:
|
||||||
|
self.read_complete = True
|
||||||
|
else:
|
||||||
|
self.__cache.write(data)
|
||||||
|
self.__cache.seek(pos)
|
||||||
|
|
||||||
|
data = self.__cache.readline()
|
||||||
|
if size != -1:
|
||||||
|
r = data[:size]
|
||||||
|
self.__pos = pos+size
|
||||||
|
else:
|
||||||
|
r = data
|
||||||
|
self.__pos = pos+len(data)
|
||||||
|
return r
|
||||||
|
|
||||||
|
def readlines(self, sizehint=-1):
|
||||||
|
pos = self.__pos
|
||||||
|
self.__cache.seek(0, 2)
|
||||||
|
self.__cache.write(self.wrapped.read())
|
||||||
|
self.read_complete = True
|
||||||
|
self.__cache.seek(pos)
|
||||||
|
data = self.__cache.readlines(sizehint)
|
||||||
|
self.__pos = self.__cache.tell()
|
||||||
|
return data
|
||||||
|
|
||||||
|
def __iter__(self): return self
|
||||||
|
def next(self):
|
||||||
|
line = self.readline()
|
||||||
|
if line == "": raise StopIteration
|
||||||
|
return line
|
||||||
|
|
||||||
|
xreadlines = __iter__
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return ("<%s at %s whose wrapped object = %r>" %
|
||||||
|
(self.__class__.__name__, hex(abs(id(self))), self.wrapped))
|
||||||
|
|
||||||
|
|
||||||
|
class response_seek_wrapper(seek_wrapper):
|
||||||
|
|
||||||
|
"""
|
||||||
|
Supports copying response objects and setting response body data.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, wrapped):
|
||||||
|
seek_wrapper.__init__(self, wrapped)
|
||||||
|
self._headers = self.wrapped.info()
|
||||||
|
|
||||||
|
def __copy__(self):
|
||||||
|
cpy = seek_wrapper.__copy__(self)
|
||||||
|
# copy headers from delegate
|
||||||
|
cpy._headers = copy.copy(self.info())
|
||||||
|
return cpy
|
||||||
|
|
||||||
|
# Note that .info() and .geturl() (the only two urllib2 response methods
|
||||||
|
# that are not implemented by seek_wrapper) must be here explicitly rather
|
||||||
|
# than by seek_wrapper's __getattr__ delegation) so that the nasty
|
||||||
|
# dynamically-created HTTPError classes in get_seek_wrapper_class() get the
|
||||||
|
# wrapped object's implementation, and not HTTPError's.
|
||||||
|
|
||||||
|
def info(self):
|
||||||
|
return self._headers
|
||||||
|
|
||||||
|
def geturl(self):
|
||||||
|
return self.wrapped.geturl()
|
||||||
|
|
||||||
|
def set_data(self, data):
|
||||||
|
self.seek(0)
|
||||||
|
self.read()
|
||||||
|
self.close()
|
||||||
|
cache = self._seek_wrapper__cache = StringIO()
|
||||||
|
cache.write(data)
|
||||||
|
self.seek(0)
|
||||||
|
|
||||||
|
|
||||||
|
class eoffile:
|
||||||
|
# file-like object that always claims to be at end-of-file...
|
||||||
|
def read(self, size=-1): return ""
|
||||||
|
def readline(self, size=-1): return ""
|
||||||
|
def __iter__(self): return self
|
||||||
|
def next(self): return ""
|
||||||
|
def close(self): pass
|
||||||
|
|
||||||
|
class eofresponse(eoffile):
|
||||||
|
def __init__(self, url, headers, code, msg):
|
||||||
|
self._url = url
|
||||||
|
self._headers = headers
|
||||||
|
self.code = code
|
||||||
|
self.msg = msg
|
||||||
|
def geturl(self): return self._url
|
||||||
|
def info(self): return self._headers
|
||||||
|
|
||||||
|
|
||||||
|
class closeable_response:
|
||||||
|
"""Avoids unnecessarily clobbering urllib.addinfourl methods on .close().
|
||||||
|
|
||||||
|
Only supports responses returned by mechanize.HTTPHandler.
|
||||||
|
|
||||||
|
After .close(), the following methods are supported:
|
||||||
|
|
||||||
|
.read()
|
||||||
|
.readline()
|
||||||
|
.info()
|
||||||
|
.geturl()
|
||||||
|
.__iter__()
|
||||||
|
.next()
|
||||||
|
.close()
|
||||||
|
|
||||||
|
and the following attributes are supported:
|
||||||
|
|
||||||
|
.code
|
||||||
|
.msg
|
||||||
|
|
||||||
|
Also supports pickling (but the stdlib currently does something to prevent
|
||||||
|
it: http://python.org/sf/1144636).
|
||||||
|
|
||||||
|
"""
|
||||||
|
# presence of this attr indicates is useable after .close()
|
||||||
|
closeable_response = None
|
||||||
|
|
||||||
|
def __init__(self, fp, headers, url, code, msg):
|
||||||
|
self._set_fp(fp)
|
||||||
|
self._headers = headers
|
||||||
|
self._url = url
|
||||||
|
self.code = code
|
||||||
|
self.msg = msg
|
||||||
|
|
||||||
|
def _set_fp(self, fp):
|
||||||
|
self.fp = fp
|
||||||
|
self.read = self.fp.read
|
||||||
|
self.readline = self.fp.readline
|
||||||
|
if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines
|
||||||
|
if hasattr(self.fp, "fileno"):
|
||||||
|
self.fileno = self.fp.fileno
|
||||||
|
else:
|
||||||
|
self.fileno = lambda: None
|
||||||
|
self.__iter__ = self.fp.__iter__
|
||||||
|
self.next = self.fp.next
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return '<%s at %s whose fp = %r>' % (
|
||||||
|
self.__class__.__name__, hex(abs(id(self))), self.fp)
|
||||||
|
|
||||||
|
def info(self):
|
||||||
|
return self._headers
|
||||||
|
|
||||||
|
def geturl(self):
|
||||||
|
return self._url
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
self.fp._close = True
|
||||||
|
wrapped = self.fp
|
||||||
|
wrapped.close()
|
||||||
|
new_wrapped = eofresponse(
|
||||||
|
self._url, self._headers, self.code, self.msg)
|
||||||
|
self._set_fp(new_wrapped)
|
||||||
|
|
||||||
|
def __getstate__(self):
|
||||||
|
# There are three obvious options here:
|
||||||
|
# 1. truncate
|
||||||
|
# 2. read to end
|
||||||
|
# 3. close socket, pickle state including read position, then open
|
||||||
|
# again on unpickle and use Range header
|
||||||
|
# XXXX um, 4. refuse to pickle unless .close()d. This is better,
|
||||||
|
# actually ("errors should never pass silently"). Pickling doesn't
|
||||||
|
# work anyway ATM, because of http://python.org/sf/1144636 so fix
|
||||||
|
# this later
|
||||||
|
|
||||||
|
# 2 breaks pickle protocol, because one expects the original object
|
||||||
|
# to be left unscathed by pickling. 3 is too complicated and
|
||||||
|
# surprising (and too much work ;-) to happen in a sane __getstate__.
|
||||||
|
# So we do 1.
|
||||||
|
|
||||||
|
state = self.__dict__.copy()
|
||||||
|
new_wrapped = eofresponse(
|
||||||
|
self._url, self._headers, self.code, self.msg)
|
||||||
|
state["wrapped"] = new_wrapped
|
||||||
|
return state
|
||||||
|
|
||||||
|
def test_response(data='test data', headers=[],
|
||||||
|
url="http://example.com/", code=200, msg="OK"):
|
||||||
|
return make_response(data, headers, url, code, msg)
|
||||||
|
|
||||||
|
def test_html_response(data='test data', headers=[],
|
||||||
|
url="http://example.com/", code=200, msg="OK"):
|
||||||
|
headers += [("Content-type", "text/html")]
|
||||||
|
return make_response(data, headers, url, code, msg)
|
||||||
|
|
||||||
|
def make_response(data, headers, url, code, msg):
|
||||||
|
"""Convenient factory for objects implementing response interface.
|
||||||
|
|
||||||
|
data: string containing response body data
|
||||||
|
headers: sequence of (name, value) pairs
|
||||||
|
url: URL of response
|
||||||
|
code: integer response code (e.g. 200)
|
||||||
|
msg: string response code message (e.g. "OK")
|
||||||
|
|
||||||
|
"""
|
||||||
|
mime_headers = make_headers(headers)
|
||||||
|
r = closeable_response(StringIO(data), mime_headers, url, code, msg)
|
||||||
|
return response_seek_wrapper(r)
|
||||||
|
|
||||||
|
|
||||||
|
def make_headers(headers):
|
||||||
|
"""
|
||||||
|
headers: sequence of (name, value) pairs
|
||||||
|
"""
|
||||||
|
hdr_text = []
|
||||||
|
for name_value in headers:
|
||||||
|
hdr_text.append("%s: %s" % name_value)
|
||||||
|
return mimetools.Message(StringIO("\n".join(hdr_text)))
|
||||||
|
|
||||||
|
|
||||||
|
# Rest of this module is especially horrible, but needed, at least until fork
|
||||||
|
# urllib2. Even then, may want to preseve urllib2 compatibility.
|
||||||
|
|
||||||
|
def get_seek_wrapper_class(response):
|
||||||
|
# in order to wrap response objects that are also exceptions, we must
|
||||||
|
# dynamically subclass the exception :-(((
|
||||||
|
if (isinstance(response, urllib2.HTTPError) and
|
||||||
|
not hasattr(response, "seek")):
|
||||||
|
if response.__class__.__module__ == "__builtin__":
|
||||||
|
exc_class_name = response.__class__.__name__
|
||||||
|
else:
|
||||||
|
exc_class_name = "%s.%s" % (
|
||||||
|
response.__class__.__module__, response.__class__.__name__)
|
||||||
|
|
||||||
|
class httperror_seek_wrapper(response_seek_wrapper, response.__class__):
|
||||||
|
# this only derives from HTTPError in order to be a subclass --
|
||||||
|
# the HTTPError behaviour comes from delegation
|
||||||
|
|
||||||
|
_exc_class_name = exc_class_name
|
||||||
|
|
||||||
|
def __init__(self, wrapped):
|
||||||
|
response_seek_wrapper.__init__(self, wrapped)
|
||||||
|
# be compatible with undocumented HTTPError attributes :-(
|
||||||
|
self.hdrs = wrapped.info()
|
||||||
|
self.filename = wrapped.geturl()
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return (
|
||||||
|
"<%s (%s instance) at %s "
|
||||||
|
"whose wrapped object = %r>" % (
|
||||||
|
self.__class__.__name__, self._exc_class_name,
|
||||||
|
hex(abs(id(self))), self.wrapped)
|
||||||
|
)
|
||||||
|
wrapper_class = httperror_seek_wrapper
|
||||||
|
else:
|
||||||
|
wrapper_class = response_seek_wrapper
|
||||||
|
return wrapper_class
|
||||||
|
|
||||||
|
def seek_wrapped_response(response):
|
||||||
|
"""Return a copy of response that supports seekable response interface.
|
||||||
|
|
||||||
|
Accepts responses from both mechanize and urllib2 handlers.
|
||||||
|
|
||||||
|
Copes with both oridinary response instances and HTTPError instances (which
|
||||||
|
can't be simply wrapped due to the requirement of preserving the exception
|
||||||
|
base class).
|
||||||
|
"""
|
||||||
|
if not hasattr(response, "seek"):
|
||||||
|
wrapper_class = get_seek_wrapper_class(response)
|
||||||
|
response = wrapper_class(response)
|
||||||
|
assert hasattr(response, "get_data")
|
||||||
|
return response
|
||||||
|
|
||||||
|
def upgrade_response(response):
|
||||||
|
"""Return a copy of response that supports Browser response interface.
|
||||||
|
|
||||||
|
Browser response interface is that of "seekable responses"
|
||||||
|
(response_seek_wrapper), plus the requirement that responses must be
|
||||||
|
useable after .close() (closeable_response).
|
||||||
|
|
||||||
|
Accepts responses from both mechanize and urllib2 handlers.
|
||||||
|
|
||||||
|
Copes with both ordinary response instances and HTTPError instances (which
|
||||||
|
can't be simply wrapped due to the requirement of preserving the exception
|
||||||
|
base class).
|
||||||
|
"""
|
||||||
|
wrapper_class = get_seek_wrapper_class(response)
|
||||||
|
if hasattr(response, "closeable_response"):
|
||||||
|
if not hasattr(response, "seek"):
|
||||||
|
response = wrapper_class(response)
|
||||||
|
assert hasattr(response, "get_data")
|
||||||
|
return copy.copy(response)
|
||||||
|
|
||||||
|
# a urllib2 handler constructed the response, i.e. the response is an
|
||||||
|
# urllib.addinfourl or a urllib2.HTTPError, instead of a
|
||||||
|
# _Util.closeable_response as returned by e.g. mechanize.HTTPHandler
|
||||||
|
try:
|
||||||
|
code = response.code
|
||||||
|
except AttributeError:
|
||||||
|
code = None
|
||||||
|
try:
|
||||||
|
msg = response.msg
|
||||||
|
except AttributeError:
|
||||||
|
msg = None
|
||||||
|
|
||||||
|
# may have already-.read() data from .seek() cache
|
||||||
|
data = None
|
||||||
|
get_data = getattr(response, "get_data", None)
|
||||||
|
if get_data:
|
||||||
|
data = get_data()
|
||||||
|
|
||||||
|
response = closeable_response(
|
||||||
|
response.fp, response.info(), response.geturl(), code, msg)
|
||||||
|
response = wrapper_class(response)
|
||||||
|
if data:
|
||||||
|
response.set_data(data)
|
||||||
|
return response
|
240
src/calibre/utils/mechanize/_rfc3986.py
Normal file
240
src/calibre/utils/mechanize/_rfc3986.py
Normal file
@ -0,0 +1,240 @@
|
|||||||
|
"""RFC 3986 URI parsing and relative reference resolution / absolutization.
|
||||||
|
|
||||||
|
(aka splitting and joining)
|
||||||
|
|
||||||
|
Copyright 2006 John J. Lee <jjl@pobox.com>
|
||||||
|
|
||||||
|
This code is free software; you can redistribute it and/or modify it under
|
||||||
|
the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt
|
||||||
|
included with the distribution).
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
# XXX Wow, this is ugly. Overly-direct translation of the RFC ATM.
|
||||||
|
|
||||||
|
import sys, re, posixpath, urllib
|
||||||
|
|
||||||
|
## def chr_range(a, b):
|
||||||
|
## return "".join(map(chr, range(ord(a), ord(b)+1)))
|
||||||
|
|
||||||
|
## UNRESERVED_URI_CHARS = ("ABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
||||||
|
## "abcdefghijklmnopqrstuvwxyz"
|
||||||
|
## "0123456789"
|
||||||
|
## "-_.~")
|
||||||
|
## RESERVED_URI_CHARS = "!*'();:@&=+$,/?#[]"
|
||||||
|
## URI_CHARS = RESERVED_URI_CHARS+UNRESERVED_URI_CHARS+'%'
|
||||||
|
# this re matches any character that's not in URI_CHARS
|
||||||
|
BAD_URI_CHARS_RE = re.compile("[^A-Za-z0-9\-_.~!*'();:@&=+$,/?%#[\]]")
|
||||||
|
|
||||||
|
|
||||||
|
def clean_url(url, encoding):
|
||||||
|
# percent-encode illegal URI characters
|
||||||
|
# Trying to come up with test cases for this gave me a headache, revisit
|
||||||
|
# when do switch to unicode.
|
||||||
|
# Somebody else's comments (lost the attribution):
|
||||||
|
## - IE will return you the url in the encoding you send it
|
||||||
|
## - Mozilla/Firefox will send you latin-1 if there's no non latin-1
|
||||||
|
## characters in your link. It will send you utf-8 however if there are...
|
||||||
|
if type(url) == type(""):
|
||||||
|
url = url.decode(encoding, "replace")
|
||||||
|
url = url.strip()
|
||||||
|
# for second param to urllib.quote(), we want URI_CHARS, minus the
|
||||||
|
# 'always_safe' characters that urllib.quote() never percent-encodes
|
||||||
|
return urllib.quote(url.encode(encoding), "!*'();:@&=+$,/?%#[]~")
|
||||||
|
|
||||||
|
def is_clean_uri(uri):
|
||||||
|
"""
|
||||||
|
>>> is_clean_uri("ABC!")
|
||||||
|
True
|
||||||
|
>>> is_clean_uri(u"ABC!")
|
||||||
|
True
|
||||||
|
>>> is_clean_uri("ABC|")
|
||||||
|
False
|
||||||
|
>>> is_clean_uri(u"ABC|")
|
||||||
|
False
|
||||||
|
>>> is_clean_uri("http://example.com/0")
|
||||||
|
True
|
||||||
|
>>> is_clean_uri(u"http://example.com/0")
|
||||||
|
True
|
||||||
|
"""
|
||||||
|
# note module re treats bytestrings as through they were decoded as latin-1
|
||||||
|
# so this function accepts both unicode and bytestrings
|
||||||
|
return not bool(BAD_URI_CHARS_RE.search(uri))
|
||||||
|
|
||||||
|
|
||||||
|
SPLIT_MATCH = re.compile(
|
||||||
|
r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?").match
|
||||||
|
def urlsplit(absolute_uri):
|
||||||
|
"""Return scheme, authority, path, query, fragment."""
|
||||||
|
match = SPLIT_MATCH(absolute_uri)
|
||||||
|
if match:
|
||||||
|
g = match.groups()
|
||||||
|
return g[1], g[3], g[4], g[6], g[8]
|
||||||
|
|
||||||
|
def urlunsplit(parts):
|
||||||
|
scheme, authority, path, query, fragment = parts
|
||||||
|
r = []
|
||||||
|
append = r.append
|
||||||
|
if scheme is not None:
|
||||||
|
append(scheme)
|
||||||
|
append(":")
|
||||||
|
if authority is not None:
|
||||||
|
append("//")
|
||||||
|
append(authority)
|
||||||
|
append(path)
|
||||||
|
if query is not None:
|
||||||
|
append("?")
|
||||||
|
append(query)
|
||||||
|
if fragment is not None:
|
||||||
|
append("#")
|
||||||
|
append(fragment)
|
||||||
|
return "".join(r)
|
||||||
|
|
||||||
|
def urljoin(base_uri, uri_reference):
|
||||||
|
return urlunsplit(urljoin_parts(urlsplit(base_uri),
|
||||||
|
urlsplit(uri_reference)))
|
||||||
|
|
||||||
|
# oops, this doesn't do the same thing as the literal translation
|
||||||
|
# from the RFC below
|
||||||
|
## def urljoin_parts(base_parts, reference_parts):
|
||||||
|
## scheme, authority, path, query, fragment = base_parts
|
||||||
|
## rscheme, rauthority, rpath, rquery, rfragment = reference_parts
|
||||||
|
|
||||||
|
## # compute target URI path
|
||||||
|
## if rpath == "":
|
||||||
|
## tpath = path
|
||||||
|
## else:
|
||||||
|
## tpath = rpath
|
||||||
|
## if not tpath.startswith("/"):
|
||||||
|
## tpath = merge(authority, path, tpath)
|
||||||
|
## tpath = posixpath.normpath(tpath)
|
||||||
|
|
||||||
|
## if rscheme is not None:
|
||||||
|
## return (rscheme, rauthority, tpath, rquery, rfragment)
|
||||||
|
## elif rauthority is not None:
|
||||||
|
## return (scheme, rauthority, tpath, rquery, rfragment)
|
||||||
|
## elif rpath == "":
|
||||||
|
## if rquery is not None:
|
||||||
|
## tquery = rquery
|
||||||
|
## else:
|
||||||
|
## tquery = query
|
||||||
|
## return (scheme, authority, tpath, tquery, rfragment)
|
||||||
|
## else:
|
||||||
|
## return (scheme, authority, tpath, rquery, rfragment)
|
||||||
|
|
||||||
|
def urljoin_parts(base_parts, reference_parts):
|
||||||
|
scheme, authority, path, query, fragment = base_parts
|
||||||
|
rscheme, rauthority, rpath, rquery, rfragment = reference_parts
|
||||||
|
|
||||||
|
if rscheme == scheme:
|
||||||
|
rscheme = None
|
||||||
|
|
||||||
|
if rscheme is not None:
|
||||||
|
tscheme, tauthority, tpath, tquery = (
|
||||||
|
rscheme, rauthority, remove_dot_segments(rpath), rquery)
|
||||||
|
else:
|
||||||
|
if rauthority is not None:
|
||||||
|
tauthority, tpath, tquery = (
|
||||||
|
rauthority, remove_dot_segments(rpath), rquery)
|
||||||
|
else:
|
||||||
|
if rpath == "":
|
||||||
|
tpath = path
|
||||||
|
if rquery is not None:
|
||||||
|
tquery = rquery
|
||||||
|
else:
|
||||||
|
tquery = query
|
||||||
|
else:
|
||||||
|
if rpath.startswith("/"):
|
||||||
|
tpath = remove_dot_segments(rpath)
|
||||||
|
else:
|
||||||
|
tpath = merge(authority, path, rpath)
|
||||||
|
tpath = remove_dot_segments(tpath)
|
||||||
|
tquery = rquery
|
||||||
|
tauthority = authority
|
||||||
|
tscheme = scheme
|
||||||
|
tfragment = rfragment
|
||||||
|
return (tscheme, tauthority, tpath, tquery, tfragment)
|
||||||
|
|
||||||
|
# um, something *vaguely* like this is what I want, but I have to generate
|
||||||
|
# lots of test cases first, if only to understand what it is that
|
||||||
|
# remove_dot_segments really does...
|
||||||
|
## def remove_dot_segments(path):
|
||||||
|
## if path == '':
|
||||||
|
## return ''
|
||||||
|
## comps = path.split('/')
|
||||||
|
## new_comps = []
|
||||||
|
## for comp in comps:
|
||||||
|
## if comp in ['.', '']:
|
||||||
|
## if not new_comps or new_comps[-1]:
|
||||||
|
## new_comps.append('')
|
||||||
|
## continue
|
||||||
|
## if comp != '..':
|
||||||
|
## new_comps.append(comp)
|
||||||
|
## elif new_comps:
|
||||||
|
## new_comps.pop()
|
||||||
|
## return '/'.join(new_comps)
|
||||||
|
|
||||||
|
|
||||||
|
def remove_dot_segments(path):
|
||||||
|
r = []
|
||||||
|
while path:
|
||||||
|
# A
|
||||||
|
if path.startswith("../"):
|
||||||
|
path = path[3:]
|
||||||
|
continue
|
||||||
|
if path.startswith("./"):
|
||||||
|
path = path[2:]
|
||||||
|
continue
|
||||||
|
# B
|
||||||
|
if path.startswith("/./"):
|
||||||
|
path = path[2:]
|
||||||
|
continue
|
||||||
|
if path == "/.":
|
||||||
|
path = "/"
|
||||||
|
continue
|
||||||
|
# C
|
||||||
|
if path.startswith("/../"):
|
||||||
|
path = path[3:]
|
||||||
|
if r:
|
||||||
|
r.pop()
|
||||||
|
continue
|
||||||
|
if path == "/..":
|
||||||
|
path = "/"
|
||||||
|
if r:
|
||||||
|
r.pop()
|
||||||
|
continue
|
||||||
|
# D
|
||||||
|
if path == ".":
|
||||||
|
path = path[1:]
|
||||||
|
continue
|
||||||
|
if path == "..":
|
||||||
|
path = path[2:]
|
||||||
|
continue
|
||||||
|
# E
|
||||||
|
start = 0
|
||||||
|
if path.startswith("/"):
|
||||||
|
start = 1
|
||||||
|
ii = path.find("/", start)
|
||||||
|
if ii < 0:
|
||||||
|
ii = None
|
||||||
|
r.append(path[:ii])
|
||||||
|
if ii is None:
|
||||||
|
break
|
||||||
|
path = path[ii:]
|
||||||
|
return "".join(r)
|
||||||
|
|
||||||
|
def merge(base_authority, base_path, ref_path):
|
||||||
|
# XXXX Oddly, the sample Perl implementation of this by Roy Fielding
|
||||||
|
# doesn't even take base_authority as a parameter, despite the wording in
|
||||||
|
# the RFC suggesting otherwise. Perhaps I'm missing some obvious identity.
|
||||||
|
#if base_authority is not None and base_path == "":
|
||||||
|
if base_path == "":
|
||||||
|
return "/" + ref_path
|
||||||
|
ii = base_path.rfind("/")
|
||||||
|
if ii >= 0:
|
||||||
|
return base_path[:ii+1] + ref_path
|
||||||
|
return ref_path
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import doctest
|
||||||
|
doctest.testmod()
|
16
src/calibre/utils/mechanize/_seek.py
Normal file
16
src/calibre/utils/mechanize/_seek.py
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
from urllib2 import BaseHandler
|
||||||
|
from _util import deprecation
|
||||||
|
from _response import response_seek_wrapper
|
||||||
|
|
||||||
|
|
||||||
|
class SeekableProcessor(BaseHandler):
|
||||||
|
"""Deprecated: Make responses seekable."""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
deprecation(
|
||||||
|
"See http://wwwsearch.sourceforge.net/mechanize/doc.html#seekable")
|
||||||
|
|
||||||
|
def any_response(self, request, response):
|
||||||
|
if not hasattr(response, "seek"):
|
||||||
|
return response_seek_wrapper(response)
|
||||||
|
return response
|
40
src/calibre/utils/mechanize/_upgrade.py
Normal file
40
src/calibre/utils/mechanize/_upgrade.py
Normal file
@ -0,0 +1,40 @@
|
|||||||
|
from urllib2 import BaseHandler
|
||||||
|
|
||||||
|
from _request import Request
|
||||||
|
from _response import upgrade_response
|
||||||
|
from _util import deprecation
|
||||||
|
|
||||||
|
|
||||||
|
class HTTPRequestUpgradeProcessor(BaseHandler):
|
||||||
|
# upgrade urllib2.Request to this module's Request
|
||||||
|
# yuck!
|
||||||
|
handler_order = 0 # before anything else
|
||||||
|
|
||||||
|
def http_request(self, request):
|
||||||
|
if not hasattr(request, "add_unredirected_header"):
|
||||||
|
newrequest = Request(request._Request__original, request.data,
|
||||||
|
request.headers)
|
||||||
|
try: newrequest.origin_req_host = request.origin_req_host
|
||||||
|
except AttributeError: pass
|
||||||
|
try: newrequest.unverifiable = request.unverifiable
|
||||||
|
except AttributeError: pass
|
||||||
|
try: newrequest.visit = request.visit
|
||||||
|
except AttributeError: pass
|
||||||
|
request = newrequest
|
||||||
|
return request
|
||||||
|
|
||||||
|
https_request = http_request
|
||||||
|
|
||||||
|
|
||||||
|
class ResponseUpgradeProcessor(BaseHandler):
|
||||||
|
# upgrade responses to be .close()able without becoming unusable
|
||||||
|
handler_order = 0 # before anything else
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
deprecation(
|
||||||
|
"See http://wwwsearch.sourceforge.net/mechanize/doc.html#seekable")
|
||||||
|
|
||||||
|
def any_response(self, request, response):
|
||||||
|
if not hasattr(response, 'closeable_response'):
|
||||||
|
response = upgrade_response(response)
|
||||||
|
return response
|
62
src/calibre/utils/mechanize/_urllib2.py
Normal file
62
src/calibre/utils/mechanize/_urllib2.py
Normal file
@ -0,0 +1,62 @@
|
|||||||
|
# urllib2 work-alike interface
|
||||||
|
# ...from urllib2...
|
||||||
|
from urllib2 import \
|
||||||
|
URLError, \
|
||||||
|
HTTPError, \
|
||||||
|
GopherError
|
||||||
|
# ...and from mechanize
|
||||||
|
from _opener import OpenerDirector, \
|
||||||
|
SeekableResponseOpener, \
|
||||||
|
build_opener, install_opener, urlopen
|
||||||
|
from _auth import \
|
||||||
|
HTTPPasswordMgr, \
|
||||||
|
HTTPPasswordMgrWithDefaultRealm, \
|
||||||
|
AbstractBasicAuthHandler, \
|
||||||
|
AbstractDigestAuthHandler, \
|
||||||
|
HTTPProxyPasswordMgr, \
|
||||||
|
ProxyHandler, \
|
||||||
|
ProxyBasicAuthHandler, \
|
||||||
|
ProxyDigestAuthHandler, \
|
||||||
|
HTTPBasicAuthHandler, \
|
||||||
|
HTTPDigestAuthHandler, \
|
||||||
|
HTTPSClientCertMgr
|
||||||
|
from _request import \
|
||||||
|
Request
|
||||||
|
from _http import \
|
||||||
|
RobotExclusionError
|
||||||
|
|
||||||
|
# handlers...
|
||||||
|
# ...from urllib2...
|
||||||
|
from urllib2 import \
|
||||||
|
BaseHandler, \
|
||||||
|
UnknownHandler, \
|
||||||
|
FTPHandler, \
|
||||||
|
CacheFTPHandler, \
|
||||||
|
FileHandler, \
|
||||||
|
GopherHandler
|
||||||
|
# ...and from mechanize
|
||||||
|
from _http import \
|
||||||
|
HTTPHandler, \
|
||||||
|
HTTPDefaultErrorHandler, \
|
||||||
|
HTTPRedirectHandler, \
|
||||||
|
HTTPEquivProcessor, \
|
||||||
|
HTTPCookieProcessor, \
|
||||||
|
HTTPRefererProcessor, \
|
||||||
|
HTTPRefreshProcessor, \
|
||||||
|
HTTPErrorProcessor, \
|
||||||
|
HTTPRobotRulesProcessor
|
||||||
|
from _upgrade import \
|
||||||
|
HTTPRequestUpgradeProcessor, \
|
||||||
|
ResponseUpgradeProcessor
|
||||||
|
from _debug import \
|
||||||
|
HTTPResponseDebugProcessor, \
|
||||||
|
HTTPRedirectDebugProcessor
|
||||||
|
from _seek import \
|
||||||
|
SeekableProcessor
|
||||||
|
# crap ATM
|
||||||
|
## from _gzip import \
|
||||||
|
## HTTPGzipProcessor
|
||||||
|
import httplib
|
||||||
|
if hasattr(httplib, 'HTTPS'):
|
||||||
|
from _http import HTTPSHandler
|
||||||
|
del httplib
|
348
src/calibre/utils/mechanize/_useragent.py
Normal file
348
src/calibre/utils/mechanize/_useragent.py
Normal file
@ -0,0 +1,348 @@
|
|||||||
|
"""Convenient HTTP UserAgent class.
|
||||||
|
|
||||||
|
This is a subclass of urllib2.OpenerDirector.
|
||||||
|
|
||||||
|
|
||||||
|
Copyright 2003-2006 John J. Lee <jjl@pobox.com>
|
||||||
|
|
||||||
|
This code is free software; you can redistribute it and/or modify it under
|
||||||
|
the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt
|
||||||
|
included with the distribution).
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys, warnings, urllib2
|
||||||
|
|
||||||
|
import _opener
|
||||||
|
import _urllib2
|
||||||
|
import _auth
|
||||||
|
import _gzip
|
||||||
|
import _response
|
||||||
|
|
||||||
|
|
||||||
|
class UserAgentBase(_opener.OpenerDirector):
|
||||||
|
"""Convenient user-agent class.
|
||||||
|
|
||||||
|
Do not use .add_handler() to add a handler for something already dealt with
|
||||||
|
by this code.
|
||||||
|
|
||||||
|
The only reason at present for the distinction between UserAgent and
|
||||||
|
UserAgentBase is so that classes that depend on .seek()able responses
|
||||||
|
(e.g. mechanize.Browser) can inherit from UserAgentBase. The subclass
|
||||||
|
UserAgent exposes a .set_seekable_responses() method that allows switching
|
||||||
|
off the adding of a .seek() method to responses.
|
||||||
|
|
||||||
|
Public attributes:
|
||||||
|
|
||||||
|
addheaders: list of (name, value) pairs specifying headers to send with
|
||||||
|
every request, unless they are overridden in the Request instance.
|
||||||
|
|
||||||
|
>>> ua = UserAgentBase()
|
||||||
|
>>> ua.addheaders = [
|
||||||
|
... ("User-agent", "Mozilla/5.0 (compatible)"),
|
||||||
|
... ("From", "responsible.person@example.com")]
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
handler_classes = {
|
||||||
|
# scheme handlers
|
||||||
|
"http": _urllib2.HTTPHandler,
|
||||||
|
# CacheFTPHandler is buggy, at least in 2.3, so we don't use it
|
||||||
|
"ftp": _urllib2.FTPHandler,
|
||||||
|
"file": _urllib2.FileHandler,
|
||||||
|
"gopher": _urllib2.GopherHandler,
|
||||||
|
|
||||||
|
# other handlers
|
||||||
|
"_unknown": _urllib2.UnknownHandler,
|
||||||
|
# HTTP{S,}Handler depend on HTTPErrorProcessor too
|
||||||
|
"_http_error": _urllib2.HTTPErrorProcessor,
|
||||||
|
"_http_request_upgrade": _urllib2.HTTPRequestUpgradeProcessor,
|
||||||
|
"_http_default_error": _urllib2.HTTPDefaultErrorHandler,
|
||||||
|
|
||||||
|
# feature handlers
|
||||||
|
"_basicauth": _urllib2.HTTPBasicAuthHandler,
|
||||||
|
"_digestauth": _urllib2.HTTPDigestAuthHandler,
|
||||||
|
"_redirect": _urllib2.HTTPRedirectHandler,
|
||||||
|
"_cookies": _urllib2.HTTPCookieProcessor,
|
||||||
|
"_refresh": _urllib2.HTTPRefreshProcessor,
|
||||||
|
"_equiv": _urllib2.HTTPEquivProcessor,
|
||||||
|
"_proxy": _urllib2.ProxyHandler,
|
||||||
|
"_proxy_basicauth": _urllib2.ProxyBasicAuthHandler,
|
||||||
|
"_proxy_digestauth": _urllib2.ProxyDigestAuthHandler,
|
||||||
|
"_robots": _urllib2.HTTPRobotRulesProcessor,
|
||||||
|
"_gzip": _gzip.HTTPGzipProcessor, # experimental!
|
||||||
|
|
||||||
|
# debug handlers
|
||||||
|
"_debug_redirect": _urllib2.HTTPRedirectDebugProcessor,
|
||||||
|
"_debug_response_body": _urllib2.HTTPResponseDebugProcessor,
|
||||||
|
}
|
||||||
|
|
||||||
|
default_schemes = ["http", "ftp", "file", "gopher"]
|
||||||
|
default_others = ["_unknown", "_http_error", "_http_request_upgrade",
|
||||||
|
"_http_default_error",
|
||||||
|
]
|
||||||
|
default_features = ["_redirect", "_cookies",
|
||||||
|
"_refresh", "_equiv",
|
||||||
|
"_basicauth", "_digestauth",
|
||||||
|
"_proxy", "_proxy_basicauth", "_proxy_digestauth",
|
||||||
|
"_robots",
|
||||||
|
]
|
||||||
|
if hasattr(_urllib2, 'HTTPSHandler'):
|
||||||
|
handler_classes["https"] = _urllib2.HTTPSHandler
|
||||||
|
default_schemes.append("https")
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
_opener.OpenerDirector.__init__(self)
|
||||||
|
|
||||||
|
ua_handlers = self._ua_handlers = {}
|
||||||
|
for scheme in (self.default_schemes+
|
||||||
|
self.default_others+
|
||||||
|
self.default_features):
|
||||||
|
klass = self.handler_classes[scheme]
|
||||||
|
ua_handlers[scheme] = klass()
|
||||||
|
for handler in ua_handlers.itervalues():
|
||||||
|
self.add_handler(handler)
|
||||||
|
|
||||||
|
# Yuck.
|
||||||
|
# Ensure correct default constructor args were passed to
|
||||||
|
# HTTPRefreshProcessor and HTTPEquivProcessor.
|
||||||
|
if "_refresh" in ua_handlers:
|
||||||
|
self.set_handle_refresh(True)
|
||||||
|
if "_equiv" in ua_handlers:
|
||||||
|
self.set_handle_equiv(True)
|
||||||
|
# Ensure default password managers are installed.
|
||||||
|
pm = ppm = None
|
||||||
|
if "_basicauth" in ua_handlers or "_digestauth" in ua_handlers:
|
||||||
|
pm = _urllib2.HTTPPasswordMgrWithDefaultRealm()
|
||||||
|
if ("_proxy_basicauth" in ua_handlers or
|
||||||
|
"_proxy_digestauth" in ua_handlers):
|
||||||
|
ppm = _auth.HTTPProxyPasswordMgr()
|
||||||
|
self.set_password_manager(pm)
|
||||||
|
self.set_proxy_password_manager(ppm)
|
||||||
|
# set default certificate manager
|
||||||
|
if "https" in ua_handlers:
|
||||||
|
cm = _urllib2.HTTPSClientCertMgr()
|
||||||
|
self.set_client_cert_manager(cm)
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
_opener.OpenerDirector.close(self)
|
||||||
|
self._ua_handlers = None
|
||||||
|
|
||||||
|
# XXX
|
||||||
|
## def set_timeout(self, timeout):
|
||||||
|
## self._timeout = timeout
|
||||||
|
## def set_http_connection_cache(self, conn_cache):
|
||||||
|
## self._http_conn_cache = conn_cache
|
||||||
|
## def set_ftp_connection_cache(self, conn_cache):
|
||||||
|
## # XXX ATM, FTP has cache as part of handler; should it be separate?
|
||||||
|
## self._ftp_conn_cache = conn_cache
|
||||||
|
|
||||||
|
def set_handled_schemes(self, schemes):
|
||||||
|
"""Set sequence of URL scheme (protocol) strings.
|
||||||
|
|
||||||
|
For example: ua.set_handled_schemes(["http", "ftp"])
|
||||||
|
|
||||||
|
If this fails (with ValueError) because you've passed an unknown
|
||||||
|
scheme, the set of handled schemes will not be changed.
|
||||||
|
|
||||||
|
"""
|
||||||
|
want = {}
|
||||||
|
for scheme in schemes:
|
||||||
|
if scheme.startswith("_"):
|
||||||
|
raise ValueError("not a scheme '%s'" % scheme)
|
||||||
|
if scheme not in self.handler_classes:
|
||||||
|
raise ValueError("unknown scheme '%s'")
|
||||||
|
want[scheme] = None
|
||||||
|
|
||||||
|
# get rid of scheme handlers we don't want
|
||||||
|
for scheme, oldhandler in self._ua_handlers.items():
|
||||||
|
if scheme.startswith("_"): continue # not a scheme handler
|
||||||
|
if scheme not in want:
|
||||||
|
self._replace_handler(scheme, None)
|
||||||
|
else:
|
||||||
|
del want[scheme] # already got it
|
||||||
|
# add the scheme handlers that are missing
|
||||||
|
for scheme in want.keys():
|
||||||
|
self._set_handler(scheme, True)
|
||||||
|
|
||||||
|
def set_cookiejar(self, cookiejar):
|
||||||
|
"""Set a mechanize.CookieJar, or None."""
|
||||||
|
self._set_handler("_cookies", obj=cookiejar)
|
||||||
|
|
||||||
|
# XXX could use Greg Stein's httpx for some of this instead?
|
||||||
|
# or httplib2??
|
||||||
|
def set_proxies(self, proxies):
|
||||||
|
"""Set a dictionary mapping URL scheme to proxy specification, or None.
|
||||||
|
|
||||||
|
e.g. {"http": "joe:password@myproxy.example.com:3128",
|
||||||
|
"ftp": "proxy.example.com"}
|
||||||
|
|
||||||
|
"""
|
||||||
|
self._set_handler("_proxy", obj=proxies)
|
||||||
|
|
||||||
|
def add_password(self, url, user, password, realm=None):
|
||||||
|
self._password_manager.add_password(realm, url, user, password)
|
||||||
|
def add_proxy_password(self, user, password, hostport=None, realm=None):
|
||||||
|
self._proxy_password_manager.add_password(
|
||||||
|
realm, hostport, user, password)
|
||||||
|
|
||||||
|
def add_client_certificate(self, url, key_file, cert_file):
|
||||||
|
"""Add an SSL client certificate, for HTTPS client auth.
|
||||||
|
|
||||||
|
key_file and cert_file must be filenames of the key and certificate
|
||||||
|
files, in PEM format. You can use e.g. OpenSSL to convert a p12 (PKCS
|
||||||
|
12) file to PEM format:
|
||||||
|
|
||||||
|
openssl pkcs12 -clcerts -nokeys -in cert.p12 -out cert.pem
|
||||||
|
openssl pkcs12 -nocerts -in cert.p12 -out key.pem
|
||||||
|
|
||||||
|
|
||||||
|
Note that client certificate password input is very inflexible ATM. At
|
||||||
|
the moment this seems to be console only, which is presumably the
|
||||||
|
default behaviour of libopenssl. In future mechanize may support
|
||||||
|
third-party libraries that (I assume) allow more options here.
|
||||||
|
|
||||||
|
"""
|
||||||
|
self._client_cert_manager.add_key_cert(url, key_file, cert_file)
|
||||||
|
|
||||||
|
# the following are rarely useful -- use add_password / add_proxy_password
|
||||||
|
# instead
|
||||||
|
def set_password_manager(self, password_manager):
|
||||||
|
"""Set a mechanize.HTTPPasswordMgrWithDefaultRealm, or None."""
|
||||||
|
self._password_manager = password_manager
|
||||||
|
self._set_handler("_basicauth", obj=password_manager)
|
||||||
|
self._set_handler("_digestauth", obj=password_manager)
|
||||||
|
def set_proxy_password_manager(self, password_manager):
|
||||||
|
"""Set a mechanize.HTTPProxyPasswordMgr, or None."""
|
||||||
|
self._proxy_password_manager = password_manager
|
||||||
|
self._set_handler("_proxy_basicauth", obj=password_manager)
|
||||||
|
self._set_handler("_proxy_digestauth", obj=password_manager)
|
||||||
|
def set_client_cert_manager(self, cert_manager):
|
||||||
|
"""Set a mechanize.HTTPClientCertMgr, or None."""
|
||||||
|
self._client_cert_manager = cert_manager
|
||||||
|
handler = self._ua_handlers["https"]
|
||||||
|
handler.client_cert_manager = cert_manager
|
||||||
|
|
||||||
|
# these methods all take a boolean parameter
|
||||||
|
def set_handle_robots(self, handle):
|
||||||
|
"""Set whether to observe rules from robots.txt."""
|
||||||
|
self._set_handler("_robots", handle)
|
||||||
|
def set_handle_redirect(self, handle):
|
||||||
|
"""Set whether to handle HTTP 30x redirections."""
|
||||||
|
self._set_handler("_redirect", handle)
|
||||||
|
def set_handle_refresh(self, handle, max_time=None, honor_time=True):
|
||||||
|
"""Set whether to handle HTTP Refresh headers."""
|
||||||
|
self._set_handler("_refresh", handle, constructor_kwds=
|
||||||
|
{"max_time": max_time, "honor_time": honor_time})
|
||||||
|
def set_handle_equiv(self, handle, head_parser_class=None):
|
||||||
|
"""Set whether to treat HTML http-equiv headers like HTTP headers.
|
||||||
|
|
||||||
|
Response objects may be .seek()able if this is set (currently returned
|
||||||
|
responses are, raised HTTPError exception responses are not).
|
||||||
|
|
||||||
|
"""
|
||||||
|
if head_parser_class is not None:
|
||||||
|
constructor_kwds = {"head_parser_class": head_parser_class}
|
||||||
|
else:
|
||||||
|
constructor_kwds={}
|
||||||
|
self._set_handler("_equiv", handle, constructor_kwds=constructor_kwds)
|
||||||
|
def set_handle_gzip(self, handle):
|
||||||
|
"""Handle gzip transfer encoding.
|
||||||
|
|
||||||
|
"""
|
||||||
|
if handle:
|
||||||
|
warnings.warn(
|
||||||
|
"gzip transfer encoding is experimental!", stacklevel=2)
|
||||||
|
self._set_handler("_gzip", handle)
|
||||||
|
def set_debug_redirects(self, handle):
|
||||||
|
"""Log information about HTTP redirects (including refreshes).
|
||||||
|
|
||||||
|
Logging is performed using module logging. The logger name is
|
||||||
|
"mechanize.http_redirects". To actually print some debug output,
|
||||||
|
eg:
|
||||||
|
|
||||||
|
import sys, logging
|
||||||
|
logger = logging.getLogger("mechanize.http_redirects")
|
||||||
|
logger.addHandler(logging.StreamHandler(sys.stdout))
|
||||||
|
logger.setLevel(logging.INFO)
|
||||||
|
|
||||||
|
Other logger names relevant to this module:
|
||||||
|
|
||||||
|
"mechanize.http_responses"
|
||||||
|
"mechanize.cookies" (or "cookielib" if running Python 2.4)
|
||||||
|
|
||||||
|
To turn on everything:
|
||||||
|
|
||||||
|
import sys, logging
|
||||||
|
logger = logging.getLogger("mechanize")
|
||||||
|
logger.addHandler(logging.StreamHandler(sys.stdout))
|
||||||
|
logger.setLevel(logging.INFO)
|
||||||
|
|
||||||
|
"""
|
||||||
|
self._set_handler("_debug_redirect", handle)
|
||||||
|
def set_debug_responses(self, handle):
|
||||||
|
"""Log HTTP response bodies.
|
||||||
|
|
||||||
|
See docstring for .set_debug_redirects() for details of logging.
|
||||||
|
|
||||||
|
Response objects may be .seek()able if this is set (currently returned
|
||||||
|
responses are, raised HTTPError exception responses are not).
|
||||||
|
|
||||||
|
"""
|
||||||
|
self._set_handler("_debug_response_body", handle)
|
||||||
|
def set_debug_http(self, handle):
|
||||||
|
"""Print HTTP headers to sys.stdout."""
|
||||||
|
level = int(bool(handle))
|
||||||
|
for scheme in "http", "https":
|
||||||
|
h = self._ua_handlers.get(scheme)
|
||||||
|
if h is not None:
|
||||||
|
h.set_http_debuglevel(level)
|
||||||
|
|
||||||
|
def _set_handler(self, name, handle=None, obj=None,
|
||||||
|
constructor_args=(), constructor_kwds={}):
|
||||||
|
if handle is None:
|
||||||
|
handle = obj is not None
|
||||||
|
if handle:
|
||||||
|
handler_class = self.handler_classes[name]
|
||||||
|
if obj is not None:
|
||||||
|
newhandler = handler_class(obj)
|
||||||
|
else:
|
||||||
|
newhandler = handler_class(*constructor_args, **constructor_kwds)
|
||||||
|
else:
|
||||||
|
newhandler = None
|
||||||
|
self._replace_handler(name, newhandler)
|
||||||
|
|
||||||
|
def _replace_handler(self, name, newhandler=None):
|
||||||
|
# first, if handler was previously added, remove it
|
||||||
|
if name is not None:
|
||||||
|
handler = self._ua_handlers.get(name)
|
||||||
|
if handler:
|
||||||
|
try:
|
||||||
|
self.handlers.remove(handler)
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
# then add the replacement, if any
|
||||||
|
if newhandler is not None:
|
||||||
|
self.add_handler(newhandler)
|
||||||
|
self._ua_handlers[name] = newhandler
|
||||||
|
|
||||||
|
|
||||||
|
class UserAgent(UserAgentBase):
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
UserAgentBase.__init__(self)
|
||||||
|
self._seekable = False
|
||||||
|
|
||||||
|
def set_seekable_responses(self, handle):
|
||||||
|
"""Make response objects .seek()able."""
|
||||||
|
self._seekable = bool(handle)
|
||||||
|
|
||||||
|
def open(self, fullurl, data=None):
|
||||||
|
if self._seekable:
|
||||||
|
def bound_open(fullurl, data=None):
|
||||||
|
return UserAgentBase.open(self, fullurl, data)
|
||||||
|
response = _opener.wrapped_open(
|
||||||
|
bound_open, _response.seek_wrapped_response, fullurl, data)
|
||||||
|
else:
|
||||||
|
response = UserAgentBase.open(self, fullurl, data)
|
||||||
|
return response
|
279
src/calibre/utils/mechanize/_util.py
Normal file
279
src/calibre/utils/mechanize/_util.py
Normal file
@ -0,0 +1,279 @@
|
|||||||
|
"""Utility functions and date/time routines.
|
||||||
|
|
||||||
|
Copyright 2002-2006 John J Lee <jjl@pobox.com>
|
||||||
|
|
||||||
|
This code is free software; you can redistribute it and/or modify it
|
||||||
|
under the terms of the BSD or ZPL 2.1 licenses (see the file
|
||||||
|
COPYING.txt included with the distribution).
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re, string, time, warnings
|
||||||
|
|
||||||
|
def deprecation(message):
|
||||||
|
warnings.warn(message, DeprecationWarning, stacklevel=3)
|
||||||
|
def hide_deprecations():
|
||||||
|
warnings.filterwarnings('ignore', category=DeprecationWarning)
|
||||||
|
def reset_deprecations():
|
||||||
|
warnings.filterwarnings('default', category=DeprecationWarning)
|
||||||
|
|
||||||
|
|
||||||
|
def isstringlike(x):
|
||||||
|
try: x+""
|
||||||
|
except: return False
|
||||||
|
else: return True
|
||||||
|
|
||||||
|
## def caller():
|
||||||
|
## try:
|
||||||
|
## raise SyntaxError
|
||||||
|
## except:
|
||||||
|
## import sys
|
||||||
|
## return sys.exc_traceback.tb_frame.f_back.f_back.f_code.co_name
|
||||||
|
|
||||||
|
|
||||||
|
from calendar import timegm
|
||||||
|
|
||||||
|
# Date/time conversion routines for formats used by the HTTP protocol.
|
||||||
|
|
||||||
|
EPOCH = 1970
|
||||||
|
def my_timegm(tt):
|
||||||
|
year, month, mday, hour, min, sec = tt[:6]
|
||||||
|
if ((year >= EPOCH) and (1 <= month <= 12) and (1 <= mday <= 31) and
|
||||||
|
(0 <= hour <= 24) and (0 <= min <= 59) and (0 <= sec <= 61)):
|
||||||
|
return timegm(tt)
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
days = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
|
||||||
|
months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun",
|
||||||
|
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
|
||||||
|
months_lower = []
|
||||||
|
for month in months: months_lower.append(month.lower())
|
||||||
|
|
||||||
|
|
||||||
|
def time2isoz(t=None):
|
||||||
|
"""Return a string representing time in seconds since epoch, t.
|
||||||
|
|
||||||
|
If the function is called without an argument, it will use the current
|
||||||
|
time.
|
||||||
|
|
||||||
|
The format of the returned string is like "YYYY-MM-DD hh:mm:ssZ",
|
||||||
|
representing Universal Time (UTC, aka GMT). An example of this format is:
|
||||||
|
|
||||||
|
1994-11-24 08:49:37Z
|
||||||
|
|
||||||
|
"""
|
||||||
|
if t is None: t = time.time()
|
||||||
|
year, mon, mday, hour, min, sec = time.gmtime(t)[:6]
|
||||||
|
return "%04d-%02d-%02d %02d:%02d:%02dZ" % (
|
||||||
|
year, mon, mday, hour, min, sec)
|
||||||
|
|
||||||
|
def time2netscape(t=None):
|
||||||
|
"""Return a string representing time in seconds since epoch, t.
|
||||||
|
|
||||||
|
If the function is called without an argument, it will use the current
|
||||||
|
time.
|
||||||
|
|
||||||
|
The format of the returned string is like this:
|
||||||
|
|
||||||
|
Wed, DD-Mon-YYYY HH:MM:SS GMT
|
||||||
|
|
||||||
|
"""
|
||||||
|
if t is None: t = time.time()
|
||||||
|
year, mon, mday, hour, min, sec, wday = time.gmtime(t)[:7]
|
||||||
|
return "%s %02d-%s-%04d %02d:%02d:%02d GMT" % (
|
||||||
|
days[wday], mday, months[mon-1], year, hour, min, sec)
|
||||||
|
|
||||||
|
|
||||||
|
UTC_ZONES = {"GMT": None, "UTC": None, "UT": None, "Z": None}
|
||||||
|
|
||||||
|
timezone_re = re.compile(r"^([-+])?(\d\d?):?(\d\d)?$")
|
||||||
|
def offset_from_tz_string(tz):
|
||||||
|
offset = None
|
||||||
|
if UTC_ZONES.has_key(tz):
|
||||||
|
offset = 0
|
||||||
|
else:
|
||||||
|
m = timezone_re.search(tz)
|
||||||
|
if m:
|
||||||
|
offset = 3600 * int(m.group(2))
|
||||||
|
if m.group(3):
|
||||||
|
offset = offset + 60 * int(m.group(3))
|
||||||
|
if m.group(1) == '-':
|
||||||
|
offset = -offset
|
||||||
|
return offset
|
||||||
|
|
||||||
|
def _str2time(day, mon, yr, hr, min, sec, tz):
|
||||||
|
# translate month name to number
|
||||||
|
# month numbers start with 1 (January)
|
||||||
|
try:
|
||||||
|
mon = months_lower.index(mon.lower())+1
|
||||||
|
except ValueError:
|
||||||
|
# maybe it's already a number
|
||||||
|
try:
|
||||||
|
imon = int(mon)
|
||||||
|
except ValueError:
|
||||||
|
return None
|
||||||
|
if 1 <= imon <= 12:
|
||||||
|
mon = imon
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# make sure clock elements are defined
|
||||||
|
if hr is None: hr = 0
|
||||||
|
if min is None: min = 0
|
||||||
|
if sec is None: sec = 0
|
||||||
|
|
||||||
|
yr = int(yr)
|
||||||
|
day = int(day)
|
||||||
|
hr = int(hr)
|
||||||
|
min = int(min)
|
||||||
|
sec = int(sec)
|
||||||
|
|
||||||
|
if yr < 1000:
|
||||||
|
# find "obvious" year
|
||||||
|
cur_yr = time.localtime(time.time())[0]
|
||||||
|
m = cur_yr % 100
|
||||||
|
tmp = yr
|
||||||
|
yr = yr + cur_yr - m
|
||||||
|
m = m - tmp
|
||||||
|
if abs(m) > 50:
|
||||||
|
if m > 0: yr = yr + 100
|
||||||
|
else: yr = yr - 100
|
||||||
|
|
||||||
|
# convert UTC time tuple to seconds since epoch (not timezone-adjusted)
|
||||||
|
t = my_timegm((yr, mon, day, hr, min, sec, tz))
|
||||||
|
|
||||||
|
if t is not None:
|
||||||
|
# adjust time using timezone string, to get absolute time since epoch
|
||||||
|
if tz is None:
|
||||||
|
tz = "UTC"
|
||||||
|
tz = tz.upper()
|
||||||
|
offset = offset_from_tz_string(tz)
|
||||||
|
if offset is None:
|
||||||
|
return None
|
||||||
|
t = t - offset
|
||||||
|
|
||||||
|
return t
|
||||||
|
|
||||||
|
|
||||||
|
strict_re = re.compile(r"^[SMTWF][a-z][a-z], (\d\d) ([JFMASOND][a-z][a-z]) (\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$")
|
||||||
|
wkday_re = re.compile(
|
||||||
|
r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I)
|
||||||
|
loose_http_re = re.compile(
|
||||||
|
r"""^
|
||||||
|
(\d\d?) # day
|
||||||
|
(?:\s+|[-\/])
|
||||||
|
(\w+) # month
|
||||||
|
(?:\s+|[-\/])
|
||||||
|
(\d+) # year
|
||||||
|
(?:
|
||||||
|
(?:\s+|:) # separator before clock
|
||||||
|
(\d\d?):(\d\d) # hour:min
|
||||||
|
(?::(\d\d))? # optional seconds
|
||||||
|
)? # optional clock
|
||||||
|
\s*
|
||||||
|
([-+]?\d{2,4}|(?![APap][Mm]\b)[A-Za-z]+)? # timezone
|
||||||
|
\s*
|
||||||
|
(?:\(\w+\))? # ASCII representation of timezone in parens.
|
||||||
|
\s*$""", re.X)
|
||||||
|
def http2time(text):
|
||||||
|
"""Returns time in seconds since epoch of time represented by a string.
|
||||||
|
|
||||||
|
Return value is an integer.
|
||||||
|
|
||||||
|
None is returned if the format of str is unrecognized, the time is outside
|
||||||
|
the representable range, or the timezone string is not recognized. If the
|
||||||
|
string contains no timezone, UTC is assumed.
|
||||||
|
|
||||||
|
The timezone in the string may be numerical (like "-0800" or "+0100") or a
|
||||||
|
string timezone (like "UTC", "GMT", "BST" or "EST"). Currently, only the
|
||||||
|
timezone strings equivalent to UTC (zero offset) are known to the function.
|
||||||
|
|
||||||
|
The function loosely parses the following formats:
|
||||||
|
|
||||||
|
Wed, 09 Feb 1994 22:23:32 GMT -- HTTP format
|
||||||
|
Tuesday, 08-Feb-94 14:15:29 GMT -- old rfc850 HTTP format
|
||||||
|
Tuesday, 08-Feb-1994 14:15:29 GMT -- broken rfc850 HTTP format
|
||||||
|
09 Feb 1994 22:23:32 GMT -- HTTP format (no weekday)
|
||||||
|
08-Feb-94 14:15:29 GMT -- rfc850 format (no weekday)
|
||||||
|
08-Feb-1994 14:15:29 GMT -- broken rfc850 format (no weekday)
|
||||||
|
|
||||||
|
The parser ignores leading and trailing whitespace. The time may be
|
||||||
|
absent.
|
||||||
|
|
||||||
|
If the year is given with only 2 digits, the function will select the
|
||||||
|
century that makes the year closest to the current date.
|
||||||
|
|
||||||
|
"""
|
||||||
|
# fast exit for strictly conforming string
|
||||||
|
m = strict_re.search(text)
|
||||||
|
if m:
|
||||||
|
g = m.groups()
|
||||||
|
mon = months_lower.index(g[1].lower()) + 1
|
||||||
|
tt = (int(g[2]), mon, int(g[0]),
|
||||||
|
int(g[3]), int(g[4]), float(g[5]))
|
||||||
|
return my_timegm(tt)
|
||||||
|
|
||||||
|
# No, we need some messy parsing...
|
||||||
|
|
||||||
|
# clean up
|
||||||
|
text = text.lstrip()
|
||||||
|
text = wkday_re.sub("", text, 1) # Useless weekday
|
||||||
|
|
||||||
|
# tz is time zone specifier string
|
||||||
|
day, mon, yr, hr, min, sec, tz = [None]*7
|
||||||
|
|
||||||
|
# loose regexp parse
|
||||||
|
m = loose_http_re.search(text)
|
||||||
|
if m is not None:
|
||||||
|
day, mon, yr, hr, min, sec, tz = m.groups()
|
||||||
|
else:
|
||||||
|
return None # bad format
|
||||||
|
|
||||||
|
return _str2time(day, mon, yr, hr, min, sec, tz)
|
||||||
|
|
||||||
|
|
||||||
|
iso_re = re.compile(
|
||||||
|
"""^
|
||||||
|
(\d{4}) # year
|
||||||
|
[-\/]?
|
||||||
|
(\d\d?) # numerical month
|
||||||
|
[-\/]?
|
||||||
|
(\d\d?) # day
|
||||||
|
(?:
|
||||||
|
(?:\s+|[-:Tt]) # separator before clock
|
||||||
|
(\d\d?):?(\d\d) # hour:min
|
||||||
|
(?::?(\d\d(?:\.\d*)?))? # optional seconds (and fractional)
|
||||||
|
)? # optional clock
|
||||||
|
\s*
|
||||||
|
([-+]?\d\d?:?(:?\d\d)?
|
||||||
|
|Z|z)? # timezone (Z is "zero meridian", i.e. GMT)
|
||||||
|
\s*$""", re.X)
|
||||||
|
def iso2time(text):
|
||||||
|
"""
|
||||||
|
As for http2time, but parses the ISO 8601 formats:
|
||||||
|
|
||||||
|
1994-02-03 14:15:29 -0100 -- ISO 8601 format
|
||||||
|
1994-02-03 14:15:29 -- zone is optional
|
||||||
|
1994-02-03 -- only date
|
||||||
|
1994-02-03T14:15:29 -- Use T as separator
|
||||||
|
19940203T141529Z -- ISO 8601 compact format
|
||||||
|
19940203 -- only date
|
||||||
|
|
||||||
|
"""
|
||||||
|
# clean up
|
||||||
|
text = text.lstrip()
|
||||||
|
|
||||||
|
# tz is time zone specifier string
|
||||||
|
day, mon, yr, hr, min, sec, tz = [None]*7
|
||||||
|
|
||||||
|
# loose regexp parse
|
||||||
|
m = iso_re.search(text)
|
||||||
|
if m is not None:
|
||||||
|
# XXX there's an extra bit of the timezone I'm ignoring here: is
|
||||||
|
# this the right thing to do?
|
||||||
|
yr, mon, day, hr, min, sec, tz, _ = m.groups()
|
||||||
|
else:
|
||||||
|
return None # bad format
|
||||||
|
|
||||||
|
return _str2time(day, mon, yr, hr, min, sec, tz)
|
Loading…
x
Reference in New Issue
Block a user