IGN:Use patched mechanize implementation that correctly closes connections

This commit is contained in:
Kovid Goyal 2008-09-11 17:07:21 -07:00
parent e7c7cc64eb
commit 6fee09b9d2
24 changed files with 8779 additions and 2 deletions

View File

@ -2,7 +2,7 @@
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid@kovidgoyal.net>' __copyright__ = '2008, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import sys, os, re, logging, time, subprocess, mechanize, atexit import sys, os, re, logging, time, subprocess, atexit
from htmlentitydefs import name2codepoint from htmlentitydefs import name2codepoint
from math import floor from math import floor
from logging import Formatter from logging import Formatter
@ -14,7 +14,7 @@ from calibre.constants import iswindows, isosx, islinux, isfrozen, \
terminal_controller, preferred_encoding, \ terminal_controller, preferred_encoding, \
__appname__, __version__, __author__, \ __appname__, __version__, __author__, \
win32event, win32api, winerror, fcntl win32event, win32api, winerror, fcntl
from calibre.utils import mechanize
def unicode_path(path, abs=False): def unicode_path(path, abs=False):
if not isinstance(path, unicode): if not isinstance(path, unicode):

View File

@ -0,0 +1,125 @@
__all__ = [
'AbstractBasicAuthHandler',
'AbstractDigestAuthHandler',
'BaseHandler',
'Browser',
'BrowserStateError',
'CacheFTPHandler',
'ContentTooShortError',
'Cookie',
'CookieJar',
'CookiePolicy',
'DefaultCookiePolicy',
'DefaultFactory',
'FTPHandler',
'Factory',
'FileCookieJar',
'FileHandler',
'FormNotFoundError',
'FormsFactory',
'GopherError',
'GopherHandler',
'HTTPBasicAuthHandler',
'HTTPCookieProcessor',
'HTTPDefaultErrorHandler',
'HTTPDigestAuthHandler',
'HTTPEquivProcessor',
'HTTPError',
'HTTPErrorProcessor',
'HTTPHandler',
'HTTPPasswordMgr',
'HTTPPasswordMgrWithDefaultRealm',
'HTTPProxyPasswordMgr',
'HTTPRedirectDebugProcessor',
'HTTPRedirectHandler',
'HTTPRefererProcessor',
'HTTPRefreshProcessor',
'HTTPRequestUpgradeProcessor',
'HTTPResponseDebugProcessor',
'HTTPRobotRulesProcessor',
'HTTPSClientCertMgr',
'HTTPSHandler',
'HeadParser',
'History',
'LWPCookieJar',
'Link',
'LinkNotFoundError',
'LinksFactory',
'LoadError',
'MSIECookieJar',
'MozillaCookieJar',
'OpenerDirector',
'OpenerFactory',
'ParseError',
'ProxyBasicAuthHandler',
'ProxyDigestAuthHandler',
'ProxyHandler',
'Request',
'ResponseUpgradeProcessor',
'RobotExclusionError',
'RobustFactory',
'RobustFormsFactory',
'RobustLinksFactory',
'RobustTitleFactory',
'SeekableProcessor',
'SeekableResponseOpener',
'TitleFactory',
'URLError',
'USE_BARE_EXCEPT',
'UnknownHandler',
'UserAgent',
'UserAgentBase',
'XHTMLCompatibleHeadParser',
'__version__',
'build_opener',
'install_opener',
'lwp_cookie_str',
'make_response',
'request_host',
'response_seek_wrapper', # XXX deprecate in public interface?
'seek_wrapped_response' # XXX should probably use this internally in place of response_seek_wrapper()
'str2time',
'urlopen',
'urlretrieve']
from _mechanize import __version__
# high-level stateful browser-style interface
from _mechanize import \
Browser, History, \
BrowserStateError, LinkNotFoundError, FormNotFoundError
# configurable URL-opener interface
from _useragent import UserAgentBase, UserAgent
from _html import \
ParseError, \
Link, \
Factory, DefaultFactory, RobustFactory, \
FormsFactory, LinksFactory, TitleFactory, \
RobustFormsFactory, RobustLinksFactory, RobustTitleFactory
# urllib2 work-alike interface (part from mechanize, part from urllib2)
# This is a superset of the urllib2 interface.
from _urllib2 import *
# misc
from _opener import ContentTooShortError, OpenerFactory, urlretrieve
from _util import http2time as str2time
from _response import \
response_seek_wrapper, seek_wrapped_response, make_response
from _http import HeadParser
try:
from _http import XHTMLCompatibleHeadParser
except ImportError:
pass
# cookies
from _clientcookie import Cookie, CookiePolicy, DefaultCookiePolicy, \
CookieJar, FileCookieJar, LoadError, request_host
from _lwpcookiejar import LWPCookieJar, lwp_cookie_str
from _mozillacookiejar import MozillaCookieJar
from _msiecookiejar import MSIECookieJar
# If you hate the idea of turning bugs into warnings, do:
# import mechanize; mechanize.USE_BARE_EXCEPT = False
USE_BARE_EXCEPT = True

View File

@ -0,0 +1,500 @@
"""HTTP Authentication and Proxy support.
All but HTTPProxyPasswordMgr come from Python 2.5.
Copyright 2006 John J. Lee <jjl@pobox.com>
This code is free software; you can redistribute it and/or modify it under
the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt
included with the distribution).
"""
import re, base64, urlparse, posixpath, md5, sha, sys, copy
from urllib2 import BaseHandler
from urllib import getproxies, unquote, splittype, splituser, splitpasswd, \
splitport
def _parse_proxy(proxy):
"""Return (scheme, user, password, host/port) given a URL or an authority.
If a URL is supplied, it must have an authority (host:port) component.
According to RFC 3986, having an authority component means the URL must
have two slashes after the scheme:
>>> _parse_proxy('file:/ftp.example.com/')
Traceback (most recent call last):
ValueError: proxy URL with no authority: 'file:/ftp.example.com/'
The first three items of the returned tuple may be None.
Examples of authority parsing:
>>> _parse_proxy('proxy.example.com')
(None, None, None, 'proxy.example.com')
>>> _parse_proxy('proxy.example.com:3128')
(None, None, None, 'proxy.example.com:3128')
The authority component may optionally include userinfo (assumed to be
username:password):
>>> _parse_proxy('joe:password@proxy.example.com')
(None, 'joe', 'password', 'proxy.example.com')
>>> _parse_proxy('joe:password@proxy.example.com:3128')
(None, 'joe', 'password', 'proxy.example.com:3128')
Same examples, but with URLs instead:
>>> _parse_proxy('http://proxy.example.com/')
('http', None, None, 'proxy.example.com')
>>> _parse_proxy('http://proxy.example.com:3128/')
('http', None, None, 'proxy.example.com:3128')
>>> _parse_proxy('http://joe:password@proxy.example.com/')
('http', 'joe', 'password', 'proxy.example.com')
>>> _parse_proxy('http://joe:password@proxy.example.com:3128')
('http', 'joe', 'password', 'proxy.example.com:3128')
Everything after the authority is ignored:
>>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128')
('ftp', 'joe', 'password', 'proxy.example.com')
Test for no trailing '/' case:
>>> _parse_proxy('http://joe:password@proxy.example.com')
('http', 'joe', 'password', 'proxy.example.com')
"""
scheme, r_scheme = splittype(proxy)
if not r_scheme.startswith("/"):
# authority
scheme = None
authority = proxy
else:
# URL
if not r_scheme.startswith("//"):
raise ValueError("proxy URL with no authority: %r" % proxy)
# We have an authority, so for RFC 3986-compliant URLs (by ss 3.
# and 3.3.), path is empty or starts with '/'
end = r_scheme.find("/", 2)
if end == -1:
end = None
authority = r_scheme[2:end]
userinfo, hostport = splituser(authority)
if userinfo is not None:
user, password = splitpasswd(userinfo)
else:
user = password = None
return scheme, user, password, hostport
class ProxyHandler(BaseHandler):
# Proxies must be in front
handler_order = 100
def __init__(self, proxies=None):
if proxies is None:
proxies = getproxies()
assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
self.proxies = proxies
for type, url in proxies.items():
setattr(self, '%s_open' % type,
lambda r, proxy=url, type=type, meth=self.proxy_open: \
meth(r, proxy, type))
def proxy_open(self, req, proxy, type):
orig_type = req.get_type()
proxy_type, user, password, hostport = _parse_proxy(proxy)
if proxy_type is None:
proxy_type = orig_type
if user and password:
user_pass = '%s:%s' % (unquote(user), unquote(password))
creds = base64.encodestring(user_pass).strip()
req.add_header('Proxy-authorization', 'Basic ' + creds)
hostport = unquote(hostport)
req.set_proxy(hostport, proxy_type)
if orig_type == proxy_type:
# let other handlers take care of it
return None
else:
# need to start over, because the other handlers don't
# grok the proxy's URL type
# e.g. if we have a constructor arg proxies like so:
# {'http': 'ftp://proxy.example.com'}, we may end up turning
# a request for http://acme.example.com/a into one for
# ftp://proxy.example.com/a
return self.parent.open(req)
class HTTPPasswordMgr:
def __init__(self):
self.passwd = {}
def add_password(self, realm, uri, user, passwd):
# uri could be a single URI or a sequence
if isinstance(uri, basestring):
uri = [uri]
if not realm in self.passwd:
self.passwd[realm] = {}
for default_port in True, False:
reduced_uri = tuple(
[self.reduce_uri(u, default_port) for u in uri])
self.passwd[realm][reduced_uri] = (user, passwd)
def find_user_password(self, realm, authuri):
domains = self.passwd.get(realm, {})
for default_port in True, False:
reduced_authuri = self.reduce_uri(authuri, default_port)
for uris, authinfo in domains.iteritems():
for uri in uris:
if self.is_suburi(uri, reduced_authuri):
return authinfo
return None, None
def reduce_uri(self, uri, default_port=True):
"""Accept authority or URI and extract only the authority and path."""
# note HTTP URLs do not have a userinfo component
parts = urlparse.urlsplit(uri)
if parts[1]:
# URI
scheme = parts[0]
authority = parts[1]
path = parts[2] or '/'
else:
# host or host:port
scheme = None
authority = uri
path = '/'
host, port = splitport(authority)
if default_port and port is None and scheme is not None:
dport = {"http": 80,
"https": 443,
}.get(scheme)
if dport is not None:
authority = "%s:%d" % (host, dport)
return authority, path
def is_suburi(self, base, test):
"""Check if test is below base in a URI tree
Both args must be URIs in reduced form.
"""
if base == test:
return True
if base[0] != test[0]:
return False
common = posixpath.commonprefix((base[1], test[1]))
if len(common) == len(base[1]):
return True
return False
class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
def find_user_password(self, realm, authuri):
user, password = HTTPPasswordMgr.find_user_password(self, realm,
authuri)
if user is not None:
return user, password
return HTTPPasswordMgr.find_user_password(self, None, authuri)
class AbstractBasicAuthHandler:
rx = re.compile('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', re.I)
# XXX there can actually be multiple auth-schemes in a
# www-authenticate header. should probably be a lot more careful
# in parsing them to extract multiple alternatives
def __init__(self, password_mgr=None):
if password_mgr is None:
password_mgr = HTTPPasswordMgr()
self.passwd = password_mgr
self.add_password = self.passwd.add_password
def http_error_auth_reqed(self, authreq, host, req, headers):
# host may be an authority (without userinfo) or a URL with an
# authority
# XXX could be multiple headers
authreq = headers.get(authreq, None)
if authreq:
mo = AbstractBasicAuthHandler.rx.search(authreq)
if mo:
scheme, realm = mo.groups()
if scheme.lower() == 'basic':
return self.retry_http_basic_auth(host, req, realm)
def retry_http_basic_auth(self, host, req, realm):
user, pw = self.passwd.find_user_password(realm, host)
if pw is not None:
raw = "%s:%s" % (user, pw)
auth = 'Basic %s' % base64.encodestring(raw).strip()
if req.headers.get(self.auth_header, None) == auth:
return None
newreq = copy.copy(req)
newreq.add_header(self.auth_header, auth)
newreq.visit = False
return self.parent.open(newreq)
else:
return None
class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
auth_header = 'Authorization'
def http_error_401(self, req, fp, code, msg, headers):
url = req.get_full_url()
return self.http_error_auth_reqed('www-authenticate',
url, req, headers)
class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
auth_header = 'Proxy-authorization'
def http_error_407(self, req, fp, code, msg, headers):
# http_error_auth_reqed requires that there is no userinfo component in
# authority. Assume there isn't one, since urllib2 does not (and
# should not, RFC 3986 s. 3.2.1) support requests for URLs containing
# userinfo.
authority = req.get_host()
return self.http_error_auth_reqed('proxy-authenticate',
authority, req, headers)
def randombytes(n):
"""Return n random bytes."""
# Use /dev/urandom if it is available. Fall back to random module
# if not. It might be worthwhile to extend this function to use
# other platform-specific mechanisms for getting random bytes.
if os.path.exists("/dev/urandom"):
f = open("/dev/urandom")
s = f.read(n)
f.close()
return s
else:
L = [chr(random.randrange(0, 256)) for i in range(n)]
return "".join(L)
class AbstractDigestAuthHandler:
# Digest authentication is specified in RFC 2617.
# XXX The client does not inspect the Authentication-Info header
# in a successful response.
# XXX It should be possible to test this implementation against
# a mock server that just generates a static set of challenges.
# XXX qop="auth-int" supports is shaky
def __init__(self, passwd=None):
if passwd is None:
passwd = HTTPPasswordMgr()
self.passwd = passwd
self.add_password = self.passwd.add_password
self.retried = 0
self.nonce_count = 0
def reset_retry_count(self):
self.retried = 0
def http_error_auth_reqed(self, auth_header, host, req, headers):
authreq = headers.get(auth_header, None)
if self.retried > 5:
# Don't fail endlessly - if we failed once, we'll probably
# fail a second time. Hm. Unless the Password Manager is
# prompting for the information. Crap. This isn't great
# but it's better than the current 'repeat until recursion
# depth exceeded' approach <wink>
raise HTTPError(req.get_full_url(), 401, "digest auth failed",
headers, None)
else:
self.retried += 1
if authreq:
scheme = authreq.split()[0]
if scheme.lower() == 'digest':
return self.retry_http_digest_auth(req, authreq)
def retry_http_digest_auth(self, req, auth):
token, challenge = auth.split(' ', 1)
chal = parse_keqv_list(parse_http_list(challenge))
auth = self.get_authorization(req, chal)
if auth:
auth_val = 'Digest %s' % auth
if req.headers.get(self.auth_header, None) == auth_val:
return None
newreq = copy.copy(req)
newreq.add_unredirected_header(self.auth_header, auth_val)
newreq.visit = False
return self.parent.open(newreq)
def get_cnonce(self, nonce):
# The cnonce-value is an opaque
# quoted string value provided by the client and used by both client
# and server to avoid chosen plaintext attacks, to provide mutual
# authentication, and to provide some message integrity protection.
# This isn't a fabulous effort, but it's probably Good Enough.
dig = sha.new("%s:%s:%s:%s" % (self.nonce_count, nonce, time.ctime(),
randombytes(8))).hexdigest()
return dig[:16]
def get_authorization(self, req, chal):
try:
realm = chal['realm']
nonce = chal['nonce']
qop = chal.get('qop')
algorithm = chal.get('algorithm', 'MD5')
# mod_digest doesn't send an opaque, even though it isn't
# supposed to be optional
opaque = chal.get('opaque', None)
except KeyError:
return None
H, KD = self.get_algorithm_impls(algorithm)
if H is None:
return None
user, pw = self.passwd.find_user_password(realm, req.get_full_url())
if user is None:
return None
# XXX not implemented yet
if req.has_data():
entdig = self.get_entity_digest(req.get_data(), chal)
else:
entdig = None
A1 = "%s:%s:%s" % (user, realm, pw)
A2 = "%s:%s" % (req.get_method(),
# XXX selector: what about proxies and full urls
req.get_selector())
if qop == 'auth':
self.nonce_count += 1
ncvalue = '%08x' % self.nonce_count
cnonce = self.get_cnonce(nonce)
noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2))
respdig = KD(H(A1), noncebit)
elif qop is None:
respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
else:
# XXX handle auth-int.
pass
# XXX should the partial digests be encoded too?
base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
'response="%s"' % (user, realm, nonce, req.get_selector(),
respdig)
if opaque:
base += ', opaque="%s"' % opaque
if entdig:
base += ', digest="%s"' % entdig
base += ', algorithm="%s"' % algorithm
if qop:
base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
return base
def get_algorithm_impls(self, algorithm):
# lambdas assume digest modules are imported at the top level
if algorithm == 'MD5':
H = lambda x: md5.new(x).hexdigest()
elif algorithm == 'SHA':
H = lambda x: sha.new(x).hexdigest()
# XXX MD5-sess
KD = lambda s, d: H("%s:%s" % (s, d))
return H, KD
def get_entity_digest(self, data, chal):
# XXX not implemented yet
return None
class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
"""An authentication protocol defined by RFC 2069
Digest authentication improves on basic authentication because it
does not transmit passwords in the clear.
"""
auth_header = 'Authorization'
handler_order = 490
def http_error_401(self, req, fp, code, msg, headers):
host = urlparse.urlparse(req.get_full_url())[1]
retry = self.http_error_auth_reqed('www-authenticate',
host, req, headers)
self.reset_retry_count()
return retry
class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
auth_header = 'Proxy-Authorization'
handler_order = 490
def http_error_407(self, req, fp, code, msg, headers):
host = req.get_host()
retry = self.http_error_auth_reqed('proxy-authenticate',
host, req, headers)
self.reset_retry_count()
return retry
# XXX ugly implementation, should probably not bother deriving
class HTTPProxyPasswordMgr(HTTPPasswordMgr):
# has default realm and host/port
def add_password(self, realm, uri, user, passwd):
# uri could be a single URI or a sequence
if uri is None or isinstance(uri, basestring):
uris = [uri]
else:
uris = uri
passwd_by_domain = self.passwd.setdefault(realm, {})
for uri in uris:
for default_port in True, False:
reduced_uri = self.reduce_uri(uri, default_port)
passwd_by_domain[reduced_uri] = (user, passwd)
def find_user_password(self, realm, authuri):
attempts = [(realm, authuri), (None, authuri)]
# bleh, want default realm to take precedence over default
# URI/authority, hence this outer loop
for default_uri in False, True:
for realm, authuri in attempts:
authinfo_by_domain = self.passwd.get(realm, {})
for default_port in True, False:
reduced_authuri = self.reduce_uri(authuri, default_port)
for uri, authinfo in authinfo_by_domain.iteritems():
if uri is None and not default_uri:
continue
if self.is_suburi(uri, reduced_authuri):
return authinfo
user, password = None, None
if user is not None:
break
return user, password
def reduce_uri(self, uri, default_port=True):
if uri is None:
return None
return HTTPPasswordMgr.reduce_uri(self, uri, default_port)
def is_suburi(self, base, test):
if base is None:
# default to the proxy's host/port
hostport, path = test
base = (hostport, "/")
return HTTPPasswordMgr.is_suburi(self, base, test)
class HTTPSClientCertMgr(HTTPPasswordMgr):
# implementation inheritance: this is not a proper subclass
def add_key_cert(self, uri, key_file, cert_file):
self.add_password(None, uri, key_file, cert_file)
def find_key_cert(self, authuri):
return HTTPPasswordMgr.find_user_password(self, None, authuri)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,28 @@
import logging
from urllib2 import BaseHandler
from _response import response_seek_wrapper
class HTTPResponseDebugProcessor(BaseHandler):
handler_order = 900 # before redirections, after everything else
def http_response(self, request, response):
if not hasattr(response, "seek"):
response = response_seek_wrapper(response)
info = logging.getLogger("mechanize.http_responses").info
try:
info(response.read())
finally:
response.seek(0)
info("*****************************************************")
return response
https_response = http_response
class HTTPRedirectDebugProcessor(BaseHandler):
def http_request(self, request):
if hasattr(request, "redirect_dict"):
info = logging.getLogger("mechanize.http_redirects").info
info("redirecting to %s", request.get_full_url())
return request

View File

@ -0,0 +1,103 @@
import urllib2
from cStringIO import StringIO
import _response
# GzipConsumer was taken from Fredrik Lundh's effbot.org-0.1-20041009 library
class GzipConsumer:
def __init__(self, consumer):
self.__consumer = consumer
self.__decoder = None
self.__data = ""
def __getattr__(self, key):
return getattr(self.__consumer, key)
def feed(self, data):
if self.__decoder is None:
# check if we have a full gzip header
data = self.__data + data
try:
i = 10
flag = ord(data[3])
if flag & 4: # extra
x = ord(data[i]) + 256*ord(data[i+1])
i = i + 2 + x
if flag & 8: # filename
while ord(data[i]):
i = i + 1
i = i + 1
if flag & 16: # comment
while ord(data[i]):
i = i + 1
i = i + 1
if flag & 2: # crc
i = i + 2
if len(data) < i:
raise IndexError("not enough data")
if data[:3] != "\x1f\x8b\x08":
raise IOError("invalid gzip data")
data = data[i:]
except IndexError:
self.__data = data
return # need more data
import zlib
self.__data = ""
self.__decoder = zlib.decompressobj(-zlib.MAX_WBITS)
data = self.__decoder.decompress(data)
if data:
self.__consumer.feed(data)
def close(self):
if self.__decoder:
data = self.__decoder.flush()
if data:
self.__consumer.feed(data)
self.__consumer.close()
# --------------------------------------------------------------------
# the rest of this module is John Lee's stupid code, not
# Fredrik's nice code :-)
class stupid_gzip_consumer:
def __init__(self): self.data = []
def feed(self, data): self.data.append(data)
class stupid_gzip_wrapper(_response.closeable_response):
def __init__(self, response):
self._response = response
c = stupid_gzip_consumer()
gzc = GzipConsumer(c)
gzc.feed(response.read())
self.__data = StringIO("".join(c.data))
def read(self, size=-1):
return self.__data.read(size)
def readline(self, size=-1):
return self.__data.readline(size)
def readlines(self, sizehint=-1):
return self.__data.readlines(size)
def __getattr__(self, name):
# delegate unknown methods/attributes
return getattr(self._response, name)
class HTTPGzipProcessor(urllib2.BaseHandler):
handler_order = 200 # response processing before HTTPEquivProcessor
def http_request(self, request):
request.add_header("Accept-Encoding", "gzip")
return request
def http_response(self, request, response):
# post-process response
enc_hdrs = response.info().getheaders("Content-encoding")
for enc_hdr in enc_hdrs:
if ("gzip" in enc_hdr) or ("compress" in enc_hdr):
return stupid_gzip_wrapper(response)
return response
https_response = http_response

View File

@ -0,0 +1,226 @@
"""Utility functions for HTTP header value parsing and construction.
Copyright 1997-1998, Gisle Aas
Copyright 2002-2006, John J. Lee
This code is free software; you can redistribute it and/or modify it
under the terms of the BSD or ZPL 2.1 licenses (see the file
COPYING.txt included with the distribution).
"""
import os, re
from types import StringType
from types import UnicodeType
STRING_TYPES = StringType, UnicodeType
from _util import http2time
import _rfc3986
def is_html(ct_headers, url, allow_xhtml=False):
"""
ct_headers: Sequence of Content-Type headers
url: Response URL
"""
if not ct_headers:
# guess
ext = os.path.splitext(_rfc3986.urlsplit(url)[2])[1]
html_exts = [".htm", ".html"]
if allow_xhtml:
html_exts += [".xhtml"]
return ext in html_exts
# use first header
ct = split_header_words(ct_headers)[0][0][0]
html_types = ["text/html"]
if allow_xhtml:
html_types += [
"text/xhtml", "text/xml",
"application/xml", "application/xhtml+xml",
]
return ct in html_types
def unmatched(match):
"""Return unmatched part of re.Match object."""
start, end = match.span(0)
return match.string[:start]+match.string[end:]
token_re = re.compile(r"^\s*([^=\s;,]+)")
quoted_value_re = re.compile(r"^\s*=\s*\"([^\"\\]*(?:\\.[^\"\\]*)*)\"")
value_re = re.compile(r"^\s*=\s*([^\s;,]*)")
escape_re = re.compile(r"\\(.)")
def split_header_words(header_values):
r"""Parse header values into a list of lists containing key,value pairs.
The function knows how to deal with ",", ";" and "=" as well as quoted
values after "=". A list of space separated tokens are parsed as if they
were separated by ";".
If the header_values passed as argument contains multiple values, then they
are treated as if they were a single value separated by comma ",".
This means that this function is useful for parsing header fields that
follow this syntax (BNF as from the HTTP/1.1 specification, but we relax
the requirement for tokens).
headers = #header
header = (token | parameter) *( [";"] (token | parameter))
token = 1*<any CHAR except CTLs or separators>
separators = "(" | ")" | "<" | ">" | "@"
| "," | ";" | ":" | "\" | <">
| "/" | "[" | "]" | "?" | "="
| "{" | "}" | SP | HT
quoted-string = ( <"> *(qdtext | quoted-pair ) <"> )
qdtext = <any TEXT except <">>
quoted-pair = "\" CHAR
parameter = attribute "=" value
attribute = token
value = token | quoted-string
Each header is represented by a list of key/value pairs. The value for a
simple token (not part of a parameter) is None. Syntactically incorrect
headers will not necessarily be parsed as you would want.
This is easier to describe with some examples:
>>> split_header_words(['foo="bar"; port="80,81"; discard, bar=baz'])
[[('foo', 'bar'), ('port', '80,81'), ('discard', None)], [('bar', 'baz')]]
>>> split_header_words(['text/html; charset="iso-8859-1"'])
[[('text/html', None), ('charset', 'iso-8859-1')]]
>>> split_header_words([r'Basic realm="\"foo\bar\""'])
[[('Basic', None), ('realm', '"foobar"')]]
"""
assert type(header_values) not in STRING_TYPES
result = []
for text in header_values:
orig_text = text
pairs = []
while text:
m = token_re.search(text)
if m:
text = unmatched(m)
name = m.group(1)
m = quoted_value_re.search(text)
if m: # quoted value
text = unmatched(m)
value = m.group(1)
value = escape_re.sub(r"\1", value)
else:
m = value_re.search(text)
if m: # unquoted value
text = unmatched(m)
value = m.group(1)
value = value.rstrip()
else:
# no value, a lone token
value = None
pairs.append((name, value))
elif text.lstrip().startswith(","):
# concatenated headers, as per RFC 2616 section 4.2
text = text.lstrip()[1:]
if pairs: result.append(pairs)
pairs = []
else:
# skip junk
non_junk, nr_junk_chars = re.subn("^[=\s;]*", "", text)
assert nr_junk_chars > 0, (
"split_header_words bug: '%s', '%s', %s" %
(orig_text, text, pairs))
text = non_junk
if pairs: result.append(pairs)
return result
join_escape_re = re.compile(r"([\"\\])")
def join_header_words(lists):
"""Do the inverse of the conversion done by split_header_words.
Takes a list of lists of (key, value) pairs and produces a single header
value. Attribute values are quoted if needed.
>>> join_header_words([[("text/plain", None), ("charset", "iso-8859/1")]])
'text/plain; charset="iso-8859/1"'
>>> join_header_words([[("text/plain", None)], [("charset", "iso-8859/1")]])
'text/plain, charset="iso-8859/1"'
"""
headers = []
for pairs in lists:
attr = []
for k, v in pairs:
if v is not None:
if not re.search(r"^\w+$", v):
v = join_escape_re.sub(r"\\\1", v) # escape " and \
v = '"%s"' % v
if k is None: # Netscape cookies may have no name
k = v
else:
k = "%s=%s" % (k, v)
attr.append(k)
if attr: headers.append("; ".join(attr))
return ", ".join(headers)
def parse_ns_headers(ns_headers):
"""Ad-hoc parser for Netscape protocol cookie-attributes.
The old Netscape cookie format for Set-Cookie can for instance contain
an unquoted "," in the expires field, so we have to use this ad-hoc
parser instead of split_header_words.
XXX This may not make the best possible effort to parse all the crap
that Netscape Cookie headers contain. Ronald Tschalar's HTTPClient
parser is probably better, so could do worse than following that if
this ever gives any trouble.
Currently, this is also used for parsing RFC 2109 cookies.
"""
known_attrs = ("expires", "domain", "path", "secure",
# RFC 2109 attrs (may turn up in Netscape cookies, too)
"port", "max-age")
result = []
for ns_header in ns_headers:
pairs = []
version_set = False
params = re.split(r";\s*", ns_header)
for ii in range(len(params)):
param = params[ii]
param = param.rstrip()
if param == "": continue
if "=" not in param:
k, v = param, None
else:
k, v = re.split(r"\s*=\s*", param, 1)
k = k.lstrip()
if ii != 0:
lc = k.lower()
if lc in known_attrs:
k = lc
if k == "version":
# This is an RFC 2109 cookie.
version_set = True
if k == "expires":
# convert expires date to seconds since epoch
if v.startswith('"'): v = v[1:]
if v.endswith('"'): v = v[:-1]
v = http2time(v) # None if invalid
pairs.append((k, v))
if pairs:
if not version_set:
pairs.append(("version", "0"))
result.append(pairs)
return result
def _test():
import doctest, _headersutil
return doctest.testmod(_headersutil)
if __name__ == "__main__":
_test()

View File

@ -0,0 +1,607 @@
"""HTML handling.
Copyright 2003-2006 John J. Lee <jjl@pobox.com>
This code is free software; you can redistribute it and/or modify it under
the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt
included with the distribution).
"""
import re, copy, htmlentitydefs
import sgmllib, HTMLParser, ClientForm
import _request
from _headersutil import split_header_words, is_html as _is_html
import _rfc3986
DEFAULT_ENCODING = "latin-1"
# the base classe is purely for backwards compatibility
class ParseError(ClientForm.ParseError): pass
class CachingGeneratorFunction(object):
"""Caching wrapper around a no-arguments iterable."""
def __init__(self, iterable):
self._cache = []
# wrap iterable to make it non-restartable (otherwise, repeated
# __call__ would give incorrect results)
self._iterator = iter(iterable)
def __call__(self):
cache = self._cache
for item in cache:
yield item
for item in self._iterator:
cache.append(item)
yield item
class EncodingFinder:
def __init__(self, default_encoding):
self._default_encoding = default_encoding
def encoding(self, response):
# HTTPEquivProcessor may be in use, so both HTTP and HTTP-EQUIV
# headers may be in the response. HTTP-EQUIV headers come last,
# so try in order from first to last.
for ct in response.info().getheaders("content-type"):
for k, v in split_header_words([ct])[0]:
if k == "charset":
return v
return self._default_encoding
class ResponseTypeFinder:
def __init__(self, allow_xhtml):
self._allow_xhtml = allow_xhtml
def is_html(self, response, encoding):
ct_hdrs = response.info().getheaders("content-type")
url = response.geturl()
# XXX encoding
return _is_html(ct_hdrs, url, self._allow_xhtml)
# idea for this argument-processing trick is from Peter Otten
class Args:
def __init__(self, args_map):
self.dictionary = dict(args_map)
def __getattr__(self, key):
try:
return self.dictionary[key]
except KeyError:
return getattr(self.__class__, key)
def form_parser_args(
select_default=False,
form_parser_class=None,
request_class=None,
backwards_compat=False,
):
return Args(locals())
class Link:
def __init__(self, base_url, url, text, tag, attrs):
assert None not in [url, tag, attrs]
self.base_url = base_url
self.absolute_url = _rfc3986.urljoin(base_url, url)
self.url, self.text, self.tag, self.attrs = url, text, tag, attrs
def __cmp__(self, other):
try:
for name in "url", "text", "tag", "attrs":
if getattr(self, name) != getattr(other, name):
return -1
except AttributeError:
return -1
return 0
def __repr__(self):
return "Link(base_url=%r, url=%r, text=%r, tag=%r, attrs=%r)" % (
self.base_url, self.url, self.text, self.tag, self.attrs)
class LinksFactory:
def __init__(self,
link_parser_class=None,
link_class=Link,
urltags=None,
):
import _pullparser
if link_parser_class is None:
link_parser_class = _pullparser.TolerantPullParser
self.link_parser_class = link_parser_class
self.link_class = link_class
if urltags is None:
urltags = {
"a": "href",
"area": "href",
"frame": "src",
"iframe": "src",
}
self.urltags = urltags
self._response = None
self._encoding = None
def set_response(self, response, base_url, encoding):
self._response = response
self._encoding = encoding
self._base_url = base_url
def links(self):
"""Return an iterator that provides links of the document."""
response = self._response
encoding = self._encoding
base_url = self._base_url
p = self.link_parser_class(response, encoding=encoding)
try:
for token in p.tags(*(self.urltags.keys()+["base"])):
if token.type == "endtag":
continue
if token.data == "base":
base_href = dict(token.attrs).get("href")
if base_href is not None:
base_url = base_href
continue
attrs = dict(token.attrs)
tag = token.data
name = attrs.get("name")
text = None
# XXX use attr_encoding for ref'd doc if that doc does not
# provide one by other means
#attr_encoding = attrs.get("charset")
url = attrs.get(self.urltags[tag]) # XXX is "" a valid URL?
if not url:
# Probably an <A NAME="blah"> link or <AREA NOHREF...>.
# For our purposes a link is something with a URL, so
# ignore this.
continue
url = _rfc3986.clean_url(url, encoding)
if tag == "a":
if token.type != "startendtag":
# hmm, this'd break if end tag is missing
text = p.get_compressed_text(("endtag", tag))
# but this doesn't work for eg.
# <a href="blah"><b>Andy</b></a>
#text = p.get_compressed_text()
yield Link(base_url, url, text, tag, token.attrs)
except sgmllib.SGMLParseError, exc:
raise ParseError(exc)
class FormsFactory:
"""Makes a sequence of objects satisfying ClientForm.HTMLForm interface.
After calling .forms(), the .global_form attribute is a form object
containing all controls not a descendant of any FORM element.
For constructor argument docs, see ClientForm.ParseResponse
argument docs.
"""
def __init__(self,
select_default=False,
form_parser_class=None,
request_class=None,
backwards_compat=False,
):
import ClientForm
self.select_default = select_default
if form_parser_class is None:
form_parser_class = ClientForm.FormParser
self.form_parser_class = form_parser_class
if request_class is None:
request_class = _request.Request
self.request_class = request_class
self.backwards_compat = backwards_compat
self._response = None
self.encoding = None
self.global_form = None
def set_response(self, response, encoding):
self._response = response
self.encoding = encoding
self.global_form = None
def forms(self):
import ClientForm
encoding = self.encoding
try:
forms = ClientForm.ParseResponseEx(
self._response,
select_default=self.select_default,
form_parser_class=self.form_parser_class,
request_class=self.request_class,
encoding=encoding,
_urljoin=_rfc3986.urljoin,
_urlparse=_rfc3986.urlsplit,
_urlunparse=_rfc3986.urlunsplit,
)
except ClientForm.ParseError, exc:
raise ParseError(exc)
self.global_form = forms[0]
return forms[1:]
class TitleFactory:
def __init__(self):
self._response = self._encoding = None
def set_response(self, response, encoding):
self._response = response
self._encoding = encoding
def title(self):
import _pullparser
p = _pullparser.TolerantPullParser(
self._response, encoding=self._encoding)
try:
try:
p.get_tag("title")
except _pullparser.NoMoreTokensError:
return None
else:
return p.get_text()
except sgmllib.SGMLParseError, exc:
raise ParseError(exc)
def unescape(data, entities, encoding):
if data is None or "&" not in data:
return data
def replace_entities(match):
ent = match.group()
if ent[1] == "#":
return unescape_charref(ent[2:-1], encoding)
repl = entities.get(ent[1:-1])
if repl is not None:
repl = unichr(repl)
if type(repl) != type(""):
try:
repl = repl.encode(encoding)
except UnicodeError:
repl = ent
else:
repl = ent
return repl
return re.sub(r"&#?[A-Za-z0-9]+?;", replace_entities, data)
def unescape_charref(data, encoding):
name, base = data, 10
if name.startswith("x"):
name, base= name[1:], 16
uc = unichr(int(name, base))
if encoding is None:
return uc
else:
try:
repl = uc.encode(encoding)
except UnicodeError:
repl = "&#%s;" % data
return repl
# bizarre import gymnastics for bundled BeautifulSoup
import _beautifulsoup
import ClientForm
RobustFormParser, NestingRobustFormParser = ClientForm._create_bs_classes(
_beautifulsoup.BeautifulSoup, _beautifulsoup.ICantBelieveItsBeautifulSoup
)
# monkeypatch sgmllib to fix http://www.python.org/sf/803422 :-(
import sgmllib
sgmllib.charref = re.compile("&#(x?[0-9a-fA-F]+)[^0-9a-fA-F]")
class MechanizeBs(_beautifulsoup.BeautifulSoup):
_entitydefs = htmlentitydefs.name2codepoint
# don't want the magic Microsoft-char workaround
PARSER_MASSAGE = [(re.compile('(<[^<>]*)/>'),
lambda(x):x.group(1) + ' />'),
(re.compile('<!\s+([^<>]*)>'),
lambda(x):'<!' + x.group(1) + '>')
]
def __init__(self, encoding, text=None, avoidParserProblems=True,
initialTextIsEverything=True):
self._encoding = encoding
_beautifulsoup.BeautifulSoup.__init__(
self, text, avoidParserProblems, initialTextIsEverything)
def handle_charref(self, ref):
t = unescape("&#%s;"%ref, self._entitydefs, self._encoding)
self.handle_data(t)
def handle_entityref(self, ref):
t = unescape("&%s;"%ref, self._entitydefs, self._encoding)
self.handle_data(t)
def unescape_attrs(self, attrs):
escaped_attrs = []
for key, val in attrs:
val = unescape(val, self._entitydefs, self._encoding)
escaped_attrs.append((key, val))
return escaped_attrs
class RobustLinksFactory:
compress_re = re.compile(r"\s+")
def __init__(self,
link_parser_class=None,
link_class=Link,
urltags=None,
):
import _beautifulsoup
if link_parser_class is None:
link_parser_class = MechanizeBs
self.link_parser_class = link_parser_class
self.link_class = link_class
if urltags is None:
urltags = {
"a": "href",
"area": "href",
"frame": "src",
"iframe": "src",
}
self.urltags = urltags
self._bs = None
self._encoding = None
self._base_url = None
def set_soup(self, soup, base_url, encoding):
self._bs = soup
self._base_url = base_url
self._encoding = encoding
def links(self):
import _beautifulsoup
bs = self._bs
base_url = self._base_url
encoding = self._encoding
gen = bs.recursiveChildGenerator()
for ch in bs.recursiveChildGenerator():
if (isinstance(ch, _beautifulsoup.Tag) and
ch.name in self.urltags.keys()+["base"]):
link = ch
attrs = bs.unescape_attrs(link.attrs)
attrs_dict = dict(attrs)
if link.name == "base":
base_href = attrs_dict.get("href")
if base_href is not None:
base_url = base_href
continue
url_attr = self.urltags[link.name]
url = attrs_dict.get(url_attr)
if not url:
continue
url = _rfc3986.clean_url(url, encoding)
text = link.firstText(lambda t: True)
if text is _beautifulsoup.Null:
# follow _pullparser's weird behaviour rigidly
if link.name == "a":
text = ""
else:
text = None
else:
text = self.compress_re.sub(" ", text.strip())
yield Link(base_url, url, text, link.name, attrs)
class RobustFormsFactory(FormsFactory):
def __init__(self, *args, **kwds):
import ClientForm
args = form_parser_args(*args, **kwds)
if args.form_parser_class is None:
args.form_parser_class = RobustFormParser
FormsFactory.__init__(self, **args.dictionary)
def set_response(self, response, encoding):
self._response = response
self.encoding = encoding
class RobustTitleFactory:
def __init__(self):
self._bs = self._encoding = None
def set_soup(self, soup, encoding):
self._bs = soup
self._encoding = encoding
def title(self):
import _beautifulsoup
title = self._bs.first("title")
if title == _beautifulsoup.Null:
return None
else:
return title.firstText(lambda t: True)
class Factory:
"""Factory for forms, links, etc.
This interface may expand in future.
Public methods:
set_request_class(request_class)
set_response(response)
forms()
links()
Public attributes:
Note that accessing these attributes may raise ParseError.
encoding: string specifying the encoding of response if it contains a text
document (this value is left unspecified for documents that do not have
an encoding, e.g. an image file)
is_html: true if response contains an HTML document (XHTML may be
regarded as HTML too)
title: page title, or None if no title or not HTML
global_form: form object containing all controls that are not descendants
of any FORM element, or None if the forms_factory does not support
supplying a global form
"""
LAZY_ATTRS = ["encoding", "is_html", "title", "global_form"]
def __init__(self, forms_factory, links_factory, title_factory,
encoding_finder=EncodingFinder(DEFAULT_ENCODING),
response_type_finder=ResponseTypeFinder(allow_xhtml=False),
):
"""
Pass keyword arguments only.
default_encoding: character encoding to use if encoding cannot be
determined (or guessed) from the response. You should turn on
HTTP-EQUIV handling if you want the best chance of getting this right
without resorting to this default. The default value of this
parameter (currently latin-1) may change in future.
"""
self._forms_factory = forms_factory
self._links_factory = links_factory
self._title_factory = title_factory
self._encoding_finder = encoding_finder
self._response_type_finder = response_type_finder
self.set_response(None)
def set_request_class(self, request_class):
"""Set urllib2.Request class.
ClientForm.HTMLForm instances returned by .forms() will return
instances of this class when .click()ed.
"""
self._forms_factory.request_class = request_class
def set_response(self, response):
"""Set response.
The response must either be None or implement the same interface as
objects returned by urllib2.urlopen().
"""
self._response = response
self._forms_genf = self._links_genf = None
self._get_title = None
for name in self.LAZY_ATTRS:
try:
delattr(self, name)
except AttributeError:
pass
def __getattr__(self, name):
if name not in self.LAZY_ATTRS:
return getattr(self.__class__, name)
if name == "encoding":
self.encoding = self._encoding_finder.encoding(
copy.copy(self._response))
return self.encoding
elif name == "is_html":
self.is_html = self._response_type_finder.is_html(
copy.copy(self._response), self.encoding)
return self.is_html
elif name == "title":
if self.is_html:
self.title = self._title_factory.title()
else:
self.title = None
return self.title
elif name == "global_form":
self.forms()
return self.global_form
def forms(self):
"""Return iterable over ClientForm.HTMLForm-like objects.
Raises mechanize.ParseError on failure.
"""
# this implementation sets .global_form as a side-effect, for benefit
# of __getattr__ impl
if self._forms_genf is None:
try:
self._forms_genf = CachingGeneratorFunction(
self._forms_factory.forms())
except: # XXXX define exception!
self.set_response(self._response)
raise
self.global_form = getattr(
self._forms_factory, "global_form", None)
return self._forms_genf()
def links(self):
"""Return iterable over mechanize.Link-like objects.
Raises mechanize.ParseError on failure.
"""
if self._links_genf is None:
try:
self._links_genf = CachingGeneratorFunction(
self._links_factory.links())
except: # XXXX define exception!
self.set_response(self._response)
raise
return self._links_genf()
class DefaultFactory(Factory):
"""Based on sgmllib."""
def __init__(self, i_want_broken_xhtml_support=False):
Factory.__init__(
self,
forms_factory=FormsFactory(),
links_factory=LinksFactory(),
title_factory=TitleFactory(),
response_type_finder=ResponseTypeFinder(
allow_xhtml=i_want_broken_xhtml_support),
)
def set_response(self, response):
Factory.set_response(self, response)
if response is not None:
self._forms_factory.set_response(
copy.copy(response), self.encoding)
self._links_factory.set_response(
copy.copy(response), response.geturl(), self.encoding)
self._title_factory.set_response(
copy.copy(response), self.encoding)
class RobustFactory(Factory):
"""Based on BeautifulSoup, hopefully a bit more robust to bad HTML than is
DefaultFactory.
"""
def __init__(self, i_want_broken_xhtml_support=False,
soup_class=None):
Factory.__init__(
self,
forms_factory=RobustFormsFactory(),
links_factory=RobustLinksFactory(),
title_factory=RobustTitleFactory(),
response_type_finder=ResponseTypeFinder(
allow_xhtml=i_want_broken_xhtml_support),
)
if soup_class is None:
soup_class = MechanizeBs
self._soup_class = soup_class
def set_response(self, response):
import _beautifulsoup
Factory.set_response(self, response)
if response is not None:
data = response.read()
soup = self._soup_class(self.encoding, data)
self._forms_factory.set_response(
copy.copy(response), self.encoding)
self._links_factory.set_soup(
soup, response.geturl(), self.encoding)
self._title_factory.set_soup(soup, self.encoding)

View File

@ -0,0 +1,729 @@
"""HTTP related handlers.
Note that some other HTTP handlers live in more specific modules: _auth.py,
_gzip.py, etc.
Copyright 2002-2006 John J Lee <jjl@pobox.com>
This code is free software; you can redistribute it and/or modify it
under the terms of the BSD or ZPL 2.1 licenses (see the file
COPYING.txt included with the distribution).
"""
import copy, time, tempfile, htmlentitydefs, re, logging, socket, \
urllib2, urllib, httplib, sgmllib
from urllib2 import URLError, HTTPError, BaseHandler
from cStringIO import StringIO
from _request import Request
from _util import isstringlike
from _response import closeable_response, response_seek_wrapper
from _html import unescape, unescape_charref
from _headersutil import is_html
from _clientcookie import CookieJar, request_host
import _rfc3986
debug = logging.getLogger("mechanize").debug
# monkeypatch urllib2.HTTPError to show URL
## def urllib2_str(self):
## return 'HTTP Error %s: %s (%s)' % (
## self.code, self.msg, self.geturl())
## urllib2.HTTPError.__str__ = urllib2_str
CHUNK = 1024 # size of chunks fed to HTML HEAD parser, in bytes
DEFAULT_ENCODING = 'latin-1'
# This adds "refresh" to the list of redirectables and provides a redirection
# algorithm that doesn't go into a loop in the presence of cookies
# (Python 2.4 has this new algorithm, 2.3 doesn't).
class HTTPRedirectHandler(BaseHandler):
# maximum number of redirections to any single URL
# this is needed because of the state that cookies introduce
max_repeats = 4
# maximum total number of redirections (regardless of URL) before
# assuming we're in a loop
max_redirections = 10
# Implementation notes:
# To avoid the server sending us into an infinite loop, the request
# object needs to track what URLs we have already seen. Do this by
# adding a handler-specific attribute to the Request object. The value
# of the dict is used to count the number of times the same URL has
# been visited. This is needed because visiting the same URL twice
# does not necessarily imply a loop, thanks to state introduced by
# cookies.
# Always unhandled redirection codes:
# 300 Multiple Choices: should not handle this here.
# 304 Not Modified: no need to handle here: only of interest to caches
# that do conditional GETs
# 305 Use Proxy: probably not worth dealing with here
# 306 Unused: what was this for in the previous versions of protocol??
def redirect_request(self, newurl, req, fp, code, msg, headers):
"""Return a Request or None in response to a redirect.
This is called by the http_error_30x methods when a redirection
response is received. If a redirection should take place, return a
new Request to allow http_error_30x to perform the redirect;
otherwise, return None to indicate that an HTTPError should be
raised.
"""
if code in (301, 302, 303, "refresh") or \
(code == 307 and not req.has_data()):
# Strictly (according to RFC 2616), 301 or 302 in response to
# a POST MUST NOT cause a redirection without confirmation
# from the user (of urllib2, in this case). In practice,
# essentially all clients do redirect in this case, so we do
# the same.
# XXX really refresh redirections should be visiting; tricky to
# fix, so this will wait until post-stable release
new = Request(newurl,
headers=req.headers,
origin_req_host=req.get_origin_req_host(),
unverifiable=True,
visit=False,
)
new._origin_req = getattr(req, "_origin_req", req)
return new
else:
raise HTTPError(req.get_full_url(), code, msg, headers, fp)
def http_error_302(self, req, fp, code, msg, headers):
# Some servers (incorrectly) return multiple Location headers
# (so probably same goes for URI). Use first header.
if headers.has_key('location'):
newurl = headers.getheaders('location')[0]
elif headers.has_key('uri'):
newurl = headers.getheaders('uri')[0]
else:
return
newurl = _rfc3986.clean_url(newurl, "latin-1")
newurl = _rfc3986.urljoin(req.get_full_url(), newurl)
# XXX Probably want to forget about the state of the current
# request, although that might interact poorly with other
# handlers that also use handler-specific request attributes
new = self.redirect_request(newurl, req, fp, code, msg, headers)
if new is None:
return
# loop detection
# .redirect_dict has a key url if url was previously visited.
if hasattr(req, 'redirect_dict'):
visited = new.redirect_dict = req.redirect_dict
if (visited.get(newurl, 0) >= self.max_repeats or
len(visited) >= self.max_redirections):
raise HTTPError(req.get_full_url(), code,
self.inf_msg + msg, headers, fp)
else:
visited = new.redirect_dict = req.redirect_dict = {}
visited[newurl] = visited.get(newurl, 0) + 1
# Don't close the fp until we are sure that we won't use it
# with HTTPError.
fp.read()
fp.close()
return self.parent.open(new)
http_error_301 = http_error_303 = http_error_307 = http_error_302
http_error_refresh = http_error_302
inf_msg = "The HTTP server returned a redirect error that would " \
"lead to an infinite loop.\n" \
"The last 30x error message was:\n"
# XXX would self.reset() work, instead of raising this exception?
class EndOfHeadError(Exception): pass
class AbstractHeadParser:
# only these elements are allowed in or before HEAD of document
head_elems = ("html", "head",
"title", "base",
"script", "style", "meta", "link", "object")
_entitydefs = htmlentitydefs.name2codepoint
_encoding = DEFAULT_ENCODING
def __init__(self):
self.http_equiv = []
def start_meta(self, attrs):
http_equiv = content = None
for key, value in attrs:
if key == "http-equiv":
http_equiv = self.unescape_attr_if_required(value)
elif key == "content":
content = self.unescape_attr_if_required(value)
if http_equiv is not None and content is not None:
self.http_equiv.append((http_equiv, content))
def end_head(self):
raise EndOfHeadError()
def handle_entityref(self, name):
#debug("%s", name)
self.handle_data(unescape(
'&%s;' % name, self._entitydefs, self._encoding))
def handle_charref(self, name):
#debug("%s", name)
self.handle_data(unescape_charref(name, self._encoding))
def unescape_attr(self, name):
#debug("%s", name)
return unescape(name, self._entitydefs, self._encoding)
def unescape_attrs(self, attrs):
#debug("%s", attrs)
escaped_attrs = {}
for key, val in attrs.items():
escaped_attrs[key] = self.unescape_attr(val)
return escaped_attrs
def unknown_entityref(self, ref):
self.handle_data("&%s;" % ref)
def unknown_charref(self, ref):
self.handle_data("&#%s;" % ref)
try:
import HTMLParser
except ImportError:
pass
else:
class XHTMLCompatibleHeadParser(AbstractHeadParser,
HTMLParser.HTMLParser):
def __init__(self):
HTMLParser.HTMLParser.__init__(self)
AbstractHeadParser.__init__(self)
def handle_starttag(self, tag, attrs):
if tag not in self.head_elems:
raise EndOfHeadError()
try:
method = getattr(self, 'start_' + tag)
except AttributeError:
try:
method = getattr(self, 'do_' + tag)
except AttributeError:
pass # unknown tag
else:
method(attrs)
else:
method(attrs)
def handle_endtag(self, tag):
if tag not in self.head_elems:
raise EndOfHeadError()
try:
method = getattr(self, 'end_' + tag)
except AttributeError:
pass # unknown tag
else:
method()
def unescape(self, name):
# Use the entitydefs passed into constructor, not
# HTMLParser.HTMLParser's entitydefs.
return self.unescape_attr(name)
def unescape_attr_if_required(self, name):
return name # HTMLParser.HTMLParser already did it
class HeadParser(AbstractHeadParser, sgmllib.SGMLParser):
def _not_called(self):
assert False
def __init__(self):
sgmllib.SGMLParser.__init__(self)
AbstractHeadParser.__init__(self)
def handle_starttag(self, tag, method, attrs):
if tag not in self.head_elems:
raise EndOfHeadError()
if tag == "meta":
method(attrs)
def unknown_starttag(self, tag, attrs):
self.handle_starttag(tag, self._not_called, attrs)
def handle_endtag(self, tag, method):
if tag in self.head_elems:
method()
else:
raise EndOfHeadError()
def unescape_attr_if_required(self, name):
return self.unescape_attr(name)
def parse_head(fileobj, parser):
"""Return a list of key, value pairs."""
while 1:
data = fileobj.read(CHUNK)
try:
parser.feed(data)
except EndOfHeadError:
break
if len(data) != CHUNK:
# this should only happen if there is no HTML body, or if
# CHUNK is big
break
return parser.http_equiv
class HTTPEquivProcessor(BaseHandler):
"""Append META HTTP-EQUIV headers to regular HTTP headers."""
handler_order = 300 # before handlers that look at HTTP headers
def __init__(self, head_parser_class=HeadParser,
i_want_broken_xhtml_support=False,
):
self.head_parser_class = head_parser_class
self._allow_xhtml = i_want_broken_xhtml_support
def http_response(self, request, response):
if not hasattr(response, "seek"):
response = response_seek_wrapper(response)
http_message = response.info()
url = response.geturl()
ct_hdrs = http_message.getheaders("content-type")
if is_html(ct_hdrs, url, self._allow_xhtml):
try:
try:
html_headers = parse_head(response, self.head_parser_class())
finally:
response.seek(0)
except (HTMLParser.HTMLParseError,
sgmllib.SGMLParseError):
pass
else:
for hdr, val in html_headers:
# add a header
http_message.dict[hdr.lower()] = val
text = hdr + ": " + val
for line in text.split("\n"):
http_message.headers.append(line + "\n")
return response
https_response = http_response
class HTTPCookieProcessor(BaseHandler):
"""Handle HTTP cookies.
Public attributes:
cookiejar: CookieJar instance
"""
def __init__(self, cookiejar=None):
if cookiejar is None:
cookiejar = CookieJar()
self.cookiejar = cookiejar
def http_request(self, request):
self.cookiejar.add_cookie_header(request)
return request
def http_response(self, request, response):
self.cookiejar.extract_cookies(response, request)
return response
https_request = http_request
https_response = http_response
try:
import robotparser
except ImportError:
pass
else:
class MechanizeRobotFileParser(robotparser.RobotFileParser):
def __init__(self, url='', opener=None):
import _opener
robotparser.RobotFileParser.__init__(self, url)
self._opener = opener
def set_opener(self, opener=None):
if opener is None:
opener = _opener.OpenerDirector()
self._opener = opener
def read(self):
"""Reads the robots.txt URL and feeds it to the parser."""
if self._opener is None:
self.set_opener()
req = Request(self.url, unverifiable=True, visit=False)
try:
f = self._opener.open(req)
except HTTPError, f:
pass
except (IOError, socket.error, OSError), exc:
robotparser._debug("ignoring error opening %r: %s" %
(self.url, exc))
return
lines = []
line = f.readline()
while line:
lines.append(line.strip())
line = f.readline()
status = f.code
if status == 401 or status == 403:
self.disallow_all = True
robotparser._debug("disallow all")
elif status >= 400:
self.allow_all = True
robotparser._debug("allow all")
elif status == 200 and lines:
robotparser._debug("parse lines")
self.parse(lines)
class RobotExclusionError(urllib2.HTTPError):
def __init__(self, request, *args):
apply(urllib2.HTTPError.__init__, (self,)+args)
self.request = request
class HTTPRobotRulesProcessor(BaseHandler):
# before redirections, after everything else
handler_order = 800
try:
from httplib import HTTPMessage
except:
from mimetools import Message
http_response_class = Message
else:
http_response_class = HTTPMessage
def __init__(self, rfp_class=MechanizeRobotFileParser):
self.rfp_class = rfp_class
self.rfp = None
self._host = None
def http_request(self, request):
scheme = request.get_type()
if scheme not in ["http", "https"]:
# robots exclusion only applies to HTTP
return request
if request.get_selector() == "/robots.txt":
# /robots.txt is always OK to fetch
return request
host = request.get_host()
# robots.txt requests don't need to be allowed by robots.txt :-)
origin_req = getattr(request, "_origin_req", None)
if (origin_req is not None and
origin_req.get_selector() == "/robots.txt" and
origin_req.get_host() == host
):
return request
if host != self._host:
self.rfp = self.rfp_class()
try:
self.rfp.set_opener(self.parent)
except AttributeError:
debug("%r instance does not support set_opener" %
self.rfp.__class__)
self.rfp.set_url(scheme+"://"+host+"/robots.txt")
self.rfp.read()
self._host = host
ua = request.get_header("User-agent", "")
if self.rfp.can_fetch(ua, request.get_full_url()):
return request
else:
# XXX This should really have raised URLError. Too late now...
msg = "request disallowed by robots.txt"
raise RobotExclusionError(
request,
request.get_full_url(),
403, msg,
self.http_response_class(StringIO()), StringIO(msg))
https_request = http_request
class HTTPRefererProcessor(BaseHandler):
"""Add Referer header to requests.
This only makes sense if you use each RefererProcessor for a single
chain of requests only (so, for example, if you use a single
HTTPRefererProcessor to fetch a series of URLs extracted from a single
page, this will break).
There's a proper implementation of this in mechanize.Browser.
"""
def __init__(self):
self.referer = None
def http_request(self, request):
if ((self.referer is not None) and
not request.has_header("Referer")):
request.add_unredirected_header("Referer", self.referer)
return request
def http_response(self, request, response):
self.referer = response.geturl()
return response
https_request = http_request
https_response = http_response
def clean_refresh_url(url):
# e.g. Firefox 1.5 does (something like) this
if ((url.startswith('"') and url.endswith('"')) or
(url.startswith("'") and url.endswith("'"))):
url = url[1:-1]
return _rfc3986.clean_url(url, "latin-1") # XXX encoding
def parse_refresh_header(refresh):
"""
>>> parse_refresh_header("1; url=http://example.com/")
(1.0, 'http://example.com/')
>>> parse_refresh_header("1; url='http://example.com/'")
(1.0, 'http://example.com/')
>>> parse_refresh_header("1")
(1.0, None)
>>> parse_refresh_header("blah")
Traceback (most recent call last):
ValueError: invalid literal for float(): blah
"""
ii = refresh.find(";")
if ii != -1:
pause, newurl_spec = float(refresh[:ii]), refresh[ii+1:]
jj = newurl_spec.find("=")
key = None
if jj != -1:
key, newurl = newurl_spec[:jj], newurl_spec[jj+1:]
newurl = clean_refresh_url(newurl)
if key is None or key.strip().lower() != "url":
raise ValueError()
else:
pause, newurl = float(refresh), None
return pause, newurl
class HTTPRefreshProcessor(BaseHandler):
"""Perform HTTP Refresh redirections.
Note that if a non-200 HTTP code has occurred (for example, a 30x
redirect), this processor will do nothing.
By default, only zero-time Refresh headers are redirected. Use the
max_time attribute / constructor argument to allow Refresh with longer
pauses. Use the honor_time attribute / constructor argument to control
whether the requested pause is honoured (with a time.sleep()) or
skipped in favour of immediate redirection.
Public attributes:
max_time: see above
honor_time: see above
"""
handler_order = 1000
def __init__(self, max_time=0, honor_time=True):
self.max_time = max_time
self.honor_time = honor_time
def http_response(self, request, response):
code, msg, hdrs = response.code, response.msg, response.info()
if code == 200 and hdrs.has_key("refresh"):
refresh = hdrs.getheaders("refresh")[0]
try:
pause, newurl = parse_refresh_header(refresh)
except ValueError:
debug("bad Refresh header: %r" % refresh)
return response
if newurl is None:
newurl = response.geturl()
if (self.max_time is None) or (pause <= self.max_time):
if pause > 1E-3 and self.honor_time:
time.sleep(pause)
hdrs["location"] = newurl
# hardcoded http is NOT a bug
response = self.parent.error(
"http", request, response,
"refresh", msg, hdrs)
return response
https_response = http_response
class HTTPErrorProcessor(BaseHandler):
"""Process HTTP error responses.
The purpose of this handler is to to allow other response processors a
look-in by removing the call to parent.error() from
AbstractHTTPHandler.
For non-200 error codes, this just passes the job on to the
Handler.<proto>_error_<code> methods, via the OpenerDirector.error
method. Eventually, urllib2.HTTPDefaultErrorHandler will raise an
HTTPError if no other handler handles the error.
"""
handler_order = 1000 # after all other processors
def http_response(self, request, response):
code, msg, hdrs = response.code, response.msg, response.info()
if code != 200:
# hardcoded http is NOT a bug
response = self.parent.error(
"http", request, response, code, msg, hdrs)
return response
https_response = http_response
class HTTPDefaultErrorHandler(BaseHandler):
def http_error_default(self, req, fp, code, msg, hdrs):
# why these error methods took the code, msg, headers args in the first
# place rather than a response object, I don't know, but to avoid
# multiple wrapping, we're discarding them
if isinstance(fp, urllib2.HTTPError):
response = fp
else:
response = urllib2.HTTPError(
req.get_full_url(), code, msg, hdrs, fp)
assert code == response.code
assert msg == response.msg
assert hdrs == response.hdrs
raise response
class AbstractHTTPHandler(BaseHandler):
def __init__(self, debuglevel=0):
self._debuglevel = debuglevel
def set_http_debuglevel(self, level):
self._debuglevel = level
def do_request_(self, request):
host = request.get_host()
if not host:
raise URLError('no host given')
if request.has_data(): # POST
data = request.get_data()
if not request.has_header('Content-type'):
request.add_unredirected_header(
'Content-type',
'application/x-www-form-urlencoded')
scheme, sel = urllib.splittype(request.get_selector())
sel_host, sel_path = urllib.splithost(sel)
if not request.has_header('Host'):
request.add_unredirected_header('Host', sel_host or host)
for name, value in self.parent.addheaders:
name = name.capitalize()
if not request.has_header(name):
request.add_unredirected_header(name, value)
return request
def do_open(self, http_class, req):
"""Return an addinfourl object for the request, using http_class.
http_class must implement the HTTPConnection API from httplib.
The addinfourl return value is a file-like object. It also
has methods and attributes including:
- info(): return a mimetools.Message object for the headers
- geturl(): return the original request URL
- code: HTTP status code
"""
host = req.get_host()
if not host:
raise URLError('no host given')
h = http_class(host) # will parse host:port
h.set_debuglevel(self._debuglevel)
headers = dict(req.headers)
headers.update(req.unredirected_hdrs)
# We want to make an HTTP/1.1 request, but the addinfourl
# class isn't prepared to deal with a persistent connection.
# It will try to read all remaining data from the socket,
# which will block while the server waits for the next request.
# So make sure the connection gets closed after the (only)
# request.
headers["Connection"] = "close"
headers = dict(
[(name.title(), val) for name, val in headers.items()])
try:
h.request(req.get_method(), req.get_selector(), req.data, headers)
r = h.getresponse()
except socket.error, err: # XXX what error?
raise URLError(err)
# Pick apart the HTTPResponse object to get the addinfourl
# object initialized properly.
# Wrap the HTTPResponse object in socket's file object adapter
# for Windows. That adapter calls recv(), so delegate recv()
# to read(). This weird wrapping allows the returned object to
# have readline() and readlines() methods.
# XXX It might be better to extract the read buffering code
# out of socket._fileobject() and into a base class.
r.recv = r.read
fp = socket._fileobject(r)
resp = closeable_response(fp, r.msg, req.get_full_url(),
r.status, r.reason)
return resp
class HTTPHandler(AbstractHTTPHandler):
def http_open(self, req):
return self.do_open(httplib.HTTPConnection, req)
http_request = AbstractHTTPHandler.do_request_
if hasattr(httplib, 'HTTPS'):
class HTTPSConnectionFactory:
def __init__(self, key_file, cert_file):
self._key_file = key_file
self._cert_file = cert_file
def __call__(self, hostport):
return httplib.HTTPSConnection(
hostport,
key_file=self._key_file, cert_file=self._cert_file)
class HTTPSHandler(AbstractHTTPHandler):
def __init__(self, client_cert_manager=None):
AbstractHTTPHandler.__init__(self)
self.client_cert_manager = client_cert_manager
def https_open(self, req):
if self.client_cert_manager is not None:
key_file, cert_file = self.client_cert_manager.find_key_cert(
req.get_full_url())
conn_factory = HTTPSConnectionFactory(key_file, cert_file)
else:
conn_factory = httplib.HTTPSConnection
return self.do_open(conn_factory, req)
https_request = AbstractHTTPHandler.do_request_

View File

@ -0,0 +1,185 @@
"""Load / save to libwww-perl (LWP) format files.
Actually, the format is slightly extended from that used by LWP's
(libwww-perl's) HTTP::Cookies, to avoid losing some RFC 2965 information
not recorded by LWP.
It uses the version string "2.0", though really there isn't an LWP Cookies
2.0 format. This indicates that there is extra information in here
(domain_dot and port_spec) while still being compatible with libwww-perl,
I hope.
Copyright 2002-2006 John J Lee <jjl@pobox.com>
Copyright 1997-1999 Gisle Aas (original libwww-perl code)
This code is free software; you can redistribute it and/or modify it
under the terms of the BSD or ZPL 2.1 licenses (see the file
COPYING.txt included with the distribution).
"""
import time, re, logging
from _clientcookie import reraise_unmasked_exceptions, FileCookieJar, Cookie, \
MISSING_FILENAME_TEXT, LoadError
from _headersutil import join_header_words, split_header_words
from _util import iso2time, time2isoz
debug = logging.getLogger("mechanize").debug
def lwp_cookie_str(cookie):
"""Return string representation of Cookie in an the LWP cookie file format.
Actually, the format is extended a bit -- see module docstring.
"""
h = [(cookie.name, cookie.value),
("path", cookie.path),
("domain", cookie.domain)]
if cookie.port is not None: h.append(("port", cookie.port))
if cookie.path_specified: h.append(("path_spec", None))
if cookie.port_specified: h.append(("port_spec", None))
if cookie.domain_initial_dot: h.append(("domain_dot", None))
if cookie.secure: h.append(("secure", None))
if cookie.expires: h.append(("expires",
time2isoz(float(cookie.expires))))
if cookie.discard: h.append(("discard", None))
if cookie.comment: h.append(("comment", cookie.comment))
if cookie.comment_url: h.append(("commenturl", cookie.comment_url))
if cookie.rfc2109: h.append(("rfc2109", None))
keys = cookie.nonstandard_attr_keys()
keys.sort()
for k in keys:
h.append((k, str(cookie.get_nonstandard_attr(k))))
h.append(("version", str(cookie.version)))
return join_header_words([h])
class LWPCookieJar(FileCookieJar):
"""
The LWPCookieJar saves a sequence of"Set-Cookie3" lines.
"Set-Cookie3" is the format used by the libwww-perl libary, not known
to be compatible with any browser, but which is easy to read and
doesn't lose information about RFC 2965 cookies.
Additional methods
as_lwp_str(ignore_discard=True, ignore_expired=True)
"""
magic_re = r"^\#LWP-Cookies-(\d+\.\d+)"
def as_lwp_str(self, ignore_discard=True, ignore_expires=True):
"""Return cookies as a string of "\n"-separated "Set-Cookie3" headers.
ignore_discard and ignore_expires: see docstring for FileCookieJar.save
"""
now = time.time()
r = []
for cookie in self:
if not ignore_discard and cookie.discard:
debug(" Not saving %s: marked for discard", cookie.name)
continue
if not ignore_expires and cookie.is_expired(now):
debug(" Not saving %s: expired", cookie.name)
continue
r.append("Set-Cookie3: %s" % lwp_cookie_str(cookie))
return "\n".join(r+[""])
def save(self, filename=None, ignore_discard=False, ignore_expires=False):
if filename is None:
if self.filename is not None: filename = self.filename
else: raise ValueError(MISSING_FILENAME_TEXT)
f = open(filename, "w")
try:
debug("Saving LWP cookies file")
# There really isn't an LWP Cookies 2.0 format, but this indicates
# that there is extra information in here (domain_dot and
# port_spec) while still being compatible with libwww-perl, I hope.
f.write("#LWP-Cookies-2.0\n")
f.write(self.as_lwp_str(ignore_discard, ignore_expires))
finally:
f.close()
def _really_load(self, f, filename, ignore_discard, ignore_expires):
magic = f.readline()
if not re.search(self.magic_re, magic):
msg = "%s does not seem to contain cookies" % filename
raise LoadError(msg)
now = time.time()
header = "Set-Cookie3:"
boolean_attrs = ("port_spec", "path_spec", "domain_dot",
"secure", "discard", "rfc2109")
value_attrs = ("version",
"port", "path", "domain",
"expires",
"comment", "commenturl")
try:
while 1:
line = f.readline()
if line == "": break
if not line.startswith(header):
continue
line = line[len(header):].strip()
for data in split_header_words([line]):
name, value = data[0]
standard = {}
rest = {}
for k in boolean_attrs:
standard[k] = False
for k, v in data[1:]:
if k is not None:
lc = k.lower()
else:
lc = None
# don't lose case distinction for unknown fields
if (lc in value_attrs) or (lc in boolean_attrs):
k = lc
if k in boolean_attrs:
if v is None: v = True
standard[k] = v
elif k in value_attrs:
standard[k] = v
else:
rest[k] = v
h = standard.get
expires = h("expires")
discard = h("discard")
if expires is not None:
expires = iso2time(expires)
if expires is None:
discard = True
domain = h("domain")
domain_specified = domain.startswith(".")
c = Cookie(h("version"), name, value,
h("port"), h("port_spec"),
domain, domain_specified, h("domain_dot"),
h("path"), h("path_spec"),
h("secure"),
expires,
discard,
h("comment"),
h("commenturl"),
rest,
h("rfc2109"),
)
if not ignore_discard and c.discard:
continue
if not ignore_expires and c.is_expired(now):
continue
self.set_cookie(c)
except:
reraise_unmasked_exceptions((IOError,))
raise LoadError("invalid Set-Cookie3 format file %s" % filename)

View File

@ -0,0 +1,656 @@
"""Stateful programmatic WWW navigation, after Perl's WWW::Mechanize.
Copyright 2003-2006 John J. Lee <jjl@pobox.com>
Copyright 2003 Andy Lester (original Perl code)
This code is free software; you can redistribute it and/or modify it
under the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt
included with the distribution).
"""
import urllib2, sys, copy, re
from _useragent import UserAgentBase
from _html import DefaultFactory
import _response
import _request
import _rfc3986
__version__ = (0, 1, 7, "b", None) # 0.1.7b
class BrowserStateError(Exception): pass
class LinkNotFoundError(Exception): pass
class FormNotFoundError(Exception): pass
class History:
"""
Though this will become public, the implied interface is not yet stable.
"""
def __init__(self):
self._history = [] # LIFO
def add(self, request, response):
self._history.append((request, response))
def back(self, n, _response):
response = _response # XXX move Browser._response into this class?
while n > 0 or response is None:
try:
request, response = self._history.pop()
except IndexError:
raise BrowserStateError("already at start of history")
n -= 1
return request, response
def clear(self):
del self._history[:]
def close(self):
for request, response in self._history:
if response is not None:
response.close()
del self._history[:]
class HTTPRefererProcessor(urllib2.BaseHandler):
def http_request(self, request):
# See RFC 2616 14.36. The only times we know the source of the
# request URI has a URI associated with it are redirect, and
# Browser.click() / Browser.submit() / Browser.follow_link().
# Otherwise, it's the user's job to add any Referer header before
# .open()ing.
if hasattr(request, "redirect_dict"):
request = self.parent._add_referer_header(
request, origin_request=False)
return request
https_request = http_request
class Browser(UserAgentBase):
"""Browser-like class with support for history, forms and links.
BrowserStateError is raised whenever the browser is in the wrong state to
complete the requested operation - eg., when .back() is called when the
browser history is empty, or when .follow_link() is called when the current
response does not contain HTML data.
Public attributes:
request: current request (mechanize.Request or urllib2.Request)
form: currently selected form (see .select_form())
"""
handler_classes = copy.copy(UserAgentBase.handler_classes)
handler_classes["_referer"] = HTTPRefererProcessor
default_features = copy.copy(UserAgentBase.default_features)
default_features.append("_referer")
def __init__(self,
factory=None,
history=None,
request_class=None,
):
"""
Only named arguments should be passed to this constructor.
factory: object implementing the mechanize.Factory interface.
history: object implementing the mechanize.History interface. Note
this interface is still experimental and may change in future.
request_class: Request class to use. Defaults to mechanize.Request
by default for Pythons older than 2.4, urllib2.Request otherwise.
The Factory and History objects passed in are 'owned' by the Browser,
so they should not be shared across Browsers. In particular,
factory.set_response() should not be called except by the owning
Browser itself.
Note that the supplied factory's request_class is overridden by this
constructor, to ensure only one Request class is used.
"""
self._handle_referer = True
if history is None:
history = History()
self._history = history
if request_class is None:
if not hasattr(urllib2.Request, "add_unredirected_header"):
request_class = _request.Request
else:
request_class = urllib2.Request # Python >= 2.4
if factory is None:
factory = DefaultFactory()
factory.set_request_class(request_class)
self._factory = factory
self.request_class = request_class
self.request = None
self._set_response(None, False)
# do this last to avoid __getattr__ problems
UserAgentBase.__init__(self)
def close(self):
UserAgentBase.close(self)
if self._response is not None:
self._response.close()
if self._history is not None:
self._history.close()
self._history = None
# make use after .close easy to spot
self.form = None
self.request = self._response = None
self.request = self.response = self.set_response = None
self.geturl = self.reload = self.back = None
self.clear_history = self.set_cookie = self.links = self.forms = None
self.viewing_html = self.encoding = self.title = None
self.select_form = self.click = self.submit = self.click_link = None
self.follow_link = self.find_link = None
def set_handle_referer(self, handle):
"""Set whether to add Referer header to each request.
This base class does not implement this feature (so don't turn this on
if you're using this base class directly), but the subclass
mechanize.Browser does.
"""
self._set_handler("_referer", handle)
self._handle_referer = bool(handle)
def _add_referer_header(self, request, origin_request=True):
if self.request is None:
return request
scheme = request.get_type()
original_scheme = self.request.get_type()
if scheme not in ["http", "https"]:
return request
if not origin_request and not self.request.has_header("Referer"):
return request
if (self._handle_referer and
original_scheme in ["http", "https"] and
not (original_scheme == "https" and scheme != "https")):
# strip URL fragment (RFC 2616 14.36)
parts = _rfc3986.urlsplit(self.request.get_full_url())
parts = parts[:-1]+(None,)
referer = _rfc3986.urlunsplit(parts)
request.add_unredirected_header("Referer", referer)
return request
def open_novisit(self, url, data=None):
"""Open a URL without visiting it.
The browser state (including .request, .response(), history, forms and
links) are all left unchanged by calling this function.
The interface is the same as for .open().
This is useful for things like fetching images.
See also .retrieve().
"""
return self._mech_open(url, data, visit=False)
def open(self, url, data=None):
return self._mech_open(url, data)
def _mech_open(self, url, data=None, update_history=True, visit=None):
try:
url.get_full_url
except AttributeError:
# string URL -- convert to absolute URL if required
scheme, authority = _rfc3986.urlsplit(url)[:2]
if scheme is None:
# relative URL
if self._response is None:
raise BrowserStateError(
"can't fetch relative reference: "
"not viewing any document")
url = _rfc3986.urljoin(self._response.geturl(), url)
request = self._request(url, data, visit)
visit = request.visit
if visit is None:
visit = True
if visit:
self._visit_request(request, update_history)
success = True
try:
response = UserAgentBase.open(self, request, data)
except urllib2.HTTPError, error:
success = False
if error.fp is None: # not a response
raise
response = error
## except (IOError, socket.error, OSError), error:
## # Yes, urllib2 really does raise all these :-((
## # See test_urllib2.py for examples of socket.gaierror and OSError,
## # plus note that FTPHandler raises IOError.
## # XXX I don't seem to have an example of exactly socket.error being
## # raised, only socket.gaierror...
## # I don't want to start fixing these here, though, since this is a
## # subclass of OpenerDirector, and it would break old code. Even in
## # Python core, a fix would need some backwards-compat. hack to be
## # acceptable.
## raise
if visit:
self._set_response(response, False)
response = copy.copy(self._response)
elif response is not None:
response = _response.upgrade_response(response)
if not success:
raise response
return response
def __str__(self):
text = []
text.append("<%s " % self.__class__.__name__)
if self._response:
text.append("visiting %s" % self._response.geturl())
else:
text.append("(not visiting a URL)")
if self.form:
text.append("\n selected form:\n %s\n" % str(self.form))
text.append(">")
return "".join(text)
def response(self):
"""Return a copy of the current response.
The returned object has the same interface as the object returned by
.open() (or urllib2.urlopen()).
"""
return copy.copy(self._response)
def set_response(self, response):
"""Replace current response with (a copy of) response.
response may be None.
This is intended mostly for HTML-preprocessing.
"""
self._set_response(response, True)
def _set_response(self, response, close_current):
# sanity check, necessary but far from sufficient
if not (response is None or
(hasattr(response, "info") and hasattr(response, "geturl") and
hasattr(response, "read")
)
):
raise ValueError("not a response object")
self.form = None
if response is not None:
response = _response.upgrade_response(response)
if close_current and self._response is not None:
self._response.close()
self._response = response
self._factory.set_response(response)
def visit_response(self, response, request=None):
"""Visit the response, as if it had been .open()ed.
Unlike .set_response(), this updates history rather than replacing the
current response.
"""
if request is None:
request = _request.Request(response.geturl())
self._visit_request(request, True)
self._set_response(response, False)
def _visit_request(self, request, update_history):
if self._response is not None:
self._response.close()
if self.request is not None and update_history:
self._history.add(self.request, self._response)
self._response = None
# we want self.request to be assigned even if UserAgentBase.open
# fails
self.request = request
def geturl(self):
"""Get URL of current document."""
if self._response is None:
raise BrowserStateError("not viewing any document")
return self._response.geturl()
def reload(self):
"""Reload current document, and return response object."""
if self.request is None:
raise BrowserStateError("no URL has yet been .open()ed")
if self._response is not None:
self._response.close()
return self._mech_open(self.request, update_history=False)
def back(self, n=1):
"""Go back n steps in history, and return response object.
n: go back this number of steps (default 1 step)
"""
if self._response is not None:
self._response.close()
self.request, response = self._history.back(n, self._response)
self.set_response(response)
if not response.read_complete:
return self.reload()
return copy.copy(response)
def clear_history(self):
self._history.clear()
def set_cookie(self, cookie_string):
"""Request to set a cookie.
Note that it is NOT necessary to call this method under ordinary
circumstances: cookie handling is normally entirely automatic. The
intended use case is rather to simulate the setting of a cookie by
client script in a web page (e.g. JavaScript). In that case, use of
this method is necessary because mechanize currently does not support
JavaScript, VBScript, etc.
The cookie is added in the same way as if it had arrived with the
current response, as a result of the current request. This means that,
for example, it is not appropriate to set the cookie based on the
current request, no cookie will be set.
The cookie will be returned automatically with subsequent responses
made by the Browser instance whenever that's appropriate.
cookie_string should be a valid value of the Set-Cookie header.
For example:
browser.set_cookie(
"sid=abcdef; expires=Wednesday, 09-Nov-06 23:12:40 GMT")
Currently, this method does not allow for adding RFC 2986 cookies.
This limitation will be lifted if anybody requests it.
"""
if self._response is None:
raise BrowserStateError("not viewing any document")
if self.request.get_type() not in ["http", "https"]:
raise BrowserStateError("can't set cookie for non-HTTP/HTTPS "
"transactions")
cookiejar = self._ua_handlers["_cookies"].cookiejar
response = self.response() # copy
headers = response.info()
headers["Set-cookie"] = cookie_string
cookiejar.extract_cookies(response, self.request)
def links(self, **kwds):
"""Return iterable over links (mechanize.Link objects)."""
if not self.viewing_html():
raise BrowserStateError("not viewing HTML")
links = self._factory.links()
if kwds:
return self._filter_links(links, **kwds)
else:
return links
def forms(self):
"""Return iterable over forms.
The returned form objects implement the ClientForm.HTMLForm interface.
"""
if not self.viewing_html():
raise BrowserStateError("not viewing HTML")
return self._factory.forms()
def global_form(self):
"""Return the global form object, or None if the factory implementation
did not supply one.
The "global" form object contains all controls that are not descendants of
any FORM element.
The returned form object implements the ClientForm.HTMLForm interface.
This is a separate method since the global form is not regarded as part
of the sequence of forms in the document -- mostly for
backwards-compatibility.
"""
if not self.viewing_html():
raise BrowserStateError("not viewing HTML")
return self._factory.global_form
def viewing_html(self):
"""Return whether the current response contains HTML data."""
if self._response is None:
raise BrowserStateError("not viewing any document")
return self._factory.is_html
def encoding(self):
""""""
if self._response is None:
raise BrowserStateError("not viewing any document")
return self._factory.encoding
def title(self):
"""Return title, or None if there is no title element in the document.
Tags are stripped or textified as described in docs for
PullParser.get_text() method of pullparser module.
"""
if not self.viewing_html():
raise BrowserStateError("not viewing HTML")
return self._factory.title
def select_form(self, name=None, predicate=None, nr=None):
"""Select an HTML form for input.
This is a bit like giving a form the "input focus" in a browser.
If a form is selected, the Browser object supports the HTMLForm
interface, so you can call methods like .set_value(), .set(), and
.click().
Another way to select a form is to assign to the .form attribute. The
form assigned should be one of the objects returned by the .forms()
method.
At least one of the name, predicate and nr arguments must be supplied.
If no matching form is found, mechanize.FormNotFoundError is raised.
If name is specified, then the form must have the indicated name.
If predicate is specified, then the form must match that function. The
predicate function is passed the HTMLForm as its single argument, and
should return a boolean value indicating whether the form matched.
nr, if supplied, is the sequence number of the form (where 0 is the
first). Note that control 0 is the first form matching all the other
arguments (if supplied); it is not necessarily the first control in the
form.
"""
if not self.viewing_html():
raise BrowserStateError("not viewing HTML")
if (name is None) and (predicate is None) and (nr is None):
raise ValueError(
"at least one argument must be supplied to specify form")
orig_nr = nr
for form in self.forms():
if name is not None and name != form.name:
continue
if predicate is not None and not predicate(form):
continue
if nr:
nr -= 1
continue
self.form = form
break # success
else:
# failure
description = []
if name is not None: description.append("name '%s'" % name)
if predicate is not None:
description.append("predicate %s" % predicate)
if orig_nr is not None: description.append("nr %d" % orig_nr)
description = ", ".join(description)
raise FormNotFoundError("no form matching "+description)
def click(self, *args, **kwds):
"""See ClientForm.HTMLForm.click for documentation."""
if not self.viewing_html():
raise BrowserStateError("not viewing HTML")
request = self.form.click(*args, **kwds)
return self._add_referer_header(request)
def submit(self, *args, **kwds):
"""Submit current form.
Arguments are as for ClientForm.HTMLForm.click().
Return value is same as for Browser.open().
"""
return self.open(self.click(*args, **kwds))
def click_link(self, link=None, **kwds):
"""Find a link and return a Request object for it.
Arguments are as for .find_link(), except that a link may be supplied
as the first argument.
"""
if not self.viewing_html():
raise BrowserStateError("not viewing HTML")
if not link:
link = self.find_link(**kwds)
else:
if kwds:
raise ValueError(
"either pass a Link, or keyword arguments, not both")
request = self.request_class(link.absolute_url)
return self._add_referer_header(request)
def follow_link(self, link=None, **kwds):
"""Find a link and .open() it.
Arguments are as for .click_link().
Return value is same as for Browser.open().
"""
return self.open(self.click_link(link, **kwds))
def find_link(self, **kwds):
"""Find a link in current page.
Links are returned as mechanize.Link objects.
# Return third link that .search()-matches the regexp "python"
# (by ".search()-matches", I mean that the regular expression method
# .search() is used, rather than .match()).
find_link(text_regex=re.compile("python"), nr=2)
# Return first http link in the current page that points to somewhere
# on python.org whose link text (after tags have been removed) is
# exactly "monty python".
find_link(text="monty python",
url_regex=re.compile("http.*python.org"))
# Return first link with exactly three HTML attributes.
find_link(predicate=lambda link: len(link.attrs) == 3)
Links include anchors (<a>), image maps (<area>), and frames (<frame>,
<iframe>).
All arguments must be passed by keyword, not position. Zero or more
arguments may be supplied. In order to find a link, all arguments
supplied must match.
If a matching link is not found, mechanize.LinkNotFoundError is raised.
text: link text between link tags: eg. <a href="blah">this bit</a> (as
returned by pullparser.get_compressed_text(), ie. without tags but
with opening tags "textified" as per the pullparser docs) must compare
equal to this argument, if supplied
text_regex: link text between tag (as defined above) must match the
regular expression object or regular expression string passed as this
argument, if supplied
name, name_regex: as for text and text_regex, but matched against the
name HTML attribute of the link tag
url, url_regex: as for text and text_regex, but matched against the
URL of the link tag (note this matches against Link.url, which is a
relative or absolute URL according to how it was written in the HTML)
tag: element name of opening tag, eg. "a"
predicate: a function taking a Link object as its single argument,
returning a boolean result, indicating whether the links
nr: matches the nth link that matches all other criteria (default 0)
"""
try:
return self._filter_links(self._factory.links(), **kwds).next()
except StopIteration:
raise LinkNotFoundError()
def __getattr__(self, name):
# pass through ClientForm / DOMForm methods and attributes
form = self.__dict__.get("form")
if form is None:
raise AttributeError(
"%s instance has no attribute %s (perhaps you forgot to "
".select_form()?)" % (self.__class__, name))
return getattr(form, name)
def _filter_links(self, links,
text=None, text_regex=None,
name=None, name_regex=None,
url=None, url_regex=None,
tag=None,
predicate=None,
nr=0
):
if not self.viewing_html():
raise BrowserStateError("not viewing HTML")
found_links = []
orig_nr = nr
for link in links:
if url is not None and url != link.url:
continue
if url_regex is not None and not re.search(url_regex, link.url):
continue
if (text is not None and
(link.text is None or text != link.text)):
continue
if (text_regex is not None and
(link.text is None or not re.search(text_regex, link.text))):
continue
if name is not None and name != dict(link.attrs).get("name"):
continue
if name_regex is not None:
link_name = dict(link.attrs).get("name")
if link_name is None or not re.search(name_regex, link_name):
continue
if tag is not None and tag != link.tag:
continue
if predicate is not None and not predicate(link):
continue
if nr:
nr -= 1
continue
yield link
nr = orig_nr

View File

@ -0,0 +1,159 @@
"""Mozilla / Netscape cookie loading / saving.
Copyright 2002-2006 John J Lee <jjl@pobox.com>
Copyright 1997-1999 Gisle Aas (original libwww-perl code)
This code is free software; you can redistribute it and/or modify it
under the terms of the BSD or ZPL 2.1 licenses (see the file
COPYING.txt included with the distribution).
"""
import re, time, logging
from _clientcookie import reraise_unmasked_exceptions, FileCookieJar, Cookie, \
MISSING_FILENAME_TEXT, LoadError
debug = logging.getLogger("ClientCookie").debug
class MozillaCookieJar(FileCookieJar):
"""
WARNING: you may want to backup your browser's cookies file if you use
this class to save cookies. I *think* it works, but there have been
bugs in the past!
This class differs from CookieJar only in the format it uses to save and
load cookies to and from a file. This class uses the Mozilla/Netscape
`cookies.txt' format. lynx uses this file format, too.
Don't expect cookies saved while the browser is running to be noticed by
the browser (in fact, Mozilla on unix will overwrite your saved cookies if
you change them on disk while it's running; on Windows, you probably can't
save at all while the browser is running).
Note that the Mozilla/Netscape format will downgrade RFC2965 cookies to
Netscape cookies on saving.
In particular, the cookie version and port number information is lost,
together with information about whether or not Path, Port and Discard were
specified by the Set-Cookie2 (or Set-Cookie) header, and whether or not the
domain as set in the HTTP header started with a dot (yes, I'm aware some
domains in Netscape files start with a dot and some don't -- trust me, you
really don't want to know any more about this).
Note that though Mozilla and Netscape use the same format, they use
slightly different headers. The class saves cookies using the Netscape
header by default (Mozilla can cope with that).
"""
magic_re = "#( Netscape)? HTTP Cookie File"
header = """\
# Netscape HTTP Cookie File
# http://www.netscape.com/newsref/std/cookie_spec.html
# This is a generated file! Do not edit.
"""
def _really_load(self, f, filename, ignore_discard, ignore_expires):
now = time.time()
magic = f.readline()
if not re.search(self.magic_re, magic):
f.close()
raise LoadError(
"%s does not look like a Netscape format cookies file" %
filename)
try:
while 1:
line = f.readline()
if line == "": break
# last field may be absent, so keep any trailing tab
if line.endswith("\n"): line = line[:-1]
# skip comments and blank lines XXX what is $ for?
if (line.strip().startswith("#") or
line.strip().startswith("$") or
line.strip() == ""):
continue
domain, domain_specified, path, secure, expires, name, value = \
line.split("\t")
secure = (secure == "TRUE")
domain_specified = (domain_specified == "TRUE")
if name == "":
name = value
value = None
initial_dot = domain.startswith(".")
assert domain_specified == initial_dot
discard = False
if expires == "":
expires = None
discard = True
# assume path_specified is false
c = Cookie(0, name, value,
None, False,
domain, domain_specified, initial_dot,
path, False,
secure,
expires,
discard,
None,
None,
{})
if not ignore_discard and c.discard:
continue
if not ignore_expires and c.is_expired(now):
continue
self.set_cookie(c)
except:
reraise_unmasked_exceptions((IOError,))
raise LoadError("invalid Netscape format file %s: %s" %
(filename, line))
def save(self, filename=None, ignore_discard=False, ignore_expires=False):
if filename is None:
if self.filename is not None: filename = self.filename
else: raise ValueError(MISSING_FILENAME_TEXT)
f = open(filename, "w")
try:
debug("Saving Netscape cookies.txt file")
f.write(self.header)
now = time.time()
for cookie in self:
if not ignore_discard and cookie.discard:
debug(" Not saving %s: marked for discard", cookie.name)
continue
if not ignore_expires and cookie.is_expired(now):
debug(" Not saving %s: expired", cookie.name)
continue
if cookie.secure: secure = "TRUE"
else: secure = "FALSE"
if cookie.domain.startswith("."): initial_dot = "TRUE"
else: initial_dot = "FALSE"
if cookie.expires is not None:
expires = str(cookie.expires)
else:
expires = ""
if cookie.value is None:
# cookies.txt regards 'Set-Cookie: foo' as a cookie
# with no name, whereas cookielib regards it as a
# cookie with no value.
name = ""
value = cookie.name
else:
name = cookie.name
value = cookie.value
f.write(
"\t".join([cookie.domain, initial_dot, cookie.path,
secure, expires, name, value])+
"\n")
finally:
f.close()

View File

@ -0,0 +1,387 @@
"""Microsoft Internet Explorer cookie loading on Windows.
Copyright 2002-2003 Johnny Lee <typo_pl@hotmail.com> (MSIE Perl code)
Copyright 2002-2006 John J Lee <jjl@pobox.com> (The Python port)
This code is free software; you can redistribute it and/or modify it
under the terms of the BSD or ZPL 2.1 licenses (see the file
COPYING.txt included with the distribution).
"""
# XXX names and comments are not great here
import os, re, time, struct, logging
if os.name == "nt":
import _winreg
from _clientcookie import FileCookieJar, CookieJar, Cookie, \
MISSING_FILENAME_TEXT, LoadError
debug = logging.getLogger("mechanize").debug
def regload(path, leaf):
key = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER, path, 0,
_winreg.KEY_ALL_ACCESS)
try:
value = _winreg.QueryValueEx(key, leaf)[0]
except WindowsError:
value = None
return value
WIN32_EPOCH = 0x019db1ded53e8000L # 1970 Jan 01 00:00:00 in Win32 FILETIME
def epoch_time_offset_from_win32_filetime(filetime):
"""Convert from win32 filetime to seconds-since-epoch value.
MSIE stores create and expire times as Win32 FILETIME, which is 64
bits of 100 nanosecond intervals since Jan 01 1601.
mechanize expects time in 32-bit value expressed in seconds since the
epoch (Jan 01 1970).
"""
if filetime < WIN32_EPOCH:
raise ValueError("filetime (%d) is before epoch (%d)" %
(filetime, WIN32_EPOCH))
return divmod((filetime - WIN32_EPOCH), 10000000L)[0]
def binary_to_char(c): return "%02X" % ord(c)
def binary_to_str(d): return "".join(map(binary_to_char, list(d)))
class MSIEBase:
magic_re = re.compile(r"Client UrlCache MMF Ver \d\.\d.*")
padding = "\x0d\xf0\xad\x0b"
msie_domain_re = re.compile(r"^([^/]+)(/.*)$")
cookie_re = re.compile("Cookie\:.+\@([\x21-\xFF]+).*?"
"(.+\@[\x21-\xFF]+\.txt)")
# path under HKEY_CURRENT_USER from which to get location of index.dat
reg_path = r"software\microsoft\windows" \
r"\currentversion\explorer\shell folders"
reg_key = "Cookies"
def __init__(self):
self._delayload_domains = {}
def _delayload_domain(self, domain):
# if necessary, lazily load cookies for this domain
delayload_info = self._delayload_domains.get(domain)
if delayload_info is not None:
cookie_file, ignore_discard, ignore_expires = delayload_info
try:
self.load_cookie_data(cookie_file,
ignore_discard, ignore_expires)
except (LoadError, IOError):
debug("error reading cookie file, skipping: %s", cookie_file)
else:
del self._delayload_domains[domain]
def _load_cookies_from_file(self, filename):
debug("Loading MSIE cookies file: %s", filename)
cookies = []
cookies_fh = open(filename)
try:
while 1:
key = cookies_fh.readline()
if key == "": break
rl = cookies_fh.readline
def getlong(rl=rl): return long(rl().rstrip())
def getstr(rl=rl): return rl().rstrip()
key = key.rstrip()
value = getstr()
domain_path = getstr()
flags = getlong() # 0x2000 bit is for secure I think
lo_expire = getlong()
hi_expire = getlong()
lo_create = getlong()
hi_create = getlong()
sep = getstr()
if "" in (key, value, domain_path, flags, hi_expire, lo_expire,
hi_create, lo_create, sep) or (sep != "*"):
break
m = self.msie_domain_re.search(domain_path)
if m:
domain = m.group(1)
path = m.group(2)
cookies.append({"KEY": key, "VALUE": value, "DOMAIN": domain,
"PATH": path, "FLAGS": flags, "HIXP": hi_expire,
"LOXP": lo_expire, "HICREATE": hi_create,
"LOCREATE": lo_create})
finally:
cookies_fh.close()
return cookies
def load_cookie_data(self, filename,
ignore_discard=False, ignore_expires=False):
"""Load cookies from file containing actual cookie data.
Old cookies are kept unless overwritten by newly loaded ones.
You should not call this method if the delayload attribute is set.
I think each of these files contain all cookies for one user, domain,
and path.
filename: file containing cookies -- usually found in a file like
C:\WINNT\Profiles\joe\Cookies\joe@blah[1].txt
"""
now = int(time.time())
cookie_data = self._load_cookies_from_file(filename)
for cookie in cookie_data:
flags = cookie["FLAGS"]
secure = ((flags & 0x2000) != 0)
filetime = (cookie["HIXP"] << 32) + cookie["LOXP"]
expires = epoch_time_offset_from_win32_filetime(filetime)
if expires < now:
discard = True
else:
discard = False
domain = cookie["DOMAIN"]
initial_dot = domain.startswith(".")
if initial_dot:
domain_specified = True
else:
# MSIE 5 does not record whether the domain cookie-attribute
# was specified.
# Assuming it wasn't is conservative, because with strict
# domain matching this will match less frequently; with regular
# Netscape tail-matching, this will match at exactly the same
# times that domain_specified = True would. It also means we
# don't have to prepend a dot to achieve consistency with our
# own & Mozilla's domain-munging scheme.
domain_specified = False
# assume path_specified is false
# XXX is there other stuff in here? -- eg. comment, commentURL?
c = Cookie(0,
cookie["KEY"], cookie["VALUE"],
None, False,
domain, domain_specified, initial_dot,
cookie["PATH"], False,
secure,
expires,
discard,
None,
None,
{"flags": flags})
if not ignore_discard and c.discard:
continue
if not ignore_expires and c.is_expired(now):
continue
CookieJar.set_cookie(self, c)
def load_from_registry(self, ignore_discard=False, ignore_expires=False,
username=None):
"""
username: only required on win9x
"""
cookies_dir = regload(self.reg_path, self.reg_key)
filename = os.path.normpath(os.path.join(cookies_dir, "INDEX.DAT"))
self.load(filename, ignore_discard, ignore_expires, username)
def _really_load(self, index, filename, ignore_discard, ignore_expires,
username):
now = int(time.time())
if username is None:
username = os.environ['USERNAME'].lower()
cookie_dir = os.path.dirname(filename)
data = index.read(256)
if len(data) != 256:
raise LoadError("%s file is too short" % filename)
# Cookies' index.dat file starts with 32 bytes of signature
# followed by an offset to the first record, stored as a little-
# endian DWORD.
sig, size, data = data[:32], data[32:36], data[36:]
size = struct.unpack("<L", size)[0]
# check that sig is valid
if not self.magic_re.match(sig) or size != 0x4000:
raise LoadError("%s ['%s' %s] does not seem to contain cookies" %
(str(filename), sig, size))
# skip to start of first record
index.seek(size, 0)
sector = 128 # size of sector in bytes
while 1:
data = ""
# Cookies are usually in two contiguous sectors, so read in two
# sectors and adjust if not a Cookie.
to_read = 2 * sector
d = index.read(to_read)
if len(d) != to_read:
break
data = data + d
# Each record starts with a 4-byte signature and a count
# (little-endian DWORD) of sectors for the record.
sig, size, data = data[:4], data[4:8], data[8:]
size = struct.unpack("<L", size)[0]
to_read = (size - 2) * sector
## from urllib import quote
## print "data", quote(data)
## print "sig", quote(sig)
## print "size in sectors", size
## print "size in bytes", size*sector
## print "size in units of 16 bytes", (size*sector) / 16
## print "size to read in bytes", to_read
## print
if sig != "URL ":
assert (sig in ("HASH", "LEAK",
self.padding, "\x00\x00\x00\x00"),
"unrecognized MSIE index.dat record: %s" %
binary_to_str(sig))
if sig == "\x00\x00\x00\x00":
# assume we've got all the cookies, and stop
break
if sig == self.padding:
continue
# skip the rest of this record
assert to_read >= 0
if size != 2:
assert to_read != 0
index.seek(to_read, 1)
continue
# read in rest of record if necessary
if size > 2:
more_data = index.read(to_read)
if len(more_data) != to_read: break
data = data + more_data
cookie_re = ("Cookie\:%s\@([\x21-\xFF]+).*?" % username +
"(%s\@[\x21-\xFF]+\.txt)" % username)
m = re.search(cookie_re, data, re.I)
if m:
cookie_file = os.path.join(cookie_dir, m.group(2))
if not self.delayload:
try:
self.load_cookie_data(cookie_file,
ignore_discard, ignore_expires)
except (LoadError, IOError):
debug("error reading cookie file, skipping: %s",
cookie_file)
else:
domain = m.group(1)
i = domain.find("/")
if i != -1:
domain = domain[:i]
self._delayload_domains[domain] = (
cookie_file, ignore_discard, ignore_expires)
class MSIECookieJar(MSIEBase, FileCookieJar):
"""FileCookieJar that reads from the Windows MSIE cookies database.
MSIECookieJar can read the cookie files of Microsoft Internet Explorer
(MSIE) for Windows version 5 on Windows NT and version 6 on Windows XP and
Windows 98. Other configurations may also work, but are untested. Saving
cookies in MSIE format is NOT supported. If you save cookies, they'll be
in the usual Set-Cookie3 format, which you can read back in using an
instance of the plain old CookieJar class. Don't save using the same
filename that you loaded cookies from, because you may succeed in
clobbering your MSIE cookies index file!
You should be able to have LWP share Internet Explorer's cookies like
this (note you need to supply a username to load_from_registry if you're on
Windows 9x or Windows ME):
cj = MSIECookieJar(delayload=1)
# find cookies index file in registry and load cookies from it
cj.load_from_registry()
opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cj))
response = opener.open("http://example.com/")
Iterating over a delayloaded MSIECookieJar instance will not cause any
cookies to be read from disk. To force reading of all cookies from disk,
call read_all_cookies. Note that the following methods iterate over self:
clear_temporary_cookies, clear_expired_cookies, __len__, __repr__, __str__
and as_string.
Additional methods:
load_from_registry(ignore_discard=False, ignore_expires=False,
username=None)
load_cookie_data(filename, ignore_discard=False, ignore_expires=False)
read_all_cookies()
"""
def __init__(self, filename=None, delayload=False, policy=None):
MSIEBase.__init__(self)
FileCookieJar.__init__(self, filename, delayload, policy)
def set_cookie(self, cookie):
if self.delayload:
self._delayload_domain(cookie.domain)
CookieJar.set_cookie(self, cookie)
def _cookies_for_request(self, request):
"""Return a list of cookies to be returned to server."""
domains = self._cookies.copy()
domains.update(self._delayload_domains)
domains = domains.keys()
cookies = []
for domain in domains:
cookies.extend(self._cookies_for_domain(domain, request))
return cookies
def _cookies_for_domain(self, domain, request):
if not self._policy.domain_return_ok(domain, request):
return []
debug("Checking %s for cookies to return", domain)
if self.delayload:
self._delayload_domain(domain)
return CookieJar._cookies_for_domain(self, domain, request)
def read_all_cookies(self):
"""Eagerly read in all cookies."""
if self.delayload:
for domain in self._delayload_domains.keys():
self._delayload_domain(domain)
def load(self, filename, ignore_discard=False, ignore_expires=False,
username=None):
"""Load cookies from an MSIE 'index.dat' cookies index file.
filename: full path to cookie index file
username: only required on win9x
"""
if filename is None:
if self.filename is not None: filename = self.filename
else: raise ValueError(MISSING_FILENAME_TEXT)
index = open(filename, "rb")
try:
self._really_load(index, filename, ignore_discard, ignore_expires,
username)
finally:
index.close()

View File

@ -0,0 +1,421 @@
"""Integration with Python standard library module urllib2: OpenerDirector
class.
Copyright 2004-2006 John J Lee <jjl@pobox.com>
This code is free software; you can redistribute it and/or modify it
under the terms of the BSD or ZPL 2.1 licenses (see the file
COPYING.txt included with the distribution).
"""
import os, urllib2, bisect, urllib, httplib, types, tempfile
try:
import threading as _threading
except ImportError:
import dummy_threading as _threading
try:
set
except NameError:
import sets
set = sets.Set
import _http
import _upgrade
import _rfc3986
import _response
from _util import isstringlike
from _request import Request
class ContentTooShortError(urllib2.URLError):
def __init__(self, reason, result):
urllib2.URLError.__init__(self, reason)
self.result = result
class OpenerDirector(urllib2.OpenerDirector):
def __init__(self):
urllib2.OpenerDirector.__init__(self)
# really none of these are (sanely) public -- the lack of initial
# underscore on some is just due to following urllib2
self.process_response = {}
self.process_request = {}
self._any_request = {}
self._any_response = {}
self._handler_index_valid = True
self._tempfiles = []
def add_handler(self, handler):
if handler in self.handlers:
return
# XXX why does self.handlers need to be sorted?
bisect.insort(self.handlers, handler)
handler.add_parent(self)
self._handler_index_valid = False
def _maybe_reindex_handlers(self):
if self._handler_index_valid:
return
handle_error = {}
handle_open = {}
process_request = {}
process_response = {}
any_request = set()
any_response = set()
unwanted = []
for handler in self.handlers:
added = False
for meth in dir(handler):
if meth in ["redirect_request", "do_open", "proxy_open"]:
# oops, coincidental match
continue
if meth == "any_request":
any_request.add(handler)
added = True
continue
elif meth == "any_response":
any_response.add(handler)
added = True
continue
ii = meth.find("_")
scheme = meth[:ii]
condition = meth[ii+1:]
if condition.startswith("error"):
jj = meth[ii+1:].find("_") + ii + 1
kind = meth[jj+1:]
try:
kind = int(kind)
except ValueError:
pass
lookup = handle_error.setdefault(scheme, {})
elif condition == "open":
kind = scheme
lookup = handle_open
elif condition == "request":
kind = scheme
lookup = process_request
elif condition == "response":
kind = scheme
lookup = process_response
else:
continue
lookup.setdefault(kind, set()).add(handler)
added = True
if not added:
unwanted.append(handler)
for handler in unwanted:
self.handlers.remove(handler)
# sort indexed methods
# XXX could be cleaned up
for lookup in [process_request, process_response]:
for scheme, handlers in lookup.iteritems():
lookup[scheme] = handlers
for scheme, lookup in handle_error.iteritems():
for code, handlers in lookup.iteritems():
handlers = list(handlers)
handlers.sort()
lookup[code] = handlers
for scheme, handlers in handle_open.iteritems():
handlers = list(handlers)
handlers.sort()
handle_open[scheme] = handlers
# cache the indexes
self.handle_error = handle_error
self.handle_open = handle_open
self.process_request = process_request
self.process_response = process_response
self._any_request = any_request
self._any_response = any_response
def _request(self, url_or_req, data, visit):
if isstringlike(url_or_req):
req = Request(url_or_req, data, visit=visit)
else:
# already a urllib2.Request or mechanize.Request instance
req = url_or_req
if data is not None:
req.add_data(data)
# XXX yuck, give request a .visit attribute if it doesn't have one
try:
req.visit
except AttributeError:
req.visit = None
if visit is not None:
req.visit = visit
return req
def open(self, fullurl, data=None):
req = self._request(fullurl, data, None)
req_scheme = req.get_type()
self._maybe_reindex_handlers()
# pre-process request
# XXX should we allow a Processor to change the URL scheme
# of the request?
request_processors = set(self.process_request.get(req_scheme, []))
request_processors.update(self._any_request)
request_processors = list(request_processors)
request_processors.sort()
for processor in request_processors:
for meth_name in ["any_request", req_scheme+"_request"]:
meth = getattr(processor, meth_name, None)
if meth:
req = meth(req)
# In Python >= 2.4, .open() supports processors already, so we must
# call ._open() instead.
urlopen = getattr(urllib2.OpenerDirector, "_open",
urllib2.OpenerDirector.open)
response = urlopen(self, req, data)
# post-process response
response_processors = set(self.process_response.get(req_scheme, []))
response_processors.update(self._any_response)
response_processors = list(response_processors)
response_processors.sort()
for processor in response_processors:
for meth_name in ["any_response", req_scheme+"_response"]:
meth = getattr(processor, meth_name, None)
if meth:
response = meth(req, response)
return response
def error(self, proto, *args):
if proto in ['http', 'https']:
# XXX http[s] protocols are special-cased
dict = self.handle_error['http'] # https is not different than http
proto = args[2] # YUCK!
meth_name = 'http_error_%s' % proto
http_err = 1
orig_args = args
else:
dict = self.handle_error
meth_name = proto + '_error'
http_err = 0
args = (dict, proto, meth_name) + args
result = apply(self._call_chain, args)
if result:
return result
if http_err:
args = (dict, 'default', 'http_error_default') + orig_args
return apply(self._call_chain, args)
BLOCK_SIZE = 1024*8
def retrieve(self, fullurl, filename=None, reporthook=None, data=None):
"""Returns (filename, headers).
For remote objects, the default filename will refer to a temporary
file. Temporary files are removed when the OpenerDirector.close()
method is called.
For file: URLs, at present the returned filename is None. This may
change in future.
If the actual number of bytes read is less than indicated by the
Content-Length header, raises ContentTooShortError (a URLError
subclass). The exception's .result attribute contains the (filename,
headers) that would have been returned.
"""
req = self._request(fullurl, data, False)
scheme = req.get_type()
fp = self.open(req)
headers = fp.info()
if filename is None and scheme == 'file':
# XXX req.get_selector() seems broken here, return None,
# pending sanity :-/
return None, headers
#return urllib.url2pathname(req.get_selector()), headers
if filename:
tfp = open(filename, 'wb')
else:
path = _rfc3986.urlsplit(fullurl)[2]
suffix = os.path.splitext(path)[1]
fd, filename = tempfile.mkstemp(suffix)
self._tempfiles.append(filename)
tfp = os.fdopen(fd, 'wb')
result = filename, headers
bs = self.BLOCK_SIZE
size = -1
read = 0
blocknum = 0
if reporthook:
if "content-length" in headers:
size = int(headers["Content-Length"])
reporthook(blocknum, bs, size)
while 1:
block = fp.read(bs)
if block == "":
break
read += len(block)
tfp.write(block)
blocknum += 1
if reporthook:
reporthook(blocknum, bs, size)
fp.close()
tfp.close()
del fp
del tfp
# raise exception if actual size does not match content-length header
if size >= 0 and read < size:
raise ContentTooShortError(
"retrieval incomplete: "
"got only %i out of %i bytes" % (read, size),
result
)
return result
def close(self):
urllib2.OpenerDirector.close(self)
# make it very obvious this object is no longer supposed to be used
self.open = self.error = self.retrieve = self.add_handler = None
if self._tempfiles:
for filename in self._tempfiles:
try:
os.unlink(filename)
except OSError:
pass
del self._tempfiles[:]
def wrapped_open(urlopen, process_response_object, fullurl, data=None):
success = True
try:
response = urlopen(fullurl, data)
except urllib2.HTTPError, error:
success = False
if error.fp is None: # not a response
raise
response = error
if response is not None:
response = process_response_object(response)
if not success:
raise response
return response
class ResponseProcessingOpener(OpenerDirector):
def open(self, fullurl, data=None):
def bound_open(fullurl, data=None):
return OpenerDirector.open(self, fullurl, data)
return wrapped_open(
bound_open, self.process_response_object, fullurl, data)
def process_response_object(self, response):
return response
class SeekableResponseOpener(ResponseProcessingOpener):
def process_response_object(self, response):
return _response.seek_wrapped_response(response)
class OpenerFactory:
"""This class's interface is quite likely to change."""
default_classes = [
# handlers
urllib2.ProxyHandler,
urllib2.UnknownHandler,
_http.HTTPHandler, # derived from new AbstractHTTPHandler
_http.HTTPDefaultErrorHandler,
_http.HTTPRedirectHandler, # bugfixed
urllib2.FTPHandler,
urllib2.FileHandler,
# processors
_upgrade.HTTPRequestUpgradeProcessor,
_http.HTTPCookieProcessor,
_http.HTTPErrorProcessor,
]
if hasattr(httplib, 'HTTPS'):
default_classes.append(_http.HTTPSHandler)
handlers = []
replacement_handlers = []
def __init__(self, klass=OpenerDirector):
self.klass = klass
def build_opener(self, *handlers):
"""Create an opener object from a list of handlers and processors.
The opener will use several default handlers and processors, including
support for HTTP and FTP.
If any of the handlers passed as arguments are subclasses of the
default handlers, the default handlers will not be used.
"""
opener = self.klass()
default_classes = list(self.default_classes)
skip = []
for klass in default_classes:
for check in handlers:
if type(check) == types.ClassType:
if issubclass(check, klass):
skip.append(klass)
elif type(check) == types.InstanceType:
if isinstance(check, klass):
skip.append(klass)
for klass in skip:
default_classes.remove(klass)
for klass in default_classes:
opener.add_handler(klass())
for h in handlers:
if type(h) == types.ClassType:
h = h()
opener.add_handler(h)
return opener
build_opener = OpenerFactory().build_opener
_opener = None
urlopen_lock = _threading.Lock()
def urlopen(url, data=None):
global _opener
if _opener is None:
urlopen_lock.acquire()
try:
if _opener is None:
_opener = build_opener()
finally:
urlopen_lock.release()
return _opener.open(url, data)
def urlretrieve(url, filename=None, reporthook=None, data=None):
global _opener
if _opener is None:
urlopen_lock.acquire()
try:
if _opener is None:
_opener = build_opener()
finally:
urlopen_lock.release()
return _opener.retrieve(url, filename, reporthook, data)
def install_opener(opener):
global _opener
_opener = opener

View File

@ -0,0 +1,334 @@
"""A simple "pull API" for HTML parsing, after Perl's HTML::TokeParser.
Examples
This program extracts all links from a document. It will print one
line for each link, containing the URL and the textual description
between the <A>...</A> tags:
import pullparser, sys
f = file(sys.argv[1])
p = pullparser.PullParser(f)
for token in p.tags("a"):
if token.type == "endtag": continue
url = dict(token.attrs).get("href", "-")
text = p.get_compressed_text(endat=("endtag", "a"))
print "%s\t%s" % (url, text)
This program extracts the <TITLE> from the document:
import pullparser, sys
f = file(sys.argv[1])
p = pullparser.PullParser(f)
if p.get_tag("title"):
title = p.get_compressed_text()
print "Title: %s" % title
Copyright 2003-2006 John J. Lee <jjl@pobox.com>
Copyright 1998-2001 Gisle Aas (original libwww-perl code)
This code is free software; you can redistribute it and/or modify it
under the terms of the BSD or ZPL 2.1 licenses.
"""
import re, htmlentitydefs
import sgmllib, HTMLParser
from _html import unescape, unescape_charref
class NoMoreTokensError(Exception): pass
class Token:
"""Represents an HTML tag, declaration, processing instruction etc.
Behaves as both a tuple-like object (ie. iterable) and has attributes
.type, .data and .attrs.
>>> t = Token("starttag", "a", [("href", "http://www.python.org/")])
>>> t == ("starttag", "a", [("href", "http://www.python.org/")])
True
>>> (t.type, t.data) == ("starttag", "a")
True
>>> t.attrs == [("href", "http://www.python.org/")]
True
Public attributes
type: one of "starttag", "endtag", "startendtag", "charref", "entityref",
"data", "comment", "decl", "pi", after the corresponding methods of
HTMLParser.HTMLParser
data: For a tag, the tag name; otherwise, the relevant data carried by the
tag, as a string
attrs: list of (name, value) pairs representing HTML attributes
(or None if token does not represent an opening tag)
"""
def __init__(self, type, data, attrs=None):
self.type = type
self.data = data
self.attrs = attrs
def __iter__(self):
return iter((self.type, self.data, self.attrs))
def __eq__(self, other):
type, data, attrs = other
if (self.type == type and
self.data == data and
self.attrs == attrs):
return True
else:
return False
def __ne__(self, other): return not self.__eq__(other)
def __repr__(self):
args = ", ".join(map(repr, [self.type, self.data, self.attrs]))
return self.__class__.__name__+"(%s)" % args
def iter_until_exception(fn, exception, *args, **kwds):
while 1:
try:
yield fn(*args, **kwds)
except exception:
raise StopIteration
class _AbstractParser:
chunk = 1024
compress_re = re.compile(r"\s+")
def __init__(self, fh, textify={"img": "alt", "applet": "alt"},
encoding="ascii", entitydefs=None):
"""
fh: file-like object (only a .read() method is required) from which to
read HTML to be parsed
textify: mapping used by .get_text() and .get_compressed_text() methods
to represent opening tags as text
encoding: encoding used to encode numeric character references by
.get_text() and .get_compressed_text() ("ascii" by default)
entitydefs: mapping like {"amp": "&", ...} containing HTML entity
definitions (a sensible default is used). This is used to unescape
entities in .get_text() (and .get_compressed_text()) and attribute
values. If the encoding can not represent the character, the entity
reference is left unescaped. Note that entity references (both
numeric - e.g. &#123; or &#xabc; - and non-numeric - e.g. &amp;) are
unescaped in attribute values and the return value of .get_text(), but
not in data outside of tags. Instead, entity references outside of
tags are represented as tokens. This is a bit odd, it's true :-/
If the element name of an opening tag matches a key in the textify
mapping then that tag is converted to text. The corresponding value is
used to specify which tag attribute to obtain the text from. textify
maps from element names to either:
- an HTML attribute name, in which case the HTML attribute value is
used as its text value along with the element name in square
brackets (eg."alt text goes here[IMG]", or, if the alt attribute
were missing, just "[IMG]")
- a callable object (eg. a function) which takes a Token and returns
the string to be used as its text value
If textify has no key for an element name, nothing is substituted for
the opening tag.
Public attributes:
encoding and textify: see above
"""
self._fh = fh
self._tokenstack = [] # FIFO
self.textify = textify
self.encoding = encoding
if entitydefs is None:
entitydefs = htmlentitydefs.name2codepoint
self._entitydefs = entitydefs
def __iter__(self): return self
def tags(self, *names):
return iter_until_exception(self.get_tag, NoMoreTokensError, *names)
def tokens(self, *tokentypes):
return iter_until_exception(self.get_token, NoMoreTokensError, *tokentypes)
def next(self):
try:
return self.get_token()
except NoMoreTokensError:
raise StopIteration()
def get_token(self, *tokentypes):
"""Pop the next Token object from the stack of parsed tokens.
If arguments are given, they are taken to be token types in which the
caller is interested: tokens representing other elements will be
skipped. Element names must be given in lower case.
Raises NoMoreTokensError.
"""
while 1:
while self._tokenstack:
token = self._tokenstack.pop(0)
if tokentypes:
if token.type in tokentypes:
return token
else:
return token
data = self._fh.read(self.chunk)
if not data:
raise NoMoreTokensError()
self.feed(data)
def unget_token(self, token):
"""Push a Token back onto the stack."""
self._tokenstack.insert(0, token)
def get_tag(self, *names):
"""Return the next Token that represents an opening or closing tag.
If arguments are given, they are taken to be element names in which the
caller is interested: tags representing other elements will be skipped.
Element names must be given in lower case.
Raises NoMoreTokensError.
"""
while 1:
tok = self.get_token()
if tok.type not in ["starttag", "endtag", "startendtag"]:
continue
if names:
if tok.data in names:
return tok
else:
return tok
def get_text(self, endat=None):
"""Get some text.
endat: stop reading text at this tag (the tag is included in the
returned text); endtag is a tuple (type, name) where type is
"starttag", "endtag" or "startendtag", and name is the element name of
the tag (element names must be given in lower case)
If endat is not given, .get_text() will stop at the next opening or
closing tag, or when there are no more tokens (no exception is raised).
Note that .get_text() includes the text representation (if any) of the
opening tag, but pushes the opening tag back onto the stack. As a
result, if you want to call .get_text() again, you need to call
.get_tag() first (unless you want an empty string returned when you
next call .get_text()).
Entity references are translated using the value of the entitydefs
constructor argument (a mapping from names to characters like that
provided by the standard module htmlentitydefs). Named entity
references that are not in this mapping are left unchanged.
The textify attribute is used to translate opening tags into text: see
the class docstring.
"""
text = []
tok = None
while 1:
try:
tok = self.get_token()
except NoMoreTokensError:
# unget last token (not the one we just failed to get)
if tok: self.unget_token(tok)
break
if tok.type == "data":
text.append(tok.data)
elif tok.type == "entityref":
t = unescape("&%s;"%tok.data, self._entitydefs, self.encoding)
text.append(t)
elif tok.type == "charref":
t = unescape_charref(tok.data, self.encoding)
text.append(t)
elif tok.type in ["starttag", "endtag", "startendtag"]:
tag_name = tok.data
if tok.type in ["starttag", "startendtag"]:
alt = self.textify.get(tag_name)
if alt is not None:
if callable(alt):
text.append(alt(tok))
elif tok.attrs is not None:
for k, v in tok.attrs:
if k == alt:
text.append(v)
text.append("[%s]" % tag_name.upper())
if endat is None or endat == (tok.type, tag_name):
self.unget_token(tok)
break
return "".join(text)
def get_compressed_text(self, *args, **kwds):
"""
As .get_text(), but collapses each group of contiguous whitespace to a
single space character, and removes all initial and trailing
whitespace.
"""
text = self.get_text(*args, **kwds)
text = text.strip()
return self.compress_re.sub(" ", text)
def handle_startendtag(self, tag, attrs):
self._tokenstack.append(Token("startendtag", tag, attrs))
def handle_starttag(self, tag, attrs):
self._tokenstack.append(Token("starttag", tag, attrs))
def handle_endtag(self, tag):
self._tokenstack.append(Token("endtag", tag))
def handle_charref(self, name):
self._tokenstack.append(Token("charref", name))
def handle_entityref(self, name):
self._tokenstack.append(Token("entityref", name))
def handle_data(self, data):
self._tokenstack.append(Token("data", data))
def handle_comment(self, data):
self._tokenstack.append(Token("comment", data))
def handle_decl(self, decl):
self._tokenstack.append(Token("decl", decl))
def unknown_decl(self, data):
# XXX should this call self.error instead?
#self.error("unknown declaration: " + `data`)
self._tokenstack.append(Token("decl", data))
def handle_pi(self, data):
self._tokenstack.append(Token("pi", data))
def unescape_attr(self, name):
return unescape(name, self._entitydefs, self.encoding)
def unescape_attrs(self, attrs):
escaped_attrs = []
for key, val in attrs:
escaped_attrs.append((key, self.unescape_attr(val)))
return escaped_attrs
class PullParser(_AbstractParser, HTMLParser.HTMLParser):
def __init__(self, *args, **kwds):
HTMLParser.HTMLParser.__init__(self)
_AbstractParser.__init__(self, *args, **kwds)
def unescape(self, name):
# Use the entitydefs passed into constructor, not
# HTMLParser.HTMLParser's entitydefs.
return self.unescape_attr(name)
class TolerantPullParser(_AbstractParser, sgmllib.SGMLParser):
def __init__(self, *args, **kwds):
sgmllib.SGMLParser.__init__(self)
_AbstractParser.__init__(self, *args, **kwds)
def unknown_starttag(self, tag, attrs):
attrs = self.unescape_attrs(attrs)
self._tokenstack.append(Token("starttag", tag, attrs))
def unknown_endtag(self, tag):
self._tokenstack.append(Token("endtag", tag))
def _test():
import doctest, _pullparser
return doctest.testmod(_pullparser)
if __name__ == "__main__":
_test()

View File

@ -0,0 +1,86 @@
"""Integration with Python standard library module urllib2: Request class.
Copyright 2004-2006 John J Lee <jjl@pobox.com>
This code is free software; you can redistribute it and/or modify it
under the terms of the BSD or ZPL 2.1 licenses (see the file
COPYING.txt included with the distribution).
"""
import urllib2, urllib, logging
from _clientcookie import request_host
import _rfc3986
warn = logging.getLogger("mechanize").warning
# don't complain about missing logging handler
logging.getLogger("mechanize").setLevel(logging.ERROR)
class Request(urllib2.Request):
def __init__(self, url, data=None, headers={},
origin_req_host=None, unverifiable=False, visit=None):
# In mechanize 0.2, the interpretation of a unicode url argument will
# change: A unicode url argument will be interpreted as an IRI, and a
# bytestring as a URI. For now, we accept unicode or bytestring. We
# don't insist that the value is always a URI (specifically, must only
# contain characters which are legal), because that might break working
# code (who knows what bytes some servers want to see, especially with
# browser plugins for internationalised URIs).
if not _rfc3986.is_clean_uri(url):
warn("url argument is not a URI "
"(contains illegal characters) %r" % url)
urllib2.Request.__init__(self, url, data, headers)
self.selector = None
self.unredirected_hdrs = {}
self.visit = visit
# All the terminology below comes from RFC 2965.
self.unverifiable = unverifiable
# Set request-host of origin transaction.
# The origin request-host is needed in order to decide whether
# unverifiable sub-requests (automatic redirects, images embedded
# in HTML, etc.) are to third-party hosts. If they are, the
# resulting transactions might need to be conducted with cookies
# turned off.
if origin_req_host is None:
origin_req_host = request_host(self)
self.origin_req_host = origin_req_host
def get_selector(self):
return urllib.splittag(self.__r_host)[0]
def get_origin_req_host(self):
return self.origin_req_host
def is_unverifiable(self):
return self.unverifiable
def add_unredirected_header(self, key, val):
"""Add a header that will not be added to a redirected request."""
self.unredirected_hdrs[key.capitalize()] = val
def has_header(self, header_name):
"""True iff request has named header (regular or unredirected)."""
return (header_name in self.headers or
header_name in self.unredirected_hdrs)
def get_header(self, header_name, default=None):
return self.headers.get(
header_name,
self.unredirected_hdrs.get(header_name, default))
def header_items(self):
hdrs = self.unredirected_hdrs.copy()
hdrs.update(self.headers)
return hdrs.items()
def __str__(self):
return "<Request for %s>" % self.get_full_url()
def get_method(self):
if self.has_data():
return "POST"
else:
return "GET"

View File

@ -0,0 +1,515 @@
"""Response classes.
The seek_wrapper code is not used if you're using UserAgent with
.set_seekable_responses(False), or if you're using the urllib2-level interface
without SeekableProcessor or HTTPEquivProcessor. Class closeable_response is
instantiated by some handlers (AbstractHTTPHandler), but the closeable_response
interface is only depended upon by Browser-level code. Function
upgrade_response is only used if you're using Browser or
ResponseUpgradeProcessor.
Copyright 2006 John J. Lee <jjl@pobox.com>
This code is free software; you can redistribute it and/or modify it
under the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt
included with the distribution).
"""
import copy, mimetools
from cStringIO import StringIO
import urllib2
# XXX Andrew Dalke kindly sent me a similar class in response to my request on
# comp.lang.python, which I then proceeded to lose. I wrote this class
# instead, but I think he's released his code publicly since, could pinch the
# tests from it, at least...
# For testing seek_wrapper invariant (note that
# test_urllib2.HandlerTest.test_seekable is expected to fail when this
# invariant checking is turned on). The invariant checking is done by module
# ipdc, which is available here:
# http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/436834
## from ipdbc import ContractBase
## class seek_wrapper(ContractBase):
class seek_wrapper:
"""Adds a seek method to a file object.
This is only designed for seeking on readonly file-like objects.
Wrapped file-like object must have a read method. The readline method is
only supported if that method is present on the wrapped object. The
readlines method is always supported. xreadlines and iteration are
supported only for Python 2.2 and above.
Public attributes:
wrapped: the wrapped file object
is_closed: true iff .close() has been called
WARNING: All other attributes of the wrapped object (ie. those that are not
one of wrapped, read, readline, readlines, xreadlines, __iter__ and next)
are passed through unaltered, which may or may not make sense for your
particular file object.
"""
# General strategy is to check that cache is full enough, then delegate to
# the cache (self.__cache, which is a cStringIO.StringIO instance). A seek
# position (self.__pos) is maintained independently of the cache, in order
# that a single cache may be shared between multiple seek_wrapper objects.
# Copying using module copy shares the cache in this way.
def __init__(self, wrapped):
self.wrapped = wrapped
self.__read_complete_state = [False]
self.__is_closed_state = [False]
self.__have_readline = hasattr(self.wrapped, "readline")
self.__cache = StringIO()
self.__pos = 0 # seek position
def invariant(self):
# The end of the cache is always at the same place as the end of the
# wrapped file.
return self.wrapped.tell() == len(self.__cache.getvalue())
def close(self):
self.wrapped.close()
self.is_closed = True
def __getattr__(self, name):
if name == "is_closed":
return self.__is_closed_state[0]
elif name == "read_complete":
return self.__read_complete_state[0]
wrapped = self.__dict__.get("wrapped")
if wrapped:
return getattr(wrapped, name)
return getattr(self.__class__, name)
def __setattr__(self, name, value):
if name == "is_closed":
self.__is_closed_state[0] = bool(value)
elif name == "read_complete":
if not self.is_closed:
self.__read_complete_state[0] = bool(value)
else:
self.__dict__[name] = value
def seek(self, offset, whence=0):
assert whence in [0,1,2]
# how much data, if any, do we need to read?
if whence == 2: # 2: relative to end of *wrapped* file
if offset < 0: raise ValueError("negative seek offset")
# since we don't know yet where the end of that file is, we must
# read everything
to_read = None
else:
if whence == 0: # 0: absolute
if offset < 0: raise ValueError("negative seek offset")
dest = offset
else: # 1: relative to current position
pos = self.__pos
if pos < offset:
raise ValueError("seek to before start of file")
dest = pos + offset
end = len(self.__cache.getvalue())
to_read = dest - end
if to_read < 0:
to_read = 0
if to_read != 0:
self.__cache.seek(0, 2)
if to_read is None:
assert whence == 2
self.__cache.write(self.wrapped.read())
self.read_complete = True
self.__pos = self.__cache.tell() - offset
else:
data = self.wrapped.read(to_read)
if not data:
self.read_complete = True
else:
self.__cache.write(data)
# Don't raise an exception even if we've seek()ed past the end
# of .wrapped, since fseek() doesn't complain in that case.
# Also like fseek(), pretend we have seek()ed past the end,
# i.e. not:
#self.__pos = self.__cache.tell()
# but rather:
self.__pos = dest
else:
self.__pos = dest
def tell(self):
return self.__pos
def __copy__(self):
cpy = self.__class__(self.wrapped)
cpy.__cache = self.__cache
cpy.__read_complete_state = self.__read_complete_state
cpy.__is_closed_state = self.__is_closed_state
return cpy
def get_data(self):
pos = self.__pos
try:
self.seek(0)
return self.read(-1)
finally:
self.__pos = pos
def read(self, size=-1):
pos = self.__pos
end = len(self.__cache.getvalue())
available = end - pos
# enough data already cached?
if size <= available and size != -1:
self.__cache.seek(pos)
self.__pos = pos+size
return self.__cache.read(size)
# no, so read sufficient data from wrapped file and cache it
self.__cache.seek(0, 2)
if size == -1:
self.__cache.write(self.wrapped.read())
self.read_complete = True
else:
to_read = size - available
assert to_read > 0
data = self.wrapped.read(to_read)
if not data:
self.read_complete = True
else:
self.__cache.write(data)
self.__cache.seek(pos)
data = self.__cache.read(size)
self.__pos = self.__cache.tell()
assert self.__pos == pos + len(data)
return data
def readline(self, size=-1):
if not self.__have_readline:
raise NotImplementedError("no readline method on wrapped object")
# line we're about to read might not be complete in the cache, so
# read another line first
pos = self.__pos
self.__cache.seek(0, 2)
data = self.wrapped.readline()
if not data:
self.read_complete = True
else:
self.__cache.write(data)
self.__cache.seek(pos)
data = self.__cache.readline()
if size != -1:
r = data[:size]
self.__pos = pos+size
else:
r = data
self.__pos = pos+len(data)
return r
def readlines(self, sizehint=-1):
pos = self.__pos
self.__cache.seek(0, 2)
self.__cache.write(self.wrapped.read())
self.read_complete = True
self.__cache.seek(pos)
data = self.__cache.readlines(sizehint)
self.__pos = self.__cache.tell()
return data
def __iter__(self): return self
def next(self):
line = self.readline()
if line == "": raise StopIteration
return line
xreadlines = __iter__
def __repr__(self):
return ("<%s at %s whose wrapped object = %r>" %
(self.__class__.__name__, hex(abs(id(self))), self.wrapped))
class response_seek_wrapper(seek_wrapper):
"""
Supports copying response objects and setting response body data.
"""
def __init__(self, wrapped):
seek_wrapper.__init__(self, wrapped)
self._headers = self.wrapped.info()
def __copy__(self):
cpy = seek_wrapper.__copy__(self)
# copy headers from delegate
cpy._headers = copy.copy(self.info())
return cpy
# Note that .info() and .geturl() (the only two urllib2 response methods
# that are not implemented by seek_wrapper) must be here explicitly rather
# than by seek_wrapper's __getattr__ delegation) so that the nasty
# dynamically-created HTTPError classes in get_seek_wrapper_class() get the
# wrapped object's implementation, and not HTTPError's.
def info(self):
return self._headers
def geturl(self):
return self.wrapped.geturl()
def set_data(self, data):
self.seek(0)
self.read()
self.close()
cache = self._seek_wrapper__cache = StringIO()
cache.write(data)
self.seek(0)
class eoffile:
# file-like object that always claims to be at end-of-file...
def read(self, size=-1): return ""
def readline(self, size=-1): return ""
def __iter__(self): return self
def next(self): return ""
def close(self): pass
class eofresponse(eoffile):
def __init__(self, url, headers, code, msg):
self._url = url
self._headers = headers
self.code = code
self.msg = msg
def geturl(self): return self._url
def info(self): return self._headers
class closeable_response:
"""Avoids unnecessarily clobbering urllib.addinfourl methods on .close().
Only supports responses returned by mechanize.HTTPHandler.
After .close(), the following methods are supported:
.read()
.readline()
.info()
.geturl()
.__iter__()
.next()
.close()
and the following attributes are supported:
.code
.msg
Also supports pickling (but the stdlib currently does something to prevent
it: http://python.org/sf/1144636).
"""
# presence of this attr indicates is useable after .close()
closeable_response = None
def __init__(self, fp, headers, url, code, msg):
self._set_fp(fp)
self._headers = headers
self._url = url
self.code = code
self.msg = msg
def _set_fp(self, fp):
self.fp = fp
self.read = self.fp.read
self.readline = self.fp.readline
if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines
if hasattr(self.fp, "fileno"):
self.fileno = self.fp.fileno
else:
self.fileno = lambda: None
self.__iter__ = self.fp.__iter__
self.next = self.fp.next
def __repr__(self):
return '<%s at %s whose fp = %r>' % (
self.__class__.__name__, hex(abs(id(self))), self.fp)
def info(self):
return self._headers
def geturl(self):
return self._url
def close(self):
self.fp._close = True
wrapped = self.fp
wrapped.close()
new_wrapped = eofresponse(
self._url, self._headers, self.code, self.msg)
self._set_fp(new_wrapped)
def __getstate__(self):
# There are three obvious options here:
# 1. truncate
# 2. read to end
# 3. close socket, pickle state including read position, then open
# again on unpickle and use Range header
# XXXX um, 4. refuse to pickle unless .close()d. This is better,
# actually ("errors should never pass silently"). Pickling doesn't
# work anyway ATM, because of http://python.org/sf/1144636 so fix
# this later
# 2 breaks pickle protocol, because one expects the original object
# to be left unscathed by pickling. 3 is too complicated and
# surprising (and too much work ;-) to happen in a sane __getstate__.
# So we do 1.
state = self.__dict__.copy()
new_wrapped = eofresponse(
self._url, self._headers, self.code, self.msg)
state["wrapped"] = new_wrapped
return state
def test_response(data='test data', headers=[],
url="http://example.com/", code=200, msg="OK"):
return make_response(data, headers, url, code, msg)
def test_html_response(data='test data', headers=[],
url="http://example.com/", code=200, msg="OK"):
headers += [("Content-type", "text/html")]
return make_response(data, headers, url, code, msg)
def make_response(data, headers, url, code, msg):
"""Convenient factory for objects implementing response interface.
data: string containing response body data
headers: sequence of (name, value) pairs
url: URL of response
code: integer response code (e.g. 200)
msg: string response code message (e.g. "OK")
"""
mime_headers = make_headers(headers)
r = closeable_response(StringIO(data), mime_headers, url, code, msg)
return response_seek_wrapper(r)
def make_headers(headers):
"""
headers: sequence of (name, value) pairs
"""
hdr_text = []
for name_value in headers:
hdr_text.append("%s: %s" % name_value)
return mimetools.Message(StringIO("\n".join(hdr_text)))
# Rest of this module is especially horrible, but needed, at least until fork
# urllib2. Even then, may want to preseve urllib2 compatibility.
def get_seek_wrapper_class(response):
# in order to wrap response objects that are also exceptions, we must
# dynamically subclass the exception :-(((
if (isinstance(response, urllib2.HTTPError) and
not hasattr(response, "seek")):
if response.__class__.__module__ == "__builtin__":
exc_class_name = response.__class__.__name__
else:
exc_class_name = "%s.%s" % (
response.__class__.__module__, response.__class__.__name__)
class httperror_seek_wrapper(response_seek_wrapper, response.__class__):
# this only derives from HTTPError in order to be a subclass --
# the HTTPError behaviour comes from delegation
_exc_class_name = exc_class_name
def __init__(self, wrapped):
response_seek_wrapper.__init__(self, wrapped)
# be compatible with undocumented HTTPError attributes :-(
self.hdrs = wrapped.info()
self.filename = wrapped.geturl()
def __repr__(self):
return (
"<%s (%s instance) at %s "
"whose wrapped object = %r>" % (
self.__class__.__name__, self._exc_class_name,
hex(abs(id(self))), self.wrapped)
)
wrapper_class = httperror_seek_wrapper
else:
wrapper_class = response_seek_wrapper
return wrapper_class
def seek_wrapped_response(response):
"""Return a copy of response that supports seekable response interface.
Accepts responses from both mechanize and urllib2 handlers.
Copes with both oridinary response instances and HTTPError instances (which
can't be simply wrapped due to the requirement of preserving the exception
base class).
"""
if not hasattr(response, "seek"):
wrapper_class = get_seek_wrapper_class(response)
response = wrapper_class(response)
assert hasattr(response, "get_data")
return response
def upgrade_response(response):
"""Return a copy of response that supports Browser response interface.
Browser response interface is that of "seekable responses"
(response_seek_wrapper), plus the requirement that responses must be
useable after .close() (closeable_response).
Accepts responses from both mechanize and urllib2 handlers.
Copes with both ordinary response instances and HTTPError instances (which
can't be simply wrapped due to the requirement of preserving the exception
base class).
"""
wrapper_class = get_seek_wrapper_class(response)
if hasattr(response, "closeable_response"):
if not hasattr(response, "seek"):
response = wrapper_class(response)
assert hasattr(response, "get_data")
return copy.copy(response)
# a urllib2 handler constructed the response, i.e. the response is an
# urllib.addinfourl or a urllib2.HTTPError, instead of a
# _Util.closeable_response as returned by e.g. mechanize.HTTPHandler
try:
code = response.code
except AttributeError:
code = None
try:
msg = response.msg
except AttributeError:
msg = None
# may have already-.read() data from .seek() cache
data = None
get_data = getattr(response, "get_data", None)
if get_data:
data = get_data()
response = closeable_response(
response.fp, response.info(), response.geturl(), code, msg)
response = wrapper_class(response)
if data:
response.set_data(data)
return response

View File

@ -0,0 +1,240 @@
"""RFC 3986 URI parsing and relative reference resolution / absolutization.
(aka splitting and joining)
Copyright 2006 John J. Lee <jjl@pobox.com>
This code is free software; you can redistribute it and/or modify it under
the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt
included with the distribution).
"""
# XXX Wow, this is ugly. Overly-direct translation of the RFC ATM.
import sys, re, posixpath, urllib
## def chr_range(a, b):
## return "".join(map(chr, range(ord(a), ord(b)+1)))
## UNRESERVED_URI_CHARS = ("ABCDEFGHIJKLMNOPQRSTUVWXYZ"
## "abcdefghijklmnopqrstuvwxyz"
## "0123456789"
## "-_.~")
## RESERVED_URI_CHARS = "!*'();:@&=+$,/?#[]"
## URI_CHARS = RESERVED_URI_CHARS+UNRESERVED_URI_CHARS+'%'
# this re matches any character that's not in URI_CHARS
BAD_URI_CHARS_RE = re.compile("[^A-Za-z0-9\-_.~!*'();:@&=+$,/?%#[\]]")
def clean_url(url, encoding):
# percent-encode illegal URI characters
# Trying to come up with test cases for this gave me a headache, revisit
# when do switch to unicode.
# Somebody else's comments (lost the attribution):
## - IE will return you the url in the encoding you send it
## - Mozilla/Firefox will send you latin-1 if there's no non latin-1
## characters in your link. It will send you utf-8 however if there are...
if type(url) == type(""):
url = url.decode(encoding, "replace")
url = url.strip()
# for second param to urllib.quote(), we want URI_CHARS, minus the
# 'always_safe' characters that urllib.quote() never percent-encodes
return urllib.quote(url.encode(encoding), "!*'();:@&=+$,/?%#[]~")
def is_clean_uri(uri):
"""
>>> is_clean_uri("ABC!")
True
>>> is_clean_uri(u"ABC!")
True
>>> is_clean_uri("ABC|")
False
>>> is_clean_uri(u"ABC|")
False
>>> is_clean_uri("http://example.com/0")
True
>>> is_clean_uri(u"http://example.com/0")
True
"""
# note module re treats bytestrings as through they were decoded as latin-1
# so this function accepts both unicode and bytestrings
return not bool(BAD_URI_CHARS_RE.search(uri))
SPLIT_MATCH = re.compile(
r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?").match
def urlsplit(absolute_uri):
"""Return scheme, authority, path, query, fragment."""
match = SPLIT_MATCH(absolute_uri)
if match:
g = match.groups()
return g[1], g[3], g[4], g[6], g[8]
def urlunsplit(parts):
scheme, authority, path, query, fragment = parts
r = []
append = r.append
if scheme is not None:
append(scheme)
append(":")
if authority is not None:
append("//")
append(authority)
append(path)
if query is not None:
append("?")
append(query)
if fragment is not None:
append("#")
append(fragment)
return "".join(r)
def urljoin(base_uri, uri_reference):
return urlunsplit(urljoin_parts(urlsplit(base_uri),
urlsplit(uri_reference)))
# oops, this doesn't do the same thing as the literal translation
# from the RFC below
## def urljoin_parts(base_parts, reference_parts):
## scheme, authority, path, query, fragment = base_parts
## rscheme, rauthority, rpath, rquery, rfragment = reference_parts
## # compute target URI path
## if rpath == "":
## tpath = path
## else:
## tpath = rpath
## if not tpath.startswith("/"):
## tpath = merge(authority, path, tpath)
## tpath = posixpath.normpath(tpath)
## if rscheme is not None:
## return (rscheme, rauthority, tpath, rquery, rfragment)
## elif rauthority is not None:
## return (scheme, rauthority, tpath, rquery, rfragment)
## elif rpath == "":
## if rquery is not None:
## tquery = rquery
## else:
## tquery = query
## return (scheme, authority, tpath, tquery, rfragment)
## else:
## return (scheme, authority, tpath, rquery, rfragment)
def urljoin_parts(base_parts, reference_parts):
scheme, authority, path, query, fragment = base_parts
rscheme, rauthority, rpath, rquery, rfragment = reference_parts
if rscheme == scheme:
rscheme = None
if rscheme is not None:
tscheme, tauthority, tpath, tquery = (
rscheme, rauthority, remove_dot_segments(rpath), rquery)
else:
if rauthority is not None:
tauthority, tpath, tquery = (
rauthority, remove_dot_segments(rpath), rquery)
else:
if rpath == "":
tpath = path
if rquery is not None:
tquery = rquery
else:
tquery = query
else:
if rpath.startswith("/"):
tpath = remove_dot_segments(rpath)
else:
tpath = merge(authority, path, rpath)
tpath = remove_dot_segments(tpath)
tquery = rquery
tauthority = authority
tscheme = scheme
tfragment = rfragment
return (tscheme, tauthority, tpath, tquery, tfragment)
# um, something *vaguely* like this is what I want, but I have to generate
# lots of test cases first, if only to understand what it is that
# remove_dot_segments really does...
## def remove_dot_segments(path):
## if path == '':
## return ''
## comps = path.split('/')
## new_comps = []
## for comp in comps:
## if comp in ['.', '']:
## if not new_comps or new_comps[-1]:
## new_comps.append('')
## continue
## if comp != '..':
## new_comps.append(comp)
## elif new_comps:
## new_comps.pop()
## return '/'.join(new_comps)
def remove_dot_segments(path):
r = []
while path:
# A
if path.startswith("../"):
path = path[3:]
continue
if path.startswith("./"):
path = path[2:]
continue
# B
if path.startswith("/./"):
path = path[2:]
continue
if path == "/.":
path = "/"
continue
# C
if path.startswith("/../"):
path = path[3:]
if r:
r.pop()
continue
if path == "/..":
path = "/"
if r:
r.pop()
continue
# D
if path == ".":
path = path[1:]
continue
if path == "..":
path = path[2:]
continue
# E
start = 0
if path.startswith("/"):
start = 1
ii = path.find("/", start)
if ii < 0:
ii = None
r.append(path[:ii])
if ii is None:
break
path = path[ii:]
return "".join(r)
def merge(base_authority, base_path, ref_path):
# XXXX Oddly, the sample Perl implementation of this by Roy Fielding
# doesn't even take base_authority as a parameter, despite the wording in
# the RFC suggesting otherwise. Perhaps I'm missing some obvious identity.
#if base_authority is not None and base_path == "":
if base_path == "":
return "/" + ref_path
ii = base_path.rfind("/")
if ii >= 0:
return base_path[:ii+1] + ref_path
return ref_path
if __name__ == "__main__":
import doctest
doctest.testmod()

View File

@ -0,0 +1,16 @@
from urllib2 import BaseHandler
from _util import deprecation
from _response import response_seek_wrapper
class SeekableProcessor(BaseHandler):
"""Deprecated: Make responses seekable."""
def __init__(self):
deprecation(
"See http://wwwsearch.sourceforge.net/mechanize/doc.html#seekable")
def any_response(self, request, response):
if not hasattr(response, "seek"):
return response_seek_wrapper(response)
return response

View File

@ -0,0 +1,40 @@
from urllib2 import BaseHandler
from _request import Request
from _response import upgrade_response
from _util import deprecation
class HTTPRequestUpgradeProcessor(BaseHandler):
# upgrade urllib2.Request to this module's Request
# yuck!
handler_order = 0 # before anything else
def http_request(self, request):
if not hasattr(request, "add_unredirected_header"):
newrequest = Request(request._Request__original, request.data,
request.headers)
try: newrequest.origin_req_host = request.origin_req_host
except AttributeError: pass
try: newrequest.unverifiable = request.unverifiable
except AttributeError: pass
try: newrequest.visit = request.visit
except AttributeError: pass
request = newrequest
return request
https_request = http_request
class ResponseUpgradeProcessor(BaseHandler):
# upgrade responses to be .close()able without becoming unusable
handler_order = 0 # before anything else
def __init__(self):
deprecation(
"See http://wwwsearch.sourceforge.net/mechanize/doc.html#seekable")
def any_response(self, request, response):
if not hasattr(response, 'closeable_response'):
response = upgrade_response(response)
return response

View File

@ -0,0 +1,62 @@
# urllib2 work-alike interface
# ...from urllib2...
from urllib2 import \
URLError, \
HTTPError, \
GopherError
# ...and from mechanize
from _opener import OpenerDirector, \
SeekableResponseOpener, \
build_opener, install_opener, urlopen
from _auth import \
HTTPPasswordMgr, \
HTTPPasswordMgrWithDefaultRealm, \
AbstractBasicAuthHandler, \
AbstractDigestAuthHandler, \
HTTPProxyPasswordMgr, \
ProxyHandler, \
ProxyBasicAuthHandler, \
ProxyDigestAuthHandler, \
HTTPBasicAuthHandler, \
HTTPDigestAuthHandler, \
HTTPSClientCertMgr
from _request import \
Request
from _http import \
RobotExclusionError
# handlers...
# ...from urllib2...
from urllib2 import \
BaseHandler, \
UnknownHandler, \
FTPHandler, \
CacheFTPHandler, \
FileHandler, \
GopherHandler
# ...and from mechanize
from _http import \
HTTPHandler, \
HTTPDefaultErrorHandler, \
HTTPRedirectHandler, \
HTTPEquivProcessor, \
HTTPCookieProcessor, \
HTTPRefererProcessor, \
HTTPRefreshProcessor, \
HTTPErrorProcessor, \
HTTPRobotRulesProcessor
from _upgrade import \
HTTPRequestUpgradeProcessor, \
ResponseUpgradeProcessor
from _debug import \
HTTPResponseDebugProcessor, \
HTTPRedirectDebugProcessor
from _seek import \
SeekableProcessor
# crap ATM
## from _gzip import \
## HTTPGzipProcessor
import httplib
if hasattr(httplib, 'HTTPS'):
from _http import HTTPSHandler
del httplib

View File

@ -0,0 +1,348 @@
"""Convenient HTTP UserAgent class.
This is a subclass of urllib2.OpenerDirector.
Copyright 2003-2006 John J. Lee <jjl@pobox.com>
This code is free software; you can redistribute it and/or modify it under
the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt
included with the distribution).
"""
import sys, warnings, urllib2
import _opener
import _urllib2
import _auth
import _gzip
import _response
class UserAgentBase(_opener.OpenerDirector):
"""Convenient user-agent class.
Do not use .add_handler() to add a handler for something already dealt with
by this code.
The only reason at present for the distinction between UserAgent and
UserAgentBase is so that classes that depend on .seek()able responses
(e.g. mechanize.Browser) can inherit from UserAgentBase. The subclass
UserAgent exposes a .set_seekable_responses() method that allows switching
off the adding of a .seek() method to responses.
Public attributes:
addheaders: list of (name, value) pairs specifying headers to send with
every request, unless they are overridden in the Request instance.
>>> ua = UserAgentBase()
>>> ua.addheaders = [
... ("User-agent", "Mozilla/5.0 (compatible)"),
... ("From", "responsible.person@example.com")]
"""
handler_classes = {
# scheme handlers
"http": _urllib2.HTTPHandler,
# CacheFTPHandler is buggy, at least in 2.3, so we don't use it
"ftp": _urllib2.FTPHandler,
"file": _urllib2.FileHandler,
"gopher": _urllib2.GopherHandler,
# other handlers
"_unknown": _urllib2.UnknownHandler,
# HTTP{S,}Handler depend on HTTPErrorProcessor too
"_http_error": _urllib2.HTTPErrorProcessor,
"_http_request_upgrade": _urllib2.HTTPRequestUpgradeProcessor,
"_http_default_error": _urllib2.HTTPDefaultErrorHandler,
# feature handlers
"_basicauth": _urllib2.HTTPBasicAuthHandler,
"_digestauth": _urllib2.HTTPDigestAuthHandler,
"_redirect": _urllib2.HTTPRedirectHandler,
"_cookies": _urllib2.HTTPCookieProcessor,
"_refresh": _urllib2.HTTPRefreshProcessor,
"_equiv": _urllib2.HTTPEquivProcessor,
"_proxy": _urllib2.ProxyHandler,
"_proxy_basicauth": _urllib2.ProxyBasicAuthHandler,
"_proxy_digestauth": _urllib2.ProxyDigestAuthHandler,
"_robots": _urllib2.HTTPRobotRulesProcessor,
"_gzip": _gzip.HTTPGzipProcessor, # experimental!
# debug handlers
"_debug_redirect": _urllib2.HTTPRedirectDebugProcessor,
"_debug_response_body": _urllib2.HTTPResponseDebugProcessor,
}
default_schemes = ["http", "ftp", "file", "gopher"]
default_others = ["_unknown", "_http_error", "_http_request_upgrade",
"_http_default_error",
]
default_features = ["_redirect", "_cookies",
"_refresh", "_equiv",
"_basicauth", "_digestauth",
"_proxy", "_proxy_basicauth", "_proxy_digestauth",
"_robots",
]
if hasattr(_urllib2, 'HTTPSHandler'):
handler_classes["https"] = _urllib2.HTTPSHandler
default_schemes.append("https")
def __init__(self):
_opener.OpenerDirector.__init__(self)
ua_handlers = self._ua_handlers = {}
for scheme in (self.default_schemes+
self.default_others+
self.default_features):
klass = self.handler_classes[scheme]
ua_handlers[scheme] = klass()
for handler in ua_handlers.itervalues():
self.add_handler(handler)
# Yuck.
# Ensure correct default constructor args were passed to
# HTTPRefreshProcessor and HTTPEquivProcessor.
if "_refresh" in ua_handlers:
self.set_handle_refresh(True)
if "_equiv" in ua_handlers:
self.set_handle_equiv(True)
# Ensure default password managers are installed.
pm = ppm = None
if "_basicauth" in ua_handlers or "_digestauth" in ua_handlers:
pm = _urllib2.HTTPPasswordMgrWithDefaultRealm()
if ("_proxy_basicauth" in ua_handlers or
"_proxy_digestauth" in ua_handlers):
ppm = _auth.HTTPProxyPasswordMgr()
self.set_password_manager(pm)
self.set_proxy_password_manager(ppm)
# set default certificate manager
if "https" in ua_handlers:
cm = _urllib2.HTTPSClientCertMgr()
self.set_client_cert_manager(cm)
def close(self):
_opener.OpenerDirector.close(self)
self._ua_handlers = None
# XXX
## def set_timeout(self, timeout):
## self._timeout = timeout
## def set_http_connection_cache(self, conn_cache):
## self._http_conn_cache = conn_cache
## def set_ftp_connection_cache(self, conn_cache):
## # XXX ATM, FTP has cache as part of handler; should it be separate?
## self._ftp_conn_cache = conn_cache
def set_handled_schemes(self, schemes):
"""Set sequence of URL scheme (protocol) strings.
For example: ua.set_handled_schemes(["http", "ftp"])
If this fails (with ValueError) because you've passed an unknown
scheme, the set of handled schemes will not be changed.
"""
want = {}
for scheme in schemes:
if scheme.startswith("_"):
raise ValueError("not a scheme '%s'" % scheme)
if scheme not in self.handler_classes:
raise ValueError("unknown scheme '%s'")
want[scheme] = None
# get rid of scheme handlers we don't want
for scheme, oldhandler in self._ua_handlers.items():
if scheme.startswith("_"): continue # not a scheme handler
if scheme not in want:
self._replace_handler(scheme, None)
else:
del want[scheme] # already got it
# add the scheme handlers that are missing
for scheme in want.keys():
self._set_handler(scheme, True)
def set_cookiejar(self, cookiejar):
"""Set a mechanize.CookieJar, or None."""
self._set_handler("_cookies", obj=cookiejar)
# XXX could use Greg Stein's httpx for some of this instead?
# or httplib2??
def set_proxies(self, proxies):
"""Set a dictionary mapping URL scheme to proxy specification, or None.
e.g. {"http": "joe:password@myproxy.example.com:3128",
"ftp": "proxy.example.com"}
"""
self._set_handler("_proxy", obj=proxies)
def add_password(self, url, user, password, realm=None):
self._password_manager.add_password(realm, url, user, password)
def add_proxy_password(self, user, password, hostport=None, realm=None):
self._proxy_password_manager.add_password(
realm, hostport, user, password)
def add_client_certificate(self, url, key_file, cert_file):
"""Add an SSL client certificate, for HTTPS client auth.
key_file and cert_file must be filenames of the key and certificate
files, in PEM format. You can use e.g. OpenSSL to convert a p12 (PKCS
12) file to PEM format:
openssl pkcs12 -clcerts -nokeys -in cert.p12 -out cert.pem
openssl pkcs12 -nocerts -in cert.p12 -out key.pem
Note that client certificate password input is very inflexible ATM. At
the moment this seems to be console only, which is presumably the
default behaviour of libopenssl. In future mechanize may support
third-party libraries that (I assume) allow more options here.
"""
self._client_cert_manager.add_key_cert(url, key_file, cert_file)
# the following are rarely useful -- use add_password / add_proxy_password
# instead
def set_password_manager(self, password_manager):
"""Set a mechanize.HTTPPasswordMgrWithDefaultRealm, or None."""
self._password_manager = password_manager
self._set_handler("_basicauth", obj=password_manager)
self._set_handler("_digestauth", obj=password_manager)
def set_proxy_password_manager(self, password_manager):
"""Set a mechanize.HTTPProxyPasswordMgr, or None."""
self._proxy_password_manager = password_manager
self._set_handler("_proxy_basicauth", obj=password_manager)
self._set_handler("_proxy_digestauth", obj=password_manager)
def set_client_cert_manager(self, cert_manager):
"""Set a mechanize.HTTPClientCertMgr, or None."""
self._client_cert_manager = cert_manager
handler = self._ua_handlers["https"]
handler.client_cert_manager = cert_manager
# these methods all take a boolean parameter
def set_handle_robots(self, handle):
"""Set whether to observe rules from robots.txt."""
self._set_handler("_robots", handle)
def set_handle_redirect(self, handle):
"""Set whether to handle HTTP 30x redirections."""
self._set_handler("_redirect", handle)
def set_handle_refresh(self, handle, max_time=None, honor_time=True):
"""Set whether to handle HTTP Refresh headers."""
self._set_handler("_refresh", handle, constructor_kwds=
{"max_time": max_time, "honor_time": honor_time})
def set_handle_equiv(self, handle, head_parser_class=None):
"""Set whether to treat HTML http-equiv headers like HTTP headers.
Response objects may be .seek()able if this is set (currently returned
responses are, raised HTTPError exception responses are not).
"""
if head_parser_class is not None:
constructor_kwds = {"head_parser_class": head_parser_class}
else:
constructor_kwds={}
self._set_handler("_equiv", handle, constructor_kwds=constructor_kwds)
def set_handle_gzip(self, handle):
"""Handle gzip transfer encoding.
"""
if handle:
warnings.warn(
"gzip transfer encoding is experimental!", stacklevel=2)
self._set_handler("_gzip", handle)
def set_debug_redirects(self, handle):
"""Log information about HTTP redirects (including refreshes).
Logging is performed using module logging. The logger name is
"mechanize.http_redirects". To actually print some debug output,
eg:
import sys, logging
logger = logging.getLogger("mechanize.http_redirects")
logger.addHandler(logging.StreamHandler(sys.stdout))
logger.setLevel(logging.INFO)
Other logger names relevant to this module:
"mechanize.http_responses"
"mechanize.cookies" (or "cookielib" if running Python 2.4)
To turn on everything:
import sys, logging
logger = logging.getLogger("mechanize")
logger.addHandler(logging.StreamHandler(sys.stdout))
logger.setLevel(logging.INFO)
"""
self._set_handler("_debug_redirect", handle)
def set_debug_responses(self, handle):
"""Log HTTP response bodies.
See docstring for .set_debug_redirects() for details of logging.
Response objects may be .seek()able if this is set (currently returned
responses are, raised HTTPError exception responses are not).
"""
self._set_handler("_debug_response_body", handle)
def set_debug_http(self, handle):
"""Print HTTP headers to sys.stdout."""
level = int(bool(handle))
for scheme in "http", "https":
h = self._ua_handlers.get(scheme)
if h is not None:
h.set_http_debuglevel(level)
def _set_handler(self, name, handle=None, obj=None,
constructor_args=(), constructor_kwds={}):
if handle is None:
handle = obj is not None
if handle:
handler_class = self.handler_classes[name]
if obj is not None:
newhandler = handler_class(obj)
else:
newhandler = handler_class(*constructor_args, **constructor_kwds)
else:
newhandler = None
self._replace_handler(name, newhandler)
def _replace_handler(self, name, newhandler=None):
# first, if handler was previously added, remove it
if name is not None:
handler = self._ua_handlers.get(name)
if handler:
try:
self.handlers.remove(handler)
except ValueError:
pass
# then add the replacement, if any
if newhandler is not None:
self.add_handler(newhandler)
self._ua_handlers[name] = newhandler
class UserAgent(UserAgentBase):
def __init__(self):
UserAgentBase.__init__(self)
self._seekable = False
def set_seekable_responses(self, handle):
"""Make response objects .seek()able."""
self._seekable = bool(handle)
def open(self, fullurl, data=None):
if self._seekable:
def bound_open(fullurl, data=None):
return UserAgentBase.open(self, fullurl, data)
response = _opener.wrapped_open(
bound_open, _response.seek_wrapped_response, fullurl, data)
else:
response = UserAgentBase.open(self, fullurl, data)
return response

View File

@ -0,0 +1,279 @@
"""Utility functions and date/time routines.
Copyright 2002-2006 John J Lee <jjl@pobox.com>
This code is free software; you can redistribute it and/or modify it
under the terms of the BSD or ZPL 2.1 licenses (see the file
COPYING.txt included with the distribution).
"""
import re, string, time, warnings
def deprecation(message):
warnings.warn(message, DeprecationWarning, stacklevel=3)
def hide_deprecations():
warnings.filterwarnings('ignore', category=DeprecationWarning)
def reset_deprecations():
warnings.filterwarnings('default', category=DeprecationWarning)
def isstringlike(x):
try: x+""
except: return False
else: return True
## def caller():
## try:
## raise SyntaxError
## except:
## import sys
## return sys.exc_traceback.tb_frame.f_back.f_back.f_code.co_name
from calendar import timegm
# Date/time conversion routines for formats used by the HTTP protocol.
EPOCH = 1970
def my_timegm(tt):
year, month, mday, hour, min, sec = tt[:6]
if ((year >= EPOCH) and (1 <= month <= 12) and (1 <= mday <= 31) and
(0 <= hour <= 24) and (0 <= min <= 59) and (0 <= sec <= 61)):
return timegm(tt)
else:
return None
days = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun",
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
months_lower = []
for month in months: months_lower.append(month.lower())
def time2isoz(t=None):
"""Return a string representing time in seconds since epoch, t.
If the function is called without an argument, it will use the current
time.
The format of the returned string is like "YYYY-MM-DD hh:mm:ssZ",
representing Universal Time (UTC, aka GMT). An example of this format is:
1994-11-24 08:49:37Z
"""
if t is None: t = time.time()
year, mon, mday, hour, min, sec = time.gmtime(t)[:6]
return "%04d-%02d-%02d %02d:%02d:%02dZ" % (
year, mon, mday, hour, min, sec)
def time2netscape(t=None):
"""Return a string representing time in seconds since epoch, t.
If the function is called without an argument, it will use the current
time.
The format of the returned string is like this:
Wed, DD-Mon-YYYY HH:MM:SS GMT
"""
if t is None: t = time.time()
year, mon, mday, hour, min, sec, wday = time.gmtime(t)[:7]
return "%s %02d-%s-%04d %02d:%02d:%02d GMT" % (
days[wday], mday, months[mon-1], year, hour, min, sec)
UTC_ZONES = {"GMT": None, "UTC": None, "UT": None, "Z": None}
timezone_re = re.compile(r"^([-+])?(\d\d?):?(\d\d)?$")
def offset_from_tz_string(tz):
offset = None
if UTC_ZONES.has_key(tz):
offset = 0
else:
m = timezone_re.search(tz)
if m:
offset = 3600 * int(m.group(2))
if m.group(3):
offset = offset + 60 * int(m.group(3))
if m.group(1) == '-':
offset = -offset
return offset
def _str2time(day, mon, yr, hr, min, sec, tz):
# translate month name to number
# month numbers start with 1 (January)
try:
mon = months_lower.index(mon.lower())+1
except ValueError:
# maybe it's already a number
try:
imon = int(mon)
except ValueError:
return None
if 1 <= imon <= 12:
mon = imon
else:
return None
# make sure clock elements are defined
if hr is None: hr = 0
if min is None: min = 0
if sec is None: sec = 0
yr = int(yr)
day = int(day)
hr = int(hr)
min = int(min)
sec = int(sec)
if yr < 1000:
# find "obvious" year
cur_yr = time.localtime(time.time())[0]
m = cur_yr % 100
tmp = yr
yr = yr + cur_yr - m
m = m - tmp
if abs(m) > 50:
if m > 0: yr = yr + 100
else: yr = yr - 100
# convert UTC time tuple to seconds since epoch (not timezone-adjusted)
t = my_timegm((yr, mon, day, hr, min, sec, tz))
if t is not None:
# adjust time using timezone string, to get absolute time since epoch
if tz is None:
tz = "UTC"
tz = tz.upper()
offset = offset_from_tz_string(tz)
if offset is None:
return None
t = t - offset
return t
strict_re = re.compile(r"^[SMTWF][a-z][a-z], (\d\d) ([JFMASOND][a-z][a-z]) (\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$")
wkday_re = re.compile(
r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I)
loose_http_re = re.compile(
r"""^
(\d\d?) # day
(?:\s+|[-\/])
(\w+) # month
(?:\s+|[-\/])
(\d+) # year
(?:
(?:\s+|:) # separator before clock
(\d\d?):(\d\d) # hour:min
(?::(\d\d))? # optional seconds
)? # optional clock
\s*
([-+]?\d{2,4}|(?![APap][Mm]\b)[A-Za-z]+)? # timezone
\s*
(?:\(\w+\))? # ASCII representation of timezone in parens.
\s*$""", re.X)
def http2time(text):
"""Returns time in seconds since epoch of time represented by a string.
Return value is an integer.
None is returned if the format of str is unrecognized, the time is outside
the representable range, or the timezone string is not recognized. If the
string contains no timezone, UTC is assumed.
The timezone in the string may be numerical (like "-0800" or "+0100") or a
string timezone (like "UTC", "GMT", "BST" or "EST"). Currently, only the
timezone strings equivalent to UTC (zero offset) are known to the function.
The function loosely parses the following formats:
Wed, 09 Feb 1994 22:23:32 GMT -- HTTP format
Tuesday, 08-Feb-94 14:15:29 GMT -- old rfc850 HTTP format
Tuesday, 08-Feb-1994 14:15:29 GMT -- broken rfc850 HTTP format
09 Feb 1994 22:23:32 GMT -- HTTP format (no weekday)
08-Feb-94 14:15:29 GMT -- rfc850 format (no weekday)
08-Feb-1994 14:15:29 GMT -- broken rfc850 format (no weekday)
The parser ignores leading and trailing whitespace. The time may be
absent.
If the year is given with only 2 digits, the function will select the
century that makes the year closest to the current date.
"""
# fast exit for strictly conforming string
m = strict_re.search(text)
if m:
g = m.groups()
mon = months_lower.index(g[1].lower()) + 1
tt = (int(g[2]), mon, int(g[0]),
int(g[3]), int(g[4]), float(g[5]))
return my_timegm(tt)
# No, we need some messy parsing...
# clean up
text = text.lstrip()
text = wkday_re.sub("", text, 1) # Useless weekday
# tz is time zone specifier string
day, mon, yr, hr, min, sec, tz = [None]*7
# loose regexp parse
m = loose_http_re.search(text)
if m is not None:
day, mon, yr, hr, min, sec, tz = m.groups()
else:
return None # bad format
return _str2time(day, mon, yr, hr, min, sec, tz)
iso_re = re.compile(
"""^
(\d{4}) # year
[-\/]?
(\d\d?) # numerical month
[-\/]?
(\d\d?) # day
(?:
(?:\s+|[-:Tt]) # separator before clock
(\d\d?):?(\d\d) # hour:min
(?::?(\d\d(?:\.\d*)?))? # optional seconds (and fractional)
)? # optional clock
\s*
([-+]?\d\d?:?(:?\d\d)?
|Z|z)? # timezone (Z is "zero meridian", i.e. GMT)
\s*$""", re.X)
def iso2time(text):
"""
As for http2time, but parses the ISO 8601 formats:
1994-02-03 14:15:29 -0100 -- ISO 8601 format
1994-02-03 14:15:29 -- zone is optional
1994-02-03 -- only date
1994-02-03T14:15:29 -- Use T as separator
19940203T141529Z -- ISO 8601 compact format
19940203 -- only date
"""
# clean up
text = text.lstrip()
# tz is time zone specifier string
day, mon, yr, hr, min, sec, tz = [None]*7
# loose regexp parse
m = iso_re.search(text)
if m is not None:
# XXX there's an extra bit of the timezone I'm ignoring here: is
# this the right thing to do?
yr, mon, day, hr, min, sec, tz, _ = m.groups()
else:
return None # bad format
return _str2time(day, mon, yr, hr, min, sec, tz)