News download: Add support for turning off SSL certificate verification

This commit is contained in:
Kovid Goyal 2015-06-02 09:37:49 +05:30
parent e03e50730d
commit 5fbc95dea0
3 changed files with 39 additions and 6 deletions

View File

@ -377,20 +377,21 @@ def random_user_agent(choose=None):
choose = random.randint(0, len(choices)-1)
return choices[choose]
def browser(honor_time=True, max_time=2, mobile_browser=False, user_agent=None, use_robust_parser=False):
def browser(honor_time=True, max_time=2, mobile_browser=False, user_agent=None, use_robust_parser=False, verify_ssl_certificates=True):
'''
Create a mechanize browser for web scraping. The browser handles cookies,
refresh requests and ignores robots.txt. Also uses proxy if available.
:param honor_time: If True honors pause time in refresh requests
:param max_time: Maximum time in seconds to wait during a refresh request
:param verify_ssl_certificates: If false SSL certificates errors are ignored
'''
from calibre.utils.browser import Browser
if use_robust_parser:
import mechanize
opener = Browser(factory=mechanize.RobustFactory())
opener = Browser(factory=mechanize.RobustFactory(), verify_ssl=verify_ssl_certificates)
else:
opener = Browser()
opener = Browser(verify_ssl=verify_ssl_certificates)
opener.set_handle_refresh(True, max_time=max_time, honor_time=honor_time)
opener.set_handle_robots(False)
if user_agent is None:

View File

@ -11,6 +11,9 @@ import httplib
class JobQueueFull(Exception):
pass
class RouteError(ValueError):
pass
class HTTPSimpleResponse(Exception):
def __init__(self, http_code, http_message='', close_connection=False, location=None):

View File

@ -5,23 +5,51 @@ __license__ = 'GPL v3'
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import copy
import copy, httplib, ssl
from cookielib import CookieJar
from mechanize import Browser as B
from mechanize import Browser as B, HTTPSHandler
class ModernHTTPSHandler(HTTPSHandler):
ssl_context = None
def https_open(self, req):
if self.client_cert_manager is not None:
key_file, cert_file = self.client_cert_manager.find_key_cert(
req.get_full_url())
if cert_file:
self.ssl_context.load_cert_chain(cert_file, key_file)
def conn_factory(hostport):
return httplib.HTTPSConnection(hostport, context=self.ssl_context)
return self.do_open(conn_factory, req)
class Browser(B):
'''
A cloneable mechanize browser. Useful for multithreading. The idea is that
each thread has a browser clone. Every clone uses the same thread safe
cookie jar. All clones share the same browser configuration.
Also adds support for fine-tuning SSL verification via an SSL context object.
'''
handler_classes = B.handler_classes.copy()
handler_classes['https'] = ModernHTTPSHandler
def __init__(self, *args, **kwargs):
self._clone_actions = {}
sc = kwargs.pop('ssl_context', None)
if sc is None:
sc = ssl.create_default_context() if kwargs.pop('verify_ssl', True) else ssl._create_unverified_context(cert_reqs=ssl.CERT_NONE)
B.__init__(self, *args, **kwargs)
self.set_cookiejar(CookieJar())
self._ua_handlers['https'].ssl_context = sc
@property
def https_handler(self):
return self._ua_handlers['https']
def set_handle_refresh(self, *args, **kwargs):
B.set_handle_refresh(self, *args, **kwargs)
@ -89,6 +117,7 @@ class Browser(B):
def clone_browser(self):
clone = Browser()
clone.https_handler.ssl_context = self.https_handler.ssl_context
clone.addheaders = copy.deepcopy(self.addheaders)
for func, args, kwargs in self._clone_actions.values():
func = getattr(clone, func)
@ -100,7 +129,7 @@ if __name__ == '__main__':
from pprint import pprint
orig = browser()
clone = orig.clone_browser()
pprint( orig._ua_handlers)
pprint(orig._ua_handlers)
pprint(clone._ua_handlers)
assert orig._ua_handlers.keys() == clone._ua_handlers.keys()
assert orig._ua_handlers['_cookies'].cookiejar is \