News download: Add support for turning off SSL certificate verification

This commit is contained in:
Kovid Goyal 2015-06-02 09:37:49 +05:30
parent e03e50730d
commit 5fbc95dea0
3 changed files with 39 additions and 6 deletions

View File

@ -377,20 +377,21 @@ def random_user_agent(choose=None):
choose = random.randint(0, len(choices)-1) choose = random.randint(0, len(choices)-1)
return choices[choose] return choices[choose]
def browser(honor_time=True, max_time=2, mobile_browser=False, user_agent=None, use_robust_parser=False): def browser(honor_time=True, max_time=2, mobile_browser=False, user_agent=None, use_robust_parser=False, verify_ssl_certificates=True):
''' '''
Create a mechanize browser for web scraping. The browser handles cookies, Create a mechanize browser for web scraping. The browser handles cookies,
refresh requests and ignores robots.txt. Also uses proxy if available. refresh requests and ignores robots.txt. Also uses proxy if available.
:param honor_time: If True honors pause time in refresh requests :param honor_time: If True honors pause time in refresh requests
:param max_time: Maximum time in seconds to wait during a refresh request :param max_time: Maximum time in seconds to wait during a refresh request
:param verify_ssl_certificates: If false SSL certificates errors are ignored
''' '''
from calibre.utils.browser import Browser from calibre.utils.browser import Browser
if use_robust_parser: if use_robust_parser:
import mechanize import mechanize
opener = Browser(factory=mechanize.RobustFactory()) opener = Browser(factory=mechanize.RobustFactory(), verify_ssl=verify_ssl_certificates)
else: else:
opener = Browser() opener = Browser(verify_ssl=verify_ssl_certificates)
opener.set_handle_refresh(True, max_time=max_time, honor_time=honor_time) opener.set_handle_refresh(True, max_time=max_time, honor_time=honor_time)
opener.set_handle_robots(False) opener.set_handle_robots(False)
if user_agent is None: if user_agent is None:

View File

@ -11,6 +11,9 @@ import httplib
class JobQueueFull(Exception): class JobQueueFull(Exception):
pass pass
class RouteError(ValueError):
pass
class HTTPSimpleResponse(Exception): class HTTPSimpleResponse(Exception):
def __init__(self, http_code, http_message='', close_connection=False, location=None): def __init__(self, http_code, http_message='', close_connection=False, location=None):

View File

@ -5,23 +5,51 @@ __license__ = 'GPL v3'
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>' __copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import copy import copy, httplib, ssl
from cookielib import CookieJar from cookielib import CookieJar
from mechanize import Browser as B from mechanize import Browser as B, HTTPSHandler
class ModernHTTPSHandler(HTTPSHandler):
ssl_context = None
def https_open(self, req):
if self.client_cert_manager is not None:
key_file, cert_file = self.client_cert_manager.find_key_cert(
req.get_full_url())
if cert_file:
self.ssl_context.load_cert_chain(cert_file, key_file)
def conn_factory(hostport):
return httplib.HTTPSConnection(hostport, context=self.ssl_context)
return self.do_open(conn_factory, req)
class Browser(B): class Browser(B):
''' '''
A cloneable mechanize browser. Useful for multithreading. The idea is that A cloneable mechanize browser. Useful for multithreading. The idea is that
each thread has a browser clone. Every clone uses the same thread safe each thread has a browser clone. Every clone uses the same thread safe
cookie jar. All clones share the same browser configuration. cookie jar. All clones share the same browser configuration.
Also adds support for fine-tuning SSL verification via an SSL context object.
''' '''
handler_classes = B.handler_classes.copy()
handler_classes['https'] = ModernHTTPSHandler
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
self._clone_actions = {} self._clone_actions = {}
sc = kwargs.pop('ssl_context', None)
if sc is None:
sc = ssl.create_default_context() if kwargs.pop('verify_ssl', True) else ssl._create_unverified_context(cert_reqs=ssl.CERT_NONE)
B.__init__(self, *args, **kwargs) B.__init__(self, *args, **kwargs)
self.set_cookiejar(CookieJar()) self.set_cookiejar(CookieJar())
self._ua_handlers['https'].ssl_context = sc
@property
def https_handler(self):
return self._ua_handlers['https']
def set_handle_refresh(self, *args, **kwargs): def set_handle_refresh(self, *args, **kwargs):
B.set_handle_refresh(self, *args, **kwargs) B.set_handle_refresh(self, *args, **kwargs)
@ -89,6 +117,7 @@ class Browser(B):
def clone_browser(self): def clone_browser(self):
clone = Browser() clone = Browser()
clone.https_handler.ssl_context = self.https_handler.ssl_context
clone.addheaders = copy.deepcopy(self.addheaders) clone.addheaders = copy.deepcopy(self.addheaders)
for func, args, kwargs in self._clone_actions.values(): for func, args, kwargs in self._clone_actions.values():
func = getattr(clone, func) func = getattr(clone, func)
@ -100,7 +129,7 @@ if __name__ == '__main__':
from pprint import pprint from pprint import pprint
orig = browser() orig = browser()
clone = orig.clone_browser() clone = orig.clone_browser()
pprint( orig._ua_handlers) pprint(orig._ua_handlers)
pprint(clone._ua_handlers) pprint(clone._ua_handlers)
assert orig._ua_handlers.keys() == clone._ua_handlers.keys() assert orig._ua_handlers.keys() == clone._ua_handlers.keys()
assert orig._ua_handlers['_cookies'].cookiejar is \ assert orig._ua_handlers['_cookies'].cookiejar is \