From 3875cd176ca447d7f8d411836144f27eaa3e69ba Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 23 Jan 2013 12:34:10 +0530 Subject: [PATCH] News download: Add support for logging in to sites that require javascript for their logins. Fixes #1101809 (Private bug) --- recipes/barrons.recipe | 22 ++++++------ src/calibre/utils/browser.py | 4 +++ src/calibre/web/feeds/news.py | 53 +++++++++++++++++++++++++--- src/calibre/web/jsbrowser/browser.py | 10 ++++++ src/calibre/web/jsbrowser/test.py | 27 ++++++++++++++ 5 files changed, 99 insertions(+), 17 deletions(-) diff --git a/recipes/barrons.recipe b/recipes/barrons.recipe index 41ed7e26ec..58c62e20e9 100644 --- a/recipes/barrons.recipe +++ b/recipes/barrons.recipe @@ -28,6 +28,8 @@ class Barrons(BasicNewsRecipe): ## Don't grab articles more than 7 days old oldest_article = 7 + use_javascript_to_login = True + requires_version = (0, 9, 16) extra_css = ''' .datestamp{font-family:Verdana,Geneva,Kalimati,sans-serif; font-size:x-small;} @@ -40,7 +42,7 @@ class Barrons(BasicNewsRecipe): .insettipUnit{font-size: x-small;} ''' remove_tags = [ - dict(name ='div', attrs={'class':['tabContainer artTabbedNav','rssToolBox hidden','articleToolbox']}), + dict(name ='div', attrs={'class':['sTools sTools-t', 'tabContainer artTabbedNav','rssToolBox hidden','articleToolbox']}), dict(name = 'a', attrs ={'class':'insetClose'}) ] @@ -60,21 +62,17 @@ class Barrons(BasicNewsRecipe): ] ] - def get_browser(self): - br = BasicNewsRecipe.get_browser() - if self.username is not None and self.password is not None: - br.open('http://commerce.barrons.com/auth/login') - br.select_form(nr=0) - br['username'] = self.username - br['password'] = self.password - br.submit() - return br + def javascript_login(self, br, username, password): + br.visit('http://commerce.barrons.com/auth/login') + f = br.select_form(nr=0) + f['username'] = username + f['password'] = password + br.submit(timeout=120) ## Use the print version of a page when available. - def print_version(self, url): main, sep, rest = url.rpartition('?') - return main + '#printmode' + return main + '#text.print' def postprocess_html(self, soup, first): diff --git a/src/calibre/utils/browser.py b/src/calibre/utils/browser.py index de21158ed7..fc04044ad3 100644 --- a/src/calibre/utils/browser.py +++ b/src/calibre/utils/browser.py @@ -32,6 +32,10 @@ class Browser(B): B.set_cookiejar(self, *args, **kwargs) self._clone_actions['set_cookiejar'] = ('set_cookiejar', args, kwargs) + def copy_cookies_from_jsbrowser(self, jsbrowser): + for cookie in jsbrowser.cookies: + self.cookiejar.set_cookie(cookie) + @property def cookiejar(self): return self._clone_actions['set_cookiejar'][1][0] diff --git a/src/calibre/web/feeds/news.py b/src/calibre/web/feeds/news.py index 14834ff88c..22901f3ccc 100644 --- a/src/calibre/web/feeds/news.py +++ b/src/calibre/web/feeds/news.py @@ -332,6 +332,12 @@ class BasicNewsRecipe(Recipe): #: ignore_duplicate_articles = {'title', 'url'} ignore_duplicate_articles = None + #: If you set this True, then calibre will use javascript to login to the + #: website. This is needed for some websites that require the use of + #: javascript to login. If you set this to True you must implement the + #: :meth:`javascript_login` method, to do the actual logging in. + use_javascript_to_login = False + # See the built-in profiles for examples of these settings. def short_title(self): @@ -404,8 +410,7 @@ class BasicNewsRecipe(Recipe): ''' return url - @classmethod - def get_browser(cls, *args, **kwargs): + def get_browser(self, *args, **kwargs): ''' Return a browser instance used to fetch documents from the web. By default it returns a `mechanize `_ @@ -427,9 +432,47 @@ class BasicNewsRecipe(Recipe): return br ''' - br = browser(*args, **kwargs) - br.addheaders += [('Accept', '*/*')] - return br + if self.use_javascript_to_login: + if getattr(self, 'browser', None) is not None: + return self.clone_browser(self.browser) + from calibre.web.jsbrowser.browser import Browser + br = Browser() + with br: + self.javascript_login(br, self.username, self.password) + kwargs['user_agent'] = br.user_agent + ans = browser(*args, **kwargs) + ans.copy_cookies_from_jsbrowser(br) + return ans + else: + br = browser(*args, **kwargs) + br.addheaders += [('Accept', '*/*')] + return br + + def javascript_login(self, browser, username, password): + ''' + This method is used to login to a website that uses javascript for its + login form. After the login is complete, the cookies returned from the + website are copied to a normal (non-javascript) browser and the + download proceeds using those cookies. + + An example implementation:: + + def javascript_login(self, browser, username, password): + browser.visit('http://some-page-that-has-a-login') + form = browser.select_form(nr=0) # Select the first form on the page + form['username'] = username + form['password'] = password + browser.submit(timeout=120) # Submit the form and wait at most two minutes for loading to complete + + Note that you can also select forms with CSS2 selectors, like this:: + + browser.select_form('form#login_form') + browser.select_from('form[name="someform"]') + + ''' + raise NotImplementedError('You must implement the javascript_login()' + ' method if you set use_javascript_to_login' + ' to True') def clone_browser(self, br): ''' diff --git a/src/calibre/web/jsbrowser/browser.py b/src/calibre/web/jsbrowser/browser.py index dd87b000a7..d8f0e79bc4 100644 --- a/src/calibre/web/jsbrowser/browser.py +++ b/src/calibre/web/jsbrowser/browser.py @@ -303,6 +303,10 @@ class Browser(QObject, FormsMixin): self.nam = NetworkAccessManager(log, use_disk_cache=use_disk_cache, parent=self) self.page.setNetworkAccessManager(self.nam) + @property + def user_agent(self): + return self.page.user_agent + def _wait_for_load(self, timeout, url=None): loop = QEventLoop(self) start_time = time.time() @@ -422,3 +426,9 @@ class Browser(QObject, FormsMixin): pass self.nam = self.page = None + def __enter__(self): + pass + + def __exit__(self, *args): + self.close() + diff --git a/src/calibre/web/jsbrowser/test.py b/src/calibre/web/jsbrowser/test.py index 8527f3ec92..6f18d7b850 100644 --- a/src/calibre/web/jsbrowser/test.py +++ b/src/calibre/web/jsbrowser/test.py @@ -11,6 +11,7 @@ import unittest, pprint, threading, time import cherrypy +from calibre import browser from calibre.web.jsbrowser.browser import Browser from calibre.library.server.utils import (cookie_max_age_to_expires, cookie_time_fmt) @@ -105,6 +106,12 @@ class Server(object): import traceback traceback.print_exc() + @cherrypy.expose + def receive_cookies(self): + self.received_cookies = {n:(c.value, dict(c)) for n, c in + dict(cherrypy.request.cookie).iteritems()} + return pprint.pformat(self.received_cookies) + class Test(unittest.TestCase): @classmethod @@ -202,6 +209,26 @@ class Test(unittest.TestCase): if fexp: self.assertEqual(fexp, cexp) + def test_cookie_copy(self): + 'Test copying of cookies from jsbrowser to mechanize' + self.assertEqual(self.browser.visit('http://127.0.0.1:%d/cookies'%self.port), + True) + sent_cookies = self.server.sent_cookies.copy() + self.browser.visit('http://127.0.0.1:%d/receive_cookies'%self.port) + orig_rc = self.server.received_cookies.copy() + br = browser(user_agent=self.browser.user_agent) + br.copy_cookies_from_jsbrowser(self.browser) + br.open('http://127.0.0.1:%d/receive_cookies'%self.port) + for name, vals in sent_cookies.iteritems(): + val = vals[0] + try: + rval = self.server.received_cookies[name][0] + except: + self.fail('The cookie: %s was not received by the server') + self.assertEqual(val, rval, + 'The received value for the cookie: %s, %s != %s'%( + name, rval, val)) + self.assertEqual(orig_rc, self.server.received_cookies) def tests(): return unittest.TestLoader().loadTestsFromTestCase(Test)