News download: Add support for logging in to sites that require javascript for their logins. Fixes #1101809 (Private bug)

This commit is contained in:
Kovid Goyal 2013-01-23 12:34:10 +05:30
parent 771856c83e
commit 3875cd176c
5 changed files with 99 additions and 17 deletions

View File

@ -28,6 +28,8 @@ class Barrons(BasicNewsRecipe):
## Don't grab articles more than 7 days old ## Don't grab articles more than 7 days old
oldest_article = 7 oldest_article = 7
use_javascript_to_login = True
requires_version = (0, 9, 16)
extra_css = ''' extra_css = '''
.datestamp{font-family:Verdana,Geneva,Kalimati,sans-serif; font-size:x-small;} .datestamp{font-family:Verdana,Geneva,Kalimati,sans-serif; font-size:x-small;}
@ -40,7 +42,7 @@ class Barrons(BasicNewsRecipe):
.insettipUnit{font-size: x-small;} .insettipUnit{font-size: x-small;}
''' '''
remove_tags = [ remove_tags = [
dict(name ='div', attrs={'class':['tabContainer artTabbedNav','rssToolBox hidden','articleToolbox']}), dict(name ='div', attrs={'class':['sTools sTools-t', 'tabContainer artTabbedNav','rssToolBox hidden','articleToolbox']}),
dict(name = 'a', attrs ={'class':'insetClose'}) dict(name = 'a', attrs ={'class':'insetClose'})
] ]
@ -60,21 +62,17 @@ class Barrons(BasicNewsRecipe):
] ]
] ]
def get_browser(self): def javascript_login(self, br, username, password):
br = BasicNewsRecipe.get_browser() br.visit('http://commerce.barrons.com/auth/login')
if self.username is not None and self.password is not None: f = br.select_form(nr=0)
br.open('http://commerce.barrons.com/auth/login') f['username'] = username
br.select_form(nr=0) f['password'] = password
br['username'] = self.username br.submit(timeout=120)
br['password'] = self.password
br.submit()
return br
## Use the print version of a page when available. ## Use the print version of a page when available.
def print_version(self, url): def print_version(self, url):
main, sep, rest = url.rpartition('?') main, sep, rest = url.rpartition('?')
return main + '#printmode' return main + '#text.print'
def postprocess_html(self, soup, first): def postprocess_html(self, soup, first):

View File

@ -32,6 +32,10 @@ class Browser(B):
B.set_cookiejar(self, *args, **kwargs) B.set_cookiejar(self, *args, **kwargs)
self._clone_actions['set_cookiejar'] = ('set_cookiejar', args, kwargs) self._clone_actions['set_cookiejar'] = ('set_cookiejar', args, kwargs)
def copy_cookies_from_jsbrowser(self, jsbrowser):
for cookie in jsbrowser.cookies:
self.cookiejar.set_cookie(cookie)
@property @property
def cookiejar(self): def cookiejar(self):
return self._clone_actions['set_cookiejar'][1][0] return self._clone_actions['set_cookiejar'][1][0]

View File

@ -332,6 +332,12 @@ class BasicNewsRecipe(Recipe):
#: ignore_duplicate_articles = {'title', 'url'} #: ignore_duplicate_articles = {'title', 'url'}
ignore_duplicate_articles = None ignore_duplicate_articles = None
#: If you set this True, then calibre will use javascript to login to the
#: website. This is needed for some websites that require the use of
#: javascript to login. If you set this to True you must implement the
#: :meth:`javascript_login` method, to do the actual logging in.
use_javascript_to_login = False
# See the built-in profiles for examples of these settings. # See the built-in profiles for examples of these settings.
def short_title(self): def short_title(self):
@ -404,8 +410,7 @@ class BasicNewsRecipe(Recipe):
''' '''
return url return url
@classmethod def get_browser(self, *args, **kwargs):
def get_browser(cls, *args, **kwargs):
''' '''
Return a browser instance used to fetch documents from the web. By default Return a browser instance used to fetch documents from the web. By default
it returns a `mechanize <http://wwwsearch.sourceforge.net/mechanize/>`_ it returns a `mechanize <http://wwwsearch.sourceforge.net/mechanize/>`_
@ -427,10 +432,48 @@ class BasicNewsRecipe(Recipe):
return br return br
''' '''
if self.use_javascript_to_login:
if getattr(self, 'browser', None) is not None:
return self.clone_browser(self.browser)
from calibre.web.jsbrowser.browser import Browser
br = Browser()
with br:
self.javascript_login(br, self.username, self.password)
kwargs['user_agent'] = br.user_agent
ans = browser(*args, **kwargs)
ans.copy_cookies_from_jsbrowser(br)
return ans
else:
br = browser(*args, **kwargs) br = browser(*args, **kwargs)
br.addheaders += [('Accept', '*/*')] br.addheaders += [('Accept', '*/*')]
return br return br
def javascript_login(self, browser, username, password):
'''
This method is used to login to a website that uses javascript for its
login form. After the login is complete, the cookies returned from the
website are copied to a normal (non-javascript) browser and the
download proceeds using those cookies.
An example implementation::
def javascript_login(self, browser, username, password):
browser.visit('http://some-page-that-has-a-login')
form = browser.select_form(nr=0) # Select the first form on the page
form['username'] = username
form['password'] = password
browser.submit(timeout=120) # Submit the form and wait at most two minutes for loading to complete
Note that you can also select forms with CSS2 selectors, like this::
browser.select_form('form#login_form')
browser.select_from('form[name="someform"]')
'''
raise NotImplementedError('You must implement the javascript_login()'
' method if you set use_javascript_to_login'
' to True')
def clone_browser(self, br): def clone_browser(self, br):
''' '''
Clone the browser br. Cloned browsers are used for multi-threaded Clone the browser br. Cloned browsers are used for multi-threaded

View File

@ -303,6 +303,10 @@ class Browser(QObject, FormsMixin):
self.nam = NetworkAccessManager(log, use_disk_cache=use_disk_cache, parent=self) self.nam = NetworkAccessManager(log, use_disk_cache=use_disk_cache, parent=self)
self.page.setNetworkAccessManager(self.nam) self.page.setNetworkAccessManager(self.nam)
@property
def user_agent(self):
return self.page.user_agent
def _wait_for_load(self, timeout, url=None): def _wait_for_load(self, timeout, url=None):
loop = QEventLoop(self) loop = QEventLoop(self)
start_time = time.time() start_time = time.time()
@ -422,3 +426,9 @@ class Browser(QObject, FormsMixin):
pass pass
self.nam = self.page = None self.nam = self.page = None
def __enter__(self):
pass
def __exit__(self, *args):
self.close()

View File

@ -11,6 +11,7 @@ import unittest, pprint, threading, time
import cherrypy import cherrypy
from calibre import browser
from calibre.web.jsbrowser.browser import Browser from calibre.web.jsbrowser.browser import Browser
from calibre.library.server.utils import (cookie_max_age_to_expires, from calibre.library.server.utils import (cookie_max_age_to_expires,
cookie_time_fmt) cookie_time_fmt)
@ -105,6 +106,12 @@ class Server(object):
import traceback import traceback
traceback.print_exc() traceback.print_exc()
@cherrypy.expose
def receive_cookies(self):
self.received_cookies = {n:(c.value, dict(c)) for n, c in
dict(cherrypy.request.cookie).iteritems()}
return pprint.pformat(self.received_cookies)
class Test(unittest.TestCase): class Test(unittest.TestCase):
@classmethod @classmethod
@ -202,6 +209,26 @@ class Test(unittest.TestCase):
if fexp: if fexp:
self.assertEqual(fexp, cexp) self.assertEqual(fexp, cexp)
def test_cookie_copy(self):
'Test copying of cookies from jsbrowser to mechanize'
self.assertEqual(self.browser.visit('http://127.0.0.1:%d/cookies'%self.port),
True)
sent_cookies = self.server.sent_cookies.copy()
self.browser.visit('http://127.0.0.1:%d/receive_cookies'%self.port)
orig_rc = self.server.received_cookies.copy()
br = browser(user_agent=self.browser.user_agent)
br.copy_cookies_from_jsbrowser(self.browser)
br.open('http://127.0.0.1:%d/receive_cookies'%self.port)
for name, vals in sent_cookies.iteritems():
val = vals[0]
try:
rval = self.server.received_cookies[name][0]
except:
self.fail('The cookie: %s was not received by the server')
self.assertEqual(val, rval,
'The received value for the cookie: %s, %s != %s'%(
name, rval, val))
self.assertEqual(orig_rc, self.server.received_cookies)
def tests(): def tests():
return unittest.TestLoader().loadTestsFromTestCase(Test) return unittest.TestLoader().loadTestsFromTestCase(Test)