News download: Add support for logging in to sites that require javascript for their logins. Fixes #1101809 (Private bug)

This commit is contained in:
Kovid Goyal 2013-01-23 12:34:10 +05:30
parent 771856c83e
commit 3875cd176c
5 changed files with 99 additions and 17 deletions

View File

@ -28,6 +28,8 @@ class Barrons(BasicNewsRecipe):
## Don't grab articles more than 7 days old
oldest_article = 7
use_javascript_to_login = True
requires_version = (0, 9, 16)
extra_css = '''
.datestamp{font-family:Verdana,Geneva,Kalimati,sans-serif; font-size:x-small;}
@ -40,7 +42,7 @@ class Barrons(BasicNewsRecipe):
.insettipUnit{font-size: x-small;}
'''
remove_tags = [
dict(name ='div', attrs={'class':['tabContainer artTabbedNav','rssToolBox hidden','articleToolbox']}),
dict(name ='div', attrs={'class':['sTools sTools-t', 'tabContainer artTabbedNav','rssToolBox hidden','articleToolbox']}),
dict(name = 'a', attrs ={'class':'insetClose'})
]
@ -60,21 +62,17 @@ class Barrons(BasicNewsRecipe):
]
]
def get_browser(self):
br = BasicNewsRecipe.get_browser()
if self.username is not None and self.password is not None:
br.open('http://commerce.barrons.com/auth/login')
br.select_form(nr=0)
br['username'] = self.username
br['password'] = self.password
br.submit()
return br
def javascript_login(self, br, username, password):
br.visit('http://commerce.barrons.com/auth/login')
f = br.select_form(nr=0)
f['username'] = username
f['password'] = password
br.submit(timeout=120)
## Use the print version of a page when available.
def print_version(self, url):
main, sep, rest = url.rpartition('?')
return main + '#printmode'
return main + '#text.print'
def postprocess_html(self, soup, first):

View File

@ -32,6 +32,10 @@ class Browser(B):
B.set_cookiejar(self, *args, **kwargs)
self._clone_actions['set_cookiejar'] = ('set_cookiejar', args, kwargs)
def copy_cookies_from_jsbrowser(self, jsbrowser):
for cookie in jsbrowser.cookies:
self.cookiejar.set_cookie(cookie)
@property
def cookiejar(self):
return self._clone_actions['set_cookiejar'][1][0]

View File

@ -332,6 +332,12 @@ class BasicNewsRecipe(Recipe):
#: ignore_duplicate_articles = {'title', 'url'}
ignore_duplicate_articles = None
#: If you set this True, then calibre will use javascript to login to the
#: website. This is needed for some websites that require the use of
#: javascript to login. If you set this to True you must implement the
#: :meth:`javascript_login` method, to do the actual logging in.
use_javascript_to_login = False
# See the built-in profiles for examples of these settings.
def short_title(self):
@ -404,8 +410,7 @@ class BasicNewsRecipe(Recipe):
'''
return url
@classmethod
def get_browser(cls, *args, **kwargs):
def get_browser(self, *args, **kwargs):
'''
Return a browser instance used to fetch documents from the web. By default
it returns a `mechanize <http://wwwsearch.sourceforge.net/mechanize/>`_
@ -427,10 +432,48 @@ class BasicNewsRecipe(Recipe):
return br
'''
if self.use_javascript_to_login:
if getattr(self, 'browser', None) is not None:
return self.clone_browser(self.browser)
from calibre.web.jsbrowser.browser import Browser
br = Browser()
with br:
self.javascript_login(br, self.username, self.password)
kwargs['user_agent'] = br.user_agent
ans = browser(*args, **kwargs)
ans.copy_cookies_from_jsbrowser(br)
return ans
else:
br = browser(*args, **kwargs)
br.addheaders += [('Accept', '*/*')]
return br
def javascript_login(self, browser, username, password):
'''
This method is used to login to a website that uses javascript for its
login form. After the login is complete, the cookies returned from the
website are copied to a normal (non-javascript) browser and the
download proceeds using those cookies.
An example implementation::
def javascript_login(self, browser, username, password):
browser.visit('http://some-page-that-has-a-login')
form = browser.select_form(nr=0) # Select the first form on the page
form['username'] = username
form['password'] = password
browser.submit(timeout=120) # Submit the form and wait at most two minutes for loading to complete
Note that you can also select forms with CSS2 selectors, like this::
browser.select_form('form#login_form')
browser.select_from('form[name="someform"]')
'''
raise NotImplementedError('You must implement the javascript_login()'
' method if you set use_javascript_to_login'
' to True')
def clone_browser(self, br):
'''
Clone the browser br. Cloned browsers are used for multi-threaded

View File

@ -303,6 +303,10 @@ class Browser(QObject, FormsMixin):
self.nam = NetworkAccessManager(log, use_disk_cache=use_disk_cache, parent=self)
self.page.setNetworkAccessManager(self.nam)
@property
def user_agent(self):
return self.page.user_agent
def _wait_for_load(self, timeout, url=None):
loop = QEventLoop(self)
start_time = time.time()
@ -422,3 +426,9 @@ class Browser(QObject, FormsMixin):
pass
self.nam = self.page = None
def __enter__(self):
pass
def __exit__(self, *args):
self.close()

View File

@ -11,6 +11,7 @@ import unittest, pprint, threading, time
import cherrypy
from calibre import browser
from calibre.web.jsbrowser.browser import Browser
from calibre.library.server.utils import (cookie_max_age_to_expires,
cookie_time_fmt)
@ -105,6 +106,12 @@ class Server(object):
import traceback
traceback.print_exc()
@cherrypy.expose
def receive_cookies(self):
self.received_cookies = {n:(c.value, dict(c)) for n, c in
dict(cherrypy.request.cookie).iteritems()}
return pprint.pformat(self.received_cookies)
class Test(unittest.TestCase):
@classmethod
@ -202,6 +209,26 @@ class Test(unittest.TestCase):
if fexp:
self.assertEqual(fexp, cexp)
def test_cookie_copy(self):
'Test copying of cookies from jsbrowser to mechanize'
self.assertEqual(self.browser.visit('http://127.0.0.1:%d/cookies'%self.port),
True)
sent_cookies = self.server.sent_cookies.copy()
self.browser.visit('http://127.0.0.1:%d/receive_cookies'%self.port)
orig_rc = self.server.received_cookies.copy()
br = browser(user_agent=self.browser.user_agent)
br.copy_cookies_from_jsbrowser(self.browser)
br.open('http://127.0.0.1:%d/receive_cookies'%self.port)
for name, vals in sent_cookies.iteritems():
val = vals[0]
try:
rval = self.server.received_cookies[name][0]
except:
self.fail('The cookie: %s was not received by the server')
self.assertEqual(val, rval,
'The received value for the cookie: %s, %s != %s'%(
name, rval, val))
self.assertEqual(orig_rc, self.server.received_cookies)
def tests():
return unittest.TestLoader().loadTestsFromTestCase(Test)