mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
News download: Add support for logging in to sites that require javascript for their logins. Fixes #1101809 (Private bug)
This commit is contained in:
parent
771856c83e
commit
3875cd176c
@ -28,6 +28,8 @@ class Barrons(BasicNewsRecipe):
|
||||
|
||||
## Don't grab articles more than 7 days old
|
||||
oldest_article = 7
|
||||
use_javascript_to_login = True
|
||||
requires_version = (0, 9, 16)
|
||||
|
||||
extra_css = '''
|
||||
.datestamp{font-family:Verdana,Geneva,Kalimati,sans-serif; font-size:x-small;}
|
||||
@ -40,7 +42,7 @@ class Barrons(BasicNewsRecipe):
|
||||
.insettipUnit{font-size: x-small;}
|
||||
'''
|
||||
remove_tags = [
|
||||
dict(name ='div', attrs={'class':['tabContainer artTabbedNav','rssToolBox hidden','articleToolbox']}),
|
||||
dict(name ='div', attrs={'class':['sTools sTools-t', 'tabContainer artTabbedNav','rssToolBox hidden','articleToolbox']}),
|
||||
dict(name = 'a', attrs ={'class':'insetClose'})
|
||||
]
|
||||
|
||||
@ -60,21 +62,17 @@ class Barrons(BasicNewsRecipe):
|
||||
]
|
||||
]
|
||||
|
||||
def get_browser(self):
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
if self.username is not None and self.password is not None:
|
||||
br.open('http://commerce.barrons.com/auth/login')
|
||||
br.select_form(nr=0)
|
||||
br['username'] = self.username
|
||||
br['password'] = self.password
|
||||
br.submit()
|
||||
return br
|
||||
def javascript_login(self, br, username, password):
|
||||
br.visit('http://commerce.barrons.com/auth/login')
|
||||
f = br.select_form(nr=0)
|
||||
f['username'] = username
|
||||
f['password'] = password
|
||||
br.submit(timeout=120)
|
||||
|
||||
## Use the print version of a page when available.
|
||||
|
||||
def print_version(self, url):
|
||||
main, sep, rest = url.rpartition('?')
|
||||
return main + '#printmode'
|
||||
return main + '#text.print'
|
||||
|
||||
def postprocess_html(self, soup, first):
|
||||
|
||||
|
@ -32,6 +32,10 @@ class Browser(B):
|
||||
B.set_cookiejar(self, *args, **kwargs)
|
||||
self._clone_actions['set_cookiejar'] = ('set_cookiejar', args, kwargs)
|
||||
|
||||
def copy_cookies_from_jsbrowser(self, jsbrowser):
|
||||
for cookie in jsbrowser.cookies:
|
||||
self.cookiejar.set_cookie(cookie)
|
||||
|
||||
@property
|
||||
def cookiejar(self):
|
||||
return self._clone_actions['set_cookiejar'][1][0]
|
||||
|
@ -332,6 +332,12 @@ class BasicNewsRecipe(Recipe):
|
||||
#: ignore_duplicate_articles = {'title', 'url'}
|
||||
ignore_duplicate_articles = None
|
||||
|
||||
#: If you set this True, then calibre will use javascript to login to the
|
||||
#: website. This is needed for some websites that require the use of
|
||||
#: javascript to login. If you set this to True you must implement the
|
||||
#: :meth:`javascript_login` method, to do the actual logging in.
|
||||
use_javascript_to_login = False
|
||||
|
||||
# See the built-in profiles for examples of these settings.
|
||||
|
||||
def short_title(self):
|
||||
@ -404,8 +410,7 @@ class BasicNewsRecipe(Recipe):
|
||||
'''
|
||||
return url
|
||||
|
||||
@classmethod
|
||||
def get_browser(cls, *args, **kwargs):
|
||||
def get_browser(self, *args, **kwargs):
|
||||
'''
|
||||
Return a browser instance used to fetch documents from the web. By default
|
||||
it returns a `mechanize <http://wwwsearch.sourceforge.net/mechanize/>`_
|
||||
@ -427,9 +432,47 @@ class BasicNewsRecipe(Recipe):
|
||||
return br
|
||||
|
||||
'''
|
||||
br = browser(*args, **kwargs)
|
||||
br.addheaders += [('Accept', '*/*')]
|
||||
return br
|
||||
if self.use_javascript_to_login:
|
||||
if getattr(self, 'browser', None) is not None:
|
||||
return self.clone_browser(self.browser)
|
||||
from calibre.web.jsbrowser.browser import Browser
|
||||
br = Browser()
|
||||
with br:
|
||||
self.javascript_login(br, self.username, self.password)
|
||||
kwargs['user_agent'] = br.user_agent
|
||||
ans = browser(*args, **kwargs)
|
||||
ans.copy_cookies_from_jsbrowser(br)
|
||||
return ans
|
||||
else:
|
||||
br = browser(*args, **kwargs)
|
||||
br.addheaders += [('Accept', '*/*')]
|
||||
return br
|
||||
|
||||
def javascript_login(self, browser, username, password):
|
||||
'''
|
||||
This method is used to login to a website that uses javascript for its
|
||||
login form. After the login is complete, the cookies returned from the
|
||||
website are copied to a normal (non-javascript) browser and the
|
||||
download proceeds using those cookies.
|
||||
|
||||
An example implementation::
|
||||
|
||||
def javascript_login(self, browser, username, password):
|
||||
browser.visit('http://some-page-that-has-a-login')
|
||||
form = browser.select_form(nr=0) # Select the first form on the page
|
||||
form['username'] = username
|
||||
form['password'] = password
|
||||
browser.submit(timeout=120) # Submit the form and wait at most two minutes for loading to complete
|
||||
|
||||
Note that you can also select forms with CSS2 selectors, like this::
|
||||
|
||||
browser.select_form('form#login_form')
|
||||
browser.select_from('form[name="someform"]')
|
||||
|
||||
'''
|
||||
raise NotImplementedError('You must implement the javascript_login()'
|
||||
' method if you set use_javascript_to_login'
|
||||
' to True')
|
||||
|
||||
def clone_browser(self, br):
|
||||
'''
|
||||
|
@ -303,6 +303,10 @@ class Browser(QObject, FormsMixin):
|
||||
self.nam = NetworkAccessManager(log, use_disk_cache=use_disk_cache, parent=self)
|
||||
self.page.setNetworkAccessManager(self.nam)
|
||||
|
||||
@property
|
||||
def user_agent(self):
|
||||
return self.page.user_agent
|
||||
|
||||
def _wait_for_load(self, timeout, url=None):
|
||||
loop = QEventLoop(self)
|
||||
start_time = time.time()
|
||||
@ -422,3 +426,9 @@ class Browser(QObject, FormsMixin):
|
||||
pass
|
||||
self.nam = self.page = None
|
||||
|
||||
def __enter__(self):
|
||||
pass
|
||||
|
||||
def __exit__(self, *args):
|
||||
self.close()
|
||||
|
||||
|
@ -11,6 +11,7 @@ import unittest, pprint, threading, time
|
||||
|
||||
import cherrypy
|
||||
|
||||
from calibre import browser
|
||||
from calibre.web.jsbrowser.browser import Browser
|
||||
from calibre.library.server.utils import (cookie_max_age_to_expires,
|
||||
cookie_time_fmt)
|
||||
@ -105,6 +106,12 @@ class Server(object):
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
@cherrypy.expose
|
||||
def receive_cookies(self):
|
||||
self.received_cookies = {n:(c.value, dict(c)) for n, c in
|
||||
dict(cherrypy.request.cookie).iteritems()}
|
||||
return pprint.pformat(self.received_cookies)
|
||||
|
||||
class Test(unittest.TestCase):
|
||||
|
||||
@classmethod
|
||||
@ -202,6 +209,26 @@ class Test(unittest.TestCase):
|
||||
if fexp:
|
||||
self.assertEqual(fexp, cexp)
|
||||
|
||||
def test_cookie_copy(self):
|
||||
'Test copying of cookies from jsbrowser to mechanize'
|
||||
self.assertEqual(self.browser.visit('http://127.0.0.1:%d/cookies'%self.port),
|
||||
True)
|
||||
sent_cookies = self.server.sent_cookies.copy()
|
||||
self.browser.visit('http://127.0.0.1:%d/receive_cookies'%self.port)
|
||||
orig_rc = self.server.received_cookies.copy()
|
||||
br = browser(user_agent=self.browser.user_agent)
|
||||
br.copy_cookies_from_jsbrowser(self.browser)
|
||||
br.open('http://127.0.0.1:%d/receive_cookies'%self.port)
|
||||
for name, vals in sent_cookies.iteritems():
|
||||
val = vals[0]
|
||||
try:
|
||||
rval = self.server.received_cookies[name][0]
|
||||
except:
|
||||
self.fail('The cookie: %s was not received by the server')
|
||||
self.assertEqual(val, rval,
|
||||
'The received value for the cookie: %s, %s != %s'%(
|
||||
name, rval, val))
|
||||
self.assertEqual(orig_rc, self.server.received_cookies)
|
||||
|
||||
def tests():
|
||||
return unittest.TestLoader().loadTestsFromTestCase(Test)
|
||||
|
Loading…
x
Reference in New Issue
Block a user