mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
News download: Add support for logging in to sites that require javascript for their logins. Fixes #1101809 (Private bug)
This commit is contained in:
parent
771856c83e
commit
3875cd176c
@ -28,6 +28,8 @@ class Barrons(BasicNewsRecipe):
|
|||||||
|
|
||||||
## Don't grab articles more than 7 days old
|
## Don't grab articles more than 7 days old
|
||||||
oldest_article = 7
|
oldest_article = 7
|
||||||
|
use_javascript_to_login = True
|
||||||
|
requires_version = (0, 9, 16)
|
||||||
|
|
||||||
extra_css = '''
|
extra_css = '''
|
||||||
.datestamp{font-family:Verdana,Geneva,Kalimati,sans-serif; font-size:x-small;}
|
.datestamp{font-family:Verdana,Geneva,Kalimati,sans-serif; font-size:x-small;}
|
||||||
@ -40,7 +42,7 @@ class Barrons(BasicNewsRecipe):
|
|||||||
.insettipUnit{font-size: x-small;}
|
.insettipUnit{font-size: x-small;}
|
||||||
'''
|
'''
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
dict(name ='div', attrs={'class':['tabContainer artTabbedNav','rssToolBox hidden','articleToolbox']}),
|
dict(name ='div', attrs={'class':['sTools sTools-t', 'tabContainer artTabbedNav','rssToolBox hidden','articleToolbox']}),
|
||||||
dict(name = 'a', attrs ={'class':'insetClose'})
|
dict(name = 'a', attrs ={'class':'insetClose'})
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -60,21 +62,17 @@ class Barrons(BasicNewsRecipe):
|
|||||||
]
|
]
|
||||||
]
|
]
|
||||||
|
|
||||||
def get_browser(self):
|
def javascript_login(self, br, username, password):
|
||||||
br = BasicNewsRecipe.get_browser()
|
br.visit('http://commerce.barrons.com/auth/login')
|
||||||
if self.username is not None and self.password is not None:
|
f = br.select_form(nr=0)
|
||||||
br.open('http://commerce.barrons.com/auth/login')
|
f['username'] = username
|
||||||
br.select_form(nr=0)
|
f['password'] = password
|
||||||
br['username'] = self.username
|
br.submit(timeout=120)
|
||||||
br['password'] = self.password
|
|
||||||
br.submit()
|
|
||||||
return br
|
|
||||||
|
|
||||||
## Use the print version of a page when available.
|
## Use the print version of a page when available.
|
||||||
|
|
||||||
def print_version(self, url):
|
def print_version(self, url):
|
||||||
main, sep, rest = url.rpartition('?')
|
main, sep, rest = url.rpartition('?')
|
||||||
return main + '#printmode'
|
return main + '#text.print'
|
||||||
|
|
||||||
def postprocess_html(self, soup, first):
|
def postprocess_html(self, soup, first):
|
||||||
|
|
||||||
|
@ -32,6 +32,10 @@ class Browser(B):
|
|||||||
B.set_cookiejar(self, *args, **kwargs)
|
B.set_cookiejar(self, *args, **kwargs)
|
||||||
self._clone_actions['set_cookiejar'] = ('set_cookiejar', args, kwargs)
|
self._clone_actions['set_cookiejar'] = ('set_cookiejar', args, kwargs)
|
||||||
|
|
||||||
|
def copy_cookies_from_jsbrowser(self, jsbrowser):
|
||||||
|
for cookie in jsbrowser.cookies:
|
||||||
|
self.cookiejar.set_cookie(cookie)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def cookiejar(self):
|
def cookiejar(self):
|
||||||
return self._clone_actions['set_cookiejar'][1][0]
|
return self._clone_actions['set_cookiejar'][1][0]
|
||||||
|
@ -332,6 +332,12 @@ class BasicNewsRecipe(Recipe):
|
|||||||
#: ignore_duplicate_articles = {'title', 'url'}
|
#: ignore_duplicate_articles = {'title', 'url'}
|
||||||
ignore_duplicate_articles = None
|
ignore_duplicate_articles = None
|
||||||
|
|
||||||
|
#: If you set this True, then calibre will use javascript to login to the
|
||||||
|
#: website. This is needed for some websites that require the use of
|
||||||
|
#: javascript to login. If you set this to True you must implement the
|
||||||
|
#: :meth:`javascript_login` method, to do the actual logging in.
|
||||||
|
use_javascript_to_login = False
|
||||||
|
|
||||||
# See the built-in profiles for examples of these settings.
|
# See the built-in profiles for examples of these settings.
|
||||||
|
|
||||||
def short_title(self):
|
def short_title(self):
|
||||||
@ -404,8 +410,7 @@ class BasicNewsRecipe(Recipe):
|
|||||||
'''
|
'''
|
||||||
return url
|
return url
|
||||||
|
|
||||||
@classmethod
|
def get_browser(self, *args, **kwargs):
|
||||||
def get_browser(cls, *args, **kwargs):
|
|
||||||
'''
|
'''
|
||||||
Return a browser instance used to fetch documents from the web. By default
|
Return a browser instance used to fetch documents from the web. By default
|
||||||
it returns a `mechanize <http://wwwsearch.sourceforge.net/mechanize/>`_
|
it returns a `mechanize <http://wwwsearch.sourceforge.net/mechanize/>`_
|
||||||
@ -427,9 +432,47 @@ class BasicNewsRecipe(Recipe):
|
|||||||
return br
|
return br
|
||||||
|
|
||||||
'''
|
'''
|
||||||
br = browser(*args, **kwargs)
|
if self.use_javascript_to_login:
|
||||||
br.addheaders += [('Accept', '*/*')]
|
if getattr(self, 'browser', None) is not None:
|
||||||
return br
|
return self.clone_browser(self.browser)
|
||||||
|
from calibre.web.jsbrowser.browser import Browser
|
||||||
|
br = Browser()
|
||||||
|
with br:
|
||||||
|
self.javascript_login(br, self.username, self.password)
|
||||||
|
kwargs['user_agent'] = br.user_agent
|
||||||
|
ans = browser(*args, **kwargs)
|
||||||
|
ans.copy_cookies_from_jsbrowser(br)
|
||||||
|
return ans
|
||||||
|
else:
|
||||||
|
br = browser(*args, **kwargs)
|
||||||
|
br.addheaders += [('Accept', '*/*')]
|
||||||
|
return br
|
||||||
|
|
||||||
|
def javascript_login(self, browser, username, password):
|
||||||
|
'''
|
||||||
|
This method is used to login to a website that uses javascript for its
|
||||||
|
login form. After the login is complete, the cookies returned from the
|
||||||
|
website are copied to a normal (non-javascript) browser and the
|
||||||
|
download proceeds using those cookies.
|
||||||
|
|
||||||
|
An example implementation::
|
||||||
|
|
||||||
|
def javascript_login(self, browser, username, password):
|
||||||
|
browser.visit('http://some-page-that-has-a-login')
|
||||||
|
form = browser.select_form(nr=0) # Select the first form on the page
|
||||||
|
form['username'] = username
|
||||||
|
form['password'] = password
|
||||||
|
browser.submit(timeout=120) # Submit the form and wait at most two minutes for loading to complete
|
||||||
|
|
||||||
|
Note that you can also select forms with CSS2 selectors, like this::
|
||||||
|
|
||||||
|
browser.select_form('form#login_form')
|
||||||
|
browser.select_from('form[name="someform"]')
|
||||||
|
|
||||||
|
'''
|
||||||
|
raise NotImplementedError('You must implement the javascript_login()'
|
||||||
|
' method if you set use_javascript_to_login'
|
||||||
|
' to True')
|
||||||
|
|
||||||
def clone_browser(self, br):
|
def clone_browser(self, br):
|
||||||
'''
|
'''
|
||||||
|
@ -303,6 +303,10 @@ class Browser(QObject, FormsMixin):
|
|||||||
self.nam = NetworkAccessManager(log, use_disk_cache=use_disk_cache, parent=self)
|
self.nam = NetworkAccessManager(log, use_disk_cache=use_disk_cache, parent=self)
|
||||||
self.page.setNetworkAccessManager(self.nam)
|
self.page.setNetworkAccessManager(self.nam)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def user_agent(self):
|
||||||
|
return self.page.user_agent
|
||||||
|
|
||||||
def _wait_for_load(self, timeout, url=None):
|
def _wait_for_load(self, timeout, url=None):
|
||||||
loop = QEventLoop(self)
|
loop = QEventLoop(self)
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
@ -422,3 +426,9 @@ class Browser(QObject, FormsMixin):
|
|||||||
pass
|
pass
|
||||||
self.nam = self.page = None
|
self.nam = self.page = None
|
||||||
|
|
||||||
|
def __enter__(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def __exit__(self, *args):
|
||||||
|
self.close()
|
||||||
|
|
||||||
|
@ -11,6 +11,7 @@ import unittest, pprint, threading, time
|
|||||||
|
|
||||||
import cherrypy
|
import cherrypy
|
||||||
|
|
||||||
|
from calibre import browser
|
||||||
from calibre.web.jsbrowser.browser import Browser
|
from calibre.web.jsbrowser.browser import Browser
|
||||||
from calibre.library.server.utils import (cookie_max_age_to_expires,
|
from calibre.library.server.utils import (cookie_max_age_to_expires,
|
||||||
cookie_time_fmt)
|
cookie_time_fmt)
|
||||||
@ -105,6 +106,12 @@ class Server(object):
|
|||||||
import traceback
|
import traceback
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
|
|
||||||
|
@cherrypy.expose
|
||||||
|
def receive_cookies(self):
|
||||||
|
self.received_cookies = {n:(c.value, dict(c)) for n, c in
|
||||||
|
dict(cherrypy.request.cookie).iteritems()}
|
||||||
|
return pprint.pformat(self.received_cookies)
|
||||||
|
|
||||||
class Test(unittest.TestCase):
|
class Test(unittest.TestCase):
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@ -202,6 +209,26 @@ class Test(unittest.TestCase):
|
|||||||
if fexp:
|
if fexp:
|
||||||
self.assertEqual(fexp, cexp)
|
self.assertEqual(fexp, cexp)
|
||||||
|
|
||||||
|
def test_cookie_copy(self):
|
||||||
|
'Test copying of cookies from jsbrowser to mechanize'
|
||||||
|
self.assertEqual(self.browser.visit('http://127.0.0.1:%d/cookies'%self.port),
|
||||||
|
True)
|
||||||
|
sent_cookies = self.server.sent_cookies.copy()
|
||||||
|
self.browser.visit('http://127.0.0.1:%d/receive_cookies'%self.port)
|
||||||
|
orig_rc = self.server.received_cookies.copy()
|
||||||
|
br = browser(user_agent=self.browser.user_agent)
|
||||||
|
br.copy_cookies_from_jsbrowser(self.browser)
|
||||||
|
br.open('http://127.0.0.1:%d/receive_cookies'%self.port)
|
||||||
|
for name, vals in sent_cookies.iteritems():
|
||||||
|
val = vals[0]
|
||||||
|
try:
|
||||||
|
rval = self.server.received_cookies[name][0]
|
||||||
|
except:
|
||||||
|
self.fail('The cookie: %s was not received by the server')
|
||||||
|
self.assertEqual(val, rval,
|
||||||
|
'The received value for the cookie: %s, %s != %s'%(
|
||||||
|
name, rval, val))
|
||||||
|
self.assertEqual(orig_rc, self.server.received_cookies)
|
||||||
|
|
||||||
def tests():
|
def tests():
|
||||||
return unittest.TestLoader().loadTestsFromTestCase(Test)
|
return unittest.TestLoader().loadTestsFromTestCase(Test)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user