Update Discover Magazine

This commit is contained in:
Kovid Goyal 2016-01-31 00:45:22 +05:30
parent cf8fcfe82b
commit cb14e8f549
2 changed files with 83 additions and 34 deletions

View File

@ -7,7 +7,7 @@ __docformat__ = 'restructuredtext en'
discovermagazine.com
'''
import re
import re, mechanize, json, cookielib
from calibre.web.feeds.news import BasicNewsRecipe
class DiscoverMagazine(BasicNewsRecipe):
@ -35,29 +35,53 @@ class DiscoverMagazine(BasicNewsRecipe):
# Login stuff
needs_subscription = True
use_javascript_to_login = True
requires_version = (0, 9, 20)
def javascript_login(self, br, username, password):
br.visit('http://discovermagazine.com', timeout=120)
f = br.select_form('div.login.section div.form')
f['username'] = username
f['password'] = password
br.submit('input[id="signInButton"]', timeout=120)
br.run_for_a_time(20)
def get_browser(self):
br = BasicNewsRecipe.get_browser(self)
rq = mechanize.Request(
'https://secure.kalmbach.com/kserv/api/authentication/login', headers={
'Content-Type': 'application/json; charset=UTF-8',
'Referer': 'http://discovermagazine.com',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Language': 'en-US,en;q=0.5',
'Origin': 'http://discovermagazine.com',
}, data=json.dumps(
{'appId': '2', 'email':self.username, 'password':self.password}))
br.set_debug_http(True)
br.open(rq)
data = json.loads(br.open(rq).read())
if not data.get('success'):
raise ValueError('Failed to login')
session_id = data['sessionId']
if hasattr(br, 'set_cookie'):
br.set_cookie('KSERV', session_id, 'discovermagazine.com')
else:
c = cookielib.Cookie(
None, 'KSERV', session_id,
None, False,
'discovermagazine.com', True, False,
'/', True,
False, None, False, None, None, None)
br.cookiejar.set_cookie(c)
res = br.open('http://discovermagazine.com')
br.set_debug_http(False)
raw = res.read()
if '>Logout<' not in raw:
raise ValueError('Failed to login')
return br
# End login stuff
def append_page(self, soup, appendtag, position):
pager = soup.find('span',attrs={'class':'next'})
if pager:
nexturl = pager.a['href']
soup2 = self.index_to_soup(nexturl)
texttag = soup2.find('div', attrs={'class':'articlebody'})
newpos = len(texttag.contents)
self.append_page(soup2,texttag,newpos)
texttag.extract()
appendtag.insert(position,texttag)
nexturl = pager.a['href']
soup2 = self.index_to_soup(nexturl)
texttag = soup2.find('div', attrs={'class':'articlebody'})
newpos = len(texttag.contents)
self.append_page(soup2,texttag,newpos)
texttag.extract()
appendtag.insert(position,texttag)
def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Language" content="en-US"/>\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>'
@ -65,7 +89,7 @@ class DiscoverMagazine(BasicNewsRecipe):
self.append_page(soup, soup.body, 3)
pager = soup.find('div',attrs={'class':'listingBar'})
if pager:
pager.extract()
pager.extract()
return soup
def postprocess_html(self, soup, first_fetch):

View File

@ -7,39 +7,64 @@ __copyright__ = '2015 Michael Marotta <mikefm at gmail.net>'
'''
discovermagazine.com
'''
import re
import re, json, cookielib
import mechanize
from calibre.web.feeds.news import BasicNewsRecipe
class DiscoverMagazine(BasicNewsRecipe):
title = 'Discover Magazine Monthly'
__author__ = 'Michael Marotta'
__author__ = 'Kovid Goyal'
description = 'Monthly magazine version of Discover Magazine (not rss feed).'
language = 'en'
encoding = 'utf-8'
simultaneous_downloads = 20
tags = 'news, technology, science'
INDEX = 'http://www.discovermagazine.com'
INDEX = 'http://discovermagazine.com'
keep_only_tags = [
{'attrs':{'class':['headline', 'deck', 'belowDeck', 'mediaContainer', 'segment', 'cover']}},
]
remove_tags = [dict(name='div', attrs={'class': ['ladder', 'mobile', 'popular', 'open', 'scistarter']})]
# Login stuff
# Login {{{
needs_subscription = True
use_javascript_to_login = True
requires_version = (0, 9, 20)
def javascript_login(self, br, username, password):
br.visit('http://discovermagazine.com', timeout=120)
f = br.select_form('div.login.section div.form')
f['username'] = username
f['password'] = password
br.submit('input[id="signInButton"]', timeout=120)
br.run_for_a_time(20)
# br.show_browser()
# End login stuff
def get_browser(self):
br = BasicNewsRecipe.get_browser(self)
rq = mechanize.Request(
'https://secure.kalmbach.com/kserv/api/authentication/login', headers={
'Content-Type': 'application/json; charset=UTF-8',
'Referer': 'http://discovermagazine.com',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Language': 'en-US,en;q=0.5',
'Origin': 'http://discovermagazine.com',
}, data=json.dumps(
{'appId': '2', 'email':self.username, 'password':self.password}))
br.set_debug_http(True)
br.open(rq)
data = json.loads(br.open(rq).read())
if not data.get('success'):
raise ValueError('Failed to login')
session_id = data['sessionId']
if hasattr(br, 'set_cookie'):
br.set_cookie('KSERV', session_id, 'discovermagazine.com')
else:
c = cookielib.Cookie(
None, 'KSERV', session_id,
None, False,
'discovermagazine.com', True, False,
'/', True,
False, None, False, None, None, None)
br.cookiejar.set_cookie(c)
res = br.open('http://discovermagazine.com')
br.set_debug_http(False)
raw = res.read()
if '>Logout<' not in raw:
raise ValueError('Failed to login')
return br
# End login }}}
no_stylesheets = True
preprocess_regexps = [(re.compile(r'<br[ ]*/>', re.IGNORECASE), lambda m: ''),