calibre/recipes/discover_magazine_monthly.recipe
2019-05-18 18:55:02 +05:30

129 lines
5.2 KiB
Python

#!/usr/bin/env python2
from __future__ import unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2015 Michael Marotta <mikefm at gmail.net>'
# Written April 2015
# Last edited 4/17/15
'''
discovermagazine.com
'''
import re
import json
try:
from http.cookiejar import Cookie
except ImportError:
from cookielib import Cookie
import mechanize
from calibre.web.feeds.news import BasicNewsRecipe
class DiscoverMagazine(BasicNewsRecipe):
title = 'Discover Magazine Monthly'
__author__ = 'Kovid Goyal'
description = 'Monthly magazine version of Discover Magazine (not rss feed).'
language = 'en'
encoding = 'utf-8'
simultaneous_downloads = 20
tags = 'news, technology, science'
INDEX = 'http://discovermagazine.com'
keep_only_tags = [
{'attrs': {'class': ['headline', 'deck', 'belowDeck',
'mediaContainer', 'segment', 'cover']}},
]
remove_tags = [dict(name='div', attrs={
'class': ['ladder', 'mobile', 'popular', 'open', 'scistarter']})]
# Login {{{
needs_subscription = True
def get_browser(self):
br = BasicNewsRecipe.get_browser(self)
rq = mechanize.Request(
'https://secure.kalmbach.com/kserv/api/authentication/login', headers={
'Content-Type': 'application/json; charset=UTF-8',
'Referer': 'http://discovermagazine.com',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Language': 'en-US,en;q=0.5',
'Origin': 'http://discovermagazine.com',
}, data=json.dumps(
{'appId': '2', 'email': self.username, 'password': self.password}))
br.set_debug_http(True)
br.open(rq)
data = json.loads(br.open(rq).read())
if not data.get('success'):
raise ValueError('Failed to login')
session_id = data['sessionId']
if hasattr(br, 'set_cookie'):
br.set_cookie('KSERV', session_id, 'discovermagazine.com')
else:
c = Cookie(
None, 'KSERV', session_id,
None, False,
'discovermagazine.com', True, False,
'/', True,
False, None, False, None, None, None)
br.cookiejar.set_cookie(c)
res = br.open('http://discovermagazine.com')
br.set_debug_http(False)
raw = res.read()
if b'>Logout<' not in raw:
raise ValueError('Failed to login')
return br
# End login }}}
no_stylesheets = True
preprocess_regexps = [(re.compile(r'<br[ ]*/>', re.IGNORECASE), lambda m: ''),
(re.compile(r'<br[ ]*clear.*/>', re.IGNORECASE), lambda m: '')]
extra_css = 'body { font-family: helvetica, sans-serif; } \
.belowdeck {font-style: italic; padding=bottom: 10px; max-width: none} \
.caption {font-style: italic; padding=bottom: 10px; max-width: none} \
.caption1 {font-style: italic; padding=bottom: 10px; max-width: none} \
h2 { text-align: left; font-size: 1em; font-weight: bold; }}'
def parse_index(self):
# gets current month from homepage and append to index
soup = self.index_to_soup(self.INDEX)
c = soup.find(name=['a'], attrs={
'title': ['See inside the current issue of Discover Magazine']})
currentMonth = self.tag_to_string(c['href'])
self.INDEX = self.INDEX + currentMonth
# continue parsing
soup = self.index_to_soup(self.INDEX)
col = soup.find(attrs={'class': 'issue'})
current_section, current_articles = None, []
feeds = []
# find cover
cover = soup.find('div', attrs={'class': 'cover'})
if cover is not None:
img = cover.find('img', src=True)
if img is not None:
self.cover_url = 'http://www.discovermagazine.com' + \
img['src'].replace(' ', '%20') # [:-7]
# parse articles
for tag in col.findAll(name=['h3', 'div'], attrs={'class': ['bottomBorder', 'headline']}):
if tag.name == 'h3':
if current_section and current_articles:
feeds.append((current_section, current_articles))
current_section = self.tag_to_string(tag).capitalize()
current_articles = []
self.log('Found section:', current_section)
elif current_section:
a = tag.find('a', href=True)
if a is not None:
title = self.tag_to_string(a)
url = 'http://www.discovermagazine.com' + a['href']
if title and url:
p = tag.find('div', attrs={'class': 'snippet'})
desc = self.tag_to_string(p) if p is not None else ''
current_articles.append(
{'title': title, 'url': url, 'description': desc})
self.log('\tArticle:', title, '[%s]' % url)
self.log('\t\t', desc)
if current_section and current_articles:
feeds.append((current_section, current_articles))
return feeds