calibre/recipes/newsweek.recipe
2014-06-11 08:20:53 +05:30

122 lines
5.1 KiB
Plaintext

from calibre.web.feeds.jsnews import JavascriptRecipe
from cssselect import HTMLTranslator
from lxml.etree import XPath
import datetime
def CSSSelect(expr):
return XPath(HTMLTranslator().css_to_xpath(expr))
BASE = 'http://www.newsweek.com'
def href_to_url(a, add_piano=False):
return BASE + a.get('href') + ('?piano_t=1' if add_piano else '')
class Newsweek(JavascriptRecipe):
title = 'Newsweek'
__author__ = 'Kovid Goyal'
description = 'Weekly news and current affairs in the US. Requires a subscription.'
language = 'en'
encoding = 'utf-8'
no_stylesheets = True
requires_version = (1, 40, 0)
keep_only_tags = ['article.content-fullwidth']
remove_tags = [
'meta', '.block-openadstream', '.block-ibtmedia-social', '.issue-next',
'.most-popular', '.ibt-media-stories', '.user-btn-group',
'#taboola-below-main-column', '.trc_related_container',
'#block-nw-magazine-magazine-more-from-issue', '.block-ibtmedia-top-stories',
]
LOGIN = 'https://bar.piano-media.com/lite/authent/login//custom/newsweek/?service_id=25&loc=http%3A%2F%2Fwww.newsweek.com%2F' # noqa
needs_subscription = True
def do_login(self, br, username, password):
br.visit(self.LOGIN)
form = br.select_form('#pianomedia_login_form')
form['login'] = username
form['password'] = password
br.submit()
def get_publication_data(self, browser):
browser.wait_for_element('nav.main-menu a[href]')
root = self.index_to_soup(browser.html)
for a in CSSSelect('nav.main-menu a[href]')(root):
if a.text and a.text.strip() == 'This Week\'s Edition':
return self.get_newsweek_publication_data(browser, href_to_url(a, True))
def get_newsweek_publication_data(self, browser, url):
root = self.index_to_soup(url)
sel = lambda expr: CSSSelect(expr)(root)
ans = {}
for img in sel('div.cover-story div.info img[src]'):
if '_Cover_' in img.get('title', ''):
ans['cover'] = browser.get_resource(img.get('src'))
break
for title in root.xpath('//title'):
raw = title.text
if raw:
self.timefmt = datetime.datetime.strptime(raw, '%Y/%m/%d').strftime(' [%b %d]')
sections = []
for div in sel('div.cover-story div.info'):
url = None
for a in div.xpath('descendant::a[@href]'):
url = href_to_url(a)
break
for s in div.xpath('descendant::div[@class="summary"]'):
sections.append(('Cover Story', [{'title':'Cover Story', 'date':'', 'url':url, 'description':self.tag_to_string(s)}]))
break
features = []
for li in sel('div.features li'):
url = None
for a in li.xpath('descendant::a[@class="article-link"]'):
url = href_to_url(a)
features.append({'title':self.tag_to_string(a), 'url':url})
break
if features:
sections.append(('Features', features))
for div in sel('div.issue-list-block'):
for d in div.xpath('descendant::div[@class="block-title"]'):
section_title = self.tag_to_string(d)
articles = []
break
else:
continue
for li in div.xpath('descendant::li'):
desc = ''
for d in li.xpath('descendant::div[@class="summary"]'):
desc = self.tag_to_string(d)
break
for a in li.xpath('descendant::a[@class="article-link"]'):
articles.append({'title':self.tag_to_string(a), 'url':href_to_url(a), 'description':desc})
break
if articles:
sections.append((section_title, articles))
ans['index'] = sections
return ans
def load_complete(self, browser, url, recursion_level):
browser.wait_for_element('div.article-body')
return browser.load_completed # This is needed to allow the parallax images to load
def preprocess_stage1(self, article, browser, url, recursion_level):
# Parallax images in the articles are loaded as background images
# on <span> tags. Convert them to normal images.
for span in browser.css_select('span.parallax-image', all=True):
bg = unicode(span.styleProperty('background-image', span.InlineStyle))
if bg:
url = bg.strip().partition('(')[-1][:-1]
span.appendInside('<img src="%s"></img>' % url)
span.setAttribute('style', '')
browser.run_for_a_time(0.1) # This is needed to give the DOM time to update
def postprocess_html(self, article, root, url, recursion_level):
for x in root.xpath('//*[local-name()="body" and @style]'):
del x.attrib['style'] # body has a fixed height, which causes problems with epub viewers
for x in root.xpath('//*[@id="piano-root"]'):
x.getparent().remove(x)
return root