calibre/recipes/newsweek.recipe

from calibre.web.feeds.jsnews import JavascriptRecipe
from cssselect import HTMLTranslator
from lxml.etree import XPath
import datetime

def CSSSelect(expr):
    return XPath(HTMLTranslator().css_to_xpath(expr))

BASE = 'http://www.newsweek.com'
def href_to_url(a, add_piano=False):
    return BASE + a.get('href') + ('?piano_t=1' if add_piano else '')

class Newsweek(JavascriptRecipe):

    title          = 'Newsweek'
    __author__     = 'Kovid Goyal'
    description    = 'Weekly news and current affairs in the US. Requires a subscription.'
    language       = 'en'
    encoding       = 'utf-8'
    no_stylesheets = True
    requires_version = (1, 40, 0)

    keep_only_tags = ['article.content-fullwidth']
    remove_tags = [
        'meta', '.block-openadstream', '.block-ibtmedia-social', '.issue-next',
        '.most-popular', '.ibt-media-stories', '.user-btn-group',
        '#taboola-below-main-column', '.trc_related_container',
        '#block-nw-magazine-magazine-more-from-issue', '.block-ibtmedia-top-stories',
    ]
    LOGIN = 'https://bar.piano-media.com/lite/authent/login//custom/newsweek/?service_id=25&loc=http%3A%2F%2Fwww.newsweek.com%2F'  # noqa

    needs_subscription = True
    def do_login(self, br, username, password):
        br.visit(self.LOGIN)
        form = br.select_form('#pianomedia_login_form')
        form['login'] = username
        form['password'] = password
        br.submit()

    def get_publication_data(self, browser):
        browser.wait_for_element('nav.main-menu a[href]')
        root = self.index_to_soup(browser.html)
        for a in CSSSelect('nav.main-menu a[href]')(root):
            if a.text and a.text.strip() == 'This Week\'s Edition':
                return self.get_newsweek_publication_data(browser, href_to_url(a, True))

    def get_newsweek_publication_data(self, browser, url):
        root = self.index_to_soup(url)
        sel = lambda expr: CSSSelect(expr)(root)
        ans = {}

        for img in sel('div.cover-story div.info img[src]'):
            if '_Cover_' in img.get('title', ''):
                ans['cover'] = browser.get_resource(img.get('src'))
                break
        for title in root.xpath('//title'):
            raw = title.text
            if raw:
                self.timefmt = datetime.datetime.strptime(raw, '%Y/%m/%d').strftime(' [%b %d]')

        sections = []
        for div in sel('div.cover-story div.info'):
            url = None
            for a in div.xpath('descendant::a[@href]'):
                url = href_to_url(a)
                break
            for s in div.xpath('descendant::div[@class="summary"]'):
                sections.append(('Cover Story', [{'title':'Cover Story', 'date':'', 'url':url, 'description':self.tag_to_string(s)}]))
                break
        features = []
        for li in sel('div.features li'):
            url = None
            for a in li.xpath('descendant::a[@class="article-link"]'):
                url = href_to_url(a)
                features.append({'title':self.tag_to_string(a), 'url':url})
                break
        if features:
            sections.append(('Features', features))

        for div in sel('div.issue-list-block'):
            for d in div.xpath('descendant::div[@class="block-title"]'):
                section_title = self.tag_to_string(d)
                articles = []
                break
            else:
                continue
            for li in div.xpath('descendant::li'):
                desc = ''
                for d in li.xpath('descendant::div[@class="summary"]'):
                    desc = self.tag_to_string(d)
                    break
                for a in li.xpath('descendant::a[@class="article-link"]'):
                    articles.append({'title':self.tag_to_string(a), 'url':href_to_url(a), 'description':desc})
                    break
            if articles:
                sections.append((section_title, articles))

        ans['index'] = sections
        return ans

    def load_complete(self, browser, url, recursion_level):
        browser.wait_for_element('div.article-body')
        return browser.load_completed  # This is needed to allow the parallax images to load

    def preprocess_stage1(self, article, browser, url, recursion_level):
        # Parallax images in the articles are loaded as background images
        # on <span> tags. Convert them to normal images.
        for span in browser.css_select('span.parallax-image', all=True):
            bg = unicode(span.styleProperty('background-image', span.InlineStyle))
            if bg:
                url = bg.strip().partition('(')[-1][:-1]
                span.appendInside('<img src="%s"></img>' % url)
            span.setAttribute('style', '')
        browser.run_for_a_time(0.1)  # This is needed to give the DOM time to update

    def postprocess_html(self, article, root, url, recursion_level):
        for x in root.xpath('//*[local-name()="body" and @style]'):
            del x.attrib['style']  # body has a fixed height, which causes problems with epub viewers
        for x in root.xpath('//*[@id="piano-root"]'):
            x.getparent().remove(x)
        return root