calibre/recipes/time_magazine.recipe

#!/usr/bin/env  python2
from __future__ import (unicode_literals, division, absolute_import,
                        print_function)

__license__   = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid@kovidgoyal.net>'

'''
time.com
'''

from calibre.web.feeds.jsnews import JavascriptRecipe
from lxml import html

# Keep the login method as standalone, so it can be easily tested
def do_login(browser, username, password):
    from calibre.web.jsbrowser.browser import Timeout
    browser.visit('http://time.com/magazine')
    form = browser.select_form('#sign-in-form')
    form['username'] = username
    form['password'] = password
    browser.submit('#Sign_In')
    try:
        browser.wait_for_element('body.is-signed-in', timeout=180)
    except Timeout:
        raise ValueError('Failed to login to time.com, check your username and password and try again in a little while.')

def evaljs(elem, js):
    # Need this to work with both PyQt4 and PyQt5
    ret = elem.evaluateJavaScript(js)
    try:
        return unicode(ret.toString())
    except AttributeError:
        return unicode(ret)

class Time(JavascriptRecipe):
    title                 = u'Time'
    __author__            = 'Kovid Goyal'
    description           = 'Weekly US magazine.'
    language              = 'en'
    needs_subscription    = True
    requires_version      = (0, 9, 35)

    no_stylesheets        = True
    remove_javascript     = True
    keep_only_tags = ['.article-viewport .full-article']
    remove_tags = ['.read-more-list', '.read-more-inline', '.article-footer', '.subscribe', '.tooltip', '#first-visit']

    def do_login(self, browser, username, password):
        do_login(browser, username, password)

    def get_time_cover(self, browser):
        selector = '#rail-articles img.magazine-thumb'
        cover = browser.css_select(selector)
        # URL for large cover
        cover_url = evaljs(cover, 'this.src')
        cover_url = cover_url.partition('?')[0] + '?w=814'
        return browser.get_resource(cover_url)

    def get_publication_data(self, browser):
        # We are already at the magazine page thanks to the do_login() method
        ans = {}
        raw = browser.html
        root = html.fromstring(raw)

        dates = ''.join(root.xpath('//*[@class="rail-article-magazine-issue"]/date/text()'))
        if dates:
            self.timefmt = ' [%s]'%dates

        parent = root.xpath('//section[@id="rail-articles"]')[0]
        articles = []
        for h3 in parent.xpath(
                'descendant::h3[contains(@class, "rail-article-title")]'):
            title = html.tostring(h3[0], encoding=unicode, method='text').strip()
            a = h3.xpath('descendant::a[@href]')[0]
            url = a.get('href')
            h2 = h3.xpath('following-sibling::h2[@class="rail-article-excerpt"]')
            desc = ''
            if h2:
                desc = html.tostring(h2[0], encoding=unicode, method='text').strip()
            if title.strip() == 'In the Latest Issue':
                continue
            self.log('\nFound article:', title)
            self.log('\t' + desc)
            articles.append({'title':title, 'url':url, 'date':'', 'description':desc})

        ans['index'] = [('Articles', articles)]
        ans['cover'] = self.get_time_cover(browser)
        return ans

    def load_complete(self, browser, url, rl):
        browser.wait_for_element('footer.article-footer')
        return True

    def postprocess_html(self, article, root, url, recursion_level):
        # get rid of the first visit div which for some reason remove_tags is
        # not removing
        for div in root.xpath('//*[@id="first-visit"]'):
            div.getparent().remove(div)
        return root

if __name__ == '__main__':
    # Test the login
    import sys
    from calibre import jsbrowser
    br = jsbrowser(default_timeout=120)
    do_login(br, sys.argv[-2], sys.argv[-1])
    br.show_browser()