calibre/recipes/foreignaffairs.recipe

#!/usr/bin/env python2
from calibre.web.feeds.news import BasicNewsRecipe
import re
import html5lib
from lxml import html


def select_form(form):
    return form.attrs.get('id', None) == 'user-login'


class ForeignAffairsRecipe(BasicNewsRecipe):

    ''' there are three modifications:
    1) fetch issue cover
    2) toggle ignore premium articles
    3) extract proper section names, ie. "Comments", "Essay"

    by Chen Wei, 2012-02-05

        Additional modifications to support rebranded website

        by anisotrope, 27 June 2015
        '''

    __license__  = 'GPL v3'
    __author__ = 'Rick Shang, kwetal, anisotrope'
    language = 'en'
    version = 1.02

    title = u'Foreign Affairs (Subcription)'
    publisher = u'Council on Foreign Relations'
    category = u'USA, Foreign Affairs'
    description = u'The leading forum for serious discussion of American foreign policy and international affairs.'

    no_stylesheets = True
    remove_javascript = True
    needs_subscription = True

    INDEX = 'http://www.foreignaffairs.com'
    FRONTPAGE = INDEX + '/magazine'

    keep_only_tags = [
        dict(attrs={'class':lambda x: x and set(x.split()).intersection(set('article-header l-article-column'.split()))}),
    ]

    conversion_options = {'comments': description, 'tags': category, 'language': 'en',
                          'publisher': publisher}

    def parse_index(self):
        answer = []
        soup = self.index_to_soup(self.FRONTPAGE)
        div = soup.find('div', attrs={'class':'magazine-hero__image image_auto_width'})
        self.cover_url =  div.find('img')['src']
        # get dates
        date = re.split('\s\|\s',self.tag_to_string(soup.head.title.string))[0]
        self.title = "Foreign Affairs ({})".format(date)
        self.timefmt =  u' [%s]'%date

        sec_start = soup.findAll('section', attrs={'class':re.compile(r'\bmagazine-list\b')})
        for sec in sec_start:
            articles = []
            section = self.tag_to_string(sec.find('h1'))
            for article_block in sec.findAll('article'):
                if article_block.find('a') is not None:
                    title=self.tag_to_string(article_block.div.a.h2)
                    url = article_block.div.a['href']
                    atr=article_block.findNext('p', attrs={'class': 'author'})
                    if atr is not None:
                        author=self.tag_to_string(atr)
                    else:
                        author=''
                    desc=article_block.findNext('div', attrs={'class': 'deck'})
                    if desc is not None:
                        description=self.tag_to_string(desc)
                    else:
                        description=''
                    articles.append({'title':title, 'date':None, 'url':url, 'description':description, 'author':author})
            if articles:
                answer.append((section, articles))
        return answer

    def preprocess_raw_html(self, raw_html, url):
        root = html5lib.parse(raw_html, treebuilder='lxml', namespaceHTMLElements=False).getroot()
        for svg in tuple(root.iter('{*}svg')):
            svg.getparent().remove(svg)
        for meta in tuple(root.iter('{*}meta')):
            meta.getparent().remove(meta)
        return html.tostring(root)

    def preprocess_html(self, soup):
        for img in soup.findAll('img', attrs={'src': True}):
            if not img['src'].startswith('http'):
                img['src'] = self.INDEX + img['src']

        return soup

    def get_browser(self):
        br = BasicNewsRecipe.get_browser(self)
        if self.username is not None and self.password is not None:
            # mechanize fails to parse the html correctly, so use html5lib to
            # sanitize the html first
            response = br.open('https://www.foreignaffairs.com/user?destination=user%3Fop%3Dlo')
            root = html5lib.parse(response.get_data(), treebuilder='lxml', namespaceHTMLElements=False)
            response.set_data(html.tostring(root))
            br.set_response(response)
            br.select_form(predicate=select_form)
            br.form['name'] = self.username
            br.form['pass'] = self.password
            br.submit()
        return br