calibre/recipes/foreignaffairs.recipe

#!/usr/bin/env python2
from calibre.web.feeds.news import BasicNewsRecipe
import re
import html5lib
from lxml import html


def select_form(form):
    return form.attrs.get('id', None) == 'user-login'


class ForeignAffairsRecipe(BasicNewsRecipe):

    ''' there are three modifications:
    1) fetch issue cover
    2) toggle ignore premium articles
    3) extract proper section names, ie. "Comments", "Essay"

    by Chen Wei, 2012-02-05

        Additional modifications to support rebranded website

        by anisotrope, 27 June 2015
        '''

    __license__ = 'GPL v3'
    __author__ = 'Rick Shang, kwetal, anisotrope'
    language = 'en'
    version = 1.02

    title = u'Foreign Affairs (Subcription)'
    publisher = u'Council on Foreign Relations'
    category = u'USA, Foreign Affairs'
    description = u'The leading forum for serious discussion of American foreign policy and international affairs.'

    no_stylesheets = True
    remove_javascript = True
    needs_subscription = True

    INDEX = 'http://www.foreignaffairs.com'
    FRONTPAGE = INDEX + '/magazine'

    keep_only_tags = [
        dict(attrs={'class': lambda x: x and set(x.split()).intersection(
            set('article-header l-article-column'.split()))}),
    ]

    conversion_options = {'comments': description, 'tags': category, 'language': 'en',
                          'publisher': publisher}

    def parse_index(self):
        answer = []
        soup = self.index_to_soup(html.tostring(self.clean_fa_html(
            self.index_to_soup(self.FRONTPAGE, as_tree=True))))
        div = soup.find(
            'div', attrs={'class': 'magazine-hero__image image_auto_width'})
        self.cover_url = div.find('img')['src']
        # get dates
        date = re.split('\s\|\s', self.tag_to_string(
            soup.head.title.string))[0]
        self.title = "Foreign Affairs ({})".format(date)
        self.timefmt = u' [%s]' % date

        for section in soup.findAll(attrs={'class':lambda x: x and 'magazine-list' in x.split()}):
            articles = []
            section_title = self.tag_to_string(section.find('h1'))
            for h2 in section.findAll('h2'):
                a = h2.parent
                if a.get('href'):
                    title = self.tag_to_string(h2)
                    url = a['href']
                    atr = a.findNextSibling(attrs={'class':'author'})
                    author = self.tag_to_string(atr) if atr else ''
                    desc = a.findNextSibling(attrs={'class': 'deck'})
                    if desc is not None:
                        description = self.tag_to_string(desc)
                    else:
                        description = ''
                    articles.append({'title': title, 'url': url,
                                     'description': description, 'author': author})
                    self.log(title)
                    self.log('\t' + url)
            if articles:
                answer.append((section_title, articles))
        return answer

    def clean_fa_html(self, root):
        for svg in tuple(root.iter('{*}svg')):
            svg.getparent().remove(svg)
        for meta in tuple(root.iter('{*}meta')):
            meta.getparent().remove(meta)
        return root

    def preprocess_raw_html(self, raw_html, url):
        root = html5lib.parse(raw_html, treebuilder='lxml',
                              namespaceHTMLElements=False).getroot()
        self.clean_fa_html(root)
        return html.tostring(root)

    def preprocess_html(self, soup):
        for img in soup.findAll('img', attrs={'src': True}):
            if not img['src'].startswith('http'):
                img['src'] = self.INDEX + img['src']

        return soup

    def get_browser(self):
        br = BasicNewsRecipe.get_browser(self)
        if self.username is not None and self.password is not None:
            # mechanize fails to parse the html correctly, so use html5lib to
            # sanitize the html first
            response = br.open(
                'https://www.foreignaffairs.com/user?destination=user%3Fop%3Dlo')
            root = html5lib.parse(
                response.get_data(), treebuilder='lxml', namespaceHTMLElements=False)
            response.set_data(html.tostring(root))
            br.set_response(response)
            br.select_form(predicate=select_form)
            br.form['name'] = self.username
            br.form['pass'] = self.password
            br.submit()
        return br