calibre/recipes/economist.recipe

#!/usr/bin/env  python2

__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''
economist.com
'''
import cookielib
from collections import OrderedDict

from calibre.ebooks.BeautifulSoup import NavigableString, Tag
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.utils.cleantext import clean_ascii_chars


def classes(classes):
    q = frozenset(classes.split(' '))
    return dict(attrs={
        'class': lambda x: x and frozenset(x.split()).intersection(q)})


class NoArticles(Exception):
    pass


def process_url(url):
    if url.startswith('/'):
        url = 'https://www.economist.com' + url
    return url


class Economist(BasicNewsRecipe):

    title = 'The Economist'
    language = 'en'

    __author__ = "Kovid Goyal"
    INDEX = 'https://www.economist.com/printedition'
    description = (
        'Global news and current affairs from a European'
        ' perspective. Best downloaded on Friday mornings (GMT)'
    )
    extra_css = '''
        .headline {font-size: x-large;}
        h2 { font-size: small;  }
        h1 { font-size: medium;  }
        em.Bold {font-weight:bold;font-style:normal;}
        em.Italic {font-style:italic;}
        p.xhead {font-weight:bold;}
        .pullquote {
            float: right;
            font-size: larger;
            font-weight: bold;
            font-style: italic;
            page-break-inside:avoid;
            border-bottom: 3px solid black;
            border-top: 3px solid black;
            width: 228px;
            margin: 0px 0px 10px 15px;
            padding: 7px 0px 9px;
        }
        .flytitle-and-title__flytitle {
            display: block;
            font-size: smaller;
            color: red;
        }
        '''
    oldest_article = 7.0
    resolve_internal_links = True
    remove_tags = [
        dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent', 'aside', 'footer']),
        dict(attrs={
                'class': [
                    'dblClkTrk', 'ec-article-info', 'share_inline_header',
                    'related-items', 'main-content-container', 'ec-topic-widget',
                    'teaser', 'blog-post__bottom-panel-bottom', 'blog-post__comments-label',
                    'blog-post__foot-note', 'blog-post__sharebar', 'blog-post__bottom-panel',
                    'newsletter-form','share-links-header','teaser--wrapped', 'latest-updates-panel__container',
                    'latest-updates-panel__article-link','blog-post__section'
                ]
            }
        ),
        dict(attrs={
                'class': lambda x: x and 'blog-post__siblings-list-aside' in x.split()}),
        classes('share-links-header teaser--wrapped latest-updates-panel__container latest-updates-panel__article-link blog-post__section newsletter-form')
    ]
    keep_only_tags = [dict(name='article', id=lambda x: not x)]
    no_stylesheets = True
    remove_attributes = ['data-reactid']
    # economist.com has started throttling after about 60% of the total has
    # downloaded with connection reset by peer (104) errors.
    delay = 1

    needs_subscription = False

    def __init__(self, *args, **kwargs):
        BasicNewsRecipe.__init__(self, *args, **kwargs)
        if self.output_profile.short_name.startswith('kindle'):
            # Reduce image sizes to get file size below amazon's email
            # sending threshold
            self.web2disk_options.compress_news_images = True
            self.web2disk_options.compress_news_images_auto_size = 5
            self.log.warn('Kindle Output profile being used, reducing image quality to keep file size below amazon email threshold')

    def get_browser(self):
        br = BasicNewsRecipe.get_browser(self)
        # Add a cookie indicating we have accepted Economist's cookie
        # policy (needed when running from some European countries)
        ck = cookielib.Cookie(
            version=0,
            name='notice_preferences',
            value='2:',
            port=None,
            port_specified=False,
            domain='.economist.com',
            domain_specified=False,
            domain_initial_dot=True,
            path='/',
            path_specified=False,
            secure=False,
            expires=None,
            discard=False,
            comment=None,
            comment_url=None,
            rest={'HttpOnly': None},
            rfc2109=False
        )
        br.cookiejar.set_cookie(ck)
        br.set_handle_gzip(True)
        return br

    def preprocess_raw_html(self, raw, url):
        import html5lib
        root = html5lib.parse(raw, namespaceHTMLElements=False, treebuilder='lxml')
        from lxml import etree
        for div in root.xpath('//div[@class="lazy-image"]'):
            noscript = list(div.iter('noscript'))
            if noscript and noscript[0].text:
                img = list(html5lib.parse(noscript[0].text, namespaceHTMLElements=False, treebuilder='lxml').iter('img'))
                if img:
                    p = noscript[0].getparent()
                    idx = p.index(noscript[0])
                    p.insert(idx, p.makeelement('img', src=img[0].get('src')))
                    p.remove(noscript[0])
        for x in root.xpath('//*[name()="script" or name()="style" or name()="source" or name()="meta"]'):
            x.getparent().remove(x)
        raw = etree.tostring(root, encoding=unicode)
        return raw

    def populate_article_metadata(self, article, soup, first):
        els = soup.findAll(name=['span', 'p'],
                           attrs={'class': ['flytitle-and-title__title', 'blog-post__rubric']})
        result = []
        for el in els[0:2]:
            if el is not None and el.contents:
                for descendant in el.contents:
                    if isinstance(descendant, NavigableString):
                        result.append(unicode(descendant))
        article.summary = u'. '.join(result) + u'.'
        article.text_summary = clean_ascii_chars(article.summary)

    def parse_index(self):
        # return [('Articles', [{'title':'test',
        # 'url':'http://www.economist.com/news/business/21718916-worlds-biggest-software-firm-has-transformed-its-culture-better-getting-cloud'
        # }])]
        raw = self.index_to_soup(self.INDEX, raw=True)
        # with open('/t/raw.html', 'wb') as f:
        #     f.write(raw)
        soup = self.index_to_soup(raw)
        ans = self.economist_parse_index(soup)
        if not ans:
            raise NoArticles(
                'Could not find any articles, either the '
                'economist.com server is having trouble and you should '
                'try later or the website format has changed and the '
                'recipe needs to be updated.'
            )
        return ans

    def economist_parse_index(self, soup):
        img = soup.find(attrs={'srcset': True, 'class': lambda x: x and 'print-edition__cover-widget__image' in x.split()})
        if img is not None:
            for part in img['srcset'].split():
                if part.startswith('/'):
                    part = part.replace('200-width', '640-width')
                    self.cover_url = 'https://www.economist.com' + part
                    self.log('Got cover:', self.cover_url)
                    break

        sections = soup.findAll('div', attrs={'class': 'list__title'})
        if sections:
            feeds = []
            for section in sections:
                articles = []
                secname = self.tag_to_string(section)
                self.log(secname)
                for a in section.findNextSiblings('a', href=True):
                    spans = a.findAll('span')
                    if len(spans) == 2:
                        title = u'{}: {}'.format(*map(self.tag_to_string, spans))
                    else:
                        title = self.tag_to_string(a)
                    articles.append({'title': title, 'url': process_url(a['href'])})
                    self.log(' ', title, articles[-1]['url'])
                if articles:
                    feeds.append((secname, articles))
            return feeds
        return self.economist_parse_old_index(soup)

    def economist_parse_old_index(self, soup):
        feeds = OrderedDict()
        for section in soup.findAll(attrs={'class': lambda x: x and 'section' in x}):
            h4 = section.find('h4')
            if h4 is None:
                continue
            section_title = self.tag_to_string(h4).strip()
            if not section_title:
                continue
            self.log('Found section: %s' % section_title)
            articles = []
            subsection = ''
            for node in section.findAll(attrs={'class': 'article'}):
                subsec = node.findPreviousSibling('h5')
                if subsec is not None:
                    subsection = self.tag_to_string(subsec)
                prefix = (subsection + ': ') if subsection else ''
                a = node.find('a', href=True)
                if a is not None:
                    url = a['href']
                    if url.startswith('/'):
                        url = 'https://www.economist.com' + url
                    url += '/print'
                    title = self.tag_to_string(a)
                    if title:
                        title = prefix + title
                        self.log('\tFound article:', title)
                        articles.append({
                            'title': title,
                            'url': url,
                            'description': '',
                            'date': ''
                        })

            if articles:
                if section_title not in feeds:
                    feeds[section_title] = []
                feeds[section_title] += articles

        ans = [(key, val) for key, val in feeds.iteritems()]
        return ans

    def eco_find_image_tables(self, soup):
        for x in soup.findAll('table', align=['right', 'center']):
            if len(x.findAll('font')) in (1, 2) and len(x.findAll('img')) == 1:
                yield x

    def postprocess_html(self, soup, first):
        for img in soup.findAll('img', srcset=True):
            del img['srcset']
        for table in list(self.eco_find_image_tables(soup)):
            caption = table.find('font')
            img = table.find('img')
            div = Tag(soup, 'div')
            div['style'] = 'text-align:left;font-size:70%'
            ns = NavigableString(self.tag_to_string(caption))
            div.insert(0, ns)
            div.insert(1, Tag(soup, 'br'))
            del img['width']
            del img['height']
            img.extract()
            div.insert(2, img)
            table.replaceWith(div)
        return soup

    def canonicalize_internal_url(self, url, is_link=True):
        if url.endswith('/print'):
            url = url.rpartition('/')[0]
        return BasicNewsRecipe.canonicalize_internal_url(self, url, is_link=is_link)