calibre/recipes/wsj_free.recipe

from calibre.web.feeds.jsnews import JavascriptRecipe
from calibre.web.jsbrowser.browser import NotAFile

def CSSSelect(expr):
    expr = {
        'div.whatsNews-simple': '''descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' whatsNews-simple ')]''',
        'a.mjLinkItem[href]': '''descendant-or-self::a[@class and contains(concat(' ', normalize-space(@class), ' '), ' mjLinkItem ') and (@href)]''',
        '.meta_sectionName': '''descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), ' meta_sectionName ')]''',
        'p':'descendant-or-self::p',
        'div.whatsNews-simple.whatsNews-itp': '''descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' whatsNews-simple ') and (@class and contains(concat(' ', normalize-space(@class), ' '), ' whatsNews-itp '))]''',
        'a[href]': 'descendant-or-self::a[@href]',
        'span.date-date':"descendant-or-self::span[@class and contains(concat(' ', normalize-space(@class), ' '), ' date-date ')]",
        'div.itpSectionHeaderPdf a[href]': "descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' itpSectionHeaderPdf ')]/descendant-or-self::*/a[@href]",
        'div.itpHeader ul.tab a[href]': "descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' itpHeader ')]/descendant-or-self::*/ul[@class and contains(concat(' ', normalize-space(@class), ' '), ' tab ')]/descendant-or-self::*/a[@href]",

    }[expr]
    from lxml.etree import XPath
    return XPath(expr)


class WSJ(JavascriptRecipe):

    title = 'Wall Street Journal (free)'
    __author__ = 'Kovid Goyal'
    description = '''News and current affairs. This recipe only fetches complete
    versions of the articles that are available free on the wsj.com website.
    To get the rest of the articles, subscribe to the WSJ and use the other WSJ
    recipe.'''

    language = 'en'

    compress_news_images = True
    compress_news_images_auto_size = 5
    max_articles_per_feed = 1000
    timefmt = ' [%a, %b %d, %Y]'
    no_stylesheets = True
    ignore_duplicate_articles = {'url'}
    remove_attributes = ['style', 'data-scrim']

    keep_only_tags = (
        'h1',  # 'h2.subhead', 'h2.subHed.deck',
        'span[itemprop=author][rel=author]',
        'article#article-contents', 'article#articleBody',
        'div#article_story_body', 'header.article_header',
        # Parallax formatting
        'div#ncTitleArea', 'section.nc-exp-artbody',
        # Error conditions, login required and page not found
        'div#snippet-ad-login', 'div.wsj-snippet-body', 'div.wsj-snippet-login', 'div.errorNotFound',
    )

    remove_tags = (
        '.insetButton', '.insettipBox', '.author-info', '.media-object-video',
        '.article_tools', 'span[data-country-code][data-ticker-code]',
        'div.nc-exp-artmeta',
    )

    def preprocess_stage2(self, article, browser, url, recursion_level):
        # Slideshow and expandable images need to be processed here to
        # set the src attribute correctly
        found = 0
        for img in browser.css_select('img[data-in-base-data-lazy]', all=True):
            img.setAttribute('src', img.attribute('data-in-base-data-lazy'))
            found += 1
        for img in browser.css_select('img[data-enlarge]', all=True):
            img.setAttribute('src', img.attribute('data-enlarge'))
            found += 1
        if found:
            self.log.debug('Found %d dynamic images in:' % found, url)

    def get_publication_data(self, browser):
        return self.get_wsj_index(browser)

    def abs_wsj_url(self, href):
        if not href.startswith('http'):
            href = 'http://online.wsj.com' + href
        return href

    def wsj_find_articles(self, url, ahed=False):
        root = self.index_to_soup(url)

        for x in CSSSelect('div.whatsNews-simple')(root):
            x.getparent().remove(x)

        articles = []

        for a in CSSSelect('a.mjLinkItem[href]')(root):
            container = a.xpath('ancestor::li')
            meta = CSSSelect('.meta_sectionName')(a)
            if meta:
                meta = meta[0]
                meta.getparent().remove(meta)
                meta = self.tag_to_string(meta)
            title = self.tag_to_string(a)
            if meta:
                title += ' [%s]' % meta
            url = self.abs_wsj_url(a.get('href'))
            desc = ''
            if container:
                for p in CSSSelect('p')(container[0]):
                    desc = self.tag_to_string(p)
                    if 'Subscriber Content' not in desc:
                        break

            articles.append({'title': title, 'url': url,
                             'description': desc, 'date': ''})

            self.log('\tFound article:', title)
            self.log('\t\t', desc)
        if ahed:
            for h2 in root.xpath('//li[@class="ahed_listitem"]/h2'):
                a = h2.xpath('descendant::a')[0]
                title = self.tag_to_string(a)
                url = self.abs_wsj_url(a.get('href'))
                desc = ''
                p = h2.xpath('following-sibling::p')
                if p:
                    desc = self.tag_to_string(p[0])
                articles.append({'title':title, 'url':url, 'description':desc, 'date':''})
                self.log('Found article:', title)
                self.log('\t\t', desc)

        return articles

    def wsj_find_wn_articles(self, url):
        root = self.index_to_soup(url)
        articles = []

        whats_news = CSSSelect('div.whatsNews-simple.whatsNews-itp')(root)
        if whats_news:
            for a in CSSSelect('a[href]')(whats_news[-1]):
                if '/articles/' not in a.get('href', ''):
                    continue
                container = a.xpath('ancestor::p')
                for meta in CSSSelect('.meta_sectionName')(a):
                    meta.getparent().remove(meta)
                title = self.tag_to_string(a).strip()
                url = self.abs_wsj_url(a.get('href'))
                desc = ''
                if container:
                    desc = self.tag_to_string(container[0])

                articles.append({'title': title, 'url': url,
                                 'description': desc, 'date': ''})

                self.log('\tFound WN article:', title)
                self.log('\t\t', desc)

        return articles

    def wsj_add_feed(self, feeds, title, url):
        self.log('Found section:', title)
        try:
            if url.endswith('whatsnews'):
                articles = self.wsj_find_wn_articles(url)
            else:
                articles = self.wsj_find_articles(url, ahed=title == 'Front Section')
        except:
            articles = []
        if articles:
            feeds.append((title, articles))

    def get_wsj_index(self, browser):
        # return self.test_wsj_index()
        ans = {}
        root = self.index_to_soup('http://online.wsj.com/itp/today')
        for span in CSSSelect('span.date-date')(root):
            if span.text:
                self.timefmt = span.text
                break
        for a in CSSSelect('div.itpSectionHeaderPdf a[href]')(root):
            href = a.get('href')
            try:
                ans['cover'] = browser.download_file(href)
            except NotAFile:
                break
            break

        feeds = ans['index'] = []
        for a in CSSSelect('div.itpHeader ul.tab a[href]')(root):
            if '/itp/' not in a.get('href', ''):
                continue
            pageone = a.get('href').endswith('pageone')
            if pageone:
                title = 'Front Section'
                url = self.abs_wsj_url(a.get('href'))
                self.wsj_add_feed(feeds, title, url)
                title = "What's News"
                url = url.replace('pageone', 'whatsnews')
                self.wsj_add_feed(feeds, title, url)
            else:
                title = self.tag_to_string(a)
                url = self.abs_wsj_url(a.get('href'))
                self.wsj_add_feed(feeds, title, url)
        return ans

    def test_wsj_index(self):
        return {'index': [
            ('Testing', [
                {'title': 'Article One',
                 'url': 'http://online.wsj.com/articles/the-end-of-the-impulse-shopper-1416872108'},  # noqa
                {'title': 'Article Two',
                 'url': 'http://online.wsj.com/articles/ferguson-police-officer-not-charged-in-black-teens-shooting-1416882438'},  # noqa
                {'title': 'Article Three',
                 'url': 'http://online.wsj.com/article/SB10634695869867284248804580297251334393676.html'},  # noqa
            ]),
        ]}