calibre/recipes/wsj_free.recipe

#!/usr/bin/env  python
__license__   = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'

from calibre.web.feeds.news import BasicNewsRecipe
import copy

class WallStreetJournal(BasicNewsRecipe):

    title = 'Wall Street Journal (free)'
    __author__ = 'Kovid Goyal, Sujata Raman, Joshua Oster-Morris, Starson17'
    description = '''News and current affairs. This recipe only fetches complete
    versions of the articles that are available free on the wsj.com website.
    To get the rest of the articles, subscribe to the WSJ and use the other WSJ
    recipe.'''
    language = 'en'
    cover_url           = 'http://dealbreaker.com/images/thumbs/Wall%20Street%20Journal%20A1.JPG'
    max_articles_per_feed = 1000
    timefmt  = ' [%a, %b %d, %Y]'
    no_stylesheets = True
    ignore_duplicate_articles = {'url'}

    keep_only_tags = [
        dict(name='h1'), dict(name='h2', attrs={'class':['subhead', 'subHed deck']}),
        dict(name='span', itemprop='author', rel='author'),
        dict(name='article', id='articleBody'),
        dict(name='div', id='article_story_body'),
    ]
    remove_tags = [
        dict(attrs={'class':['insetButton', 'insettipBox']}),
        dict(name='span', attrs={'data-country-code':True, 'data-ticker-code':True}),
    ]

    def populate_article_metadata(self, article, soup, first):
        if first and hasattr(self, 'add_toc_thumbnail'):
            picdiv = soup.find('img')
            if picdiv is not None:
                self.add_toc_thumbnail(article,picdiv['src'])

    def preprocess_html(self, soup):
        # Remove thumbnail for zoomable images
        for div in soup.findAll('div', attrs={'class':lambda x: x and 'insetZoomTargetBox' in x.split()}):
            img = div.find('img')
            if img is not None:
                img.extract()

        return soup

    def abs_wsj_url(self, href):
        if not href.startswith('http'):
            href = 'http://online.wsj.com' + href
        return href

    def wsj_get_index(self):
        return self.index_to_soup('http://online.wsj.com/itp')

    def wsj_add_feed(self,feeds,title,url):
        self.log('Found section:', title)
        try:
            if url.endswith('whatsnews'):
                articles = self.wsj_find_wn_articles(url)
            else:
                articles = self.wsj_find_articles(url)
        except:
            articles = []
        if articles:
            feeds.append((title, articles))
        return feeds

    def parse_index(self):
        soup = self.wsj_get_index()

        date = soup.find('span', attrs={'class':'date-date'})
        if date is not None:
            self.timefmt = ' [%s]'%self.tag_to_string(date)

        feeds = []
        div = soup.find('div', attrs={'class':'itpHeader'})
        div = div.find('ul', attrs={'class':'tab'})
        for a in div.findAll('a', href=lambda x: x and '/itp/' in x):
            pageone = a['href'].endswith('pageone')
            if pageone:
                title = 'Front Section'
                url = self.abs_wsj_url(a['href'])
                feeds = self.wsj_add_feed(feeds,title,url)
                title = 'What''s News'
                url = url.replace('pageone','whatsnews')
                feeds = self.wsj_add_feed(feeds,title,url)
            else:
                title = self.tag_to_string(a)
                url = self.abs_wsj_url(a['href'])
                feeds = self.wsj_add_feed(feeds,title,url)

        for li in soup.findAll('li', attrs={'class':'ahed_listitem'}):
            h2 = li.find('h2')
            if h2 is None:
                continue
            a = h2.find('a', href=True)
            if a is None:
                continue
            url = a['href']
            title = self.tag_to_string(a)
            p = h2.findNextSibling('p')
            if p is not None:
                desc = self.tag_to_string(p)
            else:
                desc = ''
            if feeds:
                feeds[0][1].append({'title':title, 'url':url, 'description':desc, 'date':''})
        return feeds

    def wsj_find_wn_articles(self, url):
        soup = self.index_to_soup(url)
        articles = []

        whats_news = soup.find('div', attrs={'class':lambda x: x and 'whatsNews-simple' in x})
        if whats_news is not None:
            for a in whats_news.findAll('a', href=lambda x: x and '/article/' in x):
                container = a.findParent(['p'])
                meta = a.find(attrs={'class':'meta_sectionName'})
                if meta is not None:
                    meta.extract()
                title = self.tag_to_string(a).strip()
                url = a['href']
                desc = ''
                if container is not None:
                    desc = self.tag_to_string(container)

                articles.append({'title':title, 'url':url,
                    'description':desc, 'date':''})

                self.log('\tFound WN article:', title)

        return articles

    def wsj_find_articles(self, url):
        soup = self.index_to_soup(url)

        whats_news = soup.find('div', attrs={'class':lambda x: x and 'whatsNews-simple' in x})
        if whats_news is not None:
            whats_news.extract()

        articles = []

        flavorarea = soup.find('div', attrs={'class':lambda x: x and 'ahed' in x})
        if flavorarea is not None:
            flavorstory = flavorarea.find('a', href=lambda x: x and x.startswith('/article'))
            if flavorstory is not None:
                flavorstory['class'] = 'mjLinkItem'
                metapage = soup.find('span', attrs={'class':lambda x: x and 'meta_sectionName' in x})
                if metapage is not None:
                    flavorstory.append(copy.copy(metapage))  # metapage should always be A1 because that should be first on the page

        for a in soup.findAll('a', attrs={'class':'mjLinkItem'}, href=True):
            container = a.findParent(['li', 'div'])
            meta = a.find(attrs={'class':'meta_sectionName'})
            if meta is not None:
                meta.extract()
                meta = self.tag_to_string(meta).strip()
            if meta:
                title = self.tag_to_string(a).strip() + ' [%s]'%meta
            else:
                title = self.tag_to_string(a).strip()
            url = self.abs_wsj_url(a['href'])
            desc = ''
            for p in container.findAll('p'):
                desc = self.tag_to_string(p)
                if 'Subscriber Content' not in desc:
                    break

            articles.append({'title':title, 'url':url,
                'description':desc, 'date':''})

            self.log('\tFound article:', title)

        return articles