calibre/recipes/haaretz_en.recipe

__license__   = 'GPL v3'
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
'''
www.haaretz.com
'''

import re
from calibre import strftime
from time import gmtime
from calibre.web.feeds.news import BasicNewsRecipe

class HaaretzPrint_en(BasicNewsRecipe):
    title                 = 'Haaretz - print edition'
    __author__            = 'Darko Miletic'
    description           = "Haaretz.com is the world's leading English-language Website for real-time news and analysis of Israel and the Middle East."
    publisher             = 'Haaretz'
    category              = "news, Haaretz, Israel news, Israel newspapers, Israel business news, Israel financial news, Israeli news,Israeli newspaper, Israeli newspapers, news from Israel, news in Israel, news Israel, news on Israel, newspaper Israel, Israel sports news, Israel diplomacy news"
    oldest_article        = 2
    max_articles_per_feed = 200
    no_stylesheets        = True
    encoding              = 'utf8'
    use_embedded_content  = False
    language              = 'en_IL'
    publication_type      = 'newspaper'
    PREFIX                = 'http://www.haaretz.com'
    masthead_url          = PREFIX + '/images/logos/logoGrey.gif'
    extra_css             = ' body{font-family: Verdana,Arial,Helvetica,sans-serif } '

    preprocess_regexps = [(re.compile(r'</body>.*?</html>', re.DOTALL|re.IGNORECASE),lambda match: '</body></html>')]

    conversion_options = {
                          'comment'  : description
                        , 'tags'     : category
                        , 'publisher': publisher
                        , 'language' : language
                        }

    keep_only_tags    = [dict(attrs={'id':'threecolumns'})]
    remove_attributes = ['width','height']
    remove_tags       = [
                           dict(name=['iframe','link','object','embed'])
                          ,dict(name='div',attrs={'class':'rightcol'})
                        ]


    feeds = [
              (u'News'          , PREFIX + u'/print-edition/news'         )
             ,(u'Opinion'       , PREFIX + u'/print-edition/opinion'      )
             ,(u'Business'      , PREFIX + u'/print-edition/business'     )
             ,(u'Real estate'   , PREFIX + u'/print-edition/real-estate'  )
             ,(u'Sports'        , PREFIX + u'/print-edition/sports'       )
             ,(u'Travel'        , PREFIX + u'/print-edition/travel'       )
             ,(u'Books'         , PREFIX + u'/print-edition/books'        )
             ,(u'Food & Wine'   , PREFIX + u'/print-edition/food-wine'    )
             ,(u'Arts & Leisure', PREFIX + u'/print-edition/arts-leisure' )
             ,(u'Features'      , PREFIX + u'/print-edition/features'     )
            ]


    def print_version(self, url):
        article = url.rpartition('/')[2]
        return 'http://www.haaretz.com/misc/article-print-page/' + article

    def parse_index(self):
        totalfeeds = []
        lfeeds = self.get_feeds()
        for feedobj in lfeeds:
            feedtitle, feedurl = feedobj
            self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl))
            articles = []
            soup = self.index_to_soup(feedurl)
            for item in soup.findAll(attrs={'class':'text'}):
                sp = item.find('span',attrs={'class':'h3 font-weight-normal'})
                desc = item.find('p')
                description = ''
                if sp:
                    if desc:
                       description = self.tag_to_string(desc)
                    link        = sp.a
                    url         = self.PREFIX + link['href']
                    title       = self.tag_to_string(link)
                    times        = strftime('%a, %d %b %Y %H:%M:%S +0000',gmtime())
                    articles.append({
                                          'title'      :title
                                         ,'date'       :times
                                         ,'url'        :url
                                         ,'description':description
                                        })
            totalfeeds.append((feedtitle, articles))
        return totalfeeds


    def preprocess_html(self, soup):
        for item in soup.findAll(style=True):
            del item['style']
        return soup