Fix #1008757 (Updated recipe for Haaretz)

2025-07-09 03:04:10 -04:00 · 2012-06-05 09:25:59 +05:30 · 2012-06-05 09:25:59 +05:30 · 7e4efc5e41
commit 7e4efc5e41
parent 8d48adcd49
1 changed files with 44 additions and 50 deletions
--- a/recipes/haaretz_en.recipe
+++ b/recipes/haaretz_en.recipe
@ -1,16 +1,15 @@
 __license__   = 'GPL v3'
-__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
+__copyright__ = '2010-2012, Darko Miletic <darko.miletic at gmail.com>'
 '''
 www.haaretz.com
 '''
 import re
-from calibre import strftime
+import urllib
 from time import gmtime
 from calibre.web.feeds.news import BasicNewsRecipe
-class HaaretzPrint_en(BasicNewsRecipe):
+class Haaretz_en(BasicNewsRecipe):
-    title                 = 'Haaretz - print edition'
+    title                 = 'Haaretz'
    __author__            = 'Darko Miletic'
    description           = "Haaretz.com is the world's leading English-language Website for real-time news and analysis of Israel and the Middle East."
    publisher             = 'Haaretz'
@ -21,10 +20,16 @@ class HaaretzPrint_en(BasicNewsRecipe):
    encoding              = 'utf8'
    use_embedded_content  = False
    language              = 'en_IL'
    needs_subscription    = True
    remove_empty_feeds    = True
    publication_type      = 'newspaper'
    PREFIX                = 'http://www.haaretz.com'
-    masthead_url          = PREFIX + '/images/logos/logoGrey.gif'
+    masthead_url          = PREFIX + '/images/logos/HaaretzLogo.gif'
-    extra_css             = ' body{font-family: Verdana,Arial,Helvetica,sans-serif } '
+    extra_css             = """
                                body{font-family: Verdana,Arial,Helvetica,sans-serif }
                                h1, .articleBody {font-family: Georgia, serif}
                                .authorBar {font-size: small}
                            """
    preprocess_regexps = [(re.compile(r'</body>.*?</html>', re.DOTALL|re.IGNORECASE),lambda match: '</body></html>')]
@ -44,53 +49,42 @@ class HaaretzPrint_en(BasicNewsRecipe):
    feeds = [
-              (u'News'          , PREFIX + u'/print-edition/news'         )
+              (u'Headlines'             , 'http://feeds.feedburner.com/haaretz/LBao'        )
-             ,(u'Opinion'       , PREFIX + u'/print-edition/opinion'      )
+             ,(u'Opinion'               , 'http://feeds.feedburner.com/haaretz/opinions'    )
-             ,(u'Business'      , PREFIX + u'/print-edition/business'     )
+             ,(u'Defence and diplomacy' , 'http://feeds.feedburner.com/DefenseAndDiplomacy' )
-             ,(u'Real estate'   , PREFIX + u'/print-edition/real-estate'  )
+             ,(u'National'              , 'http://feeds.feedburner.com/haaretz/National'    )
-             ,(u'Sports'        , PREFIX + u'/print-edition/sports'       )
+             ,(u'International'         , 'http://feeds.feedburner.com/InternationalRss'    )
-             ,(u'Travel'        , PREFIX + u'/print-edition/travel'       )
+             ,(u'Jewish World'          , 'http://feeds.feedburner.com/JewishWorldRss'      )
-             ,(u'Books'         , PREFIX + u'/print-edition/books'        )
+             ,(u'Business'              , 'http://feeds.feedburner.com/BusinessPrintRss'    )
-             ,(u'Food & Wine'   , PREFIX + u'/print-edition/food-wine'    )
+             ,(u'Real Estate'           , 'http://feeds.feedburner.com/RealEstatePrintRss'  )
-             ,(u'Arts & Leisure', PREFIX + u'/print-edition/arts-leisure' )
+             ,(u'Features'              , 'http://feeds.feedburner.com/FeaturesPrintRss'    )
-             ,(u'Features'      , PREFIX + u'/print-edition/features'     )
+             ,(u'Arts & Leisure'        , 'http://feeds.feedburner.com/ArtsAndLeisureRss'   )
             ,(u'Books'                 , 'http://www.haaretz.com/cmlink/books-rss-1.264947?localLinksEnabled=false')
             ,(u'Food & Wine'           , 'http://feeds.feedburner.com/FoodAndWinePrintRss' )
             ,(u'Sports'                , 'http://feeds.feedburner.com/haaretz/Sport'       )
            ]
    def get_browser(self):
        br = BasicNewsRecipe.get_browser()
        br.open(self.PREFIX)
        if self.username is not None and self.password is not None:
            data = urllib.urlencode({ 'cb':'parseEngReply'
                                     ,'newsso':'true'
                                     ,'fromlogin':'true'
                                     ,'layer':'eng_login'
                                     ,'userName':self.username
                                     ,'password':self.password
                                   })
            br.open('https://sso.haaretz.com/sso/sso/signIn',data)
        return br
    def get_article_url(self, article):
        url = BasicNewsRecipe.get_article_url(self, article)
        return self.browser.open_novisit(url).geturl()
    def print_version(self, url):
        article = url.rpartition('/')[2]
        return 'http://www.haaretz.com/misc/article-print-page/' + article
-    def parse_index(self):
+    def preprocess_raw_html(self, raw, url):
-        totalfeeds = []
+       return '<html><head>'+raw[raw.find('</head>'):]
        lfeeds = self.get_feeds()
        for feedobj in lfeeds:
            feedtitle, feedurl = feedobj
            self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl))
            articles = []
            soup = self.index_to_soup(feedurl)
            for item in soup.findAll(attrs={'class':'text'}):
                sp = item.find('span',attrs={'class':'h3 font-weight-normal'})
                desc = item.find('p')
                description = ''
                if sp:
                    if desc:
                       description = self.tag_to_string(desc)
                    link        = sp.a
                    url         = self.PREFIX + link['href']
                    title       = self.tag_to_string(link)
                    times        = strftime('%a, %d %b %Y %H:%M:%S +0000',gmtime())
                    articles.append({
                                          'title'      :title
                                         ,'date'       :times
                                         ,'url'        :url
                                         ,'description':description
                                        })
            totalfeeds.append((feedtitle, articles))
        return totalfeeds
    def preprocess_html(self, soup):
        for item in soup.findAll(style=True):
            del item['style']
        return soup