Fix #1008757 (Updated recipe for Haaretz)

2025-08-30 23:00:21 -04:00 · 2012-06-05 09:25:59 +05:30 · 2012-06-05 09:25:59 +05:30 · 7e4efc5e41
commit 7e4efc5e41
parent 8d48adcd49
1 changed files with 44 additions and 50 deletions
--- a/recipes/haaretz_en.recipe
+++ b/recipes/haaretz_en.recipe
@ -1,16 +1,15 @@
 __license__   = 'GPL v3'
-__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
+__copyright__ = '2010-2012, Darko Miletic <darko.miletic at gmail.com>'
 '''
 www.haaretz.com
 '''

 import re
-from calibre import strftime
-from time import gmtime
+import urllib
 from calibre.web.feeds.news import BasicNewsRecipe

-class HaaretzPrint_en(BasicNewsRecipe):
-    title                 = 'Haaretz - print edition'
+class Haaretz_en(BasicNewsRecipe):
+    title                 = 'Haaretz'
    __author__            = 'Darko Miletic'
    description           = "Haaretz.com is the world's leading English-language Website for real-time news and analysis of Israel and the Middle East."
    publisher             = 'Haaretz'
@ -21,10 +20,16 @@ class HaaretzPrint_en(BasicNewsRecipe):
    encoding              = 'utf8'
    use_embedded_content  = False
    language              = 'en_IL'
+    needs_subscription    = True
+    remove_empty_feeds    = True
    publication_type      = 'newspaper'
    PREFIX                = 'http://www.haaretz.com'
-    masthead_url          = PREFIX + '/images/logos/logoGrey.gif'
-    extra_css             = ' body{font-family: Verdana,Arial,Helvetica,sans-serif } '
+    masthead_url          = PREFIX + '/images/logos/HaaretzLogo.gif'
+    extra_css             = """
+                                body{font-family: Verdana,Arial,Helvetica,sans-serif }
+                                h1, .articleBody {font-family: Georgia, serif}
+                                .authorBar {font-size: small}
+                            """

    preprocess_regexps = [(re.compile(r'</body>.*?</html>', re.DOTALL|re.IGNORECASE),lambda match: '</body></html>')]

@ -44,53 +49,42 @@ class HaaretzPrint_en(BasicNewsRecipe):


    feeds = [
-              (u'News'          , PREFIX + u'/print-edition/news'         )
-             ,(u'Opinion'       , PREFIX + u'/print-edition/opinion'      )
-             ,(u'Business'      , PREFIX + u'/print-edition/business'     )
-             ,(u'Real estate'   , PREFIX + u'/print-edition/real-estate'  )
-             ,(u'Sports'        , PREFIX + u'/print-edition/sports'       )
-             ,(u'Travel'        , PREFIX + u'/print-edition/travel'       )
-             ,(u'Books'         , PREFIX + u'/print-edition/books'        )
-             ,(u'Food & Wine'   , PREFIX + u'/print-edition/food-wine'    )
-             ,(u'Arts & Leisure', PREFIX + u'/print-edition/arts-leisure' )
-             ,(u'Features'      , PREFIX + u'/print-edition/features'     )
+              (u'Headlines'             , 'http://feeds.feedburner.com/haaretz/LBao'        )
+             ,(u'Opinion'               , 'http://feeds.feedburner.com/haaretz/opinions'    )
+             ,(u'Defence and diplomacy' , 'http://feeds.feedburner.com/DefenseAndDiplomacy' )
+             ,(u'National'              , 'http://feeds.feedburner.com/haaretz/National'    )
+             ,(u'International'         , 'http://feeds.feedburner.com/InternationalRss'    )
+             ,(u'Jewish World'          , 'http://feeds.feedburner.com/JewishWorldRss'      )
+             ,(u'Business'              , 'http://feeds.feedburner.com/BusinessPrintRss'    )
+             ,(u'Real Estate'           , 'http://feeds.feedburner.com/RealEstatePrintRss'  )
+             ,(u'Features'              , 'http://feeds.feedburner.com/FeaturesPrintRss'    )
+             ,(u'Arts & Leisure'        , 'http://feeds.feedburner.com/ArtsAndLeisureRss'   )
+             ,(u'Books'                 , 'http://www.haaretz.com/cmlink/books-rss-1.264947?localLinksEnabled=false')
+             ,(u'Food & Wine'           , 'http://feeds.feedburner.com/FoodAndWinePrintRss' )
+             ,(u'Sports'                , 'http://feeds.feedburner.com/haaretz/Sport'       )
            ]

+    def get_browser(self):
+        br = BasicNewsRecipe.get_browser()
+        br.open(self.PREFIX)
+        if self.username is not None and self.password is not None:
+            data = urllib.urlencode({ 'cb':'parseEngReply'
+                                     ,'newsso':'true'
+                                     ,'fromlogin':'true'
+                                     ,'layer':'eng_login'
+                                     ,'userName':self.username
+                                     ,'password':self.password
+                                   })
+            br.open('https://sso.haaretz.com/sso/sso/signIn',data)
+        return br
+
+    def get_article_url(self, article):
+        url = BasicNewsRecipe.get_article_url(self, article)
+        return self.browser.open_novisit(url).geturl()

    def print_version(self, url):
        article = url.rpartition('/')[2]
        return 'http://www.haaretz.com/misc/article-print-page/' + article

-    def parse_index(self):
-        totalfeeds = []
-        lfeeds = self.get_feeds()
-        for feedobj in lfeeds:
-            feedtitle, feedurl = feedobj
-            self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl))
-            articles = []
-            soup = self.index_to_soup(feedurl)
-            for item in soup.findAll(attrs={'class':'text'}):
-                sp = item.find('span',attrs={'class':'h3 font-weight-normal'})
-                desc = item.find('p')
-                description = ''
-                if sp:
-                    if desc:
-                       description = self.tag_to_string(desc)
-                    link        = sp.a
-                    url         = self.PREFIX + link['href']
-                    title       = self.tag_to_string(link)
-                    times        = strftime('%a, %d %b %Y %H:%M:%S +0000',gmtime())
-                    articles.append({
-                                          'title'      :title
-                                         ,'date'       :times
-                                         ,'url'        :url
-                                         ,'description':description
-                                        })
-            totalfeeds.append((feedtitle, articles))
-        return totalfeeds
-
-
-    def preprocess_html(self, soup):
-        for item in soup.findAll(style=True):
-            del item['style']
-        return soup
+    def preprocess_raw_html(self, raw, url):
+       return '<html><head>'+raw[raw.find('</head>'):]