From 75d1de87b51518bbb2242b417fbd6bd636770dc1 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Mon, 5 Jul 2010 08:06:17 -0600
Subject: [PATCH] Fix #6095 (Haaretz recipe broken)

---
 resources/images/news/haaretz.png           | Bin 0 -> 1184 bytes
 resources/recipes/haaretz_en.recipe         |  97 ++++++++++++++------
 src/calibre/web/feeds/recipes/collection.py |   2 +-
 3 files changed, 69 insertions(+), 30 deletions(-)
 create mode 100644 resources/images/news/haaretz.png
diff --git a/resources/images/news/haaretz.png b/resources/images/news/haaretz.png
new file mode 100644
index 0000000000000000000000000000000000000000..e6cb02c3b64dcd0a73b81fbb52c076b5d5e740af
GIT binary patch
literal 1184
zcmV;R1Yi4!P)<h;3K|Lk000e1NJLTq001Wd001Kh1^@s67cvtg00006VoOIv0RI60
z0RN!9r;`8x010qNS#tmY07w7;07w8v$!k6U000Sga6xAP001Wd001KZvC{wM000BY
zNkl<ZSi|j?O=w(I7>1v7f9EGRGo7iKiH)X>)`S*+P^h?3kkn08>cWMqE(A9!iio%o
zL@9^}LZISCap9t(;Lc5LKwGJlnkq^igQlHmnwT_yCCSX(pL1M<OlC53Co>b<rs%u6
zbMJibdC&dM_kQ;i2cQ2?u}z`fXaP5^IAh}tr9jborZpkT*HX0}FA!8v)EB1-sQFhD
zsMMott;cWVs1fISJ}*D>F!${3Mm4#6fuQ6Ey#3J$CjMR|;Y4`rxyR`2h&In51ea$P
zIr_;ko)?s}P$f=EMQ=7q->#i4?>cH3NTm>(pnrFUsBN_rzr%^v+|%zk@c5f+KQv8{
z%TM#?)I8a=L$Xf5M$9fO@!d}sIrZa3JYS-uVk9@g?BX)L*(4F$+>|3fImht1-#P!=
z1gQ11CpP+==LZyBpL8O|yDuJMUtf19>1QWK`1Gq07M6<GmQhYe(e-gXpZ$Fq-h1f@
zc67$;#xEC23?2WL;UC5*x;{~>x&&&h#58oAn9Y@i<<*^p;KJA>AAE6!rGiV;Hmj;N
zVi|PCBjoZkjE+x-a(sPulrO)zfRd_4SY2^zFm#R1xE)G7F>{q5keG&E_qkGF+a^vt
z7RvG0^dhFAp@md;LlbUf37RI*w8kE;3lX|7fUfH_veawZA)2+48zOG2Ft!q`2+xXY
zs|o%~;-r*#o{ui-OHgW6+*VTCwz&HTdN}slQ|Ozt(c3bK>n|>y=pdcwXnWynDy~p+
znZLS%5RGdpm55=R?Jd)`xbJ_-bM)hrM6CK0Qm&r0_g{IE{@%>>6(^;lQ1S>n@2dr&
zP_*}Lw<U1^x~7G85v9bI&174f@c-ieJL05NEEQZlUxo$-fu!jA1hW32_k)0yqFc6<
zdhk4tqUVQJe3du>l!5@u)H(dfeg^i0`&0MMP9D1ZcA~aH!Sh*j450~1u1_i%<LSYB
z$)w_;9R0mr+_$$IsT8gk)J&vRGeju}xHaYQ#<PPBXNOAUC+9f!`DxDlJWe!XQgVGB
z80g{kBahORayE@uN^$PjafXhcX6*7j@yPl>RGz-1QkaI$J1-n!-=1t!!T^#^l!FiU
zGd?v(E`Nodt^`A`9wzC;nu-@fu<Mp2g9q;7)cHRsl)Un@vI#4tNX8@NE=`fUG({`^
zY3Lf+w1Z)2B%BB*&yM2xLGuIxEW_Z=o;0(IE467AfE0?AqK9b1*iA#nvW(S2sn~rh
zur0Hy<E+0qKqx`H#Yw#qN-8uVh(+4xsKm|ZbK3>~08Tk#BIa`f(EtDdC3HntbYx+4
zWjbSWWnpw>05UK!FfA}QEig4yFgQ9eGCDOkEig7ZFfeMJhUowR03~!qSaf7zbY(hi
yZ)9m^c>ppnF)%GKG%YeVR536*GB`RhHZ3qVIxsNr9fFhq0000<MNUMnLSTYFRwl>*

literal 0
HcmV?d00001

diff --git a/resources/recipes/haaretz_en.recipe b/resources/recipes/haaretz_en.recipe
index 4df6b45a3e..4404624aff 100644
--- a/resources/recipes/haaretz_en.recipe
+++ b/resources/recipes/haaretz_en.recipe
@@ -1,56 +1,95 @@
 __license__   = 'GPL v3'
 __copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
 '''
-haaretz.com
+www.haaretz.com
 '''
 
+import re
+from calibre import strftime
+from time import gmtime
 from calibre.web.feeds.news import BasicNewsRecipe
 
-class Haaretz_en(BasicNewsRecipe):
-    title                 = 'Haaretz in English'
+class HaaretzPrint_en(BasicNewsRecipe):
+    title                 = 'Haaretz - print edition'
     __author__            = 'Darko Miletic'
-    description           = 'Haaretz.com, the online edition of Haaretz Newspaper in Israel, and analysis from Israel and the Middle East. Haaretz.com provides extensive and in-depth coverage of Israel, the Jewish World and the Middle East, including defense, diplomacy, the Arab-Israeli conflict, the peace process, Israeli politics, Jerusalem affairs, international relations, Iran, Iraq, Syria, Lebanon, the Palestinian Authority, the West Bank and the Gaza Strip, the Israeli business world and Jewish life in Israel and the Diaspora. '
-    publisher             = 'haaretz.com'
-    category              = 'news, politics, Israel'
+    description           = "Haaretz.com is the world's leading English-language Website for real-time news and analysis of Israel and the Middle East."
+    publisher             = 'Haaretz'
+    category              = "news, Haaretz, Israel news, Israel newspapers, Israel business news, Israel financial news, Israeli news,Israeli newspaper, Israeli newspapers, news from Israel, news in Israel, news Israel, news on Israel, newspaper Israel, Israel sports news, Israel diplomacy news"
     oldest_article        = 2
     max_articles_per_feed = 200
     no_stylesheets        = True
-    encoding              = 'cp1252'
+    encoding              = 'utf8'
     use_embedded_content  = False
     language              = 'en_IL'
     publication_type      = 'newspaper'
-    remove_empty_feeds    = True
-    masthead_url          = 'http://www.haaretz.com/images/logos/logoGrey.gif'
+    PREFIX                = 'http://www.haaretz.com'
+    masthead_url          = PREFIX + '/images/logos/logoGrey.gif'
     extra_css             = ' body{font-family: Verdana,Arial,Helvetica,sans-serif } '
 
+    preprocess_regexps = [(re.compile(r'</body>.*?</html>', re.DOTALL|re.IGNORECASE),lambda match: '</body></html>')]
+
     conversion_options = {
-                          'comment'   : description
-                        , 'tags'      : category
-                        , 'publisher' : publisher
-                        , 'language'  : language
+                          'comment'  : description
+                        , 'tags'     : category
+                        , 'publisher': publisher
+                        , 'language' : language
                         }
 
-    remove_tags = [dict(name='div', attrs={'class':['rightcol']}),dict(name='table')]
-    remove_tags_before = dict(name='h1')
-    remove_tags_after  = dict(attrs={'id':'innerArticle'})
-    keep_only_tags     = [dict(attrs={'id':'content'})]
+    keep_only_tags    = [dict(attrs={'id':'threecolumns'})]
+    remove_attributes = ['width','height']
+    remove_tags       = [
+                           dict(name=['iframe','link','object','embed'])
+                          ,dict(name='div',attrs={'class':'rightcol'})
+                        ]
 
 
     feeds = [
-              (u'Opinion'               , u'http://www.haaretz.com/cmlink/opinion-rss-1.209234?localLinksEnabled=false'   )
-             ,(u'Defense and diplomacy' , u'http://www.haaretz.com/cmlink/defense-and-diplomacy-rss-1.208894?localLinksEnabled=false')
-             ,(u'National'              , u'http://www.haaretz.com/cmlink/national-rss-1.208896?localLinksEnabled=false'       )
-             ,(u'International'         , u'http://www.haaretz.com/cmlink/international-rss-1.208898?localLinksEnabled=false'      )
-             ,(u'Jewish World'          , u'http://www.haaretz.com/cmlink/jewish-world-rss-1.209085?localLinksEnabled=false'        )
-             ,(u'Business'              , u'http://www.haaretz.com/cmlink/business-print-rss-1.264904?localLinksEnabled=false'     )
-             ,(u'Real Estate'           , u'http://www.haaretz.com/cmlink/real-estate-print-rss-1.264977?localLinksEnabled=false'      )
-             ,(u'Features'              , u'http://www.haaretz.com/cmlink/features-print-rss-1.264912?localLinksEnabled=false'          )
-             ,(u'Arts and leisure'      , u'http://www.haaretz.com/cmlink/arts-and-leisure-rss-1.286090?localLinksEnabled=false'       )
-             ,(u'Books'                 , u'http://www.haaretz.com/cmlink/books-rss-1.264947?localLinksEnabled=false'         )
-             ,(u'Food and Wine'         , u'http://www.haaretz.com/cmlink/food-and-wine-print-rss-1.265034?localLinksEnabled=false'      )
-             ,(u'Sports'                , u'http://www.haaretz.com/cmlink/sports-rss-1.286092?localLinksEnabled=false'          )
+              (u'News'          , PREFIX + u'/print-edition/news'         )
+             ,(u'Opinion'       , PREFIX + u'/print-edition/opinion'      )
+             ,(u'Business'      , PREFIX + u'/print-edition/business'     )
+             ,(u'Real estate'   , PREFIX + u'/print-edition/real-estate'  )
+             ,(u'Sports'        , PREFIX + u'/print-edition/sports'       )
+             ,(u'Travel'        , PREFIX + u'/print-edition/travel'       )
+             ,(u'Books'         , PREFIX + u'/print-edition/books'        )
+             ,(u'Food & Wine'   , PREFIX + u'/print-edition/food-wine'    )
+             ,(u'Arts & Leisure', PREFIX + u'/print-edition/arts-leisure' )
+             ,(u'Features'      , PREFIX + u'/print-edition/features'     )
             ]
 
+
+    def print_version(self, url):
+        article = url.rpartition('/')[2]
+        return 'http://www.haaretz.com/misc/article-print-page/' + article
+
+    def parse_index(self):
+        totalfeeds = []
+        lfeeds = self.get_feeds()
+        for feedobj in lfeeds:
+            feedtitle, feedurl = feedobj
+            self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl))
+            articles = []
+            soup = self.index_to_soup(feedurl)
+            for item in soup.findAll(attrs={'class':'text'}):
+                sp = item.find('span',attrs={'class':'h3 font-weight-normal'})
+                desc = item.find('p')
+                description = ''
+                if sp:
+                    if desc:
+                       description = self.tag_to_string(desc)
+                    link        = sp.a
+                    url         = self.PREFIX + link['href']
+                    title       = self.tag_to_string(link)
+                    times        = strftime('%a, %d %b %Y %H:%M:%S +0000',gmtime())
+                    articles.append({
+                                          'title'      :title
+                                         ,'date'       :times
+                                         ,'url'        :url
+                                         ,'description':description
+                                        })
+            totalfeeds.append((feedtitle, articles))
+        return totalfeeds
+
+
     def preprocess_html(self, soup):
         for item in soup.findAll(style=True):
             del item['style']
diff --git a/src/calibre/web/feeds/recipes/collection.py b/src/calibre/web/feeds/recipes/collection.py
index 9baebf9900..1dd19dc524 100644
--- a/src/calibre/web/feeds/recipes/collection.py
+++ b/src/calibre/web/feeds/recipes/collection.py
@@ -22,7 +22,7 @@ E = ElementMaker(namespace=NS, nsmap={None:NS})
 
 def iterate_over_builtin_recipe_files():
     exclude = ['craigslist', 'iht', 'outlook_india', 'toronto_sun',
-            'indian_express', 'india_today']
+            'indian_express', 'india_today', 'livemint']
     d = os.path.dirname
     base = os.path.join(d(d(d(d(d(d(os.path.abspath(__file__))))))), 'resources', 'recipes')
     for x in os.walk(base):