Improve The Marker

2025-08-30 23:00:21 -04:00 · 2011-04-27 13:00:49 -06:00 · 2011-04-27 13:00:49 -06:00 · 631afb7eac
commit 631afb7eac
parent 92de7e1807
1 changed files with 31 additions and 35 deletions
--- a/recipes/the_marker.recipe
+++ b/recipes/the_marker.recipe
@ -3,7 +3,7 @@ from calibre.web.feeds.news import BasicNewsRecipe

 class AdvancedUserRecipe1283848012(BasicNewsRecipe):
    description   = 'TheMarker Financial News in Hebrew'
-    __author__            = 'TonyTheBookworm, Marbs'
+    __author__            = 'Marbs'
    cover_url      = 'http://static.ispot.co.il/wp-content/upload/2009/09/themarker.jpg'
    title          = u'TheMarker'
    language              = 'he'
@ -11,42 +11,38 @@ class AdvancedUserRecipe1283848012(BasicNewsRecipe):
    remove_javascript     = True
    timefmt        = '[%a, %d %b, %Y]'
    oldest_article = 1
-    remove_tags = [dict(name='tr', attrs={'bgcolor':['#738A94']})          ]
-    max_articles_per_feed = 10
+    keep_only_tags =dict(name='div', attrs={'id':'content'})
+    remove_attributes = ['width','float','margin-left']
+    no_stylesheets        = True
+    remove_tags = [dict(name='div', attrs={'class':['social-nav article-social-nav','prsnlArticleEnvelope','cb']}) ,
+                            dict(name='a', attrs={'href':['/misc/mobile']})  ,
+                            dict(name='span', attrs={'class':['post-summ']}) ]
+    max_articles_per_feed = 100
    extra_css='body{direction: rtl;} .article_description{direction: rtl; } a.article{direction: rtl; } .calibre_feed_description{direction: rtl; }'
-    feeds          = [(u'Head Lines', u'http://www.themarker.com/tmc/content/xml/rss/hpfeed.xml'),
-                      (u'TA Market', u'http://www.themarker.com/tmc/content/xml/rss/sections/marketfeed.xml'),
-                      (u'Real Estate', u'http://www.themarker.com/tmc/content/xml/rss/sections/realEstaterfeed.xml'),
-                      (u'Wall Street & Global', u'http://www.themarker.com/tmc/content/xml/rss/sections/wallsfeed.xml'),
-                      (u'Law', u'http://www.themarker.com/tmc/content/xml/rss/sections/lawfeed.xml'),
-                      (u'Media', u'http://www.themarker.com/tmc/content/xml/rss/sections/mediafeed.xml'),
-                      (u'Consumer', u'http://www.themarker.com/tmc/content/xml/rss/sections/consumerfeed.xml'),
-                      (u'Career', u'http://www.themarker.com/tmc/content/xml/rss/sections/careerfeed.xml'),
-                      (u'Car', u'http://www.themarker.com/tmc/content/xml/rss/sections/carfeed.xml'),
-                      (u'High Tech', u'http://www.themarker.com/tmc/content/xml/rss/sections/hightechfeed.xml'),
-                      (u'Investor Guide', u'http://www.themarker.com/tmc/content/xml/rss/sections/investorGuidefeed.xml')]
+    feeds          = [(u'Head Lines', u'http://www.themarker.com/cmlink/1.144'),
+                      (u'TA Market', u'http://www.themarker.com/cmlink/1.243'),
+                      (u'Real Estate', u'http://www.themarker.com/cmlink/1.605656'),
+                      (u'Global', u'http://www.themarker.com/cmlink/1.605658'),
+                      (u'Wall Street', u'http://www.themarker.com/cmlink/1.613713'),
+                      (u'SmartPhone', u'http://www.themarker.com/cmlink/1.605661'),
+                      (u'Law', u'http://www.themarker.com/cmlink/1.605664'),
+                      (u'Media', u'http://www.themarker.com/cmlink/1.605660'),
+                      (u'Consumer', u'http://www.themarker.com/cmlink/1.605662'),
+                      (u'Career', u'http://www.themarker.com/cmlink/1.605665'),
+                      (u'Car', u'http://www.themarker.com/cmlink/1.605663'),
+                      (u'High Tech', u'http://www.themarker.com/cmlink/1.605659'),
+                      (u'Small Business', u'http://www.themarker.com/cmlink/1.605666')]

    def print_version(self, url):
-        split1 = url.split("=")
-        weblinks = url
+        #split1 = url.split("/")
+        #print_url='http://www.themarker.com/misc/article-print-page/'+split1[-1]
+        txt=url

-        if weblinks is not None:
-            for link in weblinks:
-                #---------------------------------------------------------
-                #here we need some help with some regexpressions
-                #we are trying to find it.themarker.com in a url
-                #-----------------------------------------------------------
-                re1='.*?'   # Non-greedy match on filler
-                re2='(it\\.themarker\\.com)'    # Fully Qualified Domain Name 1
-                rg = re.compile(re1+re2,re.IGNORECASE|re.DOTALL)
-                m = rg.search(url)
+        re1='.*?'	# Non-greedy match on filler
+        re2='(tv)'	# Word 1

-
-                if m:
-                 split2 = url.split("article/")
-                 print_url = 'http://it.themarker.com/tmit/PrintArticle/' + split2[1]
-
-                else:
-                    print_url = 'http://www.themarker.com/ibo/misc/printFriendly.jhtml?ElementId=%2Fibo%2Frepositories%2Fstories%2Fm1_2000%2F' + split1[1]+'.xml'
-
-        return print_url
+        rg = re.compile(re1+re2,re.IGNORECASE|re.DOTALL)
+        m = rg.search(txt)
+        if m:
+            #print 'bad link'
+            return 1