Hindu Businees Line by Dhiru

2026-06-06 14:05:21 -04:00 · 2010-03-31 08:10:26 +05:30
parent 9b8edf21a2
commit b3007d8410
2 changed files with 54 additions and 0 deletions
@@ -0,0 +1,53 @@
+from __future__ import with_statement
+__license__ = 'GPL 3'
+__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
+
+import re
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class TheHindu(BasicNewsRecipe):
+    title                 = u'The Business Line'
+    language = 'en_IN'
+
+    oldest_article        = 7
+    __author__            = 'Dhiru'
+    max_articles_per_feed = 100
+    no_stylesheets = True
+
+    remove_tags_before = {'name':'font', 'class':'storyhead'}
+    preprocess_regexps = [
+                (re.compile(r'<!-- story ends -->.*', re.DOTALL),
+                 lambda match: '</body></html>'),
+                          ]
+    extra_css = '''
+                .storyhead{font-family:Arial,Helvetica,sans-serif; font-size:large; color:#000099;}
+                body{font-family:Verdana,Arial,Helvetica,sans-serif; font-size:x-small; text-align:left;}
+                '''
+    feeds          = [
+     (u'Main - Latest News', u'http://www.thehindubusinessline.com/rss/blnus.xml'),
+       (u'Main - Front Page', u'http://www.thehindubusinessline.com/rss/14hdline.xml'),
+      (u'Main - Corporate', u'http://www.thehindubusinessline.com/rss/02hdline.xml'),
+      (u'Main - Market', u'http://www.thehindubusinessline.com/rss/05hdline.xml'),
+      (u'Main - Opinion', u'http://www.thehindubusinessline.com/rss/04hdline.xml'),
+      (u'Main - Infotech', u'http://www.thehindubusinessline.com/rss/15hdline.xml'),
+      (u'Main - Marketing', u'http://www.thehindubusinessline.com/rss/19hdline.xml'),
+      (u'Main - Money & banking',
+       u'http://www.thehindubusinessline.com/rss/06hdline.xml'),
+      (u'Main - Agri & Commodities', u'http://www.thehindubusinessline.com/rss/07hdline.xml'),
+      (u'Industry',
+       u'http://www.thehindubusinessline.com/rss/03hdline.xml'),
+      (u'Logistic',
+       u'http://www.thehindubusinessline.com/rss/09hdline.xml'),
+      (u'Result', u'http://www.thehindubusinessline.com/rss/26hdline.xml'),
+      (u'Government',
+       u'http://www.thehindubusinessline.com/rss/27hdline.xml'),
+      (u'Investment World',
+       u'http://www.thehindubusinessline.com/rss/iw20hdline.xml'),
+      (u'Supplement - Life',
+       u'http://www.thehindubusinessline.com/rss/lf10hdline.xml')
+      ]
+
+    def postprocess_html(self, soup, first_fetch):
+        for t in soup.findAll(['table', 'tr', 'td','center']):
+            t.name = 'div'
+        return soup
@@ -320,6 +320,7 @@ class MobiReader(object):
            from lxml.html import soupparser
            self.log.warning('Malformed markup, parsing using BeautifulSoup')
            try:
+                self.processed_html = self.processed_html.replace('</</', '</')
                root = soupparser.fromstring(self.processed_html)
            except Exception:
                self.log.warning('MOBI markup appears to contain random bytes. Stripping.')