diff --git a/resources/recipes/hindu_business_line.recipe b/resources/recipes/hindu_business_line.recipe new file mode 100644 index 0000000000..74cff3f068 --- /dev/null +++ b/resources/recipes/hindu_business_line.recipe @@ -0,0 +1,53 @@ +from __future__ import with_statement +__license__ = 'GPL 3' +__copyright__ = '2009, Kovid Goyal ' + +import re +from calibre.web.feeds.news import BasicNewsRecipe + +class TheHindu(BasicNewsRecipe): + title = u'The Business Line' + language = 'en_IN' + + oldest_article = 7 + __author__ = 'Dhiru' + max_articles_per_feed = 100 + no_stylesheets = True + + remove_tags_before = {'name':'font', 'class':'storyhead'} + preprocess_regexps = [ + (re.compile(r'.*', re.DOTALL), + lambda match: ''), + ] + extra_css = ''' + .storyhead{font-family:Arial,Helvetica,sans-serif; font-size:large; color:#000099;} + body{font-family:Verdana,Arial,Helvetica,sans-serif; font-size:x-small; text-align:left;} + ''' + feeds = [ + (u'Main - Latest News', u'http://www.thehindubusinessline.com/rss/blnus.xml'), + (u'Main - Front Page', u'http://www.thehindubusinessline.com/rss/14hdline.xml'), + (u'Main - Corporate', u'http://www.thehindubusinessline.com/rss/02hdline.xml'), + (u'Main - Market', u'http://www.thehindubusinessline.com/rss/05hdline.xml'), + (u'Main - Opinion', u'http://www.thehindubusinessline.com/rss/04hdline.xml'), + (u'Main - Infotech', u'http://www.thehindubusinessline.com/rss/15hdline.xml'), + (u'Main - Marketing', u'http://www.thehindubusinessline.com/rss/19hdline.xml'), + (u'Main - Money & banking', + u'http://www.thehindubusinessline.com/rss/06hdline.xml'), + (u'Main - Agri & Commodities', u'http://www.thehindubusinessline.com/rss/07hdline.xml'), + (u'Industry', + u'http://www.thehindubusinessline.com/rss/03hdline.xml'), + (u'Logistic', + u'http://www.thehindubusinessline.com/rss/09hdline.xml'), + (u'Result', u'http://www.thehindubusinessline.com/rss/26hdline.xml'), + (u'Government', + u'http://www.thehindubusinessline.com/rss/27hdline.xml'), + (u'Investment World', + u'http://www.thehindubusinessline.com/rss/iw20hdline.xml'), + (u'Supplement - Life', + u'http://www.thehindubusinessline.com/rss/lf10hdline.xml') + ] + + def postprocess_html(self, soup, first_fetch): + for t in soup.findAll(['table', 'tr', 'td','center']): + t.name = 'div' + return soup diff --git a/src/calibre/ebooks/mobi/reader.py b/src/calibre/ebooks/mobi/reader.py index c4829ec22f..e84ae2547c 100644 --- a/src/calibre/ebooks/mobi/reader.py +++ b/src/calibre/ebooks/mobi/reader.py @@ -320,6 +320,7 @@ class MobiReader(object): from lxml.html import soupparser self.log.warning('Malformed markup, parsing using BeautifulSoup') try: + self.processed_html = self.processed_html.replace('