From a70957c75e5abcb8355a5632f8e6d3e68d5f9728 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 1 Dec 2009 23:51:05 +0000 Subject: [PATCH] New recipe for Money Control by kwetal --- resources/recipes/moneycontrol.recipe | 57 +++++++++++++++++++++++++++ src/calibre/ebooks/oeb/base.py | 5 +++ 2 files changed, 62 insertions(+) create mode 100644 resources/recipes/moneycontrol.recipe diff --git a/resources/recipes/moneycontrol.recipe b/resources/recipes/moneycontrol.recipe new file mode 100644 index 0000000000..8be0b3a1a6 --- /dev/null +++ b/resources/recipes/moneycontrol.recipe @@ -0,0 +1,57 @@ +from calibre.web.feeds.news import BasicNewsRecipe +#from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag + +class MoneyControlRecipe(BasicNewsRecipe): + __license__ = 'GPL v3' + __author__ = 'kwetal' + language = 'en_IN' + locale = 'en_IN' + encoding = 'iso-8859-1' + version = 1 + + title = u'Money Control' + publisher = u'moneycontrol.com' + category = u'News, Financial, India' + description = u'Financial news from India' + + oldest_article = 7 + max_articles_per_feed = 100 + use_embedded_content = False + + no_stylesheets = True + remove_javascript = True + + feeds = [] + feeds.append((u'Latest News', u'http://www.moneycontrol.com/rss/latestnews.xml')) + feeds.append((u'All Stories', u'http://www.moneycontrol.com/rss/allstories.xml')) + + def print_version(self, url): + return url.replace('/stocksnews.php?', '/news_print.php?') + '&sr_no=0' + + # The articles contain really horrible html. More than one and section, not properly closed tags, lots and lots of + # tags and some weird markup that crashes the conversion to ebook. Needs some drastic sanitizing + '''def preprocess_html(self, soup): + freshSoup = BeautifulSoup('') + + headline = soup.find('td', attrs = {'class': 'heading'}) + if headline: + h1 = Tag(freshSoup, 'h1') + # Convert to string before adding it to the document! + h1.append(self.tag_to_string(headline)) + freshSoup.body.append(h1) + + for p in soup.findAll('p'): + if p.has_key('class'): + if p['class'] == 'MsoNormal': + # We have some weird pagebreak marker here; it will not find all of them however + continue + + para = Tag(freshSoup, 'p') + # Convert to string; this will loose all formatting but also all illegal markup + para.append(self.tag_to_string(p)) + + freshSoup.body.append(para) + + return freshSoup + ''' + diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index 56fa48d32f..a16f5f6139 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -861,6 +861,11 @@ class Manifest(object): data = re.compile(r'', re.DOTALL).sub('', data) data = re.sub(r']+?>', '', data) + open('/tmp/t.html', 'wb').write(data.encode('utf-8')) + data = data.replace( + "", + '') + data = data.replace("", '') data = etree.fromstring(data) elif namespace(data.tag) != XHTML_NS: # OEB_DOC_NS, but possibly others