From c4fa7317be81a7eabdf5198efea84b29a2ab0b92 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 14 Dec 2009 11:30:26 -0700 Subject: [PATCH] Conversion pipeline: Decode XML files in a more fault tolerant manner --- resources/recipes/guardian.recipe | 19 +++++++++---------- src/calibre/ebooks/oeb/base.py | 1 + 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/resources/recipes/guardian.recipe b/resources/recipes/guardian.recipe index 1aee9bdf07..f74414a569 100644 --- a/resources/recipes/guardian.recipe +++ b/resources/recipes/guardian.recipe @@ -6,7 +6,7 @@ __docformat__ = 'restructuredtext en' ''' www.guardian.co.uk ''' -#from calibre import strftime +from calibre import strftime from calibre.web.feeds.news import BasicNewsRecipe class Guardian(BasicNewsRecipe): @@ -16,7 +16,7 @@ class Guardian(BasicNewsRecipe): language = 'en_GB' oldest_article = 7 - max_articles_per_feed = 50 + max_articles_per_feed = 100 remove_javascript = True timefmt = ' [%a, %d %b %Y]' @@ -75,9 +75,6 @@ class Guardian(BasicNewsRecipe): return soup - - -''' def find_sections(self): soup = self.index_to_soup('http://www.guardian.co.uk/theguardian') # find cover pic @@ -113,11 +110,13 @@ class Guardian(BasicNewsRecipe): } def parse_index(self): - feeds = [] - for title, href in self.find_sections(): - feeds.append((title, list(self.find_articles(href)))) - return feeds -''' + try: + feeds = [] + for title, href in self.find_sections(): + feeds.append((title, list(self.find_articles(href)))) + return feeds + except: + raise NotImplementedError diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index b37f51c6b5..dff2dc724b 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -759,6 +759,7 @@ class Manifest(object): % (self.id, self.href, self.media_type) def _parse_xml(self, data): + data = xml_to_unicode(data, strip_encoding_pats=True)[0] parser = etree.XMLParser(recover=True) try: return etree.fromstring(data, parser=parser)