Conversion pipeline: Decode XML files in a more fault tolerant manner

This commit is contained in:
Kovid Goyal 2009-12-14 11:30:26 -07:00
parent 974b33ca7a
commit c4fa7317be
2 changed files with 10 additions and 10 deletions

View File

@ -6,7 +6,7 @@ __docformat__ = 'restructuredtext en'
''' '''
www.guardian.co.uk www.guardian.co.uk
''' '''
#from calibre import strftime from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class Guardian(BasicNewsRecipe): class Guardian(BasicNewsRecipe):
@ -16,7 +16,7 @@ class Guardian(BasicNewsRecipe):
language = 'en_GB' language = 'en_GB'
oldest_article = 7 oldest_article = 7
max_articles_per_feed = 50 max_articles_per_feed = 100
remove_javascript = True remove_javascript = True
timefmt = ' [%a, %d %b %Y]' timefmt = ' [%a, %d %b %Y]'
@ -75,9 +75,6 @@ class Guardian(BasicNewsRecipe):
return soup return soup
'''
def find_sections(self): def find_sections(self):
soup = self.index_to_soup('http://www.guardian.co.uk/theguardian') soup = self.index_to_soup('http://www.guardian.co.uk/theguardian')
# find cover pic # find cover pic
@ -113,11 +110,13 @@ class Guardian(BasicNewsRecipe):
} }
def parse_index(self): def parse_index(self):
feeds = [] try:
for title, href in self.find_sections(): feeds = []
feeds.append((title, list(self.find_articles(href)))) for title, href in self.find_sections():
return feeds feeds.append((title, list(self.find_articles(href))))
''' return feeds
except:
raise NotImplementedError

View File

@ -759,6 +759,7 @@ class Manifest(object):
% (self.id, self.href, self.media_type) % (self.id, self.href, self.media_type)
def _parse_xml(self, data): def _parse_xml(self, data):
data = xml_to_unicode(data, strip_encoding_pats=True)[0]
parser = etree.XMLParser(recover=True) parser = etree.XMLParser(recover=True)
try: try:
return etree.fromstring(data, parser=parser) return etree.fromstring(data, parser=parser)