mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Conversion pipeline: Decode XML files in a more fault tolerant manner
This commit is contained in:
parent
974b33ca7a
commit
c4fa7317be
@ -6,7 +6,7 @@ __docformat__ = 'restructuredtext en'
|
|||||||
'''
|
'''
|
||||||
www.guardian.co.uk
|
www.guardian.co.uk
|
||||||
'''
|
'''
|
||||||
#from calibre import strftime
|
from calibre import strftime
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class Guardian(BasicNewsRecipe):
|
class Guardian(BasicNewsRecipe):
|
||||||
@ -16,7 +16,7 @@ class Guardian(BasicNewsRecipe):
|
|||||||
language = 'en_GB'
|
language = 'en_GB'
|
||||||
|
|
||||||
oldest_article = 7
|
oldest_article = 7
|
||||||
max_articles_per_feed = 50
|
max_articles_per_feed = 100
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
|
|
||||||
timefmt = ' [%a, %d %b %Y]'
|
timefmt = ' [%a, %d %b %Y]'
|
||||||
@ -75,9 +75,6 @@ class Guardian(BasicNewsRecipe):
|
|||||||
|
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
'''
|
|
||||||
def find_sections(self):
|
def find_sections(self):
|
||||||
soup = self.index_to_soup('http://www.guardian.co.uk/theguardian')
|
soup = self.index_to_soup('http://www.guardian.co.uk/theguardian')
|
||||||
# find cover pic
|
# find cover pic
|
||||||
@ -113,11 +110,13 @@ class Guardian(BasicNewsRecipe):
|
|||||||
}
|
}
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
feeds = []
|
try:
|
||||||
for title, href in self.find_sections():
|
feeds = []
|
||||||
feeds.append((title, list(self.find_articles(href))))
|
for title, href in self.find_sections():
|
||||||
return feeds
|
feeds.append((title, list(self.find_articles(href))))
|
||||||
'''
|
return feeds
|
||||||
|
except:
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -759,6 +759,7 @@ class Manifest(object):
|
|||||||
% (self.id, self.href, self.media_type)
|
% (self.id, self.href, self.media_type)
|
||||||
|
|
||||||
def _parse_xml(self, data):
|
def _parse_xml(self, data):
|
||||||
|
data = xml_to_unicode(data, strip_encoding_pats=True)[0]
|
||||||
parser = etree.XMLParser(recover=True)
|
parser = etree.XMLParser(recover=True)
|
||||||
try:
|
try:
|
||||||
return etree.fromstring(data, parser=parser)
|
return etree.fromstring(data, parser=parser)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user