mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
Fix parsing of HTML that has a DOCTYPE declaring it as XHTML but no xmlns attribute. Also coerce downloaded article titles to unicode
This commit is contained in:
parent
14171d419c
commit
cc7b7ebff1
@ -764,7 +764,14 @@ class Manifest(object):
|
||||
# Convert to Unicode and normalize line endings
|
||||
data = self.oeb.decode(data)
|
||||
data = self.oeb.html_preprocessor(data)
|
||||
orig_data = data
|
||||
|
||||
# Remove DOCTYPE declaration as it messes up parsing
|
||||
# Inparticular it causes a tostring to insert xmlns
|
||||
# declarations, which messes up the coesrcing logic
|
||||
idx = data.find('<html')
|
||||
if idx > -1:
|
||||
data = data[idx:]
|
||||
|
||||
# Try with more & more drastic measures to parse
|
||||
def first_pass(data):
|
||||
try:
|
||||
|
@ -25,6 +25,8 @@ class Article(object):
|
||||
entity_to_unicode, self.title)
|
||||
except:
|
||||
pass
|
||||
if not isinstance(self.title, unicode):
|
||||
self.title = self.title.decode('utf-8', 'replace')
|
||||
self.url = url
|
||||
self.author = author
|
||||
if author and not isinstance(author, unicode):
|
||||
|
@ -980,7 +980,7 @@ class BasicNewsRecipe(Recipe):
|
||||
|
||||
def error_in_article_download(self, request, traceback):
|
||||
self.jobs_done += 1
|
||||
self.log.error(_('Failed to download article: %s from %s\n')%(request.article.title, request.article.url))
|
||||
self.log.error(_(u'Failed to download article: %s from %s\n')%(request.article.title, request.article.url))
|
||||
self.log.debug(traceback)
|
||||
self.log.debug('\n')
|
||||
self.report_progress(float(self.jobs_done)/len(self.jobs), _('Article download failed: %s')%request.article.title)
|
||||
|
Loading…
x
Reference in New Issue
Block a user