mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
Fix parsing of HTML that has a DOCTYPE declaring it as XHTML but no xmlns attribute. Also coerce downloaded article titles to unicode
This commit is contained in:
parent
14171d419c
commit
cc7b7ebff1
@ -764,7 +764,14 @@ class Manifest(object):
|
|||||||
# Convert to Unicode and normalize line endings
|
# Convert to Unicode and normalize line endings
|
||||||
data = self.oeb.decode(data)
|
data = self.oeb.decode(data)
|
||||||
data = self.oeb.html_preprocessor(data)
|
data = self.oeb.html_preprocessor(data)
|
||||||
orig_data = data
|
|
||||||
|
# Remove DOCTYPE declaration as it messes up parsing
|
||||||
|
# Inparticular it causes a tostring to insert xmlns
|
||||||
|
# declarations, which messes up the coesrcing logic
|
||||||
|
idx = data.find('<html')
|
||||||
|
if idx > -1:
|
||||||
|
data = data[idx:]
|
||||||
|
|
||||||
# Try with more & more drastic measures to parse
|
# Try with more & more drastic measures to parse
|
||||||
def first_pass(data):
|
def first_pass(data):
|
||||||
try:
|
try:
|
||||||
|
@ -25,6 +25,8 @@ class Article(object):
|
|||||||
entity_to_unicode, self.title)
|
entity_to_unicode, self.title)
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
if not isinstance(self.title, unicode):
|
||||||
|
self.title = self.title.decode('utf-8', 'replace')
|
||||||
self.url = url
|
self.url = url
|
||||||
self.author = author
|
self.author = author
|
||||||
if author and not isinstance(author, unicode):
|
if author and not isinstance(author, unicode):
|
||||||
|
@ -980,7 +980,7 @@ class BasicNewsRecipe(Recipe):
|
|||||||
|
|
||||||
def error_in_article_download(self, request, traceback):
|
def error_in_article_download(self, request, traceback):
|
||||||
self.jobs_done += 1
|
self.jobs_done += 1
|
||||||
self.log.error(_('Failed to download article: %s from %s\n')%(request.article.title, request.article.url))
|
self.log.error(_(u'Failed to download article: %s from %s\n')%(request.article.title, request.article.url))
|
||||||
self.log.debug(traceback)
|
self.log.debug(traceback)
|
||||||
self.log.debug('\n')
|
self.log.debug('\n')
|
||||||
self.report_progress(float(self.jobs_done)/len(self.jobs), _('Article download failed: %s')%request.article.title)
|
self.report_progress(float(self.jobs_done)/len(self.jobs), _('Article download failed: %s')%request.article.title)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user