Fix parsing of HTML that has a DOCTYPE declaring it as XHTML but no xmlns attribute. Also coerce downloaded article titles to unicode

This commit is contained in:
Kovid Goyal 2009-07-11 14:33:51 -06:00
parent 14171d419c
commit cc7b7ebff1
3 changed files with 11 additions and 2 deletions

View File

@ -764,7 +764,14 @@ class Manifest(object):
# Convert to Unicode and normalize line endings # Convert to Unicode and normalize line endings
data = self.oeb.decode(data) data = self.oeb.decode(data)
data = self.oeb.html_preprocessor(data) data = self.oeb.html_preprocessor(data)
orig_data = data
# Remove DOCTYPE declaration as it messes up parsing
# Inparticular it causes a tostring to insert xmlns
# declarations, which messes up the coesrcing logic
idx = data.find('<html')
if idx > -1:
data = data[idx:]
# Try with more & more drastic measures to parse # Try with more & more drastic measures to parse
def first_pass(data): def first_pass(data):
try: try:

View File

@ -25,6 +25,8 @@ class Article(object):
entity_to_unicode, self.title) entity_to_unicode, self.title)
except: except:
pass pass
if not isinstance(self.title, unicode):
self.title = self.title.decode('utf-8', 'replace')
self.url = url self.url = url
self.author = author self.author = author
if author and not isinstance(author, unicode): if author and not isinstance(author, unicode):

View File

@ -980,7 +980,7 @@ class BasicNewsRecipe(Recipe):
def error_in_article_download(self, request, traceback): def error_in_article_download(self, request, traceback):
self.jobs_done += 1 self.jobs_done += 1
self.log.error(_('Failed to download article: %s from %s\n')%(request.article.title, request.article.url)) self.log.error(_(u'Failed to download article: %s from %s\n')%(request.article.title, request.article.url))
self.log.debug(traceback) self.log.debug(traceback)
self.log.debug('\n') self.log.debug('\n')
self.report_progress(float(self.jobs_done)/len(self.jobs), _('Article download failed: %s')%request.article.title) self.report_progress(float(self.jobs_done)/len(self.jobs), _('Article download failed: %s')%request.article.title)