From cc7b7ebff165d71039789a525b79b44e8c139e14 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 11 Jul 2009 14:33:51 -0600 Subject: [PATCH] Fix parsing of HTML that has a DOCTYPE declaring it as XHTML but no xmlns attribute. Also coerce downloaded article titles to unicode --- src/calibre/ebooks/oeb/base.py | 9 ++++++++- src/calibre/web/feeds/__init__.py | 2 ++ src/calibre/web/feeds/news.py | 2 +- 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index 215e5a65ce..618e28daeb 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -764,7 +764,14 @@ class Manifest(object): # Convert to Unicode and normalize line endings data = self.oeb.decode(data) data = self.oeb.html_preprocessor(data) - orig_data = data + + # Remove DOCTYPE declaration as it messes up parsing + # Inparticular it causes a tostring to insert xmlns + # declarations, which messes up the coesrcing logic + idx = data.find(' -1: + data = data[idx:] + # Try with more & more drastic measures to parse def first_pass(data): try: diff --git a/src/calibre/web/feeds/__init__.py b/src/calibre/web/feeds/__init__.py index 14ca98f534..5bf1260df4 100644 --- a/src/calibre/web/feeds/__init__.py +++ b/src/calibre/web/feeds/__init__.py @@ -25,6 +25,8 @@ class Article(object): entity_to_unicode, self.title) except: pass + if not isinstance(self.title, unicode): + self.title = self.title.decode('utf-8', 'replace') self.url = url self.author = author if author and not isinstance(author, unicode): diff --git a/src/calibre/web/feeds/news.py b/src/calibre/web/feeds/news.py index 6ca0f8318f..88367ac63e 100644 --- a/src/calibre/web/feeds/news.py +++ b/src/calibre/web/feeds/news.py @@ -980,7 +980,7 @@ class BasicNewsRecipe(Recipe): def error_in_article_download(self, request, traceback): self.jobs_done += 1 - self.log.error(_('Failed to download article: %s from %s\n')%(request.article.title, request.article.url)) + self.log.error(_(u'Failed to download article: %s from %s\n')%(request.article.title, request.article.url)) self.log.debug(traceback) self.log.debug('\n') self.report_progress(float(self.jobs_done)/len(self.jobs), _('Article download failed: %s')%request.article.title)