From ffd3ac2b21fede695365e3914cd7e92f58444545 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 23 Sep 2007 06:36:37 +0000 Subject: [PATCH] Fix #213 --- src/libprs500/ebooks/lrf/html/convert_from.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/libprs500/ebooks/lrf/html/convert_from.py b/src/libprs500/ebooks/lrf/html/convert_from.py index 68e2ce03cc..6e41b78285 100644 --- a/src/libprs500/ebooks/lrf/html/convert_from.py +++ b/src/libprs500/ebooks/lrf/html/convert_from.py @@ -196,10 +196,18 @@ class HTMLConverter(object): raw = open(self.file_name, 'rb').read() if self.pdftohtml: nmassage.extend(HTMLConverter.PDFTOHTML) - raw = unicode(raw, 'utf8', 'replace') - soup = BeautifulSoup(raw, + #raw = unicode(raw, 'utf8', 'replace') + try: + soup = BeautifulSoup(raw, convertEntities=BeautifulSoup.HTML_ENTITIES, markupMassage=nmassage) + except ConversionError, err: + if 'Failed to coerce to unicode' in str(err): + raw = unicode(raw, 'utf8', 'replace') + soup = BeautifulSoup(raw, + convertEntities=BeautifulSoup.HTML_ENTITIES, + markupMassage=nmassage) + if not self.baen and self.is_baen(soup): self.baen = True self.logger.info('Baen file detected. Re-parsing...')