From 9a79f6f4a3360bdd3c1ab0898ca2275237171227 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 17 Jan 2008 16:11:52 +0000 Subject: [PATCH] Improved character encoding detection for html2lrf --- src/libprs500/ebooks/lrf/html/convert_from.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/libprs500/ebooks/lrf/html/convert_from.py b/src/libprs500/ebooks/lrf/html/convert_from.py index 5959b752bc..e1908737d8 100644 --- a/src/libprs500/ebooks/lrf/html/convert_from.py +++ b/src/libprs500/ebooks/lrf/html/convert_from.py @@ -33,7 +33,8 @@ except ImportError: import Image as PILImage from libprs500.ebooks.BeautifulSoup import BeautifulSoup, Comment, Tag, \ - NavigableString, Declaration, ProcessingInstruction + NavigableString, Declaration, ProcessingInstruction, \ + UnicodeDammit from libprs500.ebooks.lrf.pylrs.pylrs import Paragraph, CR, Italic, ImageStream, \ TextBlock, ImageBlock, JumpButton, CharButton, \ Plot, Image, BlockSpace, RuledLine, BookSetting, Canvas, DropCaps, \ @@ -348,7 +349,7 @@ class HTMLConverter(object): upath = path.encode('utf-8') if isinstance(path, unicode) else path if not os.path.exists(upath): upath = upath.replace('&', '%26') #convertlit replaces & with %26 in file names - raw = open(upath, 'rb').read() + raw = UnicodeDammit(open(upath, 'rb').read()).unicode soup = self.preprocess(raw) self.logger.info('\tConverting to BBeB...') self.current_page = None