Improved character encoding detection for html2lrf

2025-09-13 23:58:05 -04:00 · 2008-01-17 16:11:52 +00:00 · 2008-01-17 16:11:52 +00:00 · 9a79f6f4a3
commit 9a79f6f4a3
parent 5a36f6a755
1 changed files with 3 additions and 2 deletions
--- a/src/libprs500/ebooks/lrf/html/convert_from.py
+++ b/src/libprs500/ebooks/lrf/html/convert_from.py
@ -33,7 +33,8 @@ except ImportError:
    import Image as PILImage

 from libprs500.ebooks.BeautifulSoup import BeautifulSoup, Comment, Tag, \
-                            NavigableString, Declaration, ProcessingInstruction
+                            NavigableString, Declaration, ProcessingInstruction, \
+                            UnicodeDammit
 from libprs500.ebooks.lrf.pylrs.pylrs import Paragraph, CR, Italic, ImageStream, \
                TextBlock, ImageBlock, JumpButton, CharButton, \
                Plot, Image, BlockSpace, RuledLine, BookSetting, Canvas, DropCaps, \
@ -348,7 +349,7 @@ class HTMLConverter(object):
        upath = path.encode('utf-8') if isinstance(path, unicode) else path
        if not os.path.exists(upath):
            upath = upath.replace('&', '%26') #convertlit replaces & with %26 in file names 
-        raw = open(upath, 'rb').read()
+        raw = UnicodeDammit(open(upath, 'rb').read()).unicode
        soup = self.preprocess(raw)
        self.logger.info('\tConverting to BBeB...')
        self.current_page = None