Workaround bug in pdftohtml that causes it to output invalid UTf-8 encoded documents

2025-07-09 03:04:10 -04:00 · 2008-02-02 23:44:46 +00:00 · 2008-02-02 23:44:46 +00:00 · 50498fcc0e
commit 50498fcc0e
parent 6612be8017
2 changed files with 9 additions and 2 deletions
--- a/src/libprs500/ebooks/BeautifulSoup.py
+++ b/src/libprs500/ebooks/BeautifulSoup.py
@ -1676,7 +1676,7 @@ class UnicodeDammit:
            for proposedEncoding in (documentEncoding, sniffedEncoding):
                u = self._convertFrom(proposedEncoding)
                if u: break
-
+        
        # If no luck and we have auto-detection library, try that:
        if not u and chardet and not isinstance(self.markup, unicode):
            u = self._convertFrom(chardet.detect(self.markup)['encoding'])
@ -1804,6 +1804,8 @@ class UnicodeDammit:
            xml_encoding_match = re.compile \
                                 ('^<\?.*encoding=[\'"](.*?)[\'"].*\?>')\
                                 .match(xml_data)
            if xml_encoding_match is None: # By Kovid to use the content-type header in HTML files
                xml_encoding_match = re.compile(r'<meta.*?http-equiv=[\'"]Content-type[\'"].*?content=[\'"].*?charset=(\S+).*?[\'"]', re.IGNORECASE).search(xml_data)
        except:
            xml_encoding_match = None
        if xml_encoding_match:
@ -1814,6 +1816,7 @@ class UnicodeDammit:
                                 'utf-16', 'utf-32', 'utf_16', 'utf_32',
                                 'utf16', 'u16')):
                xml_encoding = sniffed_xml_encoding
        return xml_data, xml_encoding, sniffed_xml_encoding
--- a/src/libprs500/ebooks/lrf/html/convert_from.py
+++ b/src/libprs500/ebooks/lrf/html/convert_from.py
@ -350,7 +350,11 @@ class HTMLConverter(object):
        if not os.path.exists(upath):
            upath = upath.replace('&', '%26') #convertlit replaces & with %26 in file names 
        f = open(upath, 'rb')
-        raw = UnicodeDammit(f.read()).unicode
+        raw = f.read()
        if self.pdftohtml: # Bug in pdftohtml that causes it to output invalid UTF-8 files
            raw = raw.decode('utf-8', 'ignore')
        else:
            raw = UnicodeDammit(raw).unicode
        f.close()
        soup = self.preprocess(raw)
        self.logger.info('\tConverting to BBeB...')