Workaround bug in pdftohtml that causes it to output invalid UTf-8 encoded documents

2025-12-20 03:55:25 -05:00 · 2008-02-02 23:44:46 +00:00 · 2008-02-02 23:44:46 +00:00 · 50498fcc0e
commit 50498fcc0e
parent 6612be8017
2 changed files with 9 additions and 2 deletions
--- a/src/libprs500/ebooks/BeautifulSoup.py
+++ b/src/libprs500/ebooks/BeautifulSoup.py
@ -1804,6 +1804,8 @@ class UnicodeDammit:
            xml_encoding_match = re.compile \
                                 ('^<\?.*encoding=[\'"](.*?)[\'"].*\?>')\
                                 .match(xml_data)
+            if xml_encoding_match is None: # By Kovid to use the content-type header in HTML files
+                xml_encoding_match = re.compile(r'<meta.*?http-equiv=[\'"]Content-type[\'"].*?content=[\'"].*?charset=(\S+).*?[\'"]', re.IGNORECASE).search(xml_data)
        except:
            xml_encoding_match = None
        if xml_encoding_match:
@ -1814,6 +1816,7 @@ class UnicodeDammit:
                                 'utf-16', 'utf-32', 'utf_16', 'utf_32',
                                 'utf16', 'u16')):
                xml_encoding = sniffed_xml_encoding
+        
        return xml_data, xml_encoding, sniffed_xml_encoding


--- a/src/libprs500/ebooks/lrf/html/convert_from.py
+++ b/src/libprs500/ebooks/lrf/html/convert_from.py
@ -350,7 +350,11 @@ class HTMLConverter(object):
        if not os.path.exists(upath):
            upath = upath.replace('&', '%26') #convertlit replaces & with %26 in file names 
        f = open(upath, 'rb')
-        raw = UnicodeDammit(f.read()).unicode
+        raw = f.read()
+        if self.pdftohtml: # Bug in pdftohtml that causes it to output invalid UTF-8 files
+            raw = raw.decode('utf-8', 'ignore')
+        else:
+            raw = UnicodeDammit(raw).unicode
        f.close()
        soup = self.preprocess(raw)
        self.logger.info('\tConverting to BBeB...')