From 50498fcc0e89d12bf0f5d9a0ca6291ab0cbf41cb Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sat, 2 Feb 2008 23:44:46 +0000
Subject: [PATCH] Workaround bug in pdftohtml that causes it to output invalid
 UTf-8 encoded documents

---
 src/libprs500/ebooks/BeautifulSoup.py         | 5 ++++-
 src/libprs500/ebooks/lrf/html/convert_from.py | 6 +++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/libprs500/ebooks/BeautifulSoup.py b/src/libprs500/ebooks/BeautifulSoup.py
index 5f23ae5935..628e4f148b 100644
--- a/src/libprs500/ebooks/BeautifulSoup.py
+++ b/src/libprs500/ebooks/BeautifulSoup.py
@@ -1676,7 +1676,7 @@ class UnicodeDammit:
             for proposedEncoding in (documentEncoding, sniffedEncoding):
                 u = self._convertFrom(proposedEncoding)
                 if u: break
-
+        
         # If no luck and we have auto-detection library, try that:
         if not u and chardet and not isinstance(self.markup, unicode):
             u = self._convertFrom(chardet.detect(self.markup)['encoding'])
@@ -1804,6 +1804,8 @@ class UnicodeDammit:
             xml_encoding_match = re.compile \
                                  ('^<\?.*encoding=[\'"](.*?)[\'"].*\?>')\
                                  .match(xml_data)
+            if xml_encoding_match is None: # By Kovid to use the content-type header in HTML files
+                xml_encoding_match = re.compile(r'<meta.*?http-equiv=[\'"]Content-type[\'"].*?content=[\'"].*?charset=(\S+).*?[\'"]', re.IGNORECASE).search(xml_data)
         except:
             xml_encoding_match = None
         if xml_encoding_match:
@@ -1814,6 +1816,7 @@ class UnicodeDammit:
                                  'utf-16', 'utf-32', 'utf_16', 'utf_32',
                                  'utf16', 'u16')):
                 xml_encoding = sniffed_xml_encoding
+        
         return xml_data, xml_encoding, sniffed_xml_encoding
 
 
diff --git a/src/libprs500/ebooks/lrf/html/convert_from.py b/src/libprs500/ebooks/lrf/html/convert_from.py
index a058195fdf..4d697001e7 100644
--- a/src/libprs500/ebooks/lrf/html/convert_from.py
+++ b/src/libprs500/ebooks/lrf/html/convert_from.py
@@ -350,7 +350,11 @@ class HTMLConverter(object):
         if not os.path.exists(upath):
             upath = upath.replace('&', '%26') #convertlit replaces & with %26 in file names 
         f = open(upath, 'rb')
-        raw = UnicodeDammit(f.read()).unicode
+        raw = f.read()
+        if self.pdftohtml: # Bug in pdftohtml that causes it to output invalid UTF-8 files
+            raw = raw.decode('utf-8', 'ignore')
+        else:
+            raw = UnicodeDammit(raw).unicode
         f.close()
         soup = self.preprocess(raw)
         self.logger.info('\tConverting to BBeB...')