From 50498fcc0e89d12bf0f5d9a0ca6291ab0cbf41cb Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 2 Feb 2008 23:44:46 +0000 Subject: [PATCH] Workaround bug in pdftohtml that causes it to output invalid UTf-8 encoded documents --- src/libprs500/ebooks/BeautifulSoup.py | 5 ++++- src/libprs500/ebooks/lrf/html/convert_from.py | 6 +++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/src/libprs500/ebooks/BeautifulSoup.py b/src/libprs500/ebooks/BeautifulSoup.py index 5f23ae5935..628e4f148b 100644 --- a/src/libprs500/ebooks/BeautifulSoup.py +++ b/src/libprs500/ebooks/BeautifulSoup.py @@ -1676,7 +1676,7 @@ class UnicodeDammit: for proposedEncoding in (documentEncoding, sniffedEncoding): u = self._convertFrom(proposedEncoding) if u: break - + # If no luck and we have auto-detection library, try that: if not u and chardet and not isinstance(self.markup, unicode): u = self._convertFrom(chardet.detect(self.markup)['encoding']) @@ -1804,6 +1804,8 @@ class UnicodeDammit: xml_encoding_match = re.compile \ ('^<\?.*encoding=[\'"](.*?)[\'"].*\?>')\ .match(xml_data) + if xml_encoding_match is None: # By Kovid to use the content-type header in HTML files + xml_encoding_match = re.compile(r'