Workaround bug in pdftohtml that causes it to output invalid UTf-8 encoded documents

This commit is contained in:
Kovid Goyal 2008-02-02 23:44:46 +00:00
parent 6612be8017
commit 50498fcc0e
2 changed files with 9 additions and 2 deletions

View File

@ -1676,7 +1676,7 @@ class UnicodeDammit:
for proposedEncoding in (documentEncoding, sniffedEncoding): for proposedEncoding in (documentEncoding, sniffedEncoding):
u = self._convertFrom(proposedEncoding) u = self._convertFrom(proposedEncoding)
if u: break if u: break
# If no luck and we have auto-detection library, try that: # If no luck and we have auto-detection library, try that:
if not u and chardet and not isinstance(self.markup, unicode): if not u and chardet and not isinstance(self.markup, unicode):
u = self._convertFrom(chardet.detect(self.markup)['encoding']) u = self._convertFrom(chardet.detect(self.markup)['encoding'])
@ -1804,6 +1804,8 @@ class UnicodeDammit:
xml_encoding_match = re.compile \ xml_encoding_match = re.compile \
('^<\?.*encoding=[\'"](.*?)[\'"].*\?>')\ ('^<\?.*encoding=[\'"](.*?)[\'"].*\?>')\
.match(xml_data) .match(xml_data)
if xml_encoding_match is None: # By Kovid to use the content-type header in HTML files
xml_encoding_match = re.compile(r'<meta.*?http-equiv=[\'"]Content-type[\'"].*?content=[\'"].*?charset=(\S+).*?[\'"]', re.IGNORECASE).search(xml_data)
except: except:
xml_encoding_match = None xml_encoding_match = None
if xml_encoding_match: if xml_encoding_match:
@ -1814,6 +1816,7 @@ class UnicodeDammit:
'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf-16', 'utf-32', 'utf_16', 'utf_32',
'utf16', 'u16')): 'utf16', 'u16')):
xml_encoding = sniffed_xml_encoding xml_encoding = sniffed_xml_encoding
return xml_data, xml_encoding, sniffed_xml_encoding return xml_data, xml_encoding, sniffed_xml_encoding

View File

@ -350,7 +350,11 @@ class HTMLConverter(object):
if not os.path.exists(upath): if not os.path.exists(upath):
upath = upath.replace('&', '%26') #convertlit replaces & with %26 in file names upath = upath.replace('&', '%26') #convertlit replaces & with %26 in file names
f = open(upath, 'rb') f = open(upath, 'rb')
raw = UnicodeDammit(f.read()).unicode raw = f.read()
if self.pdftohtml: # Bug in pdftohtml that causes it to output invalid UTF-8 files
raw = raw.decode('utf-8', 'ignore')
else:
raw = UnicodeDammit(raw).unicode
f.close() f.close()
soup = self.preprocess(raw) soup = self.preprocess(raw)
self.logger.info('\tConverting to BBeB...') self.logger.info('\tConverting to BBeB...')