mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Workaround bug in pdftohtml that causes it to output invalid UTf-8 encoded documents
This commit is contained in:
parent
6612be8017
commit
50498fcc0e
@ -1804,6 +1804,8 @@ class UnicodeDammit:
|
||||
xml_encoding_match = re.compile \
|
||||
('^<\?.*encoding=[\'"](.*?)[\'"].*\?>')\
|
||||
.match(xml_data)
|
||||
if xml_encoding_match is None: # By Kovid to use the content-type header in HTML files
|
||||
xml_encoding_match = re.compile(r'<meta.*?http-equiv=[\'"]Content-type[\'"].*?content=[\'"].*?charset=(\S+).*?[\'"]', re.IGNORECASE).search(xml_data)
|
||||
except:
|
||||
xml_encoding_match = None
|
||||
if xml_encoding_match:
|
||||
@ -1814,6 +1816,7 @@ class UnicodeDammit:
|
||||
'utf-16', 'utf-32', 'utf_16', 'utf_32',
|
||||
'utf16', 'u16')):
|
||||
xml_encoding = sniffed_xml_encoding
|
||||
|
||||
return xml_data, xml_encoding, sniffed_xml_encoding
|
||||
|
||||
|
||||
|
@ -350,7 +350,11 @@ class HTMLConverter(object):
|
||||
if not os.path.exists(upath):
|
||||
upath = upath.replace('&', '%26') #convertlit replaces & with %26 in file names
|
||||
f = open(upath, 'rb')
|
||||
raw = UnicodeDammit(f.read()).unicode
|
||||
raw = f.read()
|
||||
if self.pdftohtml: # Bug in pdftohtml that causes it to output invalid UTF-8 files
|
||||
raw = raw.decode('utf-8', 'ignore')
|
||||
else:
|
||||
raw = UnicodeDammit(raw).unicode
|
||||
f.close()
|
||||
soup = self.preprocess(raw)
|
||||
self.logger.info('\tConverting to BBeB...')
|
||||
|
Loading…
x
Reference in New Issue
Block a user