mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Workaround bug in pdftohtml that causes it to output invalid UTf-8 encoded documents
This commit is contained in:
parent
6612be8017
commit
50498fcc0e
@ -1676,7 +1676,7 @@ class UnicodeDammit:
|
|||||||
for proposedEncoding in (documentEncoding, sniffedEncoding):
|
for proposedEncoding in (documentEncoding, sniffedEncoding):
|
||||||
u = self._convertFrom(proposedEncoding)
|
u = self._convertFrom(proposedEncoding)
|
||||||
if u: break
|
if u: break
|
||||||
|
|
||||||
# If no luck and we have auto-detection library, try that:
|
# If no luck and we have auto-detection library, try that:
|
||||||
if not u and chardet and not isinstance(self.markup, unicode):
|
if not u and chardet and not isinstance(self.markup, unicode):
|
||||||
u = self._convertFrom(chardet.detect(self.markup)['encoding'])
|
u = self._convertFrom(chardet.detect(self.markup)['encoding'])
|
||||||
@ -1804,6 +1804,8 @@ class UnicodeDammit:
|
|||||||
xml_encoding_match = re.compile \
|
xml_encoding_match = re.compile \
|
||||||
('^<\?.*encoding=[\'"](.*?)[\'"].*\?>')\
|
('^<\?.*encoding=[\'"](.*?)[\'"].*\?>')\
|
||||||
.match(xml_data)
|
.match(xml_data)
|
||||||
|
if xml_encoding_match is None: # By Kovid to use the content-type header in HTML files
|
||||||
|
xml_encoding_match = re.compile(r'<meta.*?http-equiv=[\'"]Content-type[\'"].*?content=[\'"].*?charset=(\S+).*?[\'"]', re.IGNORECASE).search(xml_data)
|
||||||
except:
|
except:
|
||||||
xml_encoding_match = None
|
xml_encoding_match = None
|
||||||
if xml_encoding_match:
|
if xml_encoding_match:
|
||||||
@ -1814,6 +1816,7 @@ class UnicodeDammit:
|
|||||||
'utf-16', 'utf-32', 'utf_16', 'utf_32',
|
'utf-16', 'utf-32', 'utf_16', 'utf_32',
|
||||||
'utf16', 'u16')):
|
'utf16', 'u16')):
|
||||||
xml_encoding = sniffed_xml_encoding
|
xml_encoding = sniffed_xml_encoding
|
||||||
|
|
||||||
return xml_data, xml_encoding, sniffed_xml_encoding
|
return xml_data, xml_encoding, sniffed_xml_encoding
|
||||||
|
|
||||||
|
|
||||||
|
@ -350,7 +350,11 @@ class HTMLConverter(object):
|
|||||||
if not os.path.exists(upath):
|
if not os.path.exists(upath):
|
||||||
upath = upath.replace('&', '%26') #convertlit replaces & with %26 in file names
|
upath = upath.replace('&', '%26') #convertlit replaces & with %26 in file names
|
||||||
f = open(upath, 'rb')
|
f = open(upath, 'rb')
|
||||||
raw = UnicodeDammit(f.read()).unicode
|
raw = f.read()
|
||||||
|
if self.pdftohtml: # Bug in pdftohtml that causes it to output invalid UTF-8 files
|
||||||
|
raw = raw.decode('utf-8', 'ignore')
|
||||||
|
else:
|
||||||
|
raw = UnicodeDammit(raw).unicode
|
||||||
f.close()
|
f.close()
|
||||||
soup = self.preprocess(raw)
|
soup = self.preprocess(raw)
|
||||||
self.logger.info('\tConverting to BBeB...')
|
self.logger.info('\tConverting to BBeB...')
|
||||||
|
Loading…
x
Reference in New Issue
Block a user