mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Fix #1140 (text has random underlining)
This commit is contained in:
parent
2b0cf3fed5
commit
c769c35b1d
@ -436,11 +436,13 @@ class Parser(PreProcessor, LoggingInterface):
|
|||||||
''' Create lxml ElementTree from HTML '''
|
''' Create lxml ElementTree from HTML '''
|
||||||
self.log_info('\tParsing '+os.sep.join(self.htmlfile.path.split(os.sep)[-3:]))
|
self.log_info('\tParsing '+os.sep.join(self.htmlfile.path.split(os.sep)[-3:]))
|
||||||
src = open(self.htmlfile.path, 'rb').read().decode(self.htmlfile.encoding, 'replace').strip()
|
src = open(self.htmlfile.path, 'rb').read().decode(self.htmlfile.encoding, 'replace').strip()
|
||||||
src = src[src.find('<'):]
|
|
||||||
src = self.preprocess(src)
|
src = self.preprocess(src)
|
||||||
# lxml chokes on unicode input when it contains encoding declarations
|
# lxml chokes on unicode input when it contains encoding declarations
|
||||||
for pat in ENCODING_PATS:
|
for pat in ENCODING_PATS:
|
||||||
src = pat.sub('', src)
|
src = pat.sub('', src)
|
||||||
|
src = src[src.find('<'):]
|
||||||
|
# Remove unclosed <style> tag as that messes up lxml's parsing
|
||||||
|
src = re.sub(r'<style>\s*</head>', '', src)
|
||||||
try:
|
try:
|
||||||
self.root = fromstring(src)
|
self.root = fromstring(src)
|
||||||
except:
|
except:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user