mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
AZW3 Input: Workaround broken AZW3 files produced by kindlegen that use uppercase attribute names. Fixes #1341306 [Some links fail to convert from kf8 to epub](https://bugs.launchpad.net/calibre/+bug/1341306)
This commit is contained in:
parent
e1a1eb41f2
commit
c83a51377d
@ -311,8 +311,8 @@ class Mobi8Reader(object):
|
||||
if plt == npos or pgt < plt:
|
||||
npos = pgt + 1
|
||||
textblock = textblock[0:npos]
|
||||
id_re = re.compile(br'''<[^>]+\sid\s*=\s*['"]([^'"]+)['"]''')
|
||||
name_re = re.compile(br'''<\s*a\s*\sname\s*=\s*['"]([^'"]+)['"]''')
|
||||
id_re = re.compile(br'''<[^>]+\s(?:id|ID)\s*=\s*['"]([^'"]+)['"]''')
|
||||
name_re = re.compile(br'''<\s*a\s*\s(?:name|NAME)\s*=\s*['"]([^'"]+)['"]''')
|
||||
for tag in reverse_tag_iter(textblock):
|
||||
m = id_re.match(tag) or name_re.match(tag)
|
||||
if m is not None:
|
||||
|
@ -252,10 +252,15 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
|
||||
idx = data.find('<html')
|
||||
if idx == -1:
|
||||
idx = data.find('<HTML')
|
||||
has_html4_doctype = False
|
||||
if idx > -1:
|
||||
pre = data[:idx]
|
||||
data = data[idx:]
|
||||
if '<!DOCTYPE' in pre: # Handle user defined entities
|
||||
has_html4_doctype = re.search(r'<!DOCTYPE\s+[^>]+HTML\s+4.0[^.]+>', pre) is not None
|
||||
# kindlegen produces invalid xhtml with uppercase attribute names
|
||||
# if fed HTML 4 with uppercase attribute names, so try to detect
|
||||
# and compensate for that.
|
||||
user_entities = {}
|
||||
for match in re.finditer(r'<!ENTITY\s+(\S+)\s+([^>]+)', pre):
|
||||
val = match.group(2)
|
||||
@ -292,7 +297,7 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
|
||||
'HTML 5 parsing failed, falling back to older parsers')
|
||||
data = _html4_parse(data)
|
||||
|
||||
if data.tag == 'HTML':
|
||||
if has_html4_doctype or data.tag == 'HTML':
|
||||
# Lower case all tag and attribute names
|
||||
data.tag = data.tag.lower()
|
||||
for x in data.iterdescendants():
|
||||
|
Loading…
x
Reference in New Issue
Block a user