AZW3 Input: Workaround broken AZW3 files produced by kindlegen that use uppercase attribute names. Fixes #1341306 [Some links fail to convert from kf8 to epub](https://bugs.launchpad.net/calibre/+bug/1341306)

This commit is contained in:
Kovid Goyal 2014-07-15 23:41:19 +05:30
parent e1a1eb41f2
commit c83a51377d
2 changed files with 8 additions and 3 deletions

View File

@ -311,8 +311,8 @@ class Mobi8Reader(object):
if plt == npos or pgt < plt:
npos = pgt + 1
textblock = textblock[0:npos]
id_re = re.compile(br'''<[^>]+\sid\s*=\s*['"]([^'"]+)['"]''')
name_re = re.compile(br'''<\s*a\s*\sname\s*=\s*['"]([^'"]+)['"]''')
id_re = re.compile(br'''<[^>]+\s(?:id|ID)\s*=\s*['"]([^'"]+)['"]''')
name_re = re.compile(br'''<\s*a\s*\s(?:name|NAME)\s*=\s*['"]([^'"]+)['"]''')
for tag in reverse_tag_iter(textblock):
m = id_re.match(tag) or name_re.match(tag)
if m is not None:

View File

@ -252,10 +252,15 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
idx = data.find('<html')
if idx == -1:
idx = data.find('<HTML')
has_html4_doctype = False
if idx > -1:
pre = data[:idx]
data = data[idx:]
if '<!DOCTYPE' in pre: # Handle user defined entities
has_html4_doctype = re.search(r'<!DOCTYPE\s+[^>]+HTML\s+4.0[^.]+>', pre) is not None
# kindlegen produces invalid xhtml with uppercase attribute names
# if fed HTML 4 with uppercase attribute names, so try to detect
# and compensate for that.
user_entities = {}
for match in re.finditer(r'<!ENTITY\s+(\S+)\s+([^>]+)', pre):
val = match.group(2)
@ -292,7 +297,7 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
'HTML 5 parsing failed, falling back to older parsers')
data = _html4_parse(data)
if data.tag == 'HTML':
if has_html4_doctype or data.tag == 'HTML':
# Lower case all tag and attribute names
data.tag = data.tag.lower()
for x in data.iterdescendants():