AZW3 Input: Workaround broken AZW3 files produced by kindlegen that use uppercase attribute names. Fixes #1341306 [Some links fail to convert from kf8 to epub](https://bugs.launchpad.net/calibre/+bug/1341306)

2025-06-23 15:30:45 -04:00 · 2014-07-15 23:41:19 +05:30 · 2014-07-15 23:41:19 +05:30 · c83a51377d
commit c83a51377d
parent e1a1eb41f2
2 changed files with 8 additions and 3 deletions
--- a/src/calibre/ebooks/mobi/reader/mobi8.py
+++ b/src/calibre/ebooks/mobi/reader/mobi8.py
@ -311,8 +311,8 @@ class Mobi8Reader(object):
        if plt == npos or pgt < plt:
            npos = pgt + 1
        textblock = textblock[0:npos]
-        id_re = re.compile(br'''<[^>]+\sid\s*=\s*['"]([^'"]+)['"]''')
-        name_re = re.compile(br'''<\s*a\s*\sname\s*=\s*['"]([^'"]+)['"]''')
+        id_re = re.compile(br'''<[^>]+\s(?:id|ID)\s*=\s*['"]([^'"]+)['"]''')
+        name_re = re.compile(br'''<\s*a\s*\s(?:name|NAME)\s*=\s*['"]([^'"]+)['"]''')
        for tag in reverse_tag_iter(textblock):
            m = id_re.match(tag) or name_re.match(tag)
            if m is not None:
--- a/src/calibre/ebooks/oeb/parse_utils.py
+++ b/src/calibre/ebooks/oeb/parse_utils.py
@ -252,10 +252,15 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
    idx = data.find('<html')
    if idx == -1:
        idx = data.find('<HTML')
+    has_html4_doctype = False
    if idx > -1:
        pre = data[:idx]
        data = data[idx:]
        if '<!DOCTYPE' in pre:  # Handle user defined entities
+            has_html4_doctype = re.search(r'<!DOCTYPE\s+[^>]+HTML\s+4.0[^.]+>', pre) is not None
+            # kindlegen produces invalid xhtml with uppercase attribute names
+            # if fed HTML 4 with uppercase attribute names, so try to detect
+            # and compensate for that.
            user_entities = {}
            for match in re.finditer(r'<!ENTITY\s+(\S+)\s+([^>]+)', pre):
                val = match.group(2)
@ -292,7 +297,7 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
                    'HTML 5 parsing failed, falling back to older parsers')
                data = _html4_parse(data)

-    if data.tag == 'HTML':
+    if has_html4_doctype or data.tag == 'HTML':
        # Lower case all tag and attribute names
        data.tag = data.tag.lower()
        for x in data.iterdescendants():