diff --git a/src/calibre/ebooks/mobi/reader/mobi8.py b/src/calibre/ebooks/mobi/reader/mobi8.py index 7c5fbbe4f9..835a24b299 100644 --- a/src/calibre/ebooks/mobi/reader/mobi8.py +++ b/src/calibre/ebooks/mobi/reader/mobi8.py @@ -311,8 +311,8 @@ class Mobi8Reader(object): if plt == npos or pgt < plt: npos = pgt + 1 textblock = textblock[0:npos] - id_re = re.compile(br'''<[^>]+\sid\s*=\s*['"]([^'"]+)['"]''') - name_re = re.compile(br'''<\s*a\s*\sname\s*=\s*['"]([^'"]+)['"]''') + id_re = re.compile(br'''<[^>]+\s(?:id|ID)\s*=\s*['"]([^'"]+)['"]''') + name_re = re.compile(br'''<\s*a\s*\s(?:name|NAME)\s*=\s*['"]([^'"]+)['"]''') for tag in reverse_tag_iter(textblock): m = id_re.match(tag) or name_re.match(tag) if m is not None: diff --git a/src/calibre/ebooks/oeb/parse_utils.py b/src/calibre/ebooks/oeb/parse_utils.py index 998ad70337..1f87bf5672 100644 --- a/src/calibre/ebooks/oeb/parse_utils.py +++ b/src/calibre/ebooks/oeb/parse_utils.py @@ -252,10 +252,15 @@ def parse_html(data, log=None, decoder=None, preprocessor=None, idx = data.find(' -1: pre = data[:idx] data = data[idx:] if ']+HTML\s+4.0[^.]+>', pre) is not None + # kindlegen produces invalid xhtml with uppercase attribute names + # if fed HTML 4 with uppercase attribute names, so try to detect + # and compensate for that. user_entities = {} for match in re.finditer(r']+)', pre): val = match.group(2) @@ -292,7 +297,7 @@ def parse_html(data, log=None, decoder=None, preprocessor=None, 'HTML 5 parsing failed, falling back to older parsers') data = _html4_parse(data) - if data.tag == 'HTML': + if has_html4_doctype or data.tag == 'HTML': # Lower case all tag and attribute names data.tag = data.tag.lower() for x in data.iterdescendants():