From d36a23d795d66dc37598d0fe72abba8df3973d68 Mon Sep 17 00:00:00 2001 From: Christopher Szucko Date: Sat, 10 Aug 2019 09:56:31 -0500 Subject: [PATCH] Refactor HTML metadata parsing Use an HTMLParser rather than regex, only parse the document once, and add handling for multiple values for authors, tags, and languages --- src/calibre/ebooks/metadata/html.py | 178 ++++++++++++++++------------ 1 file changed, 102 insertions(+), 76 deletions(-) diff --git a/src/calibre/ebooks/metadata/html.py b/src/calibre/ebooks/metadata/html.py index 3f0aefaadf..b8b86d4c19 100644 --- a/src/calibre/ebooks/metadata/html.py +++ b/src/calibre/ebooks/metadata/html.py @@ -10,9 +10,10 @@ Try to read metadata from an HTML file. import re +from collections import defaultdict from HTMLParser import HTMLParser -from calibre.ebooks.metadata import string_to_authors +from calibre.ebooks.metadata import string_to_authors, authors_to_string from calibre.ebooks.metadata.book.base import Metadata from calibre.ebooks.chardet import xml_to_unicode from calibre import replace_entities, isbytestring @@ -30,7 +31,7 @@ COMMENT_NAMES = { 'authors': 'AUTHOR', 'publisher': 'PUBLISHER', 'isbn': 'ISBN', - 'language': 'LANGUAGE', + 'languages': 'LANGUAGE', 'pubdate': 'PUBDATE', 'timestamp': 'TIMESTAMP', 'series': 'SERIES', @@ -44,8 +45,8 @@ META_NAMES = { 'title' : ('dc.title', 'dcterms.title', 'title'), 'authors': ('author', 'dc.creator.aut', 'dcterms.creator.aut', 'dc.creator'), 'publisher': ('publisher', 'dc.publisher', 'dcterms.publisher'), - 'isbn': ('isbn', 'dc.identifier.isbn', 'dcterms.identifier.isbn'), - 'language': ('dc.language', 'dcterms.language'), + 'isbn': ('isbn',), + 'languages': ('dc.language', 'dcterms.language'), 'pubdate': ('pubdate', 'date of publication', 'dc.date.published', 'dc.date.publication', 'dc.date.issued', 'dcterms.issued'), 'timestamp': ('timestamp', 'date of creation', 'dc.date.created', 'dc.date.creation', 'dcterms.created'), 'series': ('series',), @@ -59,69 +60,85 @@ META_NAMES = { # single quotes inside double quotes and vice versa. attr_pat = r'''(?:(?P')|(?P"))(?P(?(sq)[^']+|[^"]+))(?(sq)'|")''' - -def parse_meta_tags(src): - rmap = {} - for field, names in iteritems(META_NAMES): - for name in names: - rmap[name.lower()] = field - all_names = '|'.join(rmap) - ans = {} - npat = r'''name\s*=\s*['"]{0,1}(?P%s)['"]{0,1}''' % all_names - cpat = r'content\s*=\s*%s' % attr_pat - for pat in ( - r'%s)\s*=\s*%s''' % (all_names, attr_pat), src): - field = rmap[match.group('name')] - if field not in ans: - ans[field] = replace_entities(match.group('content')) - if len(ans) == len(COMMENT_NAMES): - break - return ans + def handle_charref(self, ref): + if self.recording: + self.recorded.append(replace_entities("&#%s;" % ref)) + def handle_entityref(self, ref): + if self.recording: + self.recorded.append(replace_entities("&%s;" % ref)) + + def handle_endtag(self, tag): + if tag == 'title': + self.recording = False + self.title_tag = ''.join(self.recorded) + + def handle_comment(self, data): + for match in re.finditer(r'''(?P\S+)\s*=\s*%s''' % (attr_pat), data): + x = match.group('name') + field = None + try: + field = self.rmap_comment[x] + except KeyError: + pass + if field: + self.comment_tags[field].append(replace_entities(match.group('content'))) + + parser = MetadataParser() + parser.feed(src) + + return (parser.comment_tags, parser.meta_tags, parser.meta_tag_ids, parser.title_tag) def get_metadata_(src, encoding=None): # Meta data definitions as in @@ -133,37 +150,44 @@ def get_metadata_(src, encoding=None): else: src = src.decode(encoding, 'replace') src = src[:150000] # Searching shouldn't take too long - comment_tags = parse_comment_tags(src) - meta_tags = parse_meta_tags(src) - meta_tag_ids = parse_meta_tag_identifiers(src) + (comment_tags, meta_tags, meta_tag_ids, title_tag) = parse_metadata(src) - def get(field): + def get_all(field): ans = comment_tags.get(field, meta_tags.get(field, None)) if ans: - ans = ans.strip() + ans = [x.strip() for x in ans if x.strip()] if not ans: ans = None return ans + def get(field): + ans = get_all(field) + if ans: + ans = ans[0] + return ans + # Title - title = get('title') - if not title: - pat = re.compile('([^<>]+?)', re.IGNORECASE) - match = pat.search(src) - if match: - title = replace_entities(match.group(1)) + title = get('title') or title_tag.strip() or _('Unknown') # Author - authors = get('authors') or _('Unknown') + authors = authors_to_string(get_all('authors')) or _('Unknown') # Create MetaInformation with Title and Author - mi = Metadata(title or _('Unknown'), string_to_authors(authors)) + mi = Metadata(title, string_to_authors(authors)) - for field in ('publisher', 'isbn', 'language', 'comments'): + # Single-value text fields + for field in ('publisher', 'isbn', 'comments'): val = get(field) if val: setattr(mi, field, val) + # Multi-value text fields + for field in ('languages',): + val = get_all(field) + if val: + setattr(mi, field, val) + + # Date fields for field in ('pubdate', 'timestamp'): try: val = parse_date(get(field)) @@ -210,14 +234,16 @@ def get_metadata_(src, encoding=None): pass # TAGS - tags = get('tags') + tags = get_all('tags') if tags: - tags = [x.strip() for x in tags.split(',') if x.strip()] + tags = [x.strip() for s in tags for x in s.split(',') if x.strip()] if tags: mi.tags = tags # IDENTIFIERS - for (k,v) in meta_tag_ids.iteritems(): - mi.set_identifier(k, v) + for (k,v) in iteritems(meta_tag_ids): + v = [x.strip() for x in v if x.strip()] + if v: + mi.set_identifier(k, v[0]) return mi