From 9c846f3b51ccdc13054e5f91821abc195e822107 Mon Sep 17 00:00:00 2001 From: Christopher Szucko Date: Fri, 9 Aug 2019 08:54:26 -0500 Subject: [PATCH 1/5] Support Dublin Core id tags when importing HTML All of the below formats are supported and would be interpreted as "foo:bar" --- src/calibre/ebooks/metadata/html.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/src/calibre/ebooks/metadata/html.py b/src/calibre/ebooks/metadata/html.py index 49ce83df63..3f0aefaadf 100644 --- a/src/calibre/ebooks/metadata/html.py +++ b/src/calibre/ebooks/metadata/html.py @@ -10,6 +10,8 @@ Try to read metadata from an HTML file. import re +from HTMLParser import HTMLParser + from calibre.ebooks.metadata import string_to_authors from calibre.ebooks.metadata.book.base import Metadata from calibre.ebooks.chardet import xml_to_unicode @@ -87,6 +89,26 @@ def parse_meta_tags(src): return ans return ans +def parse_meta_tag_identifiers(src): + meta_identifiers = {} + + class MetadataParser(HTMLParser): + def handle_starttag(self, tag, attrs): + attr_dict = dict(attrs) + + if tag == 'meta' and re.match(r'(?:dc|dcterms)[\.:]identifier', attr_dict.get('name', ''), flags=re.IGNORECASE): + content = attr_dict.get('content', '').strip() + scheme = attr_dict.get('scheme', '').strip() + if not scheme: + elements = re.split(r'[\.:]', attr_dict['name']) + if len(elements) == 3: + scheme = elements[2] + if content and scheme: + meta_identifiers[scheme.lower()] = replace_entities(content) + + MetadataParser().feed(src) + + return meta_identifiers def parse_comment_tags(src): all_names = '|'.join(itervalues(COMMENT_NAMES)) @@ -113,6 +135,7 @@ def get_metadata_(src, encoding=None): src = src[:150000] # Searching shouldn't take too long comment_tags = parse_comment_tags(src) meta_tags = parse_meta_tags(src) + meta_tag_ids = parse_meta_tag_identifiers(src) def get(field): ans = comment_tags.get(field, meta_tags.get(field, None)) @@ -193,4 +216,8 @@ def get_metadata_(src, encoding=None): if tags: mi.tags = tags + # IDENTIFIERS + for (k,v) in meta_tag_ids.iteritems(): + mi.set_identifier(k, v) + return mi From d36a23d795d66dc37598d0fe72abba8df3973d68 Mon Sep 17 00:00:00 2001 From: Christopher Szucko Date: Sat, 10 Aug 2019 09:56:31 -0500 Subject: [PATCH 2/5] Refactor HTML metadata parsing Use an HTMLParser rather than regex, only parse the document once, and add handling for multiple values for authors, tags, and languages --- src/calibre/ebooks/metadata/html.py | 178 ++++++++++++++++------------ 1 file changed, 102 insertions(+), 76 deletions(-) diff --git a/src/calibre/ebooks/metadata/html.py b/src/calibre/ebooks/metadata/html.py index 3f0aefaadf..b8b86d4c19 100644 --- a/src/calibre/ebooks/metadata/html.py +++ b/src/calibre/ebooks/metadata/html.py @@ -10,9 +10,10 @@ Try to read metadata from an HTML file. import re +from collections import defaultdict from HTMLParser import HTMLParser -from calibre.ebooks.metadata import string_to_authors +from calibre.ebooks.metadata import string_to_authors, authors_to_string from calibre.ebooks.metadata.book.base import Metadata from calibre.ebooks.chardet import xml_to_unicode from calibre import replace_entities, isbytestring @@ -30,7 +31,7 @@ COMMENT_NAMES = { 'authors': 'AUTHOR', 'publisher': 'PUBLISHER', 'isbn': 'ISBN', - 'language': 'LANGUAGE', + 'languages': 'LANGUAGE', 'pubdate': 'PUBDATE', 'timestamp': 'TIMESTAMP', 'series': 'SERIES', @@ -44,8 +45,8 @@ META_NAMES = { 'title' : ('dc.title', 'dcterms.title', 'title'), 'authors': ('author', 'dc.creator.aut', 'dcterms.creator.aut', 'dc.creator'), 'publisher': ('publisher', 'dc.publisher', 'dcterms.publisher'), - 'isbn': ('isbn', 'dc.identifier.isbn', 'dcterms.identifier.isbn'), - 'language': ('dc.language', 'dcterms.language'), + 'isbn': ('isbn',), + 'languages': ('dc.language', 'dcterms.language'), 'pubdate': ('pubdate', 'date of publication', 'dc.date.published', 'dc.date.publication', 'dc.date.issued', 'dcterms.issued'), 'timestamp': ('timestamp', 'date of creation', 'dc.date.created', 'dc.date.creation', 'dcterms.created'), 'series': ('series',), @@ -59,69 +60,85 @@ META_NAMES = { # single quotes inside double quotes and vice versa. attr_pat = r'''(?:(?P')|(?P"))(?P(?(sq)[^']+|[^"]+))(?(sq)'|")''' - -def parse_meta_tags(src): - rmap = {} - for field, names in iteritems(META_NAMES): - for name in names: - rmap[name.lower()] = field - all_names = '|'.join(rmap) - ans = {} - npat = r'''name\s*=\s*['"]{0,1}(?P%s)['"]{0,1}''' % all_names - cpat = r'content\s*=\s*%s' % attr_pat - for pat in ( - r'%s)\s*=\s*%s''' % (all_names, attr_pat), src): - field = rmap[match.group('name')] - if field not in ans: - ans[field] = replace_entities(match.group('content')) - if len(ans) == len(COMMENT_NAMES): - break - return ans + def handle_charref(self, ref): + if self.recording: + self.recorded.append(replace_entities("&#%s;" % ref)) + def handle_entityref(self, ref): + if self.recording: + self.recorded.append(replace_entities("&%s;" % ref)) + + def handle_endtag(self, tag): + if tag == 'title': + self.recording = False + self.title_tag = ''.join(self.recorded) + + def handle_comment(self, data): + for match in re.finditer(r'''(?P\S+)\s*=\s*%s''' % (attr_pat), data): + x = match.group('name') + field = None + try: + field = self.rmap_comment[x] + except KeyError: + pass + if field: + self.comment_tags[field].append(replace_entities(match.group('content'))) + + parser = MetadataParser() + parser.feed(src) + + return (parser.comment_tags, parser.meta_tags, parser.meta_tag_ids, parser.title_tag) def get_metadata_(src, encoding=None): # Meta data definitions as in @@ -133,37 +150,44 @@ def get_metadata_(src, encoding=None): else: src = src.decode(encoding, 'replace') src = src[:150000] # Searching shouldn't take too long - comment_tags = parse_comment_tags(src) - meta_tags = parse_meta_tags(src) - meta_tag_ids = parse_meta_tag_identifiers(src) + (comment_tags, meta_tags, meta_tag_ids, title_tag) = parse_metadata(src) - def get(field): + def get_all(field): ans = comment_tags.get(field, meta_tags.get(field, None)) if ans: - ans = ans.strip() + ans = [x.strip() for x in ans if x.strip()] if not ans: ans = None return ans + def get(field): + ans = get_all(field) + if ans: + ans = ans[0] + return ans + # Title - title = get('title') - if not title: - pat = re.compile('([^<>]+?)', re.IGNORECASE) - match = pat.search(src) - if match: - title = replace_entities(match.group(1)) + title = get('title') or title_tag.strip() or _('Unknown') # Author - authors = get('authors') or _('Unknown') + authors = authors_to_string(get_all('authors')) or _('Unknown') # Create MetaInformation with Title and Author - mi = Metadata(title or _('Unknown'), string_to_authors(authors)) + mi = Metadata(title, string_to_authors(authors)) - for field in ('publisher', 'isbn', 'language', 'comments'): + # Single-value text fields + for field in ('publisher', 'isbn', 'comments'): val = get(field) if val: setattr(mi, field, val) + # Multi-value text fields + for field in ('languages',): + val = get_all(field) + if val: + setattr(mi, field, val) + + # Date fields for field in ('pubdate', 'timestamp'): try: val = parse_date(get(field)) @@ -210,14 +234,16 @@ def get_metadata_(src, encoding=None): pass # TAGS - tags = get('tags') + tags = get_all('tags') if tags: - tags = [x.strip() for x in tags.split(',') if x.strip()] + tags = [x.strip() for s in tags for x in s.split(',') if x.strip()] if tags: mi.tags = tags # IDENTIFIERS - for (k,v) in meta_tag_ids.iteritems(): - mi.set_identifier(k, v) + for (k,v) in iteritems(meta_tag_ids): + v = [x.strip() for x in v if x.strip()] + if v: + mi.set_identifier(k, v[0]) return mi From 8cbaa3a9e26a906faed14fa169151a458d9a3f17 Mon Sep 17 00:00:00 2001 From: Christopher Szucko Date: Sat, 10 Aug 2019 10:03:40 -0500 Subject: [PATCH 3/5] Fix importing ratings from HTML metadata The HTML import was assuming ratings were out of 5 but the internal representation is out of 10 --- src/calibre/ebooks/metadata/html.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/calibre/ebooks/metadata/html.py b/src/calibre/ebooks/metadata/html.py index b8b86d4c19..222afa83ea 100644 --- a/src/calibre/ebooks/metadata/html.py +++ b/src/calibre/ebooks/metadata/html.py @@ -226,9 +226,7 @@ def get_metadata_(src, encoding=None): mi.rating = float(rating) if mi.rating < 0: mi.rating = 0 - if mi.rating > 5: - mi.rating /= 2. - if mi.rating > 5: + if mi.rating > 10: mi.rating = 0 except: pass From a81ff78c0a7500ef30792d217d2c9e32e8402d56 Mon Sep 17 00:00:00 2001 From: Christopher Szucko Date: Sat, 10 Aug 2019 21:09:03 -0500 Subject: [PATCH 4/5] Escape HTML entities in comments --- src/calibre/ebooks/metadata/html.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/calibre/ebooks/metadata/html.py b/src/calibre/ebooks/metadata/html.py index 222afa83ea..dad218ae19 100644 --- a/src/calibre/ebooks/metadata/html.py +++ b/src/calibre/ebooks/metadata/html.py @@ -176,7 +176,7 @@ def get_metadata_(src, encoding=None): mi = Metadata(title, string_to_authors(authors)) # Single-value text fields - for field in ('publisher', 'isbn', 'comments'): + for field in ('publisher', 'isbn'): val = get(field) if val: setattr(mi, field, val) @@ -187,6 +187,12 @@ def get_metadata_(src, encoding=None): if val: setattr(mi, field, val) + # HTML fields + for field in ('comments',): + val = get(field) + if val: + setattr(mi, field, val.replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"').replace("'", ''')) + # Date fields for field in ('pubdate', 'timestamp'): try: From b12c75c904be0b54c5cf2b32b5a16c7a1b39309e Mon Sep 17 00:00:00 2001 From: Christopher Szucko Date: Sun, 11 Aug 2019 09:30:35 -0500 Subject: [PATCH 5/5] Add unit tests for HTML metadata imports --- src/calibre/ebooks/metadata/html.py | 182 ++++++++++++++++++++++++++++ 1 file changed, 182 insertions(+) diff --git a/src/calibre/ebooks/metadata/html.py b/src/calibre/ebooks/metadata/html.py index dad218ae19..c6d4693baf 100644 --- a/src/calibre/ebooks/metadata/html.py +++ b/src/calibre/ebooks/metadata/html.py @@ -9,6 +9,7 @@ Try to read metadata from an HTML file. ''' import re +import unittest from collections import defaultdict from HTMLParser import HTMLParser @@ -251,3 +252,184 @@ def get_metadata_(src, encoding=None): mi.set_identifier(k, v[0]) return mi + + +class MetadataHtmlTest(unittest.TestCase): + + def compare_metadata(self, meta_a, meta_b): + for attr in ('title', 'authors', 'publisher', 'isbn', 'languages', 'pubdate', 'timestamp', 'series', 'series_index', 'rating', 'comments', 'tags', 'identifiers'): + self.assertEqual(getattr(meta_a, attr), getattr(meta_b, attr)) + + def get_stream(self, test): + from io import BytesIO + + raw = b'''\ + + +''' + + if test in {'title', 'meta_single', 'meta_multi', 'comment_single', 'comment_multi'}: + raw += b'''\ + } + A Title Tag &amp; Title Ⓒ +''' + + if test in {'meta_single', 'meta_multi', 'comment_single', 'comment_multi'}: + raw += b'''\ + + + + + + + + + + + + + + + + + +''' + + if test in {'meta_multi', 'comment_single', 'comment_multi'}: + raw += b'''\ + + + + + + + + + + + + + +''' + + if test in {'comment_single', 'comment_multi'}: + raw += b'''\ + + + + + + + + + + + + +''' + + if test in {'comment_multi'}: + raw += b'''\ + + + + + + + + + + + + +''' + + raw += b'''\ + + + + +''' + return BytesIO(raw) + + + def test_input_title(self): + stream_meta = get_metadata(self.get_stream('title')) + canon_meta = Metadata('A Title Tag & Title Ⓒ', [_('Unknown')]) + self.compare_metadata(stream_meta, canon_meta) + + + def test_input_meta_single(self): + stream_meta = get_metadata(self.get_stream('meta_single')) + canon_meta = Metadata('A Meta Tag & Title Ⓒ', ['George Washington']) + canon_meta.publisher = 'Publisher A' + canon_meta.languages = ['English'] + canon_meta.pubdate = parse_date('2019-01-01') + canon_meta.timestamp = parse_date('2018-01-01') + canon_meta.series = 'Meta Series' + canon_meta.series_index = float(1) + # canon_meta.rating = float(0) + # canon_meta.comments = '' + canon_meta.tags = ['tag a', 'tag b'] + canon_meta.set_identifiers({'isbn': '1234567890'}) + self.compare_metadata(stream_meta, canon_meta) + + + def test_input_meta_multi(self): + stream_meta = get_metadata(self.get_stream('meta_multi')) + canon_meta = Metadata('A Meta Tag & Title Ⓒ', ['George Washington', 'John Adams', 'Thomas Jefferson']) + canon_meta.publisher = 'Publisher A' + canon_meta.languages = ['English', 'Spanish'] + canon_meta.pubdate = parse_date('2019-01-01') + canon_meta.timestamp = parse_date('2018-01-01') + canon_meta.series = 'Meta Series' + canon_meta.series_index = float(1) + canon_meta.rating = float(8) + canon_meta.comments = 'meta "comments" ♥ HTML &amp;' + canon_meta.tags = ['tag a', 'tag b', 'tag c'] + canon_meta.set_identifiers({'isbn': '1234567890', 'url': 'http://google.com/search?q=calibre'}) + self.compare_metadata(stream_meta, canon_meta) + + + def test_input_comment_single(self): + stream_meta = get_metadata(self.get_stream('comment_single')) + canon_meta = Metadata('A Comment Tag & Title Ⓒ', ['James Madison', 'James Monroe']) + canon_meta.publisher = 'Publisher C' + canon_meta.languages = ['French'] + canon_meta.pubdate = parse_date('2015-01-01') + canon_meta.timestamp = parse_date('2014-01-01') + canon_meta.series = 'Comment Series' + canon_meta.series_index = float(3) + canon_meta.rating = float(0) + canon_meta.comments = 'comment "comments" ♥ HTML too &amp;' + canon_meta.tags = ['tag d'] + canon_meta.set_identifiers({'isbn': '3456789012', 'url': 'http://google.com/search?q=calibre'}) + self.compare_metadata(stream_meta, canon_meta) + + + def test_input_comment_multi(self): + stream_meta = get_metadata(self.get_stream('comment_multi')) + canon_meta = Metadata('A Comment Tag & Title Ⓒ', ['James Madison', 'James Monroe', 'John Quincy Adams']) + canon_meta.publisher = 'Publisher C' + canon_meta.languages = ['French', 'Japanese'] + canon_meta.pubdate = parse_date('2015-01-01') + canon_meta.timestamp = parse_date('2014-01-01') + canon_meta.series = 'Comment Series' + canon_meta.series_index = float(3) + canon_meta.rating = float(0) + canon_meta.comments = 'comment "comments" ♥ HTML too &amp;' + canon_meta.tags = ['tag d', 'tag e', 'tag f'] + canon_meta.set_identifiers({'isbn': '3456789012', 'url': 'http://google.com/search?q=calibre'}) + self.compare_metadata(stream_meta, canon_meta) + + +def suite(): + return unittest.TestLoader().loadTestsFromTestCase(MetadataHtmlTest) + + +def test(): + unittest.TextTestRunner(verbosity=2).run(suite()) + + +if __name__ == '__main__': + test()