diff --git a/src/calibre/ebooks/metadata/html.py b/src/calibre/ebooks/metadata/html.py index 49ce83df63..8fa7bd3e25 100644 --- a/src/calibre/ebooks/metadata/html.py +++ b/src/calibre/ebooks/metadata/html.py @@ -9,13 +9,17 @@ Try to read metadata from an HTML file. ''' import re +import unittest -from calibre.ebooks.metadata import string_to_authors +from collections import defaultdict +from HTMLParser import HTMLParser + +from calibre.ebooks.metadata import string_to_authors, authors_to_string from calibre.ebooks.metadata.book.base import Metadata from calibre.ebooks.chardet import xml_to_unicode from calibre import replace_entities, isbytestring from calibre.utils.date import parse_date, is_date_undefined -from polyglot.builtins import iteritems, itervalues +from polyglot.builtins import iteritems def get_metadata(stream): @@ -28,7 +32,7 @@ COMMENT_NAMES = { 'authors': 'AUTHOR', 'publisher': 'PUBLISHER', 'isbn': 'ISBN', - 'language': 'LANGUAGE', + 'languages': 'LANGUAGE', 'pubdate': 'PUBDATE', 'timestamp': 'TIMESTAMP', 'series': 'SERIES', @@ -42,8 +46,8 @@ META_NAMES = { 'title' : ('dc.title', 'dcterms.title', 'title'), 'authors': ('author', 'dc.creator.aut', 'dcterms.creator.aut', 'dc.creator'), 'publisher': ('publisher', 'dc.publisher', 'dcterms.publisher'), - 'isbn': ('isbn', 'dc.identifier.isbn', 'dcterms.identifier.isbn'), - 'language': ('dc.language', 'dcterms.language'), + 'isbn': ('isbn',), + 'languages': ('dc.language', 'dcterms.language'), 'pubdate': ('pubdate', 'date of publication', 'dc.date.published', 'dc.date.publication', 'dc.date.issued', 'dcterms.issued'), 'timestamp': ('timestamp', 'date of creation', 'dc.date.created', 'dc.date.creation', 'dcterms.created'), 'series': ('series',), @@ -58,47 +62,85 @@ META_NAMES = { attr_pat = r'''(?:(?P')|(?P"))(?P(?(sq)[^']+|[^"]+))(?(sq)'|")''' -def parse_meta_tags(src): - rmap = {} - for field, names in iteritems(META_NAMES): - for name in names: - rmap[name.lower()] = field - all_names = '|'.join(rmap) - ans = {} - npat = r'''name\s*=\s*['"]{0,1}(?P%s)['"]{0,1}''' % all_names - cpat = r'content\s*=\s*%s' % attr_pat - for pat in ( - r'%s)\s*=\s*%s''' % (all_names, attr_pat), src): - field = rmap[match.group('name')] - if field not in ans: - ans[field] = replace_entities(match.group('content')) - if len(ans) == len(COMMENT_NAMES): - break - return ans + def handle_entityref(self, ref): + if self.recording: + self.recorded.append(replace_entities("&%s;" % ref)) + + def handle_endtag(self, tag): + if tag == 'title': + self.recording = False + self.title_tag = ''.join(self.recorded) + + def handle_comment(self, data): + for match in re.finditer(r'''(?P\S+)\s*=\s*%s''' % (attr_pat), data): + x = match.group('name') + field = None + try: + field = self.rmap_comment[x] + except KeyError: + pass + if field: + self.comment_tags[field].append(replace_entities(match.group('content'))) + + parser = MetadataParser() + parser.feed(src) + + return (parser.comment_tags, parser.meta_tags, parser.meta_tag_ids, parser.title_tag) def get_metadata_(src, encoding=None): @@ -111,36 +153,50 @@ def get_metadata_(src, encoding=None): else: src = src.decode(encoding, 'replace') src = src[:150000] # Searching shouldn't take too long - comment_tags = parse_comment_tags(src) - meta_tags = parse_meta_tags(src) + (comment_tags, meta_tags, meta_tag_ids, title_tag) = parse_metadata(src) - def get(field): + def get_all(field): ans = comment_tags.get(field, meta_tags.get(field, None)) if ans: - ans = ans.strip() + ans = [x.strip() for x in ans if x.strip()] if not ans: ans = None return ans + def get(field): + ans = get_all(field) + if ans: + ans = ans[0] + return ans + # Title - title = get('title') - if not title: - pat = re.compile('([^<>]+?)', re.IGNORECASE) - match = pat.search(src) - if match: - title = replace_entities(match.group(1)) + title = get('title') or title_tag.strip() or _('Unknown') # Author - authors = get('authors') or _('Unknown') + authors = authors_to_string(get_all('authors')) or _('Unknown') # Create MetaInformation with Title and Author - mi = Metadata(title or _('Unknown'), string_to_authors(authors)) + mi = Metadata(title, string_to_authors(authors)) - for field in ('publisher', 'isbn', 'language', 'comments'): + # Single-value text fields + for field in ('publisher', 'isbn'): val = get(field) if val: setattr(mi, field, val) + # Multi-value text fields + for field in ('languages',): + val = get_all(field) + if val: + setattr(mi, field, val) + + # HTML fields + for field in ('comments',): + val = get(field) + if val: + setattr(mi, field, val.replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"').replace("'", ''')) + + # Date fields for field in ('pubdate', 'timestamp'): try: val = parse_date(get(field)) @@ -179,18 +235,201 @@ def get_metadata_(src, encoding=None): mi.rating = float(rating) if mi.rating < 0: mi.rating = 0 - if mi.rating > 5: - mi.rating /= 2. - if mi.rating > 5: + if mi.rating > 10: mi.rating = 0 except: pass # TAGS - tags = get('tags') + tags = get_all('tags') if tags: - tags = [x.strip() for x in tags.split(',') if x.strip()] + tags = [x.strip() for s in tags for x in s.split(',') if x.strip()] if tags: mi.tags = tags + # IDENTIFIERS + for (k,v) in iteritems(meta_tag_ids): + v = [x.strip() for x in v if x.strip()] + if v: + mi.set_identifier(k, v[0]) + return mi + + +class MetadataHtmlTest(unittest.TestCase): + + def compare_metadata(self, meta_a, meta_b): + for attr in ( + 'title', 'authors', 'publisher', 'isbn', 'languages', 'pubdate', 'timestamp', 'series', + 'series_index', 'rating', 'comments', 'tags', 'identifiers' + ): + self.assertEqual(getattr(meta_a, attr), getattr(meta_b, attr)) + + def get_stream(self, test): + from io import BytesIO + + raw = b'''\ + + +''' + + if test in {'title', 'meta_single', 'meta_multi', 'comment_single', 'comment_multi'}: + raw += b'''\ + } + A Title Tag &amp; Title Ⓒ +''' + + if test in {'meta_single', 'meta_multi', 'comment_single', 'comment_multi'}: + raw += b'''\ + + + + + + + + + + + + + + + + + +''' + + if test in {'meta_multi', 'comment_single', 'comment_multi'}: + raw += b'''\ + + + + + + + + + + + + + +''' + + if test in {'comment_single', 'comment_multi'}: + raw += b'''\ + + + + + + + + + + + + +''' + + if test in {'comment_multi'}: + raw += b'''\ + + + + + + + + + + + + +''' + + raw += b'''\ + + + + +''' + return BytesIO(raw) + + def test_input_title(self): + stream_meta = get_metadata(self.get_stream('title')) + canon_meta = Metadata('A Title Tag & Title Ⓒ', [_('Unknown')]) + self.compare_metadata(stream_meta, canon_meta) + + def test_input_meta_single(self): + stream_meta = get_metadata(self.get_stream('meta_single')) + canon_meta = Metadata('A Meta Tag & Title Ⓒ', ['George Washington']) + canon_meta.publisher = 'Publisher A' + canon_meta.languages = ['English'] + canon_meta.pubdate = parse_date('2019-01-01') + canon_meta.timestamp = parse_date('2018-01-01') + canon_meta.series = 'Meta Series' + canon_meta.series_index = float(1) + # canon_meta.rating = float(0) + # canon_meta.comments = '' + canon_meta.tags = ['tag a', 'tag b'] + canon_meta.set_identifiers({'isbn': '1234567890'}) + self.compare_metadata(stream_meta, canon_meta) + + def test_input_meta_multi(self): + stream_meta = get_metadata(self.get_stream('meta_multi')) + canon_meta = Metadata('A Meta Tag & Title Ⓒ', ['George Washington', 'John Adams', 'Thomas Jefferson']) + canon_meta.publisher = 'Publisher A' + canon_meta.languages = ['English', 'Spanish'] + canon_meta.pubdate = parse_date('2019-01-01') + canon_meta.timestamp = parse_date('2018-01-01') + canon_meta.series = 'Meta Series' + canon_meta.series_index = float(1) + canon_meta.rating = float(8) + canon_meta.comments = 'meta "comments" ♥ HTML &amp;' + canon_meta.tags = ['tag a', 'tag b', 'tag c'] + canon_meta.set_identifiers({'isbn': '1234567890', 'url': 'http://google.com/search?q=calibre'}) + self.compare_metadata(stream_meta, canon_meta) + + def test_input_comment_single(self): + stream_meta = get_metadata(self.get_stream('comment_single')) + canon_meta = Metadata('A Comment Tag & Title Ⓒ', ['James Madison', 'James Monroe']) + canon_meta.publisher = 'Publisher C' + canon_meta.languages = ['French'] + canon_meta.pubdate = parse_date('2015-01-01') + canon_meta.timestamp = parse_date('2014-01-01') + canon_meta.series = 'Comment Series' + canon_meta.series_index = float(3) + canon_meta.rating = float(0) + canon_meta.comments = 'comment "comments" ♥ HTML too &amp;' + canon_meta.tags = ['tag d'] + canon_meta.set_identifiers({'isbn': '3456789012', 'url': 'http://google.com/search?q=calibre'}) + self.compare_metadata(stream_meta, canon_meta) + + def test_input_comment_multi(self): + stream_meta = get_metadata(self.get_stream('comment_multi')) + canon_meta = Metadata('A Comment Tag & Title Ⓒ', ['James Madison', 'James Monroe', 'John Quincy Adams']) + canon_meta.publisher = 'Publisher C' + canon_meta.languages = ['French', 'Japanese'] + canon_meta.pubdate = parse_date('2015-01-01') + canon_meta.timestamp = parse_date('2014-01-01') + canon_meta.series = 'Comment Series' + canon_meta.series_index = float(3) + canon_meta.rating = float(0) + canon_meta.comments = 'comment "comments" ♥ HTML too &amp;' + canon_meta.tags = ['tag d', 'tag e', 'tag f'] + canon_meta.set_identifiers({'isbn': '3456789012', 'url': 'http://google.com/search?q=calibre'}) + self.compare_metadata(stream_meta, canon_meta) + + +def suite(): + return unittest.TestLoader().loadTestsFromTestCase(MetadataHtmlTest) + + +def test(): + unittest.TextTestRunner(verbosity=2).run(suite()) + + +if __name__ == '__main__': + test()