diff --git a/src/calibre/ebooks/metadata/html.py b/src/calibre/ebooks/metadata/html.py
index 49ce83df63..8fa7bd3e25 100644
--- a/src/calibre/ebooks/metadata/html.py
+++ b/src/calibre/ebooks/metadata/html.py
@@ -9,13 +9,17 @@ Try to read metadata from an HTML file.
'''
import re
+import unittest
-from calibre.ebooks.metadata import string_to_authors
+from collections import defaultdict
+from HTMLParser import HTMLParser
+
+from calibre.ebooks.metadata import string_to_authors, authors_to_string
from calibre.ebooks.metadata.book.base import Metadata
from calibre.ebooks.chardet import xml_to_unicode
from calibre import replace_entities, isbytestring
from calibre.utils.date import parse_date, is_date_undefined
-from polyglot.builtins import iteritems, itervalues
+from polyglot.builtins import iteritems
def get_metadata(stream):
@@ -28,7 +32,7 @@ COMMENT_NAMES = {
'authors': 'AUTHOR',
'publisher': 'PUBLISHER',
'isbn': 'ISBN',
- 'language': 'LANGUAGE',
+ 'languages': 'LANGUAGE',
'pubdate': 'PUBDATE',
'timestamp': 'TIMESTAMP',
'series': 'SERIES',
@@ -42,8 +46,8 @@ META_NAMES = {
'title' : ('dc.title', 'dcterms.title', 'title'),
'authors': ('author', 'dc.creator.aut', 'dcterms.creator.aut', 'dc.creator'),
'publisher': ('publisher', 'dc.publisher', 'dcterms.publisher'),
- 'isbn': ('isbn', 'dc.identifier.isbn', 'dcterms.identifier.isbn'),
- 'language': ('dc.language', 'dcterms.language'),
+ 'isbn': ('isbn',),
+ 'languages': ('dc.language', 'dcterms.language'),
'pubdate': ('pubdate', 'date of publication', 'dc.date.published', 'dc.date.publication', 'dc.date.issued', 'dcterms.issued'),
'timestamp': ('timestamp', 'date of creation', 'dc.date.created', 'dc.date.creation', 'dcterms.created'),
'series': ('series',),
@@ -58,47 +62,85 @@ META_NAMES = {
attr_pat = r'''(?:(?P')|(?P"))(?P(?(sq)[^']+|[^"]+))(?(sq)'|")'''
-def parse_meta_tags(src):
- rmap = {}
- for field, names in iteritems(META_NAMES):
- for name in names:
- rmap[name.lower()] = field
- all_names = '|'.join(rmap)
- ans = {}
- npat = r'''name\s*=\s*['"]{0,1}(?P%s)['"]{0,1}''' % all_names
- cpat = r'content\s*=\s*%s' % attr_pat
- for pat in (
- r'%s)\s*=\s*%s''' % (all_names, attr_pat), src):
- field = rmap[match.group('name')]
- if field not in ans:
- ans[field] = replace_entities(match.group('content'))
- if len(ans) == len(COMMENT_NAMES):
- break
- return ans
+ def handle_entityref(self, ref):
+ if self.recording:
+ self.recorded.append(replace_entities("&%s;" % ref))
+
+ def handle_endtag(self, tag):
+ if tag == 'title':
+ self.recording = False
+ self.title_tag = ''.join(self.recorded)
+
+ def handle_comment(self, data):
+ for match in re.finditer(r'''(?P\S+)\s*=\s*%s''' % (attr_pat), data):
+ x = match.group('name')
+ field = None
+ try:
+ field = self.rmap_comment[x]
+ except KeyError:
+ pass
+ if field:
+ self.comment_tags[field].append(replace_entities(match.group('content')))
+
+ parser = MetadataParser()
+ parser.feed(src)
+
+ return (parser.comment_tags, parser.meta_tags, parser.meta_tag_ids, parser.title_tag)
def get_metadata_(src, encoding=None):
@@ -111,36 +153,50 @@ def get_metadata_(src, encoding=None):
else:
src = src.decode(encoding, 'replace')
src = src[:150000] # Searching shouldn't take too long
- comment_tags = parse_comment_tags(src)
- meta_tags = parse_meta_tags(src)
+ (comment_tags, meta_tags, meta_tag_ids, title_tag) = parse_metadata(src)
- def get(field):
+ def get_all(field):
ans = comment_tags.get(field, meta_tags.get(field, None))
if ans:
- ans = ans.strip()
+ ans = [x.strip() for x in ans if x.strip()]
if not ans:
ans = None
return ans
+ def get(field):
+ ans = get_all(field)
+ if ans:
+ ans = ans[0]
+ return ans
+
# Title
- title = get('title')
- if not title:
- pat = re.compile('([^<>]+?)', re.IGNORECASE)
- match = pat.search(src)
- if match:
- title = replace_entities(match.group(1))
+ title = get('title') or title_tag.strip() or _('Unknown')
# Author
- authors = get('authors') or _('Unknown')
+ authors = authors_to_string(get_all('authors')) or _('Unknown')
# Create MetaInformation with Title and Author
- mi = Metadata(title or _('Unknown'), string_to_authors(authors))
+ mi = Metadata(title, string_to_authors(authors))
- for field in ('publisher', 'isbn', 'language', 'comments'):
+ # Single-value text fields
+ for field in ('publisher', 'isbn'):
val = get(field)
if val:
setattr(mi, field, val)
+ # Multi-value text fields
+ for field in ('languages',):
+ val = get_all(field)
+ if val:
+ setattr(mi, field, val)
+
+ # HTML fields
+ for field in ('comments',):
+ val = get(field)
+ if val:
+ setattr(mi, field, val.replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"').replace("'", '''))
+
+ # Date fields
for field in ('pubdate', 'timestamp'):
try:
val = parse_date(get(field))
@@ -179,18 +235,201 @@ def get_metadata_(src, encoding=None):
mi.rating = float(rating)
if mi.rating < 0:
mi.rating = 0
- if mi.rating > 5:
- mi.rating /= 2.
- if mi.rating > 5:
+ if mi.rating > 10:
mi.rating = 0
except:
pass
# TAGS
- tags = get('tags')
+ tags = get_all('tags')
if tags:
- tags = [x.strip() for x in tags.split(',') if x.strip()]
+ tags = [x.strip() for s in tags for x in s.split(',') if x.strip()]
if tags:
mi.tags = tags
+ # IDENTIFIERS
+ for (k,v) in iteritems(meta_tag_ids):
+ v = [x.strip() for x in v if x.strip()]
+ if v:
+ mi.set_identifier(k, v[0])
+
return mi
+
+
+class MetadataHtmlTest(unittest.TestCase):
+
+ def compare_metadata(self, meta_a, meta_b):
+ for attr in (
+ 'title', 'authors', 'publisher', 'isbn', 'languages', 'pubdate', 'timestamp', 'series',
+ 'series_index', 'rating', 'comments', 'tags', 'identifiers'
+ ):
+ self.assertEqual(getattr(meta_a, attr), getattr(meta_b, attr))
+
+ def get_stream(self, test):
+ from io import BytesIO
+
+ raw = b'''\
+
+
+'''
+
+ if test in {'title', 'meta_single', 'meta_multi', 'comment_single', 'comment_multi'}:
+ raw += b'''\
+ }
+ A Title Tag & Title Ⓒ
+'''
+
+ if test in {'meta_single', 'meta_multi', 'comment_single', 'comment_multi'}:
+ raw += b'''\
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+'''
+
+ if test in {'meta_multi', 'comment_single', 'comment_multi'}:
+ raw += b'''\
+
+
+
+
+
+
+
+
+
+
+
+
+
+'''
+
+ if test in {'comment_single', 'comment_multi'}:
+ raw += b'''\
+
+
+
+
+
+
+
+
+
+
+
+
+'''
+
+ if test in {'comment_multi'}:
+ raw += b'''\
+
+
+
+
+
+
+
+
+
+
+
+
+'''
+
+ raw += b'''\
+
+
+
+
+'''
+ return BytesIO(raw)
+
+ def test_input_title(self):
+ stream_meta = get_metadata(self.get_stream('title'))
+ canon_meta = Metadata('A Title Tag & Title Ⓒ', [_('Unknown')])
+ self.compare_metadata(stream_meta, canon_meta)
+
+ def test_input_meta_single(self):
+ stream_meta = get_metadata(self.get_stream('meta_single'))
+ canon_meta = Metadata('A Meta Tag & Title Ⓒ', ['George Washington'])
+ canon_meta.publisher = 'Publisher A'
+ canon_meta.languages = ['English']
+ canon_meta.pubdate = parse_date('2019-01-01')
+ canon_meta.timestamp = parse_date('2018-01-01')
+ canon_meta.series = 'Meta Series'
+ canon_meta.series_index = float(1)
+ # canon_meta.rating = float(0)
+ # canon_meta.comments = ''
+ canon_meta.tags = ['tag a', 'tag b']
+ canon_meta.set_identifiers({'isbn': '1234567890'})
+ self.compare_metadata(stream_meta, canon_meta)
+
+ def test_input_meta_multi(self):
+ stream_meta = get_metadata(self.get_stream('meta_multi'))
+ canon_meta = Metadata('A Meta Tag & Title Ⓒ', ['George Washington', 'John Adams', 'Thomas Jefferson'])
+ canon_meta.publisher = 'Publisher A'
+ canon_meta.languages = ['English', 'Spanish']
+ canon_meta.pubdate = parse_date('2019-01-01')
+ canon_meta.timestamp = parse_date('2018-01-01')
+ canon_meta.series = 'Meta Series'
+ canon_meta.series_index = float(1)
+ canon_meta.rating = float(8)
+ canon_meta.comments = 'meta "comments" ♥ HTML &'
+ canon_meta.tags = ['tag a', 'tag b', 'tag c']
+ canon_meta.set_identifiers({'isbn': '1234567890', 'url': 'http://google.com/search?q=calibre'})
+ self.compare_metadata(stream_meta, canon_meta)
+
+ def test_input_comment_single(self):
+ stream_meta = get_metadata(self.get_stream('comment_single'))
+ canon_meta = Metadata('A Comment Tag & Title Ⓒ', ['James Madison', 'James Monroe'])
+ canon_meta.publisher = 'Publisher C'
+ canon_meta.languages = ['French']
+ canon_meta.pubdate = parse_date('2015-01-01')
+ canon_meta.timestamp = parse_date('2014-01-01')
+ canon_meta.series = 'Comment Series'
+ canon_meta.series_index = float(3)
+ canon_meta.rating = float(0)
+ canon_meta.comments = 'comment "comments" ♥ HTML too &'
+ canon_meta.tags = ['tag d']
+ canon_meta.set_identifiers({'isbn': '3456789012', 'url': 'http://google.com/search?q=calibre'})
+ self.compare_metadata(stream_meta, canon_meta)
+
+ def test_input_comment_multi(self):
+ stream_meta = get_metadata(self.get_stream('comment_multi'))
+ canon_meta = Metadata('A Comment Tag & Title Ⓒ', ['James Madison', 'James Monroe', 'John Quincy Adams'])
+ canon_meta.publisher = 'Publisher C'
+ canon_meta.languages = ['French', 'Japanese']
+ canon_meta.pubdate = parse_date('2015-01-01')
+ canon_meta.timestamp = parse_date('2014-01-01')
+ canon_meta.series = 'Comment Series'
+ canon_meta.series_index = float(3)
+ canon_meta.rating = float(0)
+ canon_meta.comments = 'comment "comments" ♥ HTML too &'
+ canon_meta.tags = ['tag d', 'tag e', 'tag f']
+ canon_meta.set_identifiers({'isbn': '3456789012', 'url': 'http://google.com/search?q=calibre'})
+ self.compare_metadata(stream_meta, canon_meta)
+
+
+def suite():
+ return unittest.TestLoader().loadTestsFromTestCase(MetadataHtmlTest)
+
+
+def test():
+ unittest.TextTestRunner(verbosity=2).run(suite())
+
+
+if __name__ == '__main__':
+ test()