diff --git a/src/calibre/ebooks/metadata/html.py b/src/calibre/ebooks/metadata/html.py
index 3f0aefaadf..b8b86d4c19 100644
--- a/src/calibre/ebooks/metadata/html.py
+++ b/src/calibre/ebooks/metadata/html.py
@@ -10,9 +10,10 @@ Try to read metadata from an HTML file.
import re
+from collections import defaultdict
from HTMLParser import HTMLParser
-from calibre.ebooks.metadata import string_to_authors
+from calibre.ebooks.metadata import string_to_authors, authors_to_string
from calibre.ebooks.metadata.book.base import Metadata
from calibre.ebooks.chardet import xml_to_unicode
from calibre import replace_entities, isbytestring
@@ -30,7 +31,7 @@ COMMENT_NAMES = {
'authors': 'AUTHOR',
'publisher': 'PUBLISHER',
'isbn': 'ISBN',
- 'language': 'LANGUAGE',
+ 'languages': 'LANGUAGE',
'pubdate': 'PUBDATE',
'timestamp': 'TIMESTAMP',
'series': 'SERIES',
@@ -44,8 +45,8 @@ META_NAMES = {
'title' : ('dc.title', 'dcterms.title', 'title'),
'authors': ('author', 'dc.creator.aut', 'dcterms.creator.aut', 'dc.creator'),
'publisher': ('publisher', 'dc.publisher', 'dcterms.publisher'),
- 'isbn': ('isbn', 'dc.identifier.isbn', 'dcterms.identifier.isbn'),
- 'language': ('dc.language', 'dcterms.language'),
+ 'isbn': ('isbn',),
+ 'languages': ('dc.language', 'dcterms.language'),
'pubdate': ('pubdate', 'date of publication', 'dc.date.published', 'dc.date.publication', 'dc.date.issued', 'dcterms.issued'),
'timestamp': ('timestamp', 'date of creation', 'dc.date.created', 'dc.date.creation', 'dcterms.created'),
'series': ('series',),
@@ -59,69 +60,85 @@ META_NAMES = {
# single quotes inside double quotes and vice versa.
attr_pat = r'''(?:(?P')|(?P"))(?P(?(sq)[^']+|[^"]+))(?(sq)'|")'''
-
-def parse_meta_tags(src):
- rmap = {}
- for field, names in iteritems(META_NAMES):
- for name in names:
- rmap[name.lower()] = field
- all_names = '|'.join(rmap)
- ans = {}
- npat = r'''name\s*=\s*['"]{0,1}(?P%s)['"]{0,1}''' % all_names
- cpat = r'content\s*=\s*%s' % attr_pat
- for pat in (
- r'%s)\s*=\s*%s''' % (all_names, attr_pat), src):
- field = rmap[match.group('name')]
- if field not in ans:
- ans[field] = replace_entities(match.group('content'))
- if len(ans) == len(COMMENT_NAMES):
- break
- return ans
+ def handle_charref(self, ref):
+ if self.recording:
+ self.recorded.append(replace_entities("%s;" % ref))
+ def handle_entityref(self, ref):
+ if self.recording:
+ self.recorded.append(replace_entities("&%s;" % ref))
+
+ def handle_endtag(self, tag):
+ if tag == 'title':
+ self.recording = False
+ self.title_tag = ''.join(self.recorded)
+
+ def handle_comment(self, data):
+ for match in re.finditer(r'''(?P\S+)\s*=\s*%s''' % (attr_pat), data):
+ x = match.group('name')
+ field = None
+ try:
+ field = self.rmap_comment[x]
+ except KeyError:
+ pass
+ if field:
+ self.comment_tags[field].append(replace_entities(match.group('content')))
+
+ parser = MetadataParser()
+ parser.feed(src)
+
+ return (parser.comment_tags, parser.meta_tags, parser.meta_tag_ids, parser.title_tag)
def get_metadata_(src, encoding=None):
# Meta data definitions as in
@@ -133,37 +150,44 @@ def get_metadata_(src, encoding=None):
else:
src = src.decode(encoding, 'replace')
src = src[:150000] # Searching shouldn't take too long
- comment_tags = parse_comment_tags(src)
- meta_tags = parse_meta_tags(src)
- meta_tag_ids = parse_meta_tag_identifiers(src)
+ (comment_tags, meta_tags, meta_tag_ids, title_tag) = parse_metadata(src)
- def get(field):
+ def get_all(field):
ans = comment_tags.get(field, meta_tags.get(field, None))
if ans:
- ans = ans.strip()
+ ans = [x.strip() for x in ans if x.strip()]
if not ans:
ans = None
return ans
+ def get(field):
+ ans = get_all(field)
+ if ans:
+ ans = ans[0]
+ return ans
+
# Title
- title = get('title')
- if not title:
- pat = re.compile('([^<>]+?)', re.IGNORECASE)
- match = pat.search(src)
- if match:
- title = replace_entities(match.group(1))
+ title = get('title') or title_tag.strip() or _('Unknown')
# Author
- authors = get('authors') or _('Unknown')
+ authors = authors_to_string(get_all('authors')) or _('Unknown')
# Create MetaInformation with Title and Author
- mi = Metadata(title or _('Unknown'), string_to_authors(authors))
+ mi = Metadata(title, string_to_authors(authors))
- for field in ('publisher', 'isbn', 'language', 'comments'):
+ # Single-value text fields
+ for field in ('publisher', 'isbn', 'comments'):
val = get(field)
if val:
setattr(mi, field, val)
+ # Multi-value text fields
+ for field in ('languages',):
+ val = get_all(field)
+ if val:
+ setattr(mi, field, val)
+
+ # Date fields
for field in ('pubdate', 'timestamp'):
try:
val = parse_date(get(field))
@@ -210,14 +234,16 @@ def get_metadata_(src, encoding=None):
pass
# TAGS
- tags = get('tags')
+ tags = get_all('tags')
if tags:
- tags = [x.strip() for x in tags.split(',') if x.strip()]
+ tags = [x.strip() for s in tags for x in s.split(',') if x.strip()]
if tags:
mi.tags = tags
# IDENTIFIERS
- for (k,v) in meta_tag_ids.iteritems():
- mi.set_identifier(k, v)
+ for (k,v) in iteritems(meta_tag_ids):
+ v = [x.strip() for x in v if x.strip()]
+ if v:
+ mi.set_identifier(k, v[0])
return mi