Refactor HTML metadata parsing

Use an HTMLParser rather than regex, only parse the document once, and add handling for multiple values for authors, tags, and languages
This commit is contained in:
Christopher Szucko 2019-08-10 09:56:31 -05:00
parent 9c846f3b51
commit d36a23d795

View File

@ -10,9 +10,10 @@ Try to read metadata from an HTML file.
import re import re
from collections import defaultdict
from HTMLParser import HTMLParser from HTMLParser import HTMLParser
from calibre.ebooks.metadata import string_to_authors from calibre.ebooks.metadata import string_to_authors, authors_to_string
from calibre.ebooks.metadata.book.base import Metadata from calibre.ebooks.metadata.book.base import Metadata
from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.chardet import xml_to_unicode
from calibre import replace_entities, isbytestring from calibre import replace_entities, isbytestring
@ -30,7 +31,7 @@ COMMENT_NAMES = {
'authors': 'AUTHOR', 'authors': 'AUTHOR',
'publisher': 'PUBLISHER', 'publisher': 'PUBLISHER',
'isbn': 'ISBN', 'isbn': 'ISBN',
'language': 'LANGUAGE', 'languages': 'LANGUAGE',
'pubdate': 'PUBDATE', 'pubdate': 'PUBDATE',
'timestamp': 'TIMESTAMP', 'timestamp': 'TIMESTAMP',
'series': 'SERIES', 'series': 'SERIES',
@ -44,8 +45,8 @@ META_NAMES = {
'title' : ('dc.title', 'dcterms.title', 'title'), 'title' : ('dc.title', 'dcterms.title', 'title'),
'authors': ('author', 'dc.creator.aut', 'dcterms.creator.aut', 'dc.creator'), 'authors': ('author', 'dc.creator.aut', 'dcterms.creator.aut', 'dc.creator'),
'publisher': ('publisher', 'dc.publisher', 'dcterms.publisher'), 'publisher': ('publisher', 'dc.publisher', 'dcterms.publisher'),
'isbn': ('isbn', 'dc.identifier.isbn', 'dcterms.identifier.isbn'), 'isbn': ('isbn',),
'language': ('dc.language', 'dcterms.language'), 'languages': ('dc.language', 'dcterms.language'),
'pubdate': ('pubdate', 'date of publication', 'dc.date.published', 'dc.date.publication', 'dc.date.issued', 'dcterms.issued'), 'pubdate': ('pubdate', 'date of publication', 'dc.date.published', 'dc.date.publication', 'dc.date.issued', 'dcterms.issued'),
'timestamp': ('timestamp', 'date of creation', 'dc.date.created', 'dc.date.creation', 'dcterms.created'), 'timestamp': ('timestamp', 'date of creation', 'dc.date.created', 'dc.date.creation', 'dcterms.created'),
'series': ('series',), 'series': ('series',),
@ -59,69 +60,85 @@ META_NAMES = {
# single quotes inside double quotes and vice versa. # single quotes inside double quotes and vice versa.
attr_pat = r'''(?:(?P<sq>')|(?P<dq>"))(?P<content>(?(sq)[^']+|[^"]+))(?(sq)'|")''' attr_pat = r'''(?:(?P<sq>')|(?P<dq>"))(?P<content>(?(sq)[^']+|[^"]+))(?(sq)'|")'''
def parse_metadata(src):
def parse_meta_tags(src):
rmap = {}
for field, names in iteritems(META_NAMES):
for name in names:
rmap[name.lower()] = field
all_names = '|'.join(rmap)
ans = {}
npat = r'''name\s*=\s*['"]{0,1}(?P<name>%s)['"]{0,1}''' % all_names
cpat = r'content\s*=\s*%s' % attr_pat
for pat in (
r'<meta\s+%s\s+%s' % (npat, cpat),
r'<meta\s+%s\s+%s' % (cpat, npat),
):
for match in re.finditer(pat, src, flags=re.IGNORECASE):
x = match.group('name').lower()
try:
field = rmap[x]
except KeyError:
try:
field = rmap[x.replace(':', '.')]
except KeyError:
continue
if field not in ans:
ans[field] = replace_entities(match.group('content'))
if len(ans) == len(META_NAMES):
return ans
return ans
def parse_meta_tag_identifiers(src):
meta_identifiers = {}
class MetadataParser(HTMLParser): class MetadataParser(HTMLParser):
def __init__(self):
self.comment_tags = defaultdict(list)
self.meta_tag_ids = defaultdict(list)
self.meta_tags = defaultdict(list)
self.title_tag = ''
self.recording = False
self.recorded = []
self.rmap_comment = {v:k for k, v in iteritems(COMMENT_NAMES)}
self.rmap_meta = {v:k for k, l in iteritems(META_NAMES) for v in l}
HTMLParser.__init__(self)
def handle_starttag(self, tag, attrs): def handle_starttag(self, tag, attrs):
attr_dict = dict(attrs) attr_dict = dict(attrs)
if tag == 'meta' and re.match(r'(?:dc|dcterms)[\.:]identifier', attr_dict.get('name', ''), flags=re.IGNORECASE): if tag == 'title':
content = attr_dict.get('content', '').strip() self.recording = True
self.recorded = []
elif tag == 'meta' and re.match(r'(?:dc|dcterms)[.:]identifier(?:\.|$)', attr_dict.get('name', ''), flags=re.IGNORECASE):
scheme = None
if re.match(r'(?:dc|dcterms)[.:]identifier$', attr_dict.get('name', ''), flags=re.IGNORECASE):
scheme = attr_dict.get('scheme', '').strip() scheme = attr_dict.get('scheme', '').strip()
if not scheme: elif 'scheme' not in attr_dict:
elements = re.split(r'[\.:]', attr_dict['name']) elements = re.split(r'[.:]', attr_dict['name'])
if len(elements) == 3: if len(elements) == 3:
scheme = elements[2] scheme = elements[2].strip()
if content and scheme: if scheme:
meta_identifiers[scheme.lower()] = replace_entities(content) self.meta_tag_ids[scheme.lower()].append(attr_dict.get('content', ''))
MetadataParser().feed(src) elif tag == 'meta':
x = attr_dict.get('name', '').lower()
field = None
try:
field = self.rmap_meta[x]
except KeyError:
try:
field = self.rmap_meta[x.replace(':', '.')]
except KeyError:
pass
if field:
self.meta_tags[field].append(attr_dict.get('content', ''))
return meta_identifiers def handle_data(self, data):
if self.recording:
self.recorded.append(data)
def parse_comment_tags(src): def handle_charref(self, ref):
all_names = '|'.join(itervalues(COMMENT_NAMES)) if self.recording:
rmap = {v:k for k, v in iteritems(COMMENT_NAMES)} self.recorded.append(replace_entities("&#%s;" % ref))
ans = {}
for match in re.finditer(r'''<!--\s*(?P<name>%s)\s*=\s*%s''' % (all_names, attr_pat), src):
field = rmap[match.group('name')]
if field not in ans:
ans[field] = replace_entities(match.group('content'))
if len(ans) == len(COMMENT_NAMES):
break
return ans
def handle_entityref(self, ref):
if self.recording:
self.recorded.append(replace_entities("&%s;" % ref))
def handle_endtag(self, tag):
if tag == 'title':
self.recording = False
self.title_tag = ''.join(self.recorded)
def handle_comment(self, data):
for match in re.finditer(r'''(?P<name>\S+)\s*=\s*%s''' % (attr_pat), data):
x = match.group('name')
field = None
try:
field = self.rmap_comment[x]
except KeyError:
pass
if field:
self.comment_tags[field].append(replace_entities(match.group('content')))
parser = MetadataParser()
parser.feed(src)
return (parser.comment_tags, parser.meta_tags, parser.meta_tag_ids, parser.title_tag)
def get_metadata_(src, encoding=None): def get_metadata_(src, encoding=None):
# Meta data definitions as in # Meta data definitions as in
@ -133,37 +150,44 @@ def get_metadata_(src, encoding=None):
else: else:
src = src.decode(encoding, 'replace') src = src.decode(encoding, 'replace')
src = src[:150000] # Searching shouldn't take too long src = src[:150000] # Searching shouldn't take too long
comment_tags = parse_comment_tags(src) (comment_tags, meta_tags, meta_tag_ids, title_tag) = parse_metadata(src)
meta_tags = parse_meta_tags(src)
meta_tag_ids = parse_meta_tag_identifiers(src)
def get(field): def get_all(field):
ans = comment_tags.get(field, meta_tags.get(field, None)) ans = comment_tags.get(field, meta_tags.get(field, None))
if ans: if ans:
ans = ans.strip() ans = [x.strip() for x in ans if x.strip()]
if not ans: if not ans:
ans = None ans = None
return ans return ans
def get(field):
ans = get_all(field)
if ans:
ans = ans[0]
return ans
# Title # Title
title = get('title') title = get('title') or title_tag.strip() or _('Unknown')
if not title:
pat = re.compile('<title>([^<>]+?)</title>', re.IGNORECASE)
match = pat.search(src)
if match:
title = replace_entities(match.group(1))
# Author # Author
authors = get('authors') or _('Unknown') authors = authors_to_string(get_all('authors')) or _('Unknown')
# Create MetaInformation with Title and Author # Create MetaInformation with Title and Author
mi = Metadata(title or _('Unknown'), string_to_authors(authors)) mi = Metadata(title, string_to_authors(authors))
for field in ('publisher', 'isbn', 'language', 'comments'): # Single-value text fields
for field in ('publisher', 'isbn', 'comments'):
val = get(field) val = get(field)
if val: if val:
setattr(mi, field, val) setattr(mi, field, val)
# Multi-value text fields
for field in ('languages',):
val = get_all(field)
if val:
setattr(mi, field, val)
# Date fields
for field in ('pubdate', 'timestamp'): for field in ('pubdate', 'timestamp'):
try: try:
val = parse_date(get(field)) val = parse_date(get(field))
@ -210,14 +234,16 @@ def get_metadata_(src, encoding=None):
pass pass
# TAGS # TAGS
tags = get('tags') tags = get_all('tags')
if tags: if tags:
tags = [x.strip() for x in tags.split(',') if x.strip()] tags = [x.strip() for s in tags for x in s.split(',') if x.strip()]
if tags: if tags:
mi.tags = tags mi.tags = tags
# IDENTIFIERS # IDENTIFIERS
for (k,v) in meta_tag_ids.iteritems(): for (k,v) in iteritems(meta_tag_ids):
mi.set_identifier(k, v) v = [x.strip() for x in v if x.strip()]
if v:
mi.set_identifier(k, v[0])
return mi return mi