mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
Refactor HTML metadata parsing
Use an HTMLParser rather than regex, only parse the document once, and add handling for multiple values for authors, tags, and languages
This commit is contained in:
parent
9c846f3b51
commit
d36a23d795
@ -10,9 +10,10 @@ Try to read metadata from an HTML file.
|
|||||||
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
from collections import defaultdict
|
||||||
from HTMLParser import HTMLParser
|
from HTMLParser import HTMLParser
|
||||||
|
|
||||||
from calibre.ebooks.metadata import string_to_authors
|
from calibre.ebooks.metadata import string_to_authors, authors_to_string
|
||||||
from calibre.ebooks.metadata.book.base import Metadata
|
from calibre.ebooks.metadata.book.base import Metadata
|
||||||
from calibre.ebooks.chardet import xml_to_unicode
|
from calibre.ebooks.chardet import xml_to_unicode
|
||||||
from calibre import replace_entities, isbytestring
|
from calibre import replace_entities, isbytestring
|
||||||
@ -30,7 +31,7 @@ COMMENT_NAMES = {
|
|||||||
'authors': 'AUTHOR',
|
'authors': 'AUTHOR',
|
||||||
'publisher': 'PUBLISHER',
|
'publisher': 'PUBLISHER',
|
||||||
'isbn': 'ISBN',
|
'isbn': 'ISBN',
|
||||||
'language': 'LANGUAGE',
|
'languages': 'LANGUAGE',
|
||||||
'pubdate': 'PUBDATE',
|
'pubdate': 'PUBDATE',
|
||||||
'timestamp': 'TIMESTAMP',
|
'timestamp': 'TIMESTAMP',
|
||||||
'series': 'SERIES',
|
'series': 'SERIES',
|
||||||
@ -44,8 +45,8 @@ META_NAMES = {
|
|||||||
'title' : ('dc.title', 'dcterms.title', 'title'),
|
'title' : ('dc.title', 'dcterms.title', 'title'),
|
||||||
'authors': ('author', 'dc.creator.aut', 'dcterms.creator.aut', 'dc.creator'),
|
'authors': ('author', 'dc.creator.aut', 'dcterms.creator.aut', 'dc.creator'),
|
||||||
'publisher': ('publisher', 'dc.publisher', 'dcterms.publisher'),
|
'publisher': ('publisher', 'dc.publisher', 'dcterms.publisher'),
|
||||||
'isbn': ('isbn', 'dc.identifier.isbn', 'dcterms.identifier.isbn'),
|
'isbn': ('isbn',),
|
||||||
'language': ('dc.language', 'dcterms.language'),
|
'languages': ('dc.language', 'dcterms.language'),
|
||||||
'pubdate': ('pubdate', 'date of publication', 'dc.date.published', 'dc.date.publication', 'dc.date.issued', 'dcterms.issued'),
|
'pubdate': ('pubdate', 'date of publication', 'dc.date.published', 'dc.date.publication', 'dc.date.issued', 'dcterms.issued'),
|
||||||
'timestamp': ('timestamp', 'date of creation', 'dc.date.created', 'dc.date.creation', 'dcterms.created'),
|
'timestamp': ('timestamp', 'date of creation', 'dc.date.created', 'dc.date.creation', 'dcterms.created'),
|
||||||
'series': ('series',),
|
'series': ('series',),
|
||||||
@ -59,69 +60,85 @@ META_NAMES = {
|
|||||||
# single quotes inside double quotes and vice versa.
|
# single quotes inside double quotes and vice versa.
|
||||||
attr_pat = r'''(?:(?P<sq>')|(?P<dq>"))(?P<content>(?(sq)[^']+|[^"]+))(?(sq)'|")'''
|
attr_pat = r'''(?:(?P<sq>')|(?P<dq>"))(?P<content>(?(sq)[^']+|[^"]+))(?(sq)'|")'''
|
||||||
|
|
||||||
|
def parse_metadata(src):
|
||||||
def parse_meta_tags(src):
|
|
||||||
rmap = {}
|
|
||||||
for field, names in iteritems(META_NAMES):
|
|
||||||
for name in names:
|
|
||||||
rmap[name.lower()] = field
|
|
||||||
all_names = '|'.join(rmap)
|
|
||||||
ans = {}
|
|
||||||
npat = r'''name\s*=\s*['"]{0,1}(?P<name>%s)['"]{0,1}''' % all_names
|
|
||||||
cpat = r'content\s*=\s*%s' % attr_pat
|
|
||||||
for pat in (
|
|
||||||
r'<meta\s+%s\s+%s' % (npat, cpat),
|
|
||||||
r'<meta\s+%s\s+%s' % (cpat, npat),
|
|
||||||
):
|
|
||||||
for match in re.finditer(pat, src, flags=re.IGNORECASE):
|
|
||||||
x = match.group('name').lower()
|
|
||||||
try:
|
|
||||||
field = rmap[x]
|
|
||||||
except KeyError:
|
|
||||||
try:
|
|
||||||
field = rmap[x.replace(':', '.')]
|
|
||||||
except KeyError:
|
|
||||||
continue
|
|
||||||
|
|
||||||
if field not in ans:
|
|
||||||
ans[field] = replace_entities(match.group('content'))
|
|
||||||
if len(ans) == len(META_NAMES):
|
|
||||||
return ans
|
|
||||||
return ans
|
|
||||||
|
|
||||||
def parse_meta_tag_identifiers(src):
|
|
||||||
meta_identifiers = {}
|
|
||||||
|
|
||||||
class MetadataParser(HTMLParser):
|
class MetadataParser(HTMLParser):
|
||||||
|
def __init__(self):
|
||||||
|
self.comment_tags = defaultdict(list)
|
||||||
|
self.meta_tag_ids = defaultdict(list)
|
||||||
|
self.meta_tags = defaultdict(list)
|
||||||
|
self.title_tag = ''
|
||||||
|
|
||||||
|
self.recording = False
|
||||||
|
self.recorded = []
|
||||||
|
|
||||||
|
self.rmap_comment = {v:k for k, v in iteritems(COMMENT_NAMES)}
|
||||||
|
self.rmap_meta = {v:k for k, l in iteritems(META_NAMES) for v in l}
|
||||||
|
|
||||||
|
HTMLParser.__init__(self)
|
||||||
|
|
||||||
def handle_starttag(self, tag, attrs):
|
def handle_starttag(self, tag, attrs):
|
||||||
attr_dict = dict(attrs)
|
attr_dict = dict(attrs)
|
||||||
|
|
||||||
if tag == 'meta' and re.match(r'(?:dc|dcterms)[\.:]identifier', attr_dict.get('name', ''), flags=re.IGNORECASE):
|
if tag == 'title':
|
||||||
content = attr_dict.get('content', '').strip()
|
self.recording = True
|
||||||
|
self.recorded = []
|
||||||
|
|
||||||
|
elif tag == 'meta' and re.match(r'(?:dc|dcterms)[.:]identifier(?:\.|$)', attr_dict.get('name', ''), flags=re.IGNORECASE):
|
||||||
|
scheme = None
|
||||||
|
if re.match(r'(?:dc|dcterms)[.:]identifier$', attr_dict.get('name', ''), flags=re.IGNORECASE):
|
||||||
scheme = attr_dict.get('scheme', '').strip()
|
scheme = attr_dict.get('scheme', '').strip()
|
||||||
if not scheme:
|
elif 'scheme' not in attr_dict:
|
||||||
elements = re.split(r'[\.:]', attr_dict['name'])
|
elements = re.split(r'[.:]', attr_dict['name'])
|
||||||
if len(elements) == 3:
|
if len(elements) == 3:
|
||||||
scheme = elements[2]
|
scheme = elements[2].strip()
|
||||||
if content and scheme:
|
if scheme:
|
||||||
meta_identifiers[scheme.lower()] = replace_entities(content)
|
self.meta_tag_ids[scheme.lower()].append(attr_dict.get('content', ''))
|
||||||
|
|
||||||
MetadataParser().feed(src)
|
elif tag == 'meta':
|
||||||
|
x = attr_dict.get('name', '').lower()
|
||||||
|
field = None
|
||||||
|
try:
|
||||||
|
field = self.rmap_meta[x]
|
||||||
|
except KeyError:
|
||||||
|
try:
|
||||||
|
field = self.rmap_meta[x.replace(':', '.')]
|
||||||
|
except KeyError:
|
||||||
|
pass
|
||||||
|
if field:
|
||||||
|
self.meta_tags[field].append(attr_dict.get('content', ''))
|
||||||
|
|
||||||
return meta_identifiers
|
def handle_data(self, data):
|
||||||
|
if self.recording:
|
||||||
|
self.recorded.append(data)
|
||||||
|
|
||||||
def parse_comment_tags(src):
|
def handle_charref(self, ref):
|
||||||
all_names = '|'.join(itervalues(COMMENT_NAMES))
|
if self.recording:
|
||||||
rmap = {v:k for k, v in iteritems(COMMENT_NAMES)}
|
self.recorded.append(replace_entities("&#%s;" % ref))
|
||||||
ans = {}
|
|
||||||
for match in re.finditer(r'''<!--\s*(?P<name>%s)\s*=\s*%s''' % (all_names, attr_pat), src):
|
|
||||||
field = rmap[match.group('name')]
|
|
||||||
if field not in ans:
|
|
||||||
ans[field] = replace_entities(match.group('content'))
|
|
||||||
if len(ans) == len(COMMENT_NAMES):
|
|
||||||
break
|
|
||||||
return ans
|
|
||||||
|
|
||||||
|
def handle_entityref(self, ref):
|
||||||
|
if self.recording:
|
||||||
|
self.recorded.append(replace_entities("&%s;" % ref))
|
||||||
|
|
||||||
|
def handle_endtag(self, tag):
|
||||||
|
if tag == 'title':
|
||||||
|
self.recording = False
|
||||||
|
self.title_tag = ''.join(self.recorded)
|
||||||
|
|
||||||
|
def handle_comment(self, data):
|
||||||
|
for match in re.finditer(r'''(?P<name>\S+)\s*=\s*%s''' % (attr_pat), data):
|
||||||
|
x = match.group('name')
|
||||||
|
field = None
|
||||||
|
try:
|
||||||
|
field = self.rmap_comment[x]
|
||||||
|
except KeyError:
|
||||||
|
pass
|
||||||
|
if field:
|
||||||
|
self.comment_tags[field].append(replace_entities(match.group('content')))
|
||||||
|
|
||||||
|
parser = MetadataParser()
|
||||||
|
parser.feed(src)
|
||||||
|
|
||||||
|
return (parser.comment_tags, parser.meta_tags, parser.meta_tag_ids, parser.title_tag)
|
||||||
|
|
||||||
def get_metadata_(src, encoding=None):
|
def get_metadata_(src, encoding=None):
|
||||||
# Meta data definitions as in
|
# Meta data definitions as in
|
||||||
@ -133,37 +150,44 @@ def get_metadata_(src, encoding=None):
|
|||||||
else:
|
else:
|
||||||
src = src.decode(encoding, 'replace')
|
src = src.decode(encoding, 'replace')
|
||||||
src = src[:150000] # Searching shouldn't take too long
|
src = src[:150000] # Searching shouldn't take too long
|
||||||
comment_tags = parse_comment_tags(src)
|
(comment_tags, meta_tags, meta_tag_ids, title_tag) = parse_metadata(src)
|
||||||
meta_tags = parse_meta_tags(src)
|
|
||||||
meta_tag_ids = parse_meta_tag_identifiers(src)
|
|
||||||
|
|
||||||
def get(field):
|
def get_all(field):
|
||||||
ans = comment_tags.get(field, meta_tags.get(field, None))
|
ans = comment_tags.get(field, meta_tags.get(field, None))
|
||||||
if ans:
|
if ans:
|
||||||
ans = ans.strip()
|
ans = [x.strip() for x in ans if x.strip()]
|
||||||
if not ans:
|
if not ans:
|
||||||
ans = None
|
ans = None
|
||||||
return ans
|
return ans
|
||||||
|
|
||||||
|
def get(field):
|
||||||
|
ans = get_all(field)
|
||||||
|
if ans:
|
||||||
|
ans = ans[0]
|
||||||
|
return ans
|
||||||
|
|
||||||
# Title
|
# Title
|
||||||
title = get('title')
|
title = get('title') or title_tag.strip() or _('Unknown')
|
||||||
if not title:
|
|
||||||
pat = re.compile('<title>([^<>]+?)</title>', re.IGNORECASE)
|
|
||||||
match = pat.search(src)
|
|
||||||
if match:
|
|
||||||
title = replace_entities(match.group(1))
|
|
||||||
|
|
||||||
# Author
|
# Author
|
||||||
authors = get('authors') or _('Unknown')
|
authors = authors_to_string(get_all('authors')) or _('Unknown')
|
||||||
|
|
||||||
# Create MetaInformation with Title and Author
|
# Create MetaInformation with Title and Author
|
||||||
mi = Metadata(title or _('Unknown'), string_to_authors(authors))
|
mi = Metadata(title, string_to_authors(authors))
|
||||||
|
|
||||||
for field in ('publisher', 'isbn', 'language', 'comments'):
|
# Single-value text fields
|
||||||
|
for field in ('publisher', 'isbn', 'comments'):
|
||||||
val = get(field)
|
val = get(field)
|
||||||
if val:
|
if val:
|
||||||
setattr(mi, field, val)
|
setattr(mi, field, val)
|
||||||
|
|
||||||
|
# Multi-value text fields
|
||||||
|
for field in ('languages',):
|
||||||
|
val = get_all(field)
|
||||||
|
if val:
|
||||||
|
setattr(mi, field, val)
|
||||||
|
|
||||||
|
# Date fields
|
||||||
for field in ('pubdate', 'timestamp'):
|
for field in ('pubdate', 'timestamp'):
|
||||||
try:
|
try:
|
||||||
val = parse_date(get(field))
|
val = parse_date(get(field))
|
||||||
@ -210,14 +234,16 @@ def get_metadata_(src, encoding=None):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
# TAGS
|
# TAGS
|
||||||
tags = get('tags')
|
tags = get_all('tags')
|
||||||
if tags:
|
if tags:
|
||||||
tags = [x.strip() for x in tags.split(',') if x.strip()]
|
tags = [x.strip() for s in tags for x in s.split(',') if x.strip()]
|
||||||
if tags:
|
if tags:
|
||||||
mi.tags = tags
|
mi.tags = tags
|
||||||
|
|
||||||
# IDENTIFIERS
|
# IDENTIFIERS
|
||||||
for (k,v) in meta_tag_ids.iteritems():
|
for (k,v) in iteritems(meta_tag_ids):
|
||||||
mi.set_identifier(k, v)
|
v = [x.strip() for x in v if x.strip()]
|
||||||
|
if v:
|
||||||
|
mi.set_identifier(k, v[0])
|
||||||
|
|
||||||
return mi
|
return mi
|
||||||
|
Loading…
x
Reference in New Issue
Block a user