mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
Support Dublin Core id tags when importing HTML
All of the below formats are supported and would be interpreted as "foo:bar" <meta name="DC.identifier" scheme="foo" content="bar" /> <meta name="dc:identifier.foo" content="bar/> <meta name="DCTERMS:identifier" scheme="foo" content="bar" /> <meta name="dcterms.identifier.foo" content="bar" />
This commit is contained in:
parent
c1663d3cc8
commit
9c846f3b51
@ -10,6 +10,8 @@ Try to read metadata from an HTML file.
|
|||||||
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
from HTMLParser import HTMLParser
|
||||||
|
|
||||||
from calibre.ebooks.metadata import string_to_authors
|
from calibre.ebooks.metadata import string_to_authors
|
||||||
from calibre.ebooks.metadata.book.base import Metadata
|
from calibre.ebooks.metadata.book.base import Metadata
|
||||||
from calibre.ebooks.chardet import xml_to_unicode
|
from calibre.ebooks.chardet import xml_to_unicode
|
||||||
@ -87,6 +89,26 @@ def parse_meta_tags(src):
|
|||||||
return ans
|
return ans
|
||||||
return ans
|
return ans
|
||||||
|
|
||||||
|
def parse_meta_tag_identifiers(src):
|
||||||
|
meta_identifiers = {}
|
||||||
|
|
||||||
|
class MetadataParser(HTMLParser):
|
||||||
|
def handle_starttag(self, tag, attrs):
|
||||||
|
attr_dict = dict(attrs)
|
||||||
|
|
||||||
|
if tag == 'meta' and re.match(r'(?:dc|dcterms)[\.:]identifier', attr_dict.get('name', ''), flags=re.IGNORECASE):
|
||||||
|
content = attr_dict.get('content', '').strip()
|
||||||
|
scheme = attr_dict.get('scheme', '').strip()
|
||||||
|
if not scheme:
|
||||||
|
elements = re.split(r'[\.:]', attr_dict['name'])
|
||||||
|
if len(elements) == 3:
|
||||||
|
scheme = elements[2]
|
||||||
|
if content and scheme:
|
||||||
|
meta_identifiers[scheme.lower()] = replace_entities(content)
|
||||||
|
|
||||||
|
MetadataParser().feed(src)
|
||||||
|
|
||||||
|
return meta_identifiers
|
||||||
|
|
||||||
def parse_comment_tags(src):
|
def parse_comment_tags(src):
|
||||||
all_names = '|'.join(itervalues(COMMENT_NAMES))
|
all_names = '|'.join(itervalues(COMMENT_NAMES))
|
||||||
@ -113,6 +135,7 @@ def get_metadata_(src, encoding=None):
|
|||||||
src = src[:150000] # Searching shouldn't take too long
|
src = src[:150000] # Searching shouldn't take too long
|
||||||
comment_tags = parse_comment_tags(src)
|
comment_tags = parse_comment_tags(src)
|
||||||
meta_tags = parse_meta_tags(src)
|
meta_tags = parse_meta_tags(src)
|
||||||
|
meta_tag_ids = parse_meta_tag_identifiers(src)
|
||||||
|
|
||||||
def get(field):
|
def get(field):
|
||||||
ans = comment_tags.get(field, meta_tags.get(field, None))
|
ans = comment_tags.get(field, meta_tags.get(field, None))
|
||||||
@ -193,4 +216,8 @@ def get_metadata_(src, encoding=None):
|
|||||||
if tags:
|
if tags:
|
||||||
mi.tags = tags
|
mi.tags = tags
|
||||||
|
|
||||||
|
# IDENTIFIERS
|
||||||
|
for (k,v) in meta_tag_ids.iteritems():
|
||||||
|
mi.set_identifier(k, v)
|
||||||
|
|
||||||
return mi
|
return mi
|
||||||
|
Loading…
x
Reference in New Issue
Block a user