mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Added more metadata support for HTML
This commit is contained in:
parent
f4b097fad0
commit
8e338e7ee0
@ -17,6 +17,9 @@ def get_metadata(stream):
|
||||
src = stream.read()
|
||||
return get_metadata_(src)
|
||||
|
||||
def get_meta_regexp_(name):
|
||||
return re.compile('<meta name=[\'"]' + name + '[\'"] content=[\'"](.+?)[\'"]\s*/?>', re.IGNORECASE)
|
||||
|
||||
def get_metadata_(src, encoding=None):
|
||||
if not isinstance(src, unicode):
|
||||
if not encoding:
|
||||
@ -24,6 +27,9 @@ def get_metadata_(src, encoding=None):
|
||||
else:
|
||||
src = src.decode(encoding, 'replace')
|
||||
|
||||
# Meta data definitions as in
|
||||
# http://www.mobileread.com/forums/showpost.php?p=712544&postcount=9
|
||||
|
||||
# Title
|
||||
title = None
|
||||
pat = re.compile(r'<!--.*?TITLE=(?P<q>[\'"])(.+?)(?P=q).*?-->', re.DOTALL)
|
||||
@ -35,6 +41,13 @@ def get_metadata_(src, encoding=None):
|
||||
match = pat.search(src)
|
||||
if match:
|
||||
title = match.group(1)
|
||||
if not title:
|
||||
for x in ('Title','DC.title','DCTERMS.title'):
|
||||
pat = get_meta_regexp_(x)
|
||||
match = pat.search(src)
|
||||
if match:
|
||||
title = match.group(1)
|
||||
break
|
||||
|
||||
# Author
|
||||
author = None
|
||||
@ -42,7 +55,15 @@ def get_metadata_(src, encoding=None):
|
||||
match = pat.search(src)
|
||||
if match:
|
||||
author = match.group(2).replace(',', ';')
|
||||
else:
|
||||
for x in ('Author','DC.creator.aut','DCTERMS.creator.aut'):
|
||||
pat = get_meta_regexp_(x)
|
||||
match = pat.search(src)
|
||||
if match:
|
||||
author = match.group(1)
|
||||
break
|
||||
|
||||
# Create MetaInformation with Title and Author
|
||||
ent_pat = re.compile(r'&(\S+)?;')
|
||||
if title:
|
||||
title = ent_pat.sub(entity_to_unicode, title)
|
||||
@ -51,18 +72,142 @@ def get_metadata_(src, encoding=None):
|
||||
mi = MetaInformation(title, [author] if author else None)
|
||||
|
||||
# Publisher
|
||||
publisher = None
|
||||
pat = re.compile(r'<!--.*?PUBLISHER=(?P<q>[\'"])(.+?)(?P=q).*?-->', re.DOTALL)
|
||||
match = pat.search(src)
|
||||
if match:
|
||||
mi.publisher = match.group(2)
|
||||
publisher = match.group(2)
|
||||
else:
|
||||
for x in ('Publisher','DC.publisher','DCTERMS.publisher'):
|
||||
pat = get_meta_regexp_(x)
|
||||
match = pat.search(src)
|
||||
if match:
|
||||
publisher = match.group(1)
|
||||
break
|
||||
if publisher:
|
||||
mi.publisher = ent_pat.sub(entity_to_unicode, publisher)
|
||||
|
||||
# ISBN
|
||||
isbn = None
|
||||
pat = re.compile(r'<!--.*?ISBN=[\'"]([^"\']+)[\'"].*?-->', re.DOTALL)
|
||||
match = pat.search(src)
|
||||
if match:
|
||||
isbn = match.group(1)
|
||||
else:
|
||||
for x in ('ISBN','DC.identifier.ISBN','DCTERMS.identifier.ISBN'):
|
||||
pat = get_meta_regexp_(x)
|
||||
match = pat.search(src)
|
||||
if match:
|
||||
isbn = match.group(1)
|
||||
break
|
||||
if isbn:
|
||||
mi.isbn = re.sub(r'[^0-9xX]', '', isbn)
|
||||
|
||||
# LANGUAGE
|
||||
language = None
|
||||
pat = re.compile(r'<!--.*?LANGUAGE=[\'"]([^"\']+)[\'"].*?-->', re.DOTALL)
|
||||
match = pat.search(src)
|
||||
if match:
|
||||
language = match.group(1)
|
||||
else:
|
||||
for x in ('DC.language','DCTERMS.language'):
|
||||
pat = get_meta_regexp_(x)
|
||||
match = pat.search(src)
|
||||
if match:
|
||||
language = match.group(1)
|
||||
break
|
||||
if language:
|
||||
mi.language = language
|
||||
|
||||
# PUBDATE
|
||||
pubdate = None
|
||||
pat = re.compile(r'<!--.*?PUBDATE=[\'"]([^"\']+)[\'"].*?-->', re.DOTALL)
|
||||
match = pat.search(src)
|
||||
if match:
|
||||
pubdate = match.group(1)
|
||||
else:
|
||||
for x in ('Pubdate','Date of publication','DC.date.published','DC.date.publication','DC.date.issued','DCTERMS.issued'):
|
||||
pat = get_meta_regexp_(x)
|
||||
match = pat.search(src)
|
||||
if match:
|
||||
pubdate = match.group(1)
|
||||
break
|
||||
if pubdate:
|
||||
mi.pubdate = pubdate
|
||||
|
||||
# TIMESTAMP
|
||||
timestamp = None
|
||||
pat = re.compile(r'<!--.*?TIMESTAMP=[\'"]([^"\']+)[\'"].*?-->', re.DOTALL)
|
||||
match = pat.search(src)
|
||||
if match:
|
||||
timestamp = match.group(1)
|
||||
else:
|
||||
for x in ('Timestamp','Date of creation','DC.date.created','DC.date.creation','DCTERMS.created'):
|
||||
pat = get_meta_regexp_(x)
|
||||
match = pat.search(src)
|
||||
if match:
|
||||
timestamp = match.group(1)
|
||||
break
|
||||
if timestamp:
|
||||
mi.timestamp = timestamp
|
||||
|
||||
# SERIES
|
||||
series = None
|
||||
pat = re.compile(r'<!--.*?SERIES=[\'"]([^"\']+)[\'"].*?-->', re.DOTALL)
|
||||
match = pat.search(src)
|
||||
if match:
|
||||
series = match.group(1)
|
||||
else:
|
||||
pat = get_meta_regexp_("Series")
|
||||
match = pat.search(src)
|
||||
if match:
|
||||
series = match.group(1)
|
||||
if series:
|
||||
mi.series = ent_pat.sub(entity_to_unicode, series)
|
||||
|
||||
# RATING
|
||||
rating = None
|
||||
pat = re.compile(r'<!--.*?RATING=[\'"]([^"\']+)[\'"].*?-->', re.DOTALL)
|
||||
match = pat.search(src)
|
||||
if match:
|
||||
rating = match.group(1)
|
||||
else:
|
||||
pat = get_meta_regexp_("Rating")
|
||||
match = pat.search(src)
|
||||
if match:
|
||||
rating = match.group(1)
|
||||
if rating:
|
||||
mi.rating = rating
|
||||
|
||||
# COMMENTS
|
||||
comments = None
|
||||
pat = re.compile(r'<!--.*?COMMENTS=[\'"]([^"\']+)[\'"].*?-->', re.DOTALL)
|
||||
match = pat.search(src)
|
||||
if match:
|
||||
comments = match.group(1)
|
||||
else:
|
||||
pat = get_meta_regexp_("Comments")
|
||||
match = pat.search(src)
|
||||
if match:
|
||||
comments = match.group(1)
|
||||
if comments:
|
||||
mi.comments = ent_pat.sub(entity_to_unicode, comments)
|
||||
|
||||
# TAGS
|
||||
tags = None
|
||||
pat = re.compile(r'<!--.*?TAGS=[\'"]([^"\']+)[\'"].*?-->', re.DOTALL)
|
||||
match = pat.search(src)
|
||||
if match:
|
||||
tags = match.group(1)
|
||||
else:
|
||||
pat = get_meta_regexp_("Tags")
|
||||
match = pat.search(src)
|
||||
if match:
|
||||
tags = match.group(1)
|
||||
if tags:
|
||||
mi.tags = ent_pat.sub(entity_to_unicode, tags).split(",")
|
||||
|
||||
# Ready to return MetaInformation
|
||||
return mi
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user