Added more metadata support for HTML

This commit is contained in:
Magnus Widerberg 2010-07-19 13:11:03 +02:00
parent f4b097fad0
commit 8e338e7ee0

View File

@ -17,6 +17,9 @@ def get_metadata(stream):
src = stream.read()
return get_metadata_(src)
def get_meta_regexp_(name):
return re.compile('<meta name=[\'"]' + name + '[\'"] content=[\'"](.+?)[\'"]\s*/?>', re.IGNORECASE)
def get_metadata_(src, encoding=None):
if not isinstance(src, unicode):
if not encoding:
@ -24,6 +27,9 @@ def get_metadata_(src, encoding=None):
else:
src = src.decode(encoding, 'replace')
# Meta data definitions as in
# http://www.mobileread.com/forums/showpost.php?p=712544&postcount=9
# Title
title = None
pat = re.compile(r'<!--.*?TITLE=(?P<q>[\'"])(.+?)(?P=q).*?-->', re.DOTALL)
@ -35,6 +41,13 @@ def get_metadata_(src, encoding=None):
match = pat.search(src)
if match:
title = match.group(1)
if not title:
for x in ('Title','DC.title','DCTERMS.title'):
pat = get_meta_regexp_(x)
match = pat.search(src)
if match:
title = match.group(1)
break
# Author
author = None
@ -42,7 +55,15 @@ def get_metadata_(src, encoding=None):
match = pat.search(src)
if match:
author = match.group(2).replace(',', ';')
else:
for x in ('Author','DC.creator.aut','DCTERMS.creator.aut'):
pat = get_meta_regexp_(x)
match = pat.search(src)
if match:
author = match.group(1)
break
# Create MetaInformation with Title and Author
ent_pat = re.compile(r'&(\S+)?;')
if title:
title = ent_pat.sub(entity_to_unicode, title)
@ -51,18 +72,142 @@ def get_metadata_(src, encoding=None):
mi = MetaInformation(title, [author] if author else None)
# Publisher
publisher = None
pat = re.compile(r'<!--.*?PUBLISHER=(?P<q>[\'"])(.+?)(?P=q).*?-->', re.DOTALL)
match = pat.search(src)
if match:
mi.publisher = match.group(2)
publisher = match.group(2)
else:
for x in ('Publisher','DC.publisher','DCTERMS.publisher'):
pat = get_meta_regexp_(x)
match = pat.search(src)
if match:
publisher = match.group(1)
break
if publisher:
mi.publisher = ent_pat.sub(entity_to_unicode, publisher)
# ISBN
isbn = None
pat = re.compile(r'<!--.*?ISBN=[\'"]([^"\']+)[\'"].*?-->', re.DOTALL)
match = pat.search(src)
if match:
isbn = match.group(1)
else:
for x in ('ISBN','DC.identifier.ISBN','DCTERMS.identifier.ISBN'):
pat = get_meta_regexp_(x)
match = pat.search(src)
if match:
isbn = match.group(1)
break
if isbn:
mi.isbn = re.sub(r'[^0-9xX]', '', isbn)
# LANGUAGE
language = None
pat = re.compile(r'<!--.*?LANGUAGE=[\'"]([^"\']+)[\'"].*?-->', re.DOTALL)
match = pat.search(src)
if match:
language = match.group(1)
else:
for x in ('DC.language','DCTERMS.language'):
pat = get_meta_regexp_(x)
match = pat.search(src)
if match:
language = match.group(1)
break
if language:
mi.language = language
# PUBDATE
pubdate = None
pat = re.compile(r'<!--.*?PUBDATE=[\'"]([^"\']+)[\'"].*?-->', re.DOTALL)
match = pat.search(src)
if match:
pubdate = match.group(1)
else:
for x in ('Pubdate','Date of publication','DC.date.published','DC.date.publication','DC.date.issued','DCTERMS.issued'):
pat = get_meta_regexp_(x)
match = pat.search(src)
if match:
pubdate = match.group(1)
break
if pubdate:
mi.pubdate = pubdate
# TIMESTAMP
timestamp = None
pat = re.compile(r'<!--.*?TIMESTAMP=[\'"]([^"\']+)[\'"].*?-->', re.DOTALL)
match = pat.search(src)
if match:
timestamp = match.group(1)
else:
for x in ('Timestamp','Date of creation','DC.date.created','DC.date.creation','DCTERMS.created'):
pat = get_meta_regexp_(x)
match = pat.search(src)
if match:
timestamp = match.group(1)
break
if timestamp:
mi.timestamp = timestamp
# SERIES
series = None
pat = re.compile(r'<!--.*?SERIES=[\'"]([^"\']+)[\'"].*?-->', re.DOTALL)
match = pat.search(src)
if match:
series = match.group(1)
else:
pat = get_meta_regexp_("Series")
match = pat.search(src)
if match:
series = match.group(1)
if series:
mi.series = ent_pat.sub(entity_to_unicode, series)
# RATING
rating = None
pat = re.compile(r'<!--.*?RATING=[\'"]([^"\']+)[\'"].*?-->', re.DOTALL)
match = pat.search(src)
if match:
rating = match.group(1)
else:
pat = get_meta_regexp_("Rating")
match = pat.search(src)
if match:
rating = match.group(1)
if rating:
mi.rating = rating
# COMMENTS
comments = None
pat = re.compile(r'<!--.*?COMMENTS=[\'"]([^"\']+)[\'"].*?-->', re.DOTALL)
match = pat.search(src)
if match:
comments = match.group(1)
else:
pat = get_meta_regexp_("Comments")
match = pat.search(src)
if match:
comments = match.group(1)
if comments:
mi.comments = ent_pat.sub(entity_to_unicode, comments)
# TAGS
tags = None
pat = re.compile(r'<!--.*?TAGS=[\'"]([^"\']+)[\'"].*?-->', re.DOTALL)
match = pat.search(src)
if match:
tags = match.group(1)
else:
pat = get_meta_regexp_("Tags")
match = pat.search(src)
if match:
tags = match.group(1)
if tags:
mi.tags = ent_pat.sub(entity_to_unicode, tags).split(",")
# Ready to return MetaInformation
return mi