diff --git a/src/calibre/ebooks/metadata/html.py b/src/calibre/ebooks/metadata/html.py
index aa74548e83..53aa993116 100644
--- a/src/calibre/ebooks/metadata/html.py
+++ b/src/calibre/ebooks/metadata/html.py
@@ -17,6 +17,9 @@ def get_metadata(stream):
src = stream.read()
return get_metadata_(src)
+def get_meta_regexp_(name):
+ return re.compile('', re.IGNORECASE)
+
def get_metadata_(src, encoding=None):
if not isinstance(src, unicode):
if not encoding:
@@ -24,6 +27,9 @@ def get_metadata_(src, encoding=None):
else:
src = src.decode(encoding, 'replace')
+ # Meta data definitions as in
+ # http://www.mobileread.com/forums/showpost.php?p=712544&postcount=9
+
# Title
title = None
pat = re.compile(r'', re.DOTALL)
@@ -35,6 +41,13 @@ def get_metadata_(src, encoding=None):
match = pat.search(src)
if match:
title = match.group(1)
+ if not title:
+ for x in ('Title','DC.title','DCTERMS.title'):
+ pat = get_meta_regexp_(x)
+ match = pat.search(src)
+ if match:
+ title = match.group(1)
+ break
# Author
author = None
@@ -42,7 +55,15 @@ def get_metadata_(src, encoding=None):
match = pat.search(src)
if match:
author = match.group(2).replace(',', ';')
+ else:
+ for x in ('Author','DC.creator.aut','DCTERMS.creator.aut'):
+ pat = get_meta_regexp_(x)
+ match = pat.search(src)
+ if match:
+ author = match.group(1)
+ break
+ # Create MetaInformation with Title and Author
ent_pat = re.compile(r'&(\S+)?;')
if title:
title = ent_pat.sub(entity_to_unicode, title)
@@ -51,18 +72,142 @@ def get_metadata_(src, encoding=None):
mi = MetaInformation(title, [author] if author else None)
# Publisher
+ publisher = None
pat = re.compile(r'', re.DOTALL)
match = pat.search(src)
if match:
- mi.publisher = match.group(2)
+ publisher = match.group(2)
+ else:
+ for x in ('Publisher','DC.publisher','DCTERMS.publisher'):
+ pat = get_meta_regexp_(x)
+ match = pat.search(src)
+ if match:
+ publisher = match.group(1)
+ break
+ if publisher:
+ mi.publisher = ent_pat.sub(entity_to_unicode, publisher)
# ISBN
+ isbn = None
pat = re.compile(r'', re.DOTALL)
match = pat.search(src)
if match:
isbn = match.group(1)
+ else:
+ for x in ('ISBN','DC.identifier.ISBN','DCTERMS.identifier.ISBN'):
+ pat = get_meta_regexp_(x)
+ match = pat.search(src)
+ if match:
+ isbn = match.group(1)
+ break
+ if isbn:
mi.isbn = re.sub(r'[^0-9xX]', '', isbn)
+ # LANGUAGE
+ language = None
+ pat = re.compile(r'', re.DOTALL)
+ match = pat.search(src)
+ if match:
+ language = match.group(1)
+ else:
+ for x in ('DC.language','DCTERMS.language'):
+ pat = get_meta_regexp_(x)
+ match = pat.search(src)
+ if match:
+ language = match.group(1)
+ break
+ if language:
+ mi.language = language
+
+ # PUBDATE
+ pubdate = None
+ pat = re.compile(r'', re.DOTALL)
+ match = pat.search(src)
+ if match:
+ pubdate = match.group(1)
+ else:
+ for x in ('Pubdate','Date of publication','DC.date.published','DC.date.publication','DC.date.issued','DCTERMS.issued'):
+ pat = get_meta_regexp_(x)
+ match = pat.search(src)
+ if match:
+ pubdate = match.group(1)
+ break
+ if pubdate:
+ mi.pubdate = pubdate
+
+ # TIMESTAMP
+ timestamp = None
+ pat = re.compile(r'', re.DOTALL)
+ match = pat.search(src)
+ if match:
+ timestamp = match.group(1)
+ else:
+ for x in ('Timestamp','Date of creation','DC.date.created','DC.date.creation','DCTERMS.created'):
+ pat = get_meta_regexp_(x)
+ match = pat.search(src)
+ if match:
+ timestamp = match.group(1)
+ break
+ if timestamp:
+ mi.timestamp = timestamp
+
+ # SERIES
+ series = None
+ pat = re.compile(r'', re.DOTALL)
+ match = pat.search(src)
+ if match:
+ series = match.group(1)
+ else:
+ pat = get_meta_regexp_("Series")
+ match = pat.search(src)
+ if match:
+ series = match.group(1)
+ if series:
+ mi.series = ent_pat.sub(entity_to_unicode, series)
+
+ # RATING
+ rating = None
+ pat = re.compile(r'', re.DOTALL)
+ match = pat.search(src)
+ if match:
+ rating = match.group(1)
+ else:
+ pat = get_meta_regexp_("Rating")
+ match = pat.search(src)
+ if match:
+ rating = match.group(1)
+ if rating:
+ mi.rating = rating
+
+ # COMMENTS
+ comments = None
+ pat = re.compile(r'', re.DOTALL)
+ match = pat.search(src)
+ if match:
+ comments = match.group(1)
+ else:
+ pat = get_meta_regexp_("Comments")
+ match = pat.search(src)
+ if match:
+ comments = match.group(1)
+ if comments:
+ mi.comments = ent_pat.sub(entity_to_unicode, comments)
+
+ # TAGS
+ tags = None
+ pat = re.compile(r'', re.DOTALL)
+ match = pat.search(src)
+ if match:
+ tags = match.group(1)
+ else:
+ pat = get_meta_regexp_("Tags")
+ match = pat.search(src)
+ if match:
+ tags = match.group(1)
+ if tags:
+ mi.tags = ent_pat.sub(entity_to_unicode, tags).split(",")
+
+ # Ready to return MetaInformation
return mi