diff --git a/src/calibre/ebooks/metadata/html.py b/src/calibre/ebooks/metadata/html.py index aa74548e83..53aa993116 100644 --- a/src/calibre/ebooks/metadata/html.py +++ b/src/calibre/ebooks/metadata/html.py @@ -17,6 +17,9 @@ def get_metadata(stream): src = stream.read() return get_metadata_(src) +def get_meta_regexp_(name): + return re.compile('', re.IGNORECASE) + def get_metadata_(src, encoding=None): if not isinstance(src, unicode): if not encoding: @@ -24,6 +27,9 @@ def get_metadata_(src, encoding=None): else: src = src.decode(encoding, 'replace') + # Meta data definitions as in + # http://www.mobileread.com/forums/showpost.php?p=712544&postcount=9 + # Title title = None pat = re.compile(r'', re.DOTALL) @@ -35,6 +41,13 @@ def get_metadata_(src, encoding=None): match = pat.search(src) if match: title = match.group(1) + if not title: + for x in ('Title','DC.title','DCTERMS.title'): + pat = get_meta_regexp_(x) + match = pat.search(src) + if match: + title = match.group(1) + break # Author author = None @@ -42,7 +55,15 @@ def get_metadata_(src, encoding=None): match = pat.search(src) if match: author = match.group(2).replace(',', ';') + else: + for x in ('Author','DC.creator.aut','DCTERMS.creator.aut'): + pat = get_meta_regexp_(x) + match = pat.search(src) + if match: + author = match.group(1) + break + # Create MetaInformation with Title and Author ent_pat = re.compile(r'&(\S+)?;') if title: title = ent_pat.sub(entity_to_unicode, title) @@ -51,18 +72,142 @@ def get_metadata_(src, encoding=None): mi = MetaInformation(title, [author] if author else None) # Publisher + publisher = None pat = re.compile(r'', re.DOTALL) match = pat.search(src) if match: - mi.publisher = match.group(2) + publisher = match.group(2) + else: + for x in ('Publisher','DC.publisher','DCTERMS.publisher'): + pat = get_meta_regexp_(x) + match = pat.search(src) + if match: + publisher = match.group(1) + break + if publisher: + mi.publisher = ent_pat.sub(entity_to_unicode, publisher) # ISBN + isbn = None pat = re.compile(r'', re.DOTALL) match = pat.search(src) if match: isbn = match.group(1) + else: + for x in ('ISBN','DC.identifier.ISBN','DCTERMS.identifier.ISBN'): + pat = get_meta_regexp_(x) + match = pat.search(src) + if match: + isbn = match.group(1) + break + if isbn: mi.isbn = re.sub(r'[^0-9xX]', '', isbn) + # LANGUAGE + language = None + pat = re.compile(r'', re.DOTALL) + match = pat.search(src) + if match: + language = match.group(1) + else: + for x in ('DC.language','DCTERMS.language'): + pat = get_meta_regexp_(x) + match = pat.search(src) + if match: + language = match.group(1) + break + if language: + mi.language = language + + # PUBDATE + pubdate = None + pat = re.compile(r'', re.DOTALL) + match = pat.search(src) + if match: + pubdate = match.group(1) + else: + for x in ('Pubdate','Date of publication','DC.date.published','DC.date.publication','DC.date.issued','DCTERMS.issued'): + pat = get_meta_regexp_(x) + match = pat.search(src) + if match: + pubdate = match.group(1) + break + if pubdate: + mi.pubdate = pubdate + + # TIMESTAMP + timestamp = None + pat = re.compile(r'', re.DOTALL) + match = pat.search(src) + if match: + timestamp = match.group(1) + else: + for x in ('Timestamp','Date of creation','DC.date.created','DC.date.creation','DCTERMS.created'): + pat = get_meta_regexp_(x) + match = pat.search(src) + if match: + timestamp = match.group(1) + break + if timestamp: + mi.timestamp = timestamp + + # SERIES + series = None + pat = re.compile(r'', re.DOTALL) + match = pat.search(src) + if match: + series = match.group(1) + else: + pat = get_meta_regexp_("Series") + match = pat.search(src) + if match: + series = match.group(1) + if series: + mi.series = ent_pat.sub(entity_to_unicode, series) + + # RATING + rating = None + pat = re.compile(r'', re.DOTALL) + match = pat.search(src) + if match: + rating = match.group(1) + else: + pat = get_meta_regexp_("Rating") + match = pat.search(src) + if match: + rating = match.group(1) + if rating: + mi.rating = rating + + # COMMENTS + comments = None + pat = re.compile(r'', re.DOTALL) + match = pat.search(src) + if match: + comments = match.group(1) + else: + pat = get_meta_regexp_("Comments") + match = pat.search(src) + if match: + comments = match.group(1) + if comments: + mi.comments = ent_pat.sub(entity_to_unicode, comments) + + # TAGS + tags = None + pat = re.compile(r'', re.DOTALL) + match = pat.search(src) + if match: + tags = match.group(1) + else: + pat = get_meta_regexp_("Tags") + match = pat.search(src) + if match: + tags = match.group(1) + if tags: + mi.tags = ent_pat.sub(entity_to_unicode, tags).split(",") + + # Ready to return MetaInformation return mi