diff --git a/src/libprs500/ebooks/metadata/__init__.py b/src/libprs500/ebooks/metadata/__init__.py index 2c9570e6e2..cd4111b255 100644 --- a/src/libprs500/ebooks/metadata/__init__.py +++ b/src/libprs500/ebooks/metadata/__init__.py @@ -43,11 +43,12 @@ class MetaInformation(object): @staticmethod def copy(mi): ans = MetaInformation(mi.title, mi.authors) - ans.author_sort = mi.author_sort - ans.title_sort = mi.title_sort - ans.comments = mi.comments - ans.category = mi.category - ans.publisher = mi.publisher + for attr in ('author_sort', 'title_sort', 'comments', 'category', + 'publisher', 'series', 'series_index', 'rating', + 'isbn', 'tags', 'cover_data'): + if hasattr(mi, attr): + setattr(ans, attr, getattr(mi, attr)) + def __init__(self, title, authors): ''' @@ -76,7 +77,33 @@ class MetaInformation(object): self.tags = [] if not mi else mi.tags self.cover_data = (None, None) if not mi else mi.cover_data #(extension, data) + + def smart_update(self, mi): + ''' + Merge the information in C{mi} into self. In case of conflicts, the information + in C{mi} takes precedence, unless the information in mi is NULL. + ''' + if mi.title and mi.title.lower() != 'unknown': + self.title = mi.title + + if mi.authors and mi.authors[0].lower() != 'unknown': + self.authors = mi.authors + + for attr in ('author_sort', 'title_sort', 'comments', 'category', + 'publisher', 'series', 'series_index', 'rating', + 'isbn'): + if hasattr(mi, attr): + val = getattr(mi, attr) + if val is not None: + setattr(self, attr, val) + + self.tags += mi.tags + self.tags = list(set(self.tags)) + if mi.cover_data[0] is not None: + self.cover_data = mi.cover_data + + def __str__(self): ans = u'' ans += u'Title : ' + unicode(self.title) + u'\n' diff --git a/src/libprs500/ebooks/metadata/html.py b/src/libprs500/ebooks/metadata/html.py new file mode 100644 index 0000000000..a567b09bfa --- /dev/null +++ b/src/libprs500/ebooks/metadata/html.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai + +## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2 of the License, or +## (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License along +## with this program; if not, write to the Free Software Foundation, Inc., +## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +''' +Try to read metadata from an HTML file. +''' + +import re + +from libprs500.ebooks.metadata import MetaInformation + +def get_metadata(stream): + src = stream.read() + + # Title + title = None + pat = re.compile(r'', re.DOTALL) + match = pat.search(src) + if match: + title = match.group(1) + + # Author + author = None + pat = re.compile(r'', re.DOTALL) + match = pat.search(src) + if match: + author = match.group(1).replace(',', ';') + + mi = MetaInformation(title, [author]) + + # Publisher + pat = re.compile(r'', re.DOTALL) + match = pat.search(src) + if match: + mi.publisher = match.group(1) + + # ISBN + pat = re.compile(r'', re.DOTALL) + match = pat.search(src) + if match: + isbn = match.group(1) + mi.isbn = re.sub(r'[^0-9xX]', '', isbn) + + print mi + + return mi + + \ No newline at end of file diff --git a/src/libprs500/ebooks/metadata/meta.py b/src/libprs500/ebooks/metadata/meta.py index 335f1b60d6..1d1fdc4498 100644 --- a/src/libprs500/ebooks/metadata/meta.py +++ b/src/libprs500/ebooks/metadata/meta.py @@ -13,11 +13,14 @@ ## with this program; if not, write to the Free Software Foundation, Inc., ## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +import os, re + from libprs500.ebooks.metadata.rtf import get_metadata as rtf_metadata from libprs500.ebooks.lrf.meta import get_metadata as lrf_metadata from libprs500.ebooks.metadata.pdf import get_metadata as pdf_metadata from libprs500.ebooks.metadata.lit import get_metadata as lit_metadata from libprs500.ebooks.metadata.epub import get_metadata as epub_metadata +from libprs500.ebooks.metadata.html import get_metadata as html_metadata from libprs500.ebooks.metadata.rtf import set_metadata as set_rtf_metadata from libprs500.ebooks.lrf.meta import set_metadata as set_lrf_metadata @@ -25,17 +28,23 @@ from libprs500.ebooks.metadata import MetaInformation def get_metadata(stream, stream_type='lrf'): if stream_type: stream_type = stream_type.lower() - if stream_type == 'rtf': - return MetaInformation(rtf_metadata(stream), None) - if stream_type == 'lrf': - return MetaInformation(lrf_metadata(stream), None) - if stream_type == 'pdf': - return MetaInformation(pdf_metadata(stream), None) - if stream_type == 'lit': - return MetaInformation(lit_metadata(stream), None) - if stream_type == 'epub': - return MetaInformation(epub_metadata(stream), None) - return MetaInformation(None, None) + if stream_type in ('html', 'html', 'xhtml', 'xhtm'): + stream_type = 'html' + + try: + func = eval(stream_type + '_metadata') + mi = func(stream) + except NameError: + mi = MetaInformation(None, None) + + name = os.path.basename(stream.name) if hasattr(stream, 'name') else '' + base = metadata_from_filename(name) + if not base.title: + base.title = name if name else 'Unknown' + if not base.authors: + base.authors = ['Unknown'] + base.smart_update(mi) + return base def set_metadata(stream, mi, stream_type='lrf'): if stream_type: stream_type = stream_type.lower() @@ -43,4 +52,30 @@ def set_metadata(stream, mi, stream_type='lrf'): set_lrf_metadata(stream, mi) elif stream_type == 'rtf': set_rtf_metadata(stream, mi) - + +_filename_pat = re.compile(r'(?P.+) - (?P<author>[^_]+)') + +def metadata_from_filename(name): + name = os.path.splitext(name)[0] + mi = MetaInformation(None, None) + match = _filename_pat.search(name) + if match: + try: + mi.title = match.group('title') + except IndexError: + pass + try: + mi.authors = [match.group('author')] + except IndexError: + pass + try: + au = match.group('authors') + aus = au.split(',') + authors = [] + for a in aus: + authors.extend(a.split('&')) + mi.authors = authors + except IndexError: + pass + return mi + \ No newline at end of file