From a433be5ba52e7d5d37788529cd429a91c0e42d63 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 29 Nov 2007 03:26:57 +0000 Subject: [PATCH] Implement #315 --- src/libprs500/ebooks/metadata/__init__.py | 1 + src/libprs500/ebooks/metadata/opf.py | 331 +++++++++++++++++++--- src/libprs500/ebooks/metadata/rtf.py | 18 +- src/libprs500/library/database.py | 38 ++- 4 files changed, 336 insertions(+), 52 deletions(-) diff --git a/src/libprs500/ebooks/metadata/__init__.py b/src/libprs500/ebooks/metadata/__init__.py index 3f1d8a86eb..7f11404df4 100644 --- a/src/libprs500/ebooks/metadata/__init__.py +++ b/src/libprs500/ebooks/metadata/__init__.py @@ -59,6 +59,7 @@ class MetaInformation(object): self.series_index = None self.rating = None self.isbn = None + self.tags = [] def __str__(self): ans = '' diff --git a/src/libprs500/ebooks/metadata/opf.py b/src/libprs500/ebooks/metadata/opf.py index 0f5a432cb3..837e8a4ec0 100644 --- a/src/libprs500/ebooks/metadata/opf.py +++ b/src/libprs500/ebooks/metadata/opf.py @@ -17,6 +17,7 @@ import sys, re, os from urllib import unquote from urlparse import urlparse +import xml.dom.minidom as dom from libprs500.ebooks.metadata import MetaInformation from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup, BeautifulSoup @@ -92,41 +93,116 @@ class TOC(list): pass -class OPFReader(MetaInformation): +class standard_field(object): + + def __init__(self, name): + self.name = name + + def __get__(self, obj, typ=None): + return getattr(obj, 'get_'+self.name)() + + def __set__(self, obj, val): + getattr(obj, 'set_'+self.name)(val) + +class OPF(MetaInformation): ENTITY_PATTERN = re.compile(r'&(\S+?);') - def __init__(self, stream, dir=os.getcwd()): - manage = False - if not hasattr(stream, 'read'): - manage = True - dir = os.path.dirname(stream) - stream = open(stream, 'rb') - self.default_title = stream.name if hasattr(stream, 'name') else 'Unknown' - if hasattr(stream, 'seek'): - stream.seek(0) - self.soup = BeautifulStoneSoup(stream.read()) - if manage: - stream.close() - self.title = self.get_title() - self.authors = self.get_authors() - self.title_sort = self.get_title_sort() - self.author_sort = self.get_author_sort() - self.comments = self.get_comments() - self.category = self.get_category() - self.publisher = self.get_publisher() - self.isbn = self.get_isbn() - self.series = self.series_index = self.rating = None - self.manifest = Manifest(self.soup, dir) - self.spine = Spine(self.soup, self.manifest) - self.toc = TOC(self, dir) - self.cover = self.get_cover() + libprs_id = standard_field('libprs_id') + title = standard_field('title') + authors = standard_field('authors') + title_sort = standard_field('title_sort') + author_sort = standard_field('author_sort') + comments = standard_field('comments') + category = standard_field('category') + publisher = standard_field('publisher') + isbn = standard_field('isbn') + cover = standard_field('cover') + series = standard_field('series') + series_index = standard_field('series_index') + rating = standard_field('rating') + tags = standard_field('tags') + + def __init__(self): + raise NotImplementedError('Abstract base class') + + def _initialize(self): + if not hasattr(self, 'soup'): + self.soup = BeautifulStoneSoup(u'''\ + + + + + + + +''') + + def _commit(self, doc): + self.soup = BeautifulStoneSoup(doc.toxml('utf-8'), fromEncoding='utf-8') + def _find_element(self, package, name, attrs=[]): + tags = package.getElementsByTagName(name) + for tag in tags: + match = True + for attr, vattr in attrs: + if tag.getAttribute(attr) != vattr: + match = False + break + if match: + return tag + return None + + def _set_metadata_element(self, name, value, attrs=[], + type='dc-metadata', replace=False): + self._initialize() + if isinstance(value, basestring): + value = [value] + attrs = [attrs] + doc = dom.parseString(self.soup.__str__('UTF-8')) + package = doc.documentElement + metadata = package.getElementsByTagName('metadata')[0] + + dcms = metadata.getElementsByTagName(type) + if dcms: + dcm = dcms[0] + else: + dcm = doc.createElement(type) + metadata.appendChild(dcm) + tags = dcm.getElementsByTagName(name) + if tags and not replace: + for tag in tags: + tag.parentNode.removeChild(tag) + tag.unlink() + + for val, vattrs in zip(value, attrs): + if replace: + el = self._find_element(package, name, vattrs) + if el: + el.parentNode.removeChild(el) + el.unlink() + el = doc.createElement(name) + el.appendChild(doc.createTextNode(val)) + for attr, vattr in vattrs: + el.setAttribute(attr, vattr) + dcm.appendChild(el) + self._commit(doc) + + def get_title(self): title = self.soup.package.metadata.find('dc:title') if title: - return self.ENTITY_PATTERN.sub(entity_to_unicode, title.string) - return self.default_title + return self.ENTITY_PATTERN.sub(entity_to_unicode, title.string).strip() + return self.default_title.strip() + + def set_title(self, title): + if not title: + title = 'Unknown' + self._set_metadata_element('dc:title', title) def get_authors(self): creators = self.soup.package.metadata.findAll('dc:creator') @@ -142,9 +218,15 @@ class OPFReader(MetaInformation): ans = [] for i in au: ans.extend(i.split('&')) - return ans + return [a.strip() for a in ans] return [] + def set_authors(self, authors): + if not authors: + authors = ['Unknown'] + attrs = [[('role', 'aut')] for a in authors] + self._set_metadata_element('dc:Creator', authors, attrs) + def get_author_sort(self): creators = self.soup.package.metadata.findAll('dc:creator') for elem in creators: @@ -153,42 +235,99 @@ class OPFReader(MetaInformation): role = elem.get('opf:role') if role == 'aut': fa = elem.get('file-as') - return self.ENTITY_PATTERN.sub(entity_to_unicode, fa) if fa else None + return self.ENTITY_PATTERN.sub(entity_to_unicode, fa).strip() if fa else None return None - + + def set_author_sort(self, aus): + if not aus: + aus = '' + self._initialize() + if not self.authors: + self.set_authors([]) + doc = dom.parseString(self.soup.__str__('UTF-8')) + package = doc.documentElement + aut = package.getElementsByTagName('dc:Creator')[0] + aut.setAttribute('file-as', aus) + self._commit(doc) def get_title_sort(self): + title = self.soup.package.find('dc:title') + if title: + if title.has_key('file-as'): + return title['file-as'].strip() return None - + + def set_title_sort(self, title_sort): + if not title_sort: + title_sort = '' + self._initialize() + if not self.title: + self.title = None + doc = dom.parseString(self.soup.__str__('UTF-8')) + package = doc.documentElement + tit = package.getElementsByTagName('dc:Title')[0] + tit.setAttribute('file-as', title_sort) + self._commit(doc) def get_comments(self): comments = self.soup.find('dc:description') if comments: - return self.ENTITY_PATTERN.sub(entity_to_unicode, comments.string) + return self.ENTITY_PATTERN.sub(entity_to_unicode, comments.string).strip() return None + def set_comments(self, comments): + if not comments: + comments = '' + self._set_metadata_element('dc:Description', comments) + def get_category(self): category = self.soup.find('dc:type') if category: - return self.ENTITY_PATTERN.sub(entity_to_unicode, category.string) + return self.ENTITY_PATTERN.sub(entity_to_unicode, category.string).strip() return None + def set_category(self, category): + if not category: + category = '' + self._set_metadata_element('dc:Type', category) + def get_publisher(self): publisher = self.soup.find('dc:publisher') if publisher: - return self.ENTITY_PATTERN.sub(entity_to_unicode, publisher.string) + return self.ENTITY_PATTERN.sub(entity_to_unicode, publisher.string).strip() return None + def set_publisher(self, category): + if not category: + category = 'Unknown' + self._set_metadata_element('dc:Publisher', category) + def get_isbn(self): for item in self.soup.package.metadata.findAll('dc:identifier'): scheme = item.get('scheme') if not scheme: scheme = item.get('opf:scheme') if scheme is not None and scheme.lower() == 'isbn': - return item.string + return str(item.string).strip() return None + def set_isbn(self, isbn): + if isbn: + self._set_metadata_element('dc:Identifier', isbn, [('scheme', 'ISBN')], + replace=True) + + def get_libprs_id(self): + for item in self.soup.package.metadata.findAll('dc:identifier'): + if item.has_key('scheme') and item['scheme'] == 'libprs': + return str(item.string).strip() + return None + + def set_libprs_id(self, val): + if val: + self._set_metadata_element('dc:Identifier', str(val), [('scheme', 'libprs'), ('id', 'libprs_id')], + replace=True) + def get_cover(self): guide = self.soup.package.find('guide') if guide: @@ -200,7 +339,24 @@ class OPFReader(MetaInformation): if type.lower() in ['cover', 'other.ms-coverimage-standard']: return reference.get('href') return None - + + def set_cover(self, path): + self._initialize() + doc = dom.parseString(self.soup.__str__('UTF-8')) + package = doc.documentElement + guide = package.getElementsByTagName('guide') + if guide: + guide = guide[0] + else: + guide = doc.createElement('guide') + package.appendChild(guide) + el = self._find_element(guide, 'reference', [('type', 'cover')]) + if not el: + el = doc.createElement('reference') + guide.appendChild(el) + el.setAttribute('type', 'cover') + el.setAttribute('href', path) + self._commit(doc) def possible_cover_prefixes(self): isbn, ans = [], [] @@ -213,6 +369,107 @@ class OPFReader(MetaInformation): ans.append(item[1].replace('-', '')) return ans + def get_series(self): + xm = self.soup.package.metadata.find('x-metadata') + if not xm: + return None + s = xm.find('series') + if s: + return str(s.string).strip() + return None + + def set_series(self, val): + if not val: + val = '' + self._set_metadata_element('series', val, type='x-metadata') + + def get_series_index(self): + xm = self.soup.package.metadata.find('x-metadata') + if not xm: + return None + s = xm.find('series-index') + if s: + try: + return int(str(s.string).strip()) + except: + return None + return None + + def set_series_index(self, val): + if not val: + val = 1 + self._set_metadata_element('series-index', str(val), type='x-metadata') + + def get_rating(self): + xm = self.soup.package.metadata.find('x-metadata') + if not xm: + return None + s = xm.find('rating') + if s: + try: + return int(str(s.string).strip()) + except: + return None + return None + + def set_rating(self, val): + if not val: + val = 0 + self._set_metadata_element('rating', str(val), type='x-metadata') + + def get_tags(self): + ans = [] + subs = self.soup.findAll('dc:subject') + for sub in subs: + val = sub.string + if val: + ans.append(val) + return [unicode(a).strip() for a in ans] + + def set_tags(self, tags): + self._set_metadata_element('dc:Subject', tags) + + def write(self, stream): + stream.write(self.soup.prettify('utf-8')) + +class OPFReader(OPF): + + def __init__(self, stream, dir=os.getcwd()): + manage = False + if not hasattr(stream, 'read'): + manage = True + dir = os.path.dirname(stream) + stream = open(stream, 'rb') + self.default_title = stream.name if hasattr(stream, 'name') else 'Unknown' + if hasattr(stream, 'seek'): + stream.seek(0) + self.soup = BeautifulStoneSoup(stream.read()) + if manage: + stream.close() + +class OPFCreator(OPF): + + def __init__(self, mi): + self.title = mi.title + self.authors = mi.authors + if mi.category: + self.category = mi.category + if mi.comments: + self.comments = mi.comments + if mi.publisher: + self.publisher = mi.publisher + if mi.rating: + self.rating = mi.rating + if mi.series: + self.series = mi.series + if mi.series_index: + self.series_index = mi.series_index + if mi.tags: + self.tags = mi.tags + if mi.isbn: + self.isbn = mi.isbn + if hasattr(mi, 'libprs_id'): + self.libprs_id = mi.libprs_id def main(args=sys.argv): print OPFReader(open(args[1], 'rb')) diff --git a/src/libprs500/ebooks/metadata/rtf.py b/src/libprs500/ebooks/metadata/rtf.py index 84af4adad4..034d712e63 100644 --- a/src/libprs500/ebooks/metadata/rtf.py +++ b/src/libprs500/ebooks/metadata/rtf.py @@ -15,7 +15,7 @@ """ Edit metadata in RTF files. """ -import re, cStringIO, sys, copy +import re, cStringIO, sys from libprs500.ebooks.metadata import MetaInformation, get_parser @@ -118,13 +118,7 @@ def create_metadata(stream, options): stream.seek(0) stream.write(ans) -def set_metadata(stream, mi): - mi = copy.deepcopy(mi) - mi.authors = ', '.join(mi.authors) - mi.comment = mi.comments - set_metadata_(stream, mi) - -def set_metadata_(stream, options): +def set_metadata(stream, options): ''' Modify/add RTF metadata in stream @param options: Object with metadata attributes title, author, comment, category @@ -147,7 +141,7 @@ def set_metadata_(stream, options): src = pat.sub(r'{\\title ' + title + r'}', src) else: src = add_metadata_item(src, 'title', title) - comment = options.comment + comment = options.comments if comment != None: comment = comment.encode('ascii', 'replace') pat = re.compile(base_pat.replace('name', 'subject'), re.DOTALL) @@ -157,6 +151,7 @@ def set_metadata_(stream, options): src = add_metadata_item(src, 'subject', comment) author = options.authors if author != None: + author = ', '.join(author) author = author.encode('ascii', 'ignore') pat = re.compile(base_pat.replace('name', 'author'), re.DOTALL) if pat.search(src): @@ -186,7 +181,10 @@ def main(args=sys.argv): parser.print_help() sys.exit(1) stream = open(args[1], 'r+b') - set_metadata_(stream, options) + if options.authors: + options.authors = options.authors.split(',') + options.comments = options.comment + set_metadata(stream, options) mi = get_metadata(stream) return mi diff --git a/src/libprs500/library/database.py b/src/libprs500/library/database.py index 6a0a1b2918..a2fb7dc22c 100644 --- a/src/libprs500/library/database.py +++ b/src/libprs500/library/database.py @@ -21,6 +21,7 @@ from zlib import compress, decompress from libprs500 import sanitize_file_name from libprs500.ebooks.metadata.meta import set_metadata +from libprs500.ebooks.metadata.opf import OPFCreator from libprs500.ebooks.metadata import MetaInformation class Concatenate(object): @@ -1087,6 +1088,25 @@ ALTER TABLE books ADD COLUMN isbn TEXT DEFAULT "" COLLATE NOCASE; self.conn.execute('DELETE FROM books WHERE id=?', (id,)) self.conn.commit() + def get_metadata(self, idx): + aum = self.authors(idx) + if aum: aum = aum.split(',') + mi = MetaInformation(self.title(idx), aum) + mi.author_sort = self.author_sort(idx) + mi.comments = self.comments(idx) + mi.publisher = self.publisher(idx) + tags = self.tags(idx) + if tags: + mi.tags = [i.strip() for i in tags.split(',')] + mi.series = self.series(idx) + if mi.series: + mi.series_index = self.series_index(idx) + mi.rating = self.rating(idx) + id = self.id(idx) + mi.isbn = self.isbn(id) + mi.libprs_id = id + return mi + def export_to_dir(self, dir, indices, byauthor=False): if not os.path.exists(dir): raise IOError('Target directory does not exist: '+dir) @@ -1113,6 +1133,17 @@ ALTER TABLE books ADD COLUMN isbn TEXT DEFAULT "" COLLATE NOCASE; id = str(self.id(idx)) if not os.path.exists(tpath): os.mkdir(tpath) + mi = OPFCreator(self.get_metadata(idx)) + cover = self.cover(idx) + if cover is not None: + f = open(os.path.join(tpath, 'cover.jpg'), 'wb') + f.write(cover) + mi.cover = 'cover.jpg' + f.close() + f = open(os.path.join(tpath, 'metadata.opf'), 'wb') + mi.write(f) + f.close() + for fmt in self.formats(idx).split(','): data = self.format(idx, fmt) name = au + ' - ' + title if byauthor else title + ' - ' + au @@ -1120,16 +1151,13 @@ ALTER TABLE books ADD COLUMN isbn TEXT DEFAULT "" COLLATE NOCASE; f = open(os.path.join(tpath, sanitize_file_name(fname)), 'w+b') f.write(data) f.flush() - aum = self.authors(idx) - if aum: aum = aum.split(',') - mi = MetaInformation(self.title(idx), aum) - mi.author_sort = self.author_sort(idx) try: set_metadata(f, mi, fmt.lower()) except: print 'Error setting metadata for book:', mi.title traceback.print_exc() - + f.close() + if __name__ == '__main__': db = LibraryDatabase('/home/kovid/library1.db')