This commit is contained in:
Kovid Goyal 2007-11-29 03:26:57 +00:00
parent f97f1c91d2
commit a433be5ba5
4 changed files with 336 additions and 52 deletions

View File

@ -59,6 +59,7 @@ class MetaInformation(object):
self.series_index = None self.series_index = None
self.rating = None self.rating = None
self.isbn = None self.isbn = None
self.tags = []
def __str__(self): def __str__(self):
ans = '' ans = ''

View File

@ -17,6 +17,7 @@
import sys, re, os import sys, re, os
from urllib import unquote from urllib import unquote
from urlparse import urlparse from urlparse import urlparse
import xml.dom.minidom as dom
from libprs500.ebooks.metadata import MetaInformation from libprs500.ebooks.metadata import MetaInformation
from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup, BeautifulSoup from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup, BeautifulSoup
@ -92,41 +93,116 @@ class TOC(list):
pass pass
class OPFReader(MetaInformation): class standard_field(object):
def __init__(self, name):
self.name = name
def __get__(self, obj, typ=None):
return getattr(obj, 'get_'+self.name)()
def __set__(self, obj, val):
getattr(obj, 'set_'+self.name)(val)
class OPF(MetaInformation):
ENTITY_PATTERN = re.compile(r'&(\S+?);') ENTITY_PATTERN = re.compile(r'&(\S+?);')
def __init__(self, stream, dir=os.getcwd()): libprs_id = standard_field('libprs_id')
manage = False title = standard_field('title')
if not hasattr(stream, 'read'): authors = standard_field('authors')
manage = True title_sort = standard_field('title_sort')
dir = os.path.dirname(stream) author_sort = standard_field('author_sort')
stream = open(stream, 'rb') comments = standard_field('comments')
self.default_title = stream.name if hasattr(stream, 'name') else 'Unknown' category = standard_field('category')
if hasattr(stream, 'seek'): publisher = standard_field('publisher')
stream.seek(0) isbn = standard_field('isbn')
self.soup = BeautifulStoneSoup(stream.read()) cover = standard_field('cover')
if manage: series = standard_field('series')
stream.close() series_index = standard_field('series_index')
self.title = self.get_title() rating = standard_field('rating')
self.authors = self.get_authors() tags = standard_field('tags')
self.title_sort = self.get_title_sort()
self.author_sort = self.get_author_sort() def __init__(self):
self.comments = self.get_comments() raise NotImplementedError('Abstract base class')
self.category = self.get_category()
self.publisher = self.get_publisher() def _initialize(self):
self.isbn = self.get_isbn() if not hasattr(self, 'soup'):
self.series = self.series_index = self.rating = None self.soup = BeautifulStoneSoup(u'''\
self.manifest = Manifest(self.soup, dir) <?xml version="1.0" encoding="UTF-8"?>
self.spine = Spine(self.soup, self.manifest) <!DOCTYPE package
self.toc = TOC(self, dir) PUBLIC "+//ISBN 0-9673008-1-9//DTD OEB 1.2 Package//EN"
self.cover = self.get_cover() "http://openebook.org/dtds/oeb-1.2/oebpkg12.dtd">
<package unique-identifier="libprs_id">
<metadata>
<dc-metadata
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:oebpackage="http://openebook.org/namespaces/oeb-package/1.0/" />
</metadata>
</package>
''')
def _commit(self, doc):
self.soup = BeautifulStoneSoup(doc.toxml('utf-8'), fromEncoding='utf-8')
def _find_element(self, package, name, attrs=[]):
tags = package.getElementsByTagName(name)
for tag in tags:
match = True
for attr, vattr in attrs:
if tag.getAttribute(attr) != vattr:
match = False
break
if match:
return tag
return None
def _set_metadata_element(self, name, value, attrs=[],
type='dc-metadata', replace=False):
self._initialize()
if isinstance(value, basestring):
value = [value]
attrs = [attrs]
doc = dom.parseString(self.soup.__str__('UTF-8'))
package = doc.documentElement
metadata = package.getElementsByTagName('metadata')[0]
dcms = metadata.getElementsByTagName(type)
if dcms:
dcm = dcms[0]
else:
dcm = doc.createElement(type)
metadata.appendChild(dcm)
tags = dcm.getElementsByTagName(name)
if tags and not replace:
for tag in tags:
tag.parentNode.removeChild(tag)
tag.unlink()
for val, vattrs in zip(value, attrs):
if replace:
el = self._find_element(package, name, vattrs)
if el:
el.parentNode.removeChild(el)
el.unlink()
el = doc.createElement(name)
el.appendChild(doc.createTextNode(val))
for attr, vattr in vattrs:
el.setAttribute(attr, vattr)
dcm.appendChild(el)
self._commit(doc)
def get_title(self): def get_title(self):
title = self.soup.package.metadata.find('dc:title') title = self.soup.package.metadata.find('dc:title')
if title: if title:
return self.ENTITY_PATTERN.sub(entity_to_unicode, title.string) return self.ENTITY_PATTERN.sub(entity_to_unicode, title.string).strip()
return self.default_title return self.default_title.strip()
def set_title(self, title):
if not title:
title = 'Unknown'
self._set_metadata_element('dc:title', title)
def get_authors(self): def get_authors(self):
creators = self.soup.package.metadata.findAll('dc:creator') creators = self.soup.package.metadata.findAll('dc:creator')
@ -142,9 +218,15 @@ class OPFReader(MetaInformation):
ans = [] ans = []
for i in au: for i in au:
ans.extend(i.split('&')) ans.extend(i.split('&'))
return ans return [a.strip() for a in ans]
return [] return []
def set_authors(self, authors):
if not authors:
authors = ['Unknown']
attrs = [[('role', 'aut')] for a in authors]
self._set_metadata_element('dc:Creator', authors, attrs)
def get_author_sort(self): def get_author_sort(self):
creators = self.soup.package.metadata.findAll('dc:creator') creators = self.soup.package.metadata.findAll('dc:creator')
for elem in creators: for elem in creators:
@ -153,42 +235,99 @@ class OPFReader(MetaInformation):
role = elem.get('opf:role') role = elem.get('opf:role')
if role == 'aut': if role == 'aut':
fa = elem.get('file-as') fa = elem.get('file-as')
return self.ENTITY_PATTERN.sub(entity_to_unicode, fa) if fa else None return self.ENTITY_PATTERN.sub(entity_to_unicode, fa).strip() if fa else None
return None return None
def set_author_sort(self, aus):
if not aus:
aus = ''
self._initialize()
if not self.authors:
self.set_authors([])
doc = dom.parseString(self.soup.__str__('UTF-8'))
package = doc.documentElement
aut = package.getElementsByTagName('dc:Creator')[0]
aut.setAttribute('file-as', aus)
self._commit(doc)
def get_title_sort(self): def get_title_sort(self):
title = self.soup.package.find('dc:title')
if title:
if title.has_key('file-as'):
return title['file-as'].strip()
return None return None
def set_title_sort(self, title_sort):
if not title_sort:
title_sort = ''
self._initialize()
if not self.title:
self.title = None
doc = dom.parseString(self.soup.__str__('UTF-8'))
package = doc.documentElement
tit = package.getElementsByTagName('dc:Title')[0]
tit.setAttribute('file-as', title_sort)
self._commit(doc)
def get_comments(self): def get_comments(self):
comments = self.soup.find('dc:description') comments = self.soup.find('dc:description')
if comments: if comments:
return self.ENTITY_PATTERN.sub(entity_to_unicode, comments.string) return self.ENTITY_PATTERN.sub(entity_to_unicode, comments.string).strip()
return None return None
def set_comments(self, comments):
if not comments:
comments = ''
self._set_metadata_element('dc:Description', comments)
def get_category(self): def get_category(self):
category = self.soup.find('dc:type') category = self.soup.find('dc:type')
if category: if category:
return self.ENTITY_PATTERN.sub(entity_to_unicode, category.string) return self.ENTITY_PATTERN.sub(entity_to_unicode, category.string).strip()
return None return None
def set_category(self, category):
if not category:
category = ''
self._set_metadata_element('dc:Type', category)
def get_publisher(self): def get_publisher(self):
publisher = self.soup.find('dc:publisher') publisher = self.soup.find('dc:publisher')
if publisher: if publisher:
return self.ENTITY_PATTERN.sub(entity_to_unicode, publisher.string) return self.ENTITY_PATTERN.sub(entity_to_unicode, publisher.string).strip()
return None return None
def set_publisher(self, category):
if not category:
category = 'Unknown'
self._set_metadata_element('dc:Publisher', category)
def get_isbn(self): def get_isbn(self):
for item in self.soup.package.metadata.findAll('dc:identifier'): for item in self.soup.package.metadata.findAll('dc:identifier'):
scheme = item.get('scheme') scheme = item.get('scheme')
if not scheme: if not scheme:
scheme = item.get('opf:scheme') scheme = item.get('opf:scheme')
if scheme is not None and scheme.lower() == 'isbn': if scheme is not None and scheme.lower() == 'isbn':
return item.string return str(item.string).strip()
return None return None
def set_isbn(self, isbn):
if isbn:
self._set_metadata_element('dc:Identifier', isbn, [('scheme', 'ISBN')],
replace=True)
def get_libprs_id(self):
for item in self.soup.package.metadata.findAll('dc:identifier'):
if item.has_key('scheme') and item['scheme'] == 'libprs':
return str(item.string).strip()
return None
def set_libprs_id(self, val):
if val:
self._set_metadata_element('dc:Identifier', str(val), [('scheme', 'libprs'), ('id', 'libprs_id')],
replace=True)
def get_cover(self): def get_cover(self):
guide = self.soup.package.find('guide') guide = self.soup.package.find('guide')
if guide: if guide:
@ -200,7 +339,24 @@ class OPFReader(MetaInformation):
if type.lower() in ['cover', 'other.ms-coverimage-standard']: if type.lower() in ['cover', 'other.ms-coverimage-standard']:
return reference.get('href') return reference.get('href')
return None return None
def set_cover(self, path):
self._initialize()
doc = dom.parseString(self.soup.__str__('UTF-8'))
package = doc.documentElement
guide = package.getElementsByTagName('guide')
if guide:
guide = guide[0]
else:
guide = doc.createElement('guide')
package.appendChild(guide)
el = self._find_element(guide, 'reference', [('type', 'cover')])
if not el:
el = doc.createElement('reference')
guide.appendChild(el)
el.setAttribute('type', 'cover')
el.setAttribute('href', path)
self._commit(doc)
def possible_cover_prefixes(self): def possible_cover_prefixes(self):
isbn, ans = [], [] isbn, ans = [], []
@ -213,6 +369,107 @@ class OPFReader(MetaInformation):
ans.append(item[1].replace('-', '')) ans.append(item[1].replace('-', ''))
return ans return ans
def get_series(self):
xm = self.soup.package.metadata.find('x-metadata')
if not xm:
return None
s = xm.find('series')
if s:
return str(s.string).strip()
return None
def set_series(self, val):
if not val:
val = ''
self._set_metadata_element('series', val, type='x-metadata')
def get_series_index(self):
xm = self.soup.package.metadata.find('x-metadata')
if not xm:
return None
s = xm.find('series-index')
if s:
try:
return int(str(s.string).strip())
except:
return None
return None
def set_series_index(self, val):
if not val:
val = 1
self._set_metadata_element('series-index', str(val), type='x-metadata')
def get_rating(self):
xm = self.soup.package.metadata.find('x-metadata')
if not xm:
return None
s = xm.find('rating')
if s:
try:
return int(str(s.string).strip())
except:
return None
return None
def set_rating(self, val):
if not val:
val = 0
self._set_metadata_element('rating', str(val), type='x-metadata')
def get_tags(self):
ans = []
subs = self.soup.findAll('dc:subject')
for sub in subs:
val = sub.string
if val:
ans.append(val)
return [unicode(a).strip() for a in ans]
def set_tags(self, tags):
self._set_metadata_element('dc:Subject', tags)
def write(self, stream):
stream.write(self.soup.prettify('utf-8'))
class OPFReader(OPF):
def __init__(self, stream, dir=os.getcwd()):
manage = False
if not hasattr(stream, 'read'):
manage = True
dir = os.path.dirname(stream)
stream = open(stream, 'rb')
self.default_title = stream.name if hasattr(stream, 'name') else 'Unknown'
if hasattr(stream, 'seek'):
stream.seek(0)
self.soup = BeautifulStoneSoup(stream.read())
if manage:
stream.close()
class OPFCreator(OPF):
def __init__(self, mi):
self.title = mi.title
self.authors = mi.authors
if mi.category:
self.category = mi.category
if mi.comments:
self.comments = mi.comments
if mi.publisher:
self.publisher = mi.publisher
if mi.rating:
self.rating = mi.rating
if mi.series:
self.series = mi.series
if mi.series_index:
self.series_index = mi.series_index
if mi.tags:
self.tags = mi.tags
if mi.isbn:
self.isbn = mi.isbn
if hasattr(mi, 'libprs_id'):
self.libprs_id = mi.libprs_id
def main(args=sys.argv): def main(args=sys.argv):
print OPFReader(open(args[1], 'rb')) print OPFReader(open(args[1], 'rb'))

View File

@ -15,7 +15,7 @@
""" """
Edit metadata in RTF files. Edit metadata in RTF files.
""" """
import re, cStringIO, sys, copy import re, cStringIO, sys
from libprs500.ebooks.metadata import MetaInformation, get_parser from libprs500.ebooks.metadata import MetaInformation, get_parser
@ -118,13 +118,7 @@ def create_metadata(stream, options):
stream.seek(0) stream.seek(0)
stream.write(ans) stream.write(ans)
def set_metadata(stream, mi): def set_metadata(stream, options):
mi = copy.deepcopy(mi)
mi.authors = ', '.join(mi.authors)
mi.comment = mi.comments
set_metadata_(stream, mi)
def set_metadata_(stream, options):
''' '''
Modify/add RTF metadata in stream Modify/add RTF metadata in stream
@param options: Object with metadata attributes title, author, comment, category @param options: Object with metadata attributes title, author, comment, category
@ -147,7 +141,7 @@ def set_metadata_(stream, options):
src = pat.sub(r'{\\title ' + title + r'}', src) src = pat.sub(r'{\\title ' + title + r'}', src)
else: else:
src = add_metadata_item(src, 'title', title) src = add_metadata_item(src, 'title', title)
comment = options.comment comment = options.comments
if comment != None: if comment != None:
comment = comment.encode('ascii', 'replace') comment = comment.encode('ascii', 'replace')
pat = re.compile(base_pat.replace('name', 'subject'), re.DOTALL) pat = re.compile(base_pat.replace('name', 'subject'), re.DOTALL)
@ -157,6 +151,7 @@ def set_metadata_(stream, options):
src = add_metadata_item(src, 'subject', comment) src = add_metadata_item(src, 'subject', comment)
author = options.authors author = options.authors
if author != None: if author != None:
author = ', '.join(author)
author = author.encode('ascii', 'ignore') author = author.encode('ascii', 'ignore')
pat = re.compile(base_pat.replace('name', 'author'), re.DOTALL) pat = re.compile(base_pat.replace('name', 'author'), re.DOTALL)
if pat.search(src): if pat.search(src):
@ -186,7 +181,10 @@ def main(args=sys.argv):
parser.print_help() parser.print_help()
sys.exit(1) sys.exit(1)
stream = open(args[1], 'r+b') stream = open(args[1], 'r+b')
set_metadata_(stream, options) if options.authors:
options.authors = options.authors.split(',')
options.comments = options.comment
set_metadata(stream, options)
mi = get_metadata(stream) mi = get_metadata(stream)
return mi return mi

View File

@ -21,6 +21,7 @@ from zlib import compress, decompress
from libprs500 import sanitize_file_name from libprs500 import sanitize_file_name
from libprs500.ebooks.metadata.meta import set_metadata from libprs500.ebooks.metadata.meta import set_metadata
from libprs500.ebooks.metadata.opf import OPFCreator
from libprs500.ebooks.metadata import MetaInformation from libprs500.ebooks.metadata import MetaInformation
class Concatenate(object): class Concatenate(object):
@ -1087,6 +1088,25 @@ ALTER TABLE books ADD COLUMN isbn TEXT DEFAULT "" COLLATE NOCASE;
self.conn.execute('DELETE FROM books WHERE id=?', (id,)) self.conn.execute('DELETE FROM books WHERE id=?', (id,))
self.conn.commit() self.conn.commit()
def get_metadata(self, idx):
aum = self.authors(idx)
if aum: aum = aum.split(',')
mi = MetaInformation(self.title(idx), aum)
mi.author_sort = self.author_sort(idx)
mi.comments = self.comments(idx)
mi.publisher = self.publisher(idx)
tags = self.tags(idx)
if tags:
mi.tags = [i.strip() for i in tags.split(',')]
mi.series = self.series(idx)
if mi.series:
mi.series_index = self.series_index(idx)
mi.rating = self.rating(idx)
id = self.id(idx)
mi.isbn = self.isbn(id)
mi.libprs_id = id
return mi
def export_to_dir(self, dir, indices, byauthor=False): def export_to_dir(self, dir, indices, byauthor=False):
if not os.path.exists(dir): if not os.path.exists(dir):
raise IOError('Target directory does not exist: '+dir) raise IOError('Target directory does not exist: '+dir)
@ -1113,6 +1133,17 @@ ALTER TABLE books ADD COLUMN isbn TEXT DEFAULT "" COLLATE NOCASE;
id = str(self.id(idx)) id = str(self.id(idx))
if not os.path.exists(tpath): if not os.path.exists(tpath):
os.mkdir(tpath) os.mkdir(tpath)
mi = OPFCreator(self.get_metadata(idx))
cover = self.cover(idx)
if cover is not None:
f = open(os.path.join(tpath, 'cover.jpg'), 'wb')
f.write(cover)
mi.cover = 'cover.jpg'
f.close()
f = open(os.path.join(tpath, 'metadata.opf'), 'wb')
mi.write(f)
f.close()
for fmt in self.formats(idx).split(','): for fmt in self.formats(idx).split(','):
data = self.format(idx, fmt) data = self.format(idx, fmt)
name = au + ' - ' + title if byauthor else title + ' - ' + au name = au + ' - ' + title if byauthor else title + ' - ' + au
@ -1120,16 +1151,13 @@ ALTER TABLE books ADD COLUMN isbn TEXT DEFAULT "" COLLATE NOCASE;
f = open(os.path.join(tpath, sanitize_file_name(fname)), 'w+b') f = open(os.path.join(tpath, sanitize_file_name(fname)), 'w+b')
f.write(data) f.write(data)
f.flush() f.flush()
aum = self.authors(idx)
if aum: aum = aum.split(',')
mi = MetaInformation(self.title(idx), aum)
mi.author_sort = self.author_sort(idx)
try: try:
set_metadata(f, mi, fmt.lower()) set_metadata(f, mi, fmt.lower())
except: except:
print 'Error setting metadata for book:', mi.title print 'Error setting metadata for book:', mi.title
traceback.print_exc() traceback.print_exc()
f.close()
if __name__ == '__main__': if __name__ == '__main__':
db = LibraryDatabase('/home/kovid/library1.db') db = LibraryDatabase('/home/kovid/library1.db')