This commit is contained in:
Kovid Goyal 2007-11-29 03:26:57 +00:00
parent f97f1c91d2
commit a433be5ba5
4 changed files with 336 additions and 52 deletions

View File

@ -59,6 +59,7 @@ class MetaInformation(object):
self.series_index = None
self.rating = None
self.isbn = None
self.tags = []
def __str__(self):
ans = ''

View File

@ -17,6 +17,7 @@
import sys, re, os
from urllib import unquote
from urlparse import urlparse
import xml.dom.minidom as dom
from libprs500.ebooks.metadata import MetaInformation
from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup, BeautifulSoup
@ -92,41 +93,116 @@ class TOC(list):
pass
class OPFReader(MetaInformation):
class standard_field(object):
def __init__(self, name):
self.name = name
def __get__(self, obj, typ=None):
return getattr(obj, 'get_'+self.name)()
def __set__(self, obj, val):
getattr(obj, 'set_'+self.name)(val)
class OPF(MetaInformation):
ENTITY_PATTERN = re.compile(r'&(\S+?);')
def __init__(self, stream, dir=os.getcwd()):
manage = False
if not hasattr(stream, 'read'):
manage = True
dir = os.path.dirname(stream)
stream = open(stream, 'rb')
self.default_title = stream.name if hasattr(stream, 'name') else 'Unknown'
if hasattr(stream, 'seek'):
stream.seek(0)
self.soup = BeautifulStoneSoup(stream.read())
if manage:
stream.close()
self.title = self.get_title()
self.authors = self.get_authors()
self.title_sort = self.get_title_sort()
self.author_sort = self.get_author_sort()
self.comments = self.get_comments()
self.category = self.get_category()
self.publisher = self.get_publisher()
self.isbn = self.get_isbn()
self.series = self.series_index = self.rating = None
self.manifest = Manifest(self.soup, dir)
self.spine = Spine(self.soup, self.manifest)
self.toc = TOC(self, dir)
self.cover = self.get_cover()
libprs_id = standard_field('libprs_id')
title = standard_field('title')
authors = standard_field('authors')
title_sort = standard_field('title_sort')
author_sort = standard_field('author_sort')
comments = standard_field('comments')
category = standard_field('category')
publisher = standard_field('publisher')
isbn = standard_field('isbn')
cover = standard_field('cover')
series = standard_field('series')
series_index = standard_field('series_index')
rating = standard_field('rating')
tags = standard_field('tags')
def __init__(self):
raise NotImplementedError('Abstract base class')
def _initialize(self):
if not hasattr(self, 'soup'):
self.soup = BeautifulStoneSoup(u'''\
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE package
PUBLIC "+//ISBN 0-9673008-1-9//DTD OEB 1.2 Package//EN"
"http://openebook.org/dtds/oeb-1.2/oebpkg12.dtd">
<package unique-identifier="libprs_id">
<metadata>
<dc-metadata
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:oebpackage="http://openebook.org/namespaces/oeb-package/1.0/" />
</metadata>
</package>
''')
def _commit(self, doc):
self.soup = BeautifulStoneSoup(doc.toxml('utf-8'), fromEncoding='utf-8')
def _find_element(self, package, name, attrs=[]):
tags = package.getElementsByTagName(name)
for tag in tags:
match = True
for attr, vattr in attrs:
if tag.getAttribute(attr) != vattr:
match = False
break
if match:
return tag
return None
def _set_metadata_element(self, name, value, attrs=[],
type='dc-metadata', replace=False):
self._initialize()
if isinstance(value, basestring):
value = [value]
attrs = [attrs]
doc = dom.parseString(self.soup.__str__('UTF-8'))
package = doc.documentElement
metadata = package.getElementsByTagName('metadata')[0]
dcms = metadata.getElementsByTagName(type)
if dcms:
dcm = dcms[0]
else:
dcm = doc.createElement(type)
metadata.appendChild(dcm)
tags = dcm.getElementsByTagName(name)
if tags and not replace:
for tag in tags:
tag.parentNode.removeChild(tag)
tag.unlink()
for val, vattrs in zip(value, attrs):
if replace:
el = self._find_element(package, name, vattrs)
if el:
el.parentNode.removeChild(el)
el.unlink()
el = doc.createElement(name)
el.appendChild(doc.createTextNode(val))
for attr, vattr in vattrs:
el.setAttribute(attr, vattr)
dcm.appendChild(el)
self._commit(doc)
def get_title(self):
title = self.soup.package.metadata.find('dc:title')
if title:
return self.ENTITY_PATTERN.sub(entity_to_unicode, title.string)
return self.default_title
return self.ENTITY_PATTERN.sub(entity_to_unicode, title.string).strip()
return self.default_title.strip()
def set_title(self, title):
if not title:
title = 'Unknown'
self._set_metadata_element('dc:title', title)
def get_authors(self):
creators = self.soup.package.metadata.findAll('dc:creator')
@ -142,9 +218,15 @@ class OPFReader(MetaInformation):
ans = []
for i in au:
ans.extend(i.split('&'))
return ans
return [a.strip() for a in ans]
return []
def set_authors(self, authors):
if not authors:
authors = ['Unknown']
attrs = [[('role', 'aut')] for a in authors]
self._set_metadata_element('dc:Creator', authors, attrs)
def get_author_sort(self):
creators = self.soup.package.metadata.findAll('dc:creator')
for elem in creators:
@ -153,42 +235,99 @@ class OPFReader(MetaInformation):
role = elem.get('opf:role')
if role == 'aut':
fa = elem.get('file-as')
return self.ENTITY_PATTERN.sub(entity_to_unicode, fa) if fa else None
return self.ENTITY_PATTERN.sub(entity_to_unicode, fa).strip() if fa else None
return None
def set_author_sort(self, aus):
if not aus:
aus = ''
self._initialize()
if not self.authors:
self.set_authors([])
doc = dom.parseString(self.soup.__str__('UTF-8'))
package = doc.documentElement
aut = package.getElementsByTagName('dc:Creator')[0]
aut.setAttribute('file-as', aus)
self._commit(doc)
def get_title_sort(self):
title = self.soup.package.find('dc:title')
if title:
if title.has_key('file-as'):
return title['file-as'].strip()
return None
def set_title_sort(self, title_sort):
if not title_sort:
title_sort = ''
self._initialize()
if not self.title:
self.title = None
doc = dom.parseString(self.soup.__str__('UTF-8'))
package = doc.documentElement
tit = package.getElementsByTagName('dc:Title')[0]
tit.setAttribute('file-as', title_sort)
self._commit(doc)
def get_comments(self):
comments = self.soup.find('dc:description')
if comments:
return self.ENTITY_PATTERN.sub(entity_to_unicode, comments.string)
return self.ENTITY_PATTERN.sub(entity_to_unicode, comments.string).strip()
return None
def set_comments(self, comments):
if not comments:
comments = ''
self._set_metadata_element('dc:Description', comments)
def get_category(self):
category = self.soup.find('dc:type')
if category:
return self.ENTITY_PATTERN.sub(entity_to_unicode, category.string)
return self.ENTITY_PATTERN.sub(entity_to_unicode, category.string).strip()
return None
def set_category(self, category):
if not category:
category = ''
self._set_metadata_element('dc:Type', category)
def get_publisher(self):
publisher = self.soup.find('dc:publisher')
if publisher:
return self.ENTITY_PATTERN.sub(entity_to_unicode, publisher.string)
return self.ENTITY_PATTERN.sub(entity_to_unicode, publisher.string).strip()
return None
def set_publisher(self, category):
if not category:
category = 'Unknown'
self._set_metadata_element('dc:Publisher', category)
def get_isbn(self):
for item in self.soup.package.metadata.findAll('dc:identifier'):
scheme = item.get('scheme')
if not scheme:
scheme = item.get('opf:scheme')
if scheme is not None and scheme.lower() == 'isbn':
return item.string
return str(item.string).strip()
return None
def set_isbn(self, isbn):
if isbn:
self._set_metadata_element('dc:Identifier', isbn, [('scheme', 'ISBN')],
replace=True)
def get_libprs_id(self):
for item in self.soup.package.metadata.findAll('dc:identifier'):
if item.has_key('scheme') and item['scheme'] == 'libprs':
return str(item.string).strip()
return None
def set_libprs_id(self, val):
if val:
self._set_metadata_element('dc:Identifier', str(val), [('scheme', 'libprs'), ('id', 'libprs_id')],
replace=True)
def get_cover(self):
guide = self.soup.package.find('guide')
if guide:
@ -200,7 +339,24 @@ class OPFReader(MetaInformation):
if type.lower() in ['cover', 'other.ms-coverimage-standard']:
return reference.get('href')
return None
def set_cover(self, path):
self._initialize()
doc = dom.parseString(self.soup.__str__('UTF-8'))
package = doc.documentElement
guide = package.getElementsByTagName('guide')
if guide:
guide = guide[0]
else:
guide = doc.createElement('guide')
package.appendChild(guide)
el = self._find_element(guide, 'reference', [('type', 'cover')])
if not el:
el = doc.createElement('reference')
guide.appendChild(el)
el.setAttribute('type', 'cover')
el.setAttribute('href', path)
self._commit(doc)
def possible_cover_prefixes(self):
isbn, ans = [], []
@ -213,6 +369,107 @@ class OPFReader(MetaInformation):
ans.append(item[1].replace('-', ''))
return ans
def get_series(self):
xm = self.soup.package.metadata.find('x-metadata')
if not xm:
return None
s = xm.find('series')
if s:
return str(s.string).strip()
return None
def set_series(self, val):
if not val:
val = ''
self._set_metadata_element('series', val, type='x-metadata')
def get_series_index(self):
xm = self.soup.package.metadata.find('x-metadata')
if not xm:
return None
s = xm.find('series-index')
if s:
try:
return int(str(s.string).strip())
except:
return None
return None
def set_series_index(self, val):
if not val:
val = 1
self._set_metadata_element('series-index', str(val), type='x-metadata')
def get_rating(self):
xm = self.soup.package.metadata.find('x-metadata')
if not xm:
return None
s = xm.find('rating')
if s:
try:
return int(str(s.string).strip())
except:
return None
return None
def set_rating(self, val):
if not val:
val = 0
self._set_metadata_element('rating', str(val), type='x-metadata')
def get_tags(self):
ans = []
subs = self.soup.findAll('dc:subject')
for sub in subs:
val = sub.string
if val:
ans.append(val)
return [unicode(a).strip() for a in ans]
def set_tags(self, tags):
self._set_metadata_element('dc:Subject', tags)
def write(self, stream):
stream.write(self.soup.prettify('utf-8'))
class OPFReader(OPF):
def __init__(self, stream, dir=os.getcwd()):
manage = False
if not hasattr(stream, 'read'):
manage = True
dir = os.path.dirname(stream)
stream = open(stream, 'rb')
self.default_title = stream.name if hasattr(stream, 'name') else 'Unknown'
if hasattr(stream, 'seek'):
stream.seek(0)
self.soup = BeautifulStoneSoup(stream.read())
if manage:
stream.close()
class OPFCreator(OPF):
def __init__(self, mi):
self.title = mi.title
self.authors = mi.authors
if mi.category:
self.category = mi.category
if mi.comments:
self.comments = mi.comments
if mi.publisher:
self.publisher = mi.publisher
if mi.rating:
self.rating = mi.rating
if mi.series:
self.series = mi.series
if mi.series_index:
self.series_index = mi.series_index
if mi.tags:
self.tags = mi.tags
if mi.isbn:
self.isbn = mi.isbn
if hasattr(mi, 'libprs_id'):
self.libprs_id = mi.libprs_id
def main(args=sys.argv):
print OPFReader(open(args[1], 'rb'))

View File

@ -15,7 +15,7 @@
"""
Edit metadata in RTF files.
"""
import re, cStringIO, sys, copy
import re, cStringIO, sys
from libprs500.ebooks.metadata import MetaInformation, get_parser
@ -118,13 +118,7 @@ def create_metadata(stream, options):
stream.seek(0)
stream.write(ans)
def set_metadata(stream, mi):
mi = copy.deepcopy(mi)
mi.authors = ', '.join(mi.authors)
mi.comment = mi.comments
set_metadata_(stream, mi)
def set_metadata_(stream, options):
def set_metadata(stream, options):
'''
Modify/add RTF metadata in stream
@param options: Object with metadata attributes title, author, comment, category
@ -147,7 +141,7 @@ def set_metadata_(stream, options):
src = pat.sub(r'{\\title ' + title + r'}', src)
else:
src = add_metadata_item(src, 'title', title)
comment = options.comment
comment = options.comments
if comment != None:
comment = comment.encode('ascii', 'replace')
pat = re.compile(base_pat.replace('name', 'subject'), re.DOTALL)
@ -157,6 +151,7 @@ def set_metadata_(stream, options):
src = add_metadata_item(src, 'subject', comment)
author = options.authors
if author != None:
author = ', '.join(author)
author = author.encode('ascii', 'ignore')
pat = re.compile(base_pat.replace('name', 'author'), re.DOTALL)
if pat.search(src):
@ -186,7 +181,10 @@ def main(args=sys.argv):
parser.print_help()
sys.exit(1)
stream = open(args[1], 'r+b')
set_metadata_(stream, options)
if options.authors:
options.authors = options.authors.split(',')
options.comments = options.comment
set_metadata(stream, options)
mi = get_metadata(stream)
return mi

View File

@ -21,6 +21,7 @@ from zlib import compress, decompress
from libprs500 import sanitize_file_name
from libprs500.ebooks.metadata.meta import set_metadata
from libprs500.ebooks.metadata.opf import OPFCreator
from libprs500.ebooks.metadata import MetaInformation
class Concatenate(object):
@ -1087,6 +1088,25 @@ ALTER TABLE books ADD COLUMN isbn TEXT DEFAULT "" COLLATE NOCASE;
self.conn.execute('DELETE FROM books WHERE id=?', (id,))
self.conn.commit()
def get_metadata(self, idx):
aum = self.authors(idx)
if aum: aum = aum.split(',')
mi = MetaInformation(self.title(idx), aum)
mi.author_sort = self.author_sort(idx)
mi.comments = self.comments(idx)
mi.publisher = self.publisher(idx)
tags = self.tags(idx)
if tags:
mi.tags = [i.strip() for i in tags.split(',')]
mi.series = self.series(idx)
if mi.series:
mi.series_index = self.series_index(idx)
mi.rating = self.rating(idx)
id = self.id(idx)
mi.isbn = self.isbn(id)
mi.libprs_id = id
return mi
def export_to_dir(self, dir, indices, byauthor=False):
if not os.path.exists(dir):
raise IOError('Target directory does not exist: '+dir)
@ -1113,6 +1133,17 @@ ALTER TABLE books ADD COLUMN isbn TEXT DEFAULT "" COLLATE NOCASE;
id = str(self.id(idx))
if not os.path.exists(tpath):
os.mkdir(tpath)
mi = OPFCreator(self.get_metadata(idx))
cover = self.cover(idx)
if cover is not None:
f = open(os.path.join(tpath, 'cover.jpg'), 'wb')
f.write(cover)
mi.cover = 'cover.jpg'
f.close()
f = open(os.path.join(tpath, 'metadata.opf'), 'wb')
mi.write(f)
f.close()
for fmt in self.formats(idx).split(','):
data = self.format(idx, fmt)
name = au + ' - ' + title if byauthor else title + ' - ' + au
@ -1120,16 +1151,13 @@ ALTER TABLE books ADD COLUMN isbn TEXT DEFAULT "" COLLATE NOCASE;
f = open(os.path.join(tpath, sanitize_file_name(fname)), 'w+b')
f.write(data)
f.flush()
aum = self.authors(idx)
if aum: aum = aum.split(',')
mi = MetaInformation(self.title(idx), aum)
mi.author_sort = self.author_sort(idx)
try:
set_metadata(f, mi, fmt.lower())
except:
print 'Error setting metadata for book:', mi.title
traceback.print_exc()
f.close()
if __name__ == '__main__':
db = LibraryDatabase('/home/kovid/library1.db')