mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-08-30 23:00:21 -04:00
IGN:Created lxml based OPF framework for epub-meta and cleaned up rtf2html code
This commit is contained in:
parent
0bfde7ee2c
commit
66a6d60901
@ -2,7 +2,7 @@
|
|||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2008, Kovid Goyal <kovid@kovidgoyal.net>'
|
__copyright__ = '2008, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
from lxml import etree # Needed on OSX to ensure the correct libxml2 is loaded
|
||||||
import sys, os, re, logging, time, subprocess, mechanize, atexit
|
import sys, os, re, logging, time, subprocess, mechanize, atexit
|
||||||
from htmlentitydefs import name2codepoint
|
from htmlentitydefs import name2codepoint
|
||||||
from math import floor
|
from math import floor
|
||||||
|
@ -2,7 +2,7 @@ __license__ = 'GPL v3'
|
|||||||
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
|
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
__appname__ = 'calibre'
|
__appname__ = 'calibre'
|
||||||
__version__ = '0.4.84b9'
|
__version__ = '0.4.84b10'
|
||||||
__author__ = "Kovid Goyal <kovid@kovidgoyal.net>"
|
__author__ = "Kovid Goyal <kovid@kovidgoyal.net>"
|
||||||
'''
|
'''
|
||||||
Various run time constants.
|
Various run time constants.
|
||||||
|
@ -29,7 +29,13 @@ def detect(aBuf):
|
|||||||
return u.result
|
return u.result
|
||||||
|
|
||||||
# Added by Kovid
|
# Added by Kovid
|
||||||
def xml_to_unicode(raw, verbose=False):
|
ENCODING_PATS = [
|
||||||
|
re.compile(r'<[^<>]+encoding=[\'"](.*?)[\'"][^<>]*>', re.IGNORECASE),
|
||||||
|
re.compile(r'<meta.*?content=[\'"].*?charset=([^\s\'"]+).*?[\'"].*?>', re.IGNORECASE)
|
||||||
|
]
|
||||||
|
ENTITY_PATTERN = re.compile(r'&(\S+?);')
|
||||||
|
|
||||||
|
def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False, resolve_entities=False):
|
||||||
'''
|
'''
|
||||||
Force conversion of byte string to unicode. Tries to look for XML/HTML
|
Force conversion of byte string to unicode. Tries to look for XML/HTML
|
||||||
encoding declaration first, if not found uses the chardet library and
|
encoding declaration first, if not found uses the chardet library and
|
||||||
@ -41,11 +47,14 @@ def xml_to_unicode(raw, verbose=False):
|
|||||||
return u'', encoding
|
return u'', encoding
|
||||||
if isinstance(raw, unicode):
|
if isinstance(raw, unicode):
|
||||||
return raw, encoding
|
return raw, encoding
|
||||||
match = re.compile(r'<[^<>]+encoding=[\'"](.*?)[\'"][^<>]*>', re.IGNORECASE).search(raw)
|
for pat in ENCODING_PATS:
|
||||||
if match is None:
|
match = pat.search(raw)
|
||||||
match = re.compile(r'<meta.*?content=[\'"].*?charset=([^\s\'"]+).*?[\'"]', re.IGNORECASE).search(raw)
|
if match:
|
||||||
if match is not None:
|
encoding = match.group(1)
|
||||||
encoding = match.group(1)
|
break
|
||||||
|
if strip_encoding_pats:
|
||||||
|
for pat in ENCODING_PATS:
|
||||||
|
raw = pat.sub('', raw)
|
||||||
if encoding is None:
|
if encoding is None:
|
||||||
try:
|
try:
|
||||||
chardet = detect(raw)
|
chardet = detect(raw)
|
||||||
@ -65,4 +74,12 @@ def xml_to_unicode(raw, verbose=False):
|
|||||||
encoding = CHARSET_ALIASES[encoding]
|
encoding = CHARSET_ALIASES[encoding]
|
||||||
if encoding == 'ascii':
|
if encoding == 'ascii':
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
return raw.decode(encoding, 'ignore'), encoding
|
|
||||||
|
raw = raw.decode(encoding, 'replace')
|
||||||
|
if resolve_entities:
|
||||||
|
from calibre import entity_to_unicode
|
||||||
|
from functools import partial
|
||||||
|
f = partial(entity_to_unicode, exceptions=['amp', 'apos', 'quot', 'lt', 'gt'])
|
||||||
|
raw = ENTITY_PATTERN.sub(f, raw)
|
||||||
|
|
||||||
|
return raw, encoding
|
||||||
|
@ -13,7 +13,7 @@ from lxml.etree import XPath
|
|||||||
get_text = XPath("//text()")
|
get_text = XPath("//text()")
|
||||||
|
|
||||||
from calibre import LoggingInterface, unicode_path
|
from calibre import LoggingInterface, unicode_path
|
||||||
from calibre.ebooks.chardet import xml_to_unicode
|
from calibre.ebooks.chardet import xml_to_unicode, ENCODING_PATS
|
||||||
from calibre.utils.config import Config, StringConfig
|
from calibre.utils.config import Config, StringConfig
|
||||||
from calibre.ebooks.metadata.opf import OPFReader, OPFCreator
|
from calibre.ebooks.metadata.opf import OPFReader, OPFCreator
|
||||||
from calibre.ebooks.metadata import MetaInformation
|
from calibre.ebooks.metadata import MetaInformation
|
||||||
@ -287,10 +287,6 @@ class PreProcessor(object):
|
|||||||
|
|
||||||
class Parser(PreProcessor, LoggingInterface):
|
class Parser(PreProcessor, LoggingInterface):
|
||||||
|
|
||||||
ENCODING_PATS = [re.compile(r'<[^<>]+encoding=[\'"](.*?)[\'"][^<>]*>', re.IGNORECASE),
|
|
||||||
re.compile(r'<meta.*?content=[\'"].*?charset=([^\s\'"]+).*?[\'"].*?>', re.IGNORECASE)]
|
|
||||||
|
|
||||||
|
|
||||||
def __init__(self, htmlfile, opts, tdir, resource_map, htmlfiles, name='htmlparser'):
|
def __init__(self, htmlfile, opts, tdir, resource_map, htmlfiles, name='htmlparser'):
|
||||||
LoggingInterface.__init__(self, logging.getLogger(name))
|
LoggingInterface.__init__(self, logging.getLogger(name))
|
||||||
self.setup_cli_handler(opts.verbose)
|
self.setup_cli_handler(opts.verbose)
|
||||||
@ -332,7 +328,7 @@ class Parser(PreProcessor, LoggingInterface):
|
|||||||
src = open(self.htmlfile.path, 'rb').read().decode(self.htmlfile.encoding, 'replace')
|
src = open(self.htmlfile.path, 'rb').read().decode(self.htmlfile.encoding, 'replace')
|
||||||
src = self.preprocess(src)
|
src = self.preprocess(src)
|
||||||
# lxml chokes on unicode input when it contains encoding declarations
|
# lxml chokes on unicode input when it contains encoding declarations
|
||||||
for pat in self.ENCODING_PATS:
|
for pat in ENCODING_PATS:
|
||||||
src = pat.sub('', src)
|
src = pat.sub('', src)
|
||||||
try:
|
try:
|
||||||
self.root = html.document_fromstring(src)
|
self.root = html.document_fromstring(src)
|
||||||
|
@ -1,11 +1,12 @@
|
|||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||||
import os, sys, tempfile, subprocess, shutil, logging, glob
|
import os, sys, tempfile, shutil, logging, glob
|
||||||
|
|
||||||
|
from lxml import etree
|
||||||
|
|
||||||
from calibre.ebooks.lrf import option_parser as lrf_option_parser
|
from calibre.ebooks.lrf import option_parser as lrf_option_parser
|
||||||
from calibre.ebooks.metadata.meta import get_metadata
|
from calibre.ebooks.metadata.meta import get_metadata
|
||||||
from calibre.ebooks.lrf.html.convert_from import process_file as html_process_file
|
from calibre.ebooks.lrf.html.convert_from import process_file as html_process_file
|
||||||
from calibre.ebooks import ConversionError
|
|
||||||
from calibre import isosx, setup_cli_handlers, __appname__
|
from calibre import isosx, setup_cli_handlers, __appname__
|
||||||
from calibre.libwand import convert, WandException
|
from calibre.libwand import convert, WandException
|
||||||
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
|
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
|
||||||
@ -37,32 +38,6 @@ def convert_images(html, logger):
|
|||||||
continue
|
continue
|
||||||
return html
|
return html
|
||||||
|
|
||||||
def generate_html(rtfpath, logger):
|
|
||||||
tdir = tempfile.mkdtemp(prefix=__appname__+'_')
|
|
||||||
cwd = os.path.abspath(os.getcwd())
|
|
||||||
os.chdir(tdir)
|
|
||||||
try:
|
|
||||||
logger.info('Converting to HTML...')
|
|
||||||
sys.stdout.flush()
|
|
||||||
handle, path = tempfile.mkstemp(dir=tdir, suffix='.html')
|
|
||||||
file = os.fdopen(handle, 'wb')
|
|
||||||
cmd = ' '.join([UNRTF, '"'+rtfpath+'"'])
|
|
||||||
p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE,
|
|
||||||
stderr=subprocess.PIPE)
|
|
||||||
raw = p.stdout.read()
|
|
||||||
ret = p.wait()
|
|
||||||
if ret != 0:
|
|
||||||
if len(raw) > 1000: #unrtf crashes occassionally on OSX and windows but still convert correctly
|
|
||||||
raw += '</body>\n</html>'
|
|
||||||
else:
|
|
||||||
logger.critical(p.stderr.read())
|
|
||||||
raise ConversionError, 'unrtf failed with error code: %d'%(ret,)
|
|
||||||
file.write(convert_images(raw, logger))
|
|
||||||
file.close()
|
|
||||||
return path
|
|
||||||
finally:
|
|
||||||
os.chdir(cwd)
|
|
||||||
|
|
||||||
def process_file(path, options, logger=None):
|
def process_file(path, options, logger=None):
|
||||||
if logger is None:
|
if logger is None:
|
||||||
level = logging.DEBUG if options.verbose else logging.INFO
|
level = logging.DEBUG if options.verbose else logging.INFO
|
||||||
@ -72,7 +47,7 @@ def process_file(path, options, logger=None):
|
|||||||
f = open(rtf, 'rb')
|
f = open(rtf, 'rb')
|
||||||
mi = get_metadata(f, 'rtf')
|
mi = get_metadata(f, 'rtf')
|
||||||
f.close()
|
f.close()
|
||||||
html = generate_html2(rtf, logger)
|
html = generate_html(rtf, logger)
|
||||||
tdir = os.path.dirname(html)
|
tdir = os.path.dirname(html)
|
||||||
cwd = os.getcwdu()
|
cwd = os.getcwdu()
|
||||||
try:
|
try:
|
||||||
@ -162,8 +137,7 @@ def generate_xml(rtfpath):
|
|||||||
return ofile
|
return ofile
|
||||||
|
|
||||||
|
|
||||||
def generate_html2(rtfpath, logger):
|
def generate_html(rtfpath, logger):
|
||||||
from lxml import etree
|
|
||||||
logger.info('Converting RTF to XML...')
|
logger.info('Converting RTF to XML...')
|
||||||
xml = generate_xml(rtfpath)
|
xml = generate_xml(rtfpath)
|
||||||
tdir = os.path.dirname(xml)
|
tdir = os.path.dirname(xml)
|
||||||
|
@ -171,14 +171,13 @@ class MetaInformation(object):
|
|||||||
if hasattr(mi, attr):
|
if hasattr(mi, attr):
|
||||||
setattr(ans, attr, getattr(mi, attr))
|
setattr(ans, attr, getattr(mi, attr))
|
||||||
|
|
||||||
|
|
||||||
def __init__(self, title, authors=[_('Unknown')]):
|
def __init__(self, title, authors=[_('Unknown')]):
|
||||||
'''
|
'''
|
||||||
@param title: title or "Unknown" or a MetaInformation object
|
@param title: title or "Unknown" or a MetaInformation object
|
||||||
@param authors: List of strings or []
|
@param authors: List of strings or []
|
||||||
'''
|
'''
|
||||||
mi = None
|
mi = None
|
||||||
if isinstance(title, MetaInformation):
|
if hasattr(title, 'title') and hasattr(title, 'authors'):
|
||||||
mi = title
|
mi = title
|
||||||
title = mi.title
|
title = mi.title
|
||||||
authors = mi.authors
|
authors = mi.authors
|
||||||
@ -186,26 +185,15 @@ class MetaInformation(object):
|
|||||||
self.author = authors # Needed for backward compatibility
|
self.author = authors # Needed for backward compatibility
|
||||||
#: List of strings or []
|
#: List of strings or []
|
||||||
self.authors = authors
|
self.authors = authors
|
||||||
#: Sort text for author
|
self.tags = getattr(mi, 'tags', [])
|
||||||
self.author_sort = None if not mi else mi.author_sort
|
|
||||||
self.title_sort = None if not mi else mi.title_sort
|
|
||||||
self.comments = None if not mi else mi.comments
|
|
||||||
self.category = None if not mi else mi.category
|
|
||||||
self.publisher = None if not mi else mi.publisher
|
|
||||||
self.series = None if not mi else mi.series
|
|
||||||
self.series_index = None if not mi else mi.series_index
|
|
||||||
self.rating = None if not mi else mi.rating
|
|
||||||
self.isbn = None if not mi else mi.isbn
|
|
||||||
self.tags = [] if not mi else mi.tags
|
|
||||||
self.language = None if not mi else mi.language # Typically a string describing the language
|
|
||||||
#: mi.cover_data = (ext, data)
|
#: mi.cover_data = (ext, data)
|
||||||
self.cover_data = mi.cover_data if (mi and hasattr(mi, 'cover_data')) else (None, None)
|
self.cover_data = getattr(mi, 'cover_data', (None, None))
|
||||||
self.application_id = mi.application_id if (mi and hasattr(mi, 'application_id')) else None
|
|
||||||
self.manifest = getattr(mi, 'manifest', None)
|
for x in ('author_sort', 'title_sort', 'comments', 'category', 'publisher',
|
||||||
self.toc = getattr(mi, 'toc', None)
|
'series', 'series_index', 'rating', 'isbn', 'language',
|
||||||
self.spine = getattr(mi, 'spine', None)
|
'application_id', 'manifest', 'toc', 'spine', 'guide', 'cover'
|
||||||
self.guide = getattr(mi, 'guide', None)
|
):
|
||||||
self.cover = getattr(mi, 'cover', None)
|
setattr(self, x, getattr(mi, x, None))
|
||||||
|
|
||||||
def smart_update(self, mi):
|
def smart_update(self, mi):
|
||||||
'''
|
'''
|
||||||
|
@ -6,14 +6,14 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
|||||||
'''Read meta information from epub files'''
|
'''Read meta information from epub files'''
|
||||||
|
|
||||||
import sys, os
|
import sys, os
|
||||||
|
|
||||||
from calibre.utils.zipfile import ZipFile, BadZipfile, safe_replace
|
|
||||||
from cStringIO import StringIO
|
from cStringIO import StringIO
|
||||||
from contextlib import closing
|
from contextlib import closing
|
||||||
|
|
||||||
|
|
||||||
|
from calibre.utils.zipfile import ZipFile, BadZipfile, safe_replace
|
||||||
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
|
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
|
||||||
from calibre.ebooks.metadata.opf import OPF, OPFReader, OPFCreator
|
|
||||||
from calibre.ebooks.metadata import get_parser, MetaInformation
|
from calibre.ebooks.metadata import get_parser, MetaInformation
|
||||||
|
from calibre.ebooks.metadata.opf2 import OPF
|
||||||
|
|
||||||
class EPubException(Exception):
|
class EPubException(Exception):
|
||||||
pass
|
pass
|
||||||
@ -49,14 +49,15 @@ class OCF(object):
|
|||||||
def __init__(self):
|
def __init__(self):
|
||||||
raise NotImplementedError('Abstract base class')
|
raise NotImplementedError('Abstract base class')
|
||||||
|
|
||||||
|
|
||||||
class OCFReader(OCF):
|
class OCFReader(OCF):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
try:
|
try:
|
||||||
mimetype = self.open('mimetype').read().rstrip()
|
mimetype = self.open('mimetype').read().rstrip()
|
||||||
if mimetype != OCF.MIMETYPE:
|
if mimetype != OCF.MIMETYPE:
|
||||||
raise EPubException
|
print 'WARNING: Invalid mimetype declaration', mimetype
|
||||||
except (KeyError, EPubException):
|
except:
|
||||||
raise EPubException("not an .epub OCF container")
|
print 'WARNING: Epub doesn\'t contain a mimetype declaration'
|
||||||
|
|
||||||
try:
|
try:
|
||||||
with closing(self.open(OCF.CONTAINER_PATH)) as f:
|
with closing(self.open(OCF.CONTAINER_PATH)) as f:
|
||||||
@ -66,37 +67,26 @@ class OCFReader(OCF):
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
with closing(self.open(self.container[OPF.MIMETYPE])) as f:
|
with closing(self.open(self.container[OPF.MIMETYPE])) as f:
|
||||||
self.opf = OPFReader(f, self.root)
|
self.opf = OPF(f, self.root)
|
||||||
except KeyError:
|
except KeyError:
|
||||||
raise EPubException("missing OPF package file")
|
raise EPubException("missing OPF package file")
|
||||||
|
|
||||||
class OCFZipReader(OCFReader):
|
class OCFZipReader(OCFReader):
|
||||||
def __init__(self, stream, mode='r'):
|
def __init__(self, stream, mode='r', root=None):
|
||||||
try:
|
try:
|
||||||
self.archive = ZipFile(stream, mode=mode)
|
self.archive = ZipFile(stream, mode=mode)
|
||||||
except BadZipfile:
|
except BadZipfile:
|
||||||
raise EPubException("not a ZIP .epub OCF container")
|
raise EPubException("not a ZIP .epub OCF container")
|
||||||
self.root = getattr(stream, 'name', os.getcwd())
|
self.root = root
|
||||||
|
if self.root is None:
|
||||||
|
self.root = os.getcwdu()
|
||||||
|
if hasattr(stream, 'name'):
|
||||||
|
self.root = os.path.abspath(os.path.dirname(stream.name))
|
||||||
super(OCFZipReader, self).__init__()
|
super(OCFZipReader, self).__init__()
|
||||||
|
|
||||||
def open(self, name, mode='r'):
|
def open(self, name, mode='r'):
|
||||||
return StringIO(self.archive.read(name))
|
return StringIO(self.archive.read(name))
|
||||||
|
|
||||||
class OCFZipWriter(object):
|
|
||||||
|
|
||||||
def __init__(self, stream):
|
|
||||||
reader = OCFZipReader(stream)
|
|
||||||
self.opf = reader.container[OPF.MIMETYPE]
|
|
||||||
self.stream = stream
|
|
||||||
self.root = getattr(stream, 'name', os.getcwd())
|
|
||||||
|
|
||||||
def set_metadata(self, mi):
|
|
||||||
stream = StringIO()
|
|
||||||
opf = OPFCreator(self.root, mi)
|
|
||||||
opf.render(stream)
|
|
||||||
stream.seek(0)
|
|
||||||
safe_replace(self.stream, self.opf, stream)
|
|
||||||
|
|
||||||
class OCFDirReader(OCFReader):
|
class OCFDirReader(OCFReader):
|
||||||
def __init__(self, path):
|
def __init__(self, path):
|
||||||
self.root = path
|
self.root = path
|
||||||
@ -111,12 +101,17 @@ def get_metadata(stream):
|
|||||||
return OCFZipReader(stream).opf
|
return OCFZipReader(stream).opf
|
||||||
|
|
||||||
def set_metadata(stream, mi):
|
def set_metadata(stream, mi):
|
||||||
OCFZipWriter(stream).set_metadata(mi)
|
reader = OCFZipReader(stream, root=os.getcwdu())
|
||||||
|
reader.opf.smart_update(mi)
|
||||||
|
newopf = StringIO(reader.opf.render())
|
||||||
|
safe_replace(stream, reader.container[OPF.MIMETYPE], newopf)
|
||||||
|
print newopf.getvalue()
|
||||||
|
|
||||||
def option_parser():
|
def option_parser():
|
||||||
parser = get_parser('epub')
|
parser = get_parser('epub')
|
||||||
parser.remove_option('--category')
|
parser.remove_option('--category')
|
||||||
parser.add_option('--tags', default=None, help=_('A comma separated list of tags to set'))
|
parser.add_option('--tags', default=None,
|
||||||
|
help=_('A comma separated list of tags to set'))
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
def main(args=sys.argv):
|
def main(args=sys.argv):
|
||||||
@ -126,7 +121,7 @@ def main(args=sys.argv):
|
|||||||
parser.print_help()
|
parser.print_help()
|
||||||
return 1
|
return 1
|
||||||
stream = open(args[1], 'r+b')
|
stream = open(args[1], 'r+b')
|
||||||
mi = MetaInformation(OCFZipReader(stream).opf)
|
mi = MetaInformation(OCFZipReader(stream, root=os.getcwdu()).opf)
|
||||||
if opts.title:
|
if opts.title:
|
||||||
mi.title = opts.title
|
mi.title = opts.title
|
||||||
if opts.authors:
|
if opts.authors:
|
||||||
@ -136,8 +131,10 @@ def main(args=sys.argv):
|
|||||||
if opts.comment:
|
if opts.comment:
|
||||||
mi.comments = opts.comment
|
mi.comments = opts.comment
|
||||||
|
|
||||||
|
stream.seek(0)
|
||||||
set_metadata(stream, mi)
|
set_metadata(stream, mi)
|
||||||
print unicode(mi)
|
print unicode(mi)
|
||||||
|
stream.close()
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
220
src/calibre/ebooks/metadata/opf2.py
Normal file
220
src/calibre/ebooks/metadata/opf2.py
Normal file
@ -0,0 +1,220 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
|
||||||
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
'''
|
||||||
|
lxml based OPF parser.
|
||||||
|
'''
|
||||||
|
|
||||||
|
import sys, unittest, functools, os
|
||||||
|
|
||||||
|
from lxml import etree
|
||||||
|
|
||||||
|
from calibre.ebooks.chardet import xml_to_unicode
|
||||||
|
|
||||||
|
class MetadataField(object):
|
||||||
|
|
||||||
|
def __init__(self, name, is_dc=True):
|
||||||
|
self.name = name
|
||||||
|
self.is_dc = is_dc
|
||||||
|
|
||||||
|
def __get__(self, obj, type=None):
|
||||||
|
ans = obj.get_metadata_element(self.name)
|
||||||
|
if ans is None:
|
||||||
|
return u''
|
||||||
|
return obj.get_text(ans)
|
||||||
|
|
||||||
|
def __set__(self, obj, val):
|
||||||
|
elem = obj.get_metadata_element(self.name)
|
||||||
|
if elem is None:
|
||||||
|
elem = obj.create_metadata_element(self.name, ns='dc' if self.is_dc else 'opf')
|
||||||
|
elem.text = unicode(val)
|
||||||
|
|
||||||
|
class OPF(object):
|
||||||
|
MIMETYPE = 'application/oebps-package+xml'
|
||||||
|
PARSER = etree.XMLParser(recover=True)
|
||||||
|
NAMESPACES = {
|
||||||
|
None : "http://www.idpf.org/2007/opf",
|
||||||
|
'dc' : "http://purl.org/dc/elements/1.1/",
|
||||||
|
'opf' : "http://www.idpf.org/2007/opf",
|
||||||
|
}
|
||||||
|
xpn = NAMESPACES.copy()
|
||||||
|
xpn.pop(None)
|
||||||
|
xpn['re'] = 'http://exslt.org/regular-expressions'
|
||||||
|
XPath = functools.partial(etree.XPath, namespaces=xpn)
|
||||||
|
TEXT = XPath('string()')
|
||||||
|
|
||||||
|
|
||||||
|
metadata_path = XPath('/opf:package/opf:metadata')
|
||||||
|
metadata_elem_path = XPath('/opf:package/opf:metadata/*[re:match(name(), $name, "i")]')
|
||||||
|
authors_path = XPath('/opf:package/opf:metadata/*' + \
|
||||||
|
'[re:match(name(), "creator", "i") and (@role="aut" or @opf:role="aut")]')
|
||||||
|
tags_path = XPath('/opf:package/opf:metadata/*[re:match(name(), "subject", "i")]')
|
||||||
|
isbn_path = XPath('/opf:package/opf:metadata/*[re:match(name(), "identifier", "i") and '+
|
||||||
|
'(re:match(@scheme, "isbn", "i") or re:match(@opf:scheme, "isbn", "i"))]')
|
||||||
|
|
||||||
|
title = MetadataField('title')
|
||||||
|
publisher = MetadataField('publisher')
|
||||||
|
language = MetadataField('language')
|
||||||
|
comments = MetadataField('description')
|
||||||
|
category = MetadataField('category')
|
||||||
|
series = MetadataField('series', is_dc=False)
|
||||||
|
series_index = MetadataField('series_index', is_dc=False)
|
||||||
|
rating = MetadataField('rating', is_dc=False)
|
||||||
|
|
||||||
|
|
||||||
|
def __init__(self, stream, basedir):
|
||||||
|
self.basedir = basedir
|
||||||
|
raw, self.encoding = xml_to_unicode(stream.read(), strip_encoding_pats=True, resolve_entities=True)
|
||||||
|
|
||||||
|
self.tree = etree.fromstring(raw, self.PARSER)
|
||||||
|
self.metadata = self.metadata_path(self.tree)
|
||||||
|
if not self.metadata:
|
||||||
|
raise ValueError('Malformed OPF file: No <metadata> element')
|
||||||
|
self.metadata = self.metadata[0]
|
||||||
|
|
||||||
|
def get_text(self, elem):
|
||||||
|
return u''.join(self.TEXT(elem))
|
||||||
|
|
||||||
|
@apply
|
||||||
|
def authors():
|
||||||
|
|
||||||
|
def fget(self):
|
||||||
|
ans = []
|
||||||
|
for elem in self.authors_path(self.tree):
|
||||||
|
ans.extend([x.strip() for x in self.get_text(elem).split(',')])
|
||||||
|
return ans
|
||||||
|
|
||||||
|
def fset(self, val):
|
||||||
|
remove = list(self.authors_path(self.tree))
|
||||||
|
for elem in remove:
|
||||||
|
self.metadata.remove(elem)
|
||||||
|
for author in val:
|
||||||
|
elem = self.create_metadata_element('creator', ns='dc',
|
||||||
|
attrib={'{%s}role'%self.NAMESPACES['opf']:'aut'})
|
||||||
|
elem.text = author
|
||||||
|
|
||||||
|
return property(fget=fget, fset=fset)
|
||||||
|
|
||||||
|
@apply
|
||||||
|
def author_sort():
|
||||||
|
|
||||||
|
def fget(self):
|
||||||
|
matches = self.authors_path(self.tree)
|
||||||
|
if matches:
|
||||||
|
ans = matches[0].get('opf:file-as', None)
|
||||||
|
return ans if ans else matches[0].get('file-as', None)
|
||||||
|
|
||||||
|
def fset(self, val):
|
||||||
|
matches = self.authors_path(self.tree)
|
||||||
|
if matches:
|
||||||
|
matches[0].set('file-as', unicode(val))
|
||||||
|
|
||||||
|
return property(fget=fget, fset=fset)
|
||||||
|
|
||||||
|
@apply
|
||||||
|
def tags():
|
||||||
|
|
||||||
|
def fget(self):
|
||||||
|
ans = []
|
||||||
|
for tag in self.tags_path(self.tree):
|
||||||
|
ans.append(self.get_text(tag))
|
||||||
|
return ans
|
||||||
|
|
||||||
|
def fset(self, val):
|
||||||
|
for tag in list(self.tags_path(self.tree)):
|
||||||
|
self.metadata.remove(tag)
|
||||||
|
for tag in val:
|
||||||
|
elem = self.create_metadata_element('subject', ns='dc')
|
||||||
|
elem.text = unicode(tag)
|
||||||
|
|
||||||
|
return property(fget=fget, fset=fset)
|
||||||
|
|
||||||
|
@apply
|
||||||
|
def isbn():
|
||||||
|
|
||||||
|
def fget(self):
|
||||||
|
for match in self.isbn_path(self.tree):
|
||||||
|
return match.text if match.text else None
|
||||||
|
|
||||||
|
def fset(self, val):
|
||||||
|
matches = self.isbn_path(self.tree)
|
||||||
|
if not matches:
|
||||||
|
matches = [self.create_metadata_element('dc:identifier',
|
||||||
|
attrib={'{%s}scheme'%self.NAMESPACES['opf']:'ISBN'})]
|
||||||
|
matches[0].text = unicode(val)
|
||||||
|
return property(fget=fget, fset=fset)
|
||||||
|
|
||||||
|
def get_metadata_element(self, name):
|
||||||
|
matches = self.metadata_elem_path(self.tree, name=name)
|
||||||
|
if matches:
|
||||||
|
return matches[0]
|
||||||
|
|
||||||
|
def create_metadata_element(self, name, attrib=None, ns='opf'):
|
||||||
|
elem = etree.SubElement(self.metadata, '{%s}%s'%(self.NAMESPACES[ns], name),
|
||||||
|
attrib=attrib, nsmap=self.NAMESPACES)
|
||||||
|
elem.tail = '\n'
|
||||||
|
return elem
|
||||||
|
|
||||||
|
def render(self, encoding='utf-8'):
|
||||||
|
return etree.tostring(self.tree, encoding='utf-8', pretty_print=True)
|
||||||
|
|
||||||
|
def smart_update(self, mi):
|
||||||
|
for attr in ('author_sort', 'title_sort', 'comments', 'category',
|
||||||
|
'publisher', 'series', 'series_index', 'rating',
|
||||||
|
'isbn', 'language', 'tags'):
|
||||||
|
val = getattr(mi, attr, None)
|
||||||
|
if val or val == []:
|
||||||
|
setattr(self, attr, val)
|
||||||
|
|
||||||
|
class OPFTest(unittest.TestCase):
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
import cStringIO
|
||||||
|
self.stream = cStringIO.StringIO(
|
||||||
|
'''\
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<package version="2.0" xmlns="http://www.idpf.org/2007/opf" >
|
||||||
|
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">
|
||||||
|
<dc:title>A Cool & © ß Title</dc:title>
|
||||||
|
<creator opf:role="aut" file-as="Monkey">Monkey Kitchen, Next</creator>
|
||||||
|
<dc:subject>One</dc:subject><dc:subject>Two</dc:subject>
|
||||||
|
<dc:identifier scheme="ISBN">123456789</dc:identifier>
|
||||||
|
</metadata>
|
||||||
|
</package>
|
||||||
|
'''
|
||||||
|
)
|
||||||
|
self.opf = OPF(self.stream, os.getcwd())
|
||||||
|
|
||||||
|
def testReading(self):
|
||||||
|
opf = self.opf
|
||||||
|
self.assertEqual(opf.title, u'A Cool & \xa9 \xdf Title')
|
||||||
|
self.assertEqual(opf.authors, u'Monkey Kitchen,Next'.split(','))
|
||||||
|
self.assertEqual(opf.author_sort, 'Monkey')
|
||||||
|
self.assertEqual(opf.tags, ['One', 'Two'])
|
||||||
|
self.assertEqual(opf.isbn, '123456789')
|
||||||
|
|
||||||
|
|
||||||
|
def testWriting(self):
|
||||||
|
for test in [('title', 'New & Title'), ('authors', ['One', 'Two']),
|
||||||
|
('author_sort', "Kitchen"), ('tags', ['Three']),
|
||||||
|
('isbn', 'a'), ('rating', '3')]:
|
||||||
|
setattr(self.opf, *test)
|
||||||
|
self.assertEqual(getattr(self.opf, test[0]), test[1])
|
||||||
|
|
||||||
|
self.opf.render()
|
||||||
|
|
||||||
|
def suite():
|
||||||
|
return unittest.TestLoader().loadTestsFromTestCase(OPFTest)
|
||||||
|
|
||||||
|
def test():
|
||||||
|
unittest.TextTestRunner(verbosity=2).run(suite())
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def main(args=sys.argv):
|
||||||
|
return 0
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
sys.exit(test())
|
Loading…
x
Reference in New Issue
Block a user