diff --git a/src/calibre/__init__.py b/src/calibre/__init__.py index b948c1b7fa..8b9221410d 100644 --- a/src/calibre/__init__.py +++ b/src/calibre/__init__.py @@ -2,7 +2,7 @@ __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' __docformat__ = 'restructuredtext en' - +from lxml import etree # Needed on OSX to ensure the correct libxml2 is loaded import sys, os, re, logging, time, subprocess, mechanize, atexit from htmlentitydefs import name2codepoint from math import floor diff --git a/src/calibre/constants.py b/src/calibre/constants.py index 3fd76f21d9..2f3b8de9fc 100644 --- a/src/calibre/constants.py +++ b/src/calibre/constants.py @@ -2,7 +2,7 @@ __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' __docformat__ = 'restructuredtext en' __appname__ = 'calibre' -__version__ = '0.4.84b9' +__version__ = '0.4.84b10' __author__ = "Kovid Goyal " ''' Various run time constants. diff --git a/src/calibre/ebooks/chardet/__init__.py b/src/calibre/ebooks/chardet/__init__.py index 7b2b89a5fa..26fd84bee2 100644 --- a/src/calibre/ebooks/chardet/__init__.py +++ b/src/calibre/ebooks/chardet/__init__.py @@ -29,7 +29,13 @@ def detect(aBuf): return u.result # Added by Kovid -def xml_to_unicode(raw, verbose=False): +ENCODING_PATS = [ + re.compile(r'<[^<>]+encoding=[\'"](.*?)[\'"][^<>]*>', re.IGNORECASE), + re.compile(r'', re.IGNORECASE) + ] +ENTITY_PATTERN = re.compile(r'&(\S+?);') + +def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False, resolve_entities=False): ''' Force conversion of byte string to unicode. Tries to look for XML/HTML encoding declaration first, if not found uses the chardet library and @@ -41,11 +47,14 @@ def xml_to_unicode(raw, verbose=False): return u'', encoding if isinstance(raw, unicode): return raw, encoding - match = re.compile(r'<[^<>]+encoding=[\'"](.*?)[\'"][^<>]*>', re.IGNORECASE).search(raw) - if match is None: - match = re.compile(r']+encoding=[\'"](.*?)[\'"][^<>]*>', re.IGNORECASE), - re.compile(r'', re.IGNORECASE)] - - def __init__(self, htmlfile, opts, tdir, resource_map, htmlfiles, name='htmlparser'): LoggingInterface.__init__(self, logging.getLogger(name)) self.setup_cli_handler(opts.verbose) @@ -332,7 +328,7 @@ class Parser(PreProcessor, LoggingInterface): src = open(self.htmlfile.path, 'rb').read().decode(self.htmlfile.encoding, 'replace') src = self.preprocess(src) # lxml chokes on unicode input when it contains encoding declarations - for pat in self.ENCODING_PATS: + for pat in ENCODING_PATS: src = pat.sub('', src) try: self.root = html.document_fromstring(src) diff --git a/src/calibre/ebooks/lrf/rtf/convert_from.py b/src/calibre/ebooks/lrf/rtf/convert_from.py index 1fd8314c96..2b7693babc 100644 --- a/src/calibre/ebooks/lrf/rtf/convert_from.py +++ b/src/calibre/ebooks/lrf/rtf/convert_from.py @@ -1,11 +1,12 @@ __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' -import os, sys, tempfile, subprocess, shutil, logging, glob +import os, sys, tempfile, shutil, logging, glob + +from lxml import etree from calibre.ebooks.lrf import option_parser as lrf_option_parser from calibre.ebooks.metadata.meta import get_metadata from calibre.ebooks.lrf.html.convert_from import process_file as html_process_file -from calibre.ebooks import ConversionError from calibre import isosx, setup_cli_handlers, __appname__ from calibre.libwand import convert, WandException from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup @@ -37,32 +38,6 @@ def convert_images(html, logger): continue return html -def generate_html(rtfpath, logger): - tdir = tempfile.mkdtemp(prefix=__appname__+'_') - cwd = os.path.abspath(os.getcwd()) - os.chdir(tdir) - try: - logger.info('Converting to HTML...') - sys.stdout.flush() - handle, path = tempfile.mkstemp(dir=tdir, suffix='.html') - file = os.fdopen(handle, 'wb') - cmd = ' '.join([UNRTF, '"'+rtfpath+'"']) - p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - raw = p.stdout.read() - ret = p.wait() - if ret != 0: - if len(raw) > 1000: #unrtf crashes occassionally on OSX and windows but still convert correctly - raw += '\n' - else: - logger.critical(p.stderr.read()) - raise ConversionError, 'unrtf failed with error code: %d'%(ret,) - file.write(convert_images(raw, logger)) - file.close() - return path - finally: - os.chdir(cwd) - def process_file(path, options, logger=None): if logger is None: level = logging.DEBUG if options.verbose else logging.INFO @@ -72,7 +47,7 @@ def process_file(path, options, logger=None): f = open(rtf, 'rb') mi = get_metadata(f, 'rtf') f.close() - html = generate_html2(rtf, logger) + html = generate_html(rtf, logger) tdir = os.path.dirname(html) cwd = os.getcwdu() try: @@ -162,8 +137,7 @@ def generate_xml(rtfpath): return ofile -def generate_html2(rtfpath, logger): - from lxml import etree +def generate_html(rtfpath, logger): logger.info('Converting RTF to XML...') xml = generate_xml(rtfpath) tdir = os.path.dirname(xml) diff --git a/src/calibre/ebooks/metadata/__init__.py b/src/calibre/ebooks/metadata/__init__.py index c9468c812e..332d056124 100644 --- a/src/calibre/ebooks/metadata/__init__.py +++ b/src/calibre/ebooks/metadata/__init__.py @@ -171,14 +171,13 @@ class MetaInformation(object): if hasattr(mi, attr): setattr(ans, attr, getattr(mi, attr)) - def __init__(self, title, authors=[_('Unknown')]): ''' @param title: title or "Unknown" or a MetaInformation object @param authors: List of strings or [] ''' mi = None - if isinstance(title, MetaInformation): + if hasattr(title, 'title') and hasattr(title, 'authors'): mi = title title = mi.title authors = mi.authors @@ -186,26 +185,15 @@ class MetaInformation(object): self.author = authors # Needed for backward compatibility #: List of strings or [] self.authors = authors - #: Sort text for author - self.author_sort = None if not mi else mi.author_sort - self.title_sort = None if not mi else mi.title_sort - self.comments = None if not mi else mi.comments - self.category = None if not mi else mi.category - self.publisher = None if not mi else mi.publisher - self.series = None if not mi else mi.series - self.series_index = None if not mi else mi.series_index - self.rating = None if not mi else mi.rating - self.isbn = None if not mi else mi.isbn - self.tags = [] if not mi else mi.tags - self.language = None if not mi else mi.language # Typically a string describing the language + self.tags = getattr(mi, 'tags', []) #: mi.cover_data = (ext, data) - self.cover_data = mi.cover_data if (mi and hasattr(mi, 'cover_data')) else (None, None) - self.application_id = mi.application_id if (mi and hasattr(mi, 'application_id')) else None - self.manifest = getattr(mi, 'manifest', None) - self.toc = getattr(mi, 'toc', None) - self.spine = getattr(mi, 'spine', None) - self.guide = getattr(mi, 'guide', None) - self.cover = getattr(mi, 'cover', None) + self.cover_data = getattr(mi, 'cover_data', (None, None)) + + for x in ('author_sort', 'title_sort', 'comments', 'category', 'publisher', + 'series', 'series_index', 'rating', 'isbn', 'language', + 'application_id', 'manifest', 'toc', 'spine', 'guide', 'cover' + ): + setattr(self, x, getattr(mi, x, None)) def smart_update(self, mi): ''' diff --git a/src/calibre/ebooks/metadata/epub.py b/src/calibre/ebooks/metadata/epub.py index d63719868a..d0de9cbdcd 100644 --- a/src/calibre/ebooks/metadata/epub.py +++ b/src/calibre/ebooks/metadata/epub.py @@ -6,14 +6,14 @@ __copyright__ = '2008, Kovid Goyal ' '''Read meta information from epub files''' import sys, os - -from calibre.utils.zipfile import ZipFile, BadZipfile, safe_replace from cStringIO import StringIO from contextlib import closing + +from calibre.utils.zipfile import ZipFile, BadZipfile, safe_replace from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup -from calibre.ebooks.metadata.opf import OPF, OPFReader, OPFCreator from calibre.ebooks.metadata import get_parser, MetaInformation +from calibre.ebooks.metadata.opf2 import OPF class EPubException(Exception): pass @@ -49,14 +49,15 @@ class OCF(object): def __init__(self): raise NotImplementedError('Abstract base class') + class OCFReader(OCF): def __init__(self): try: mimetype = self.open('mimetype').read().rstrip() if mimetype != OCF.MIMETYPE: - raise EPubException - except (KeyError, EPubException): - raise EPubException("not an .epub OCF container") + print 'WARNING: Invalid mimetype declaration', mimetype + except: + print 'WARNING: Epub doesn\'t contain a mimetype declaration' try: with closing(self.open(OCF.CONTAINER_PATH)) as f: @@ -66,37 +67,26 @@ class OCFReader(OCF): try: with closing(self.open(self.container[OPF.MIMETYPE])) as f: - self.opf = OPFReader(f, self.root) + self.opf = OPF(f, self.root) except KeyError: raise EPubException("missing OPF package file") class OCFZipReader(OCFReader): - def __init__(self, stream, mode='r'): + def __init__(self, stream, mode='r', root=None): try: self.archive = ZipFile(stream, mode=mode) except BadZipfile: raise EPubException("not a ZIP .epub OCF container") - self.root = getattr(stream, 'name', os.getcwd()) + self.root = root + if self.root is None: + self.root = os.getcwdu() + if hasattr(stream, 'name'): + self.root = os.path.abspath(os.path.dirname(stream.name)) super(OCFZipReader, self).__init__() def open(self, name, mode='r'): return StringIO(self.archive.read(name)) -class OCFZipWriter(object): - - def __init__(self, stream): - reader = OCFZipReader(stream) - self.opf = reader.container[OPF.MIMETYPE] - self.stream = stream - self.root = getattr(stream, 'name', os.getcwd()) - - def set_metadata(self, mi): - stream = StringIO() - opf = OPFCreator(self.root, mi) - opf.render(stream) - stream.seek(0) - safe_replace(self.stream, self.opf, stream) - class OCFDirReader(OCFReader): def __init__(self, path): self.root = path @@ -111,12 +101,17 @@ def get_metadata(stream): return OCFZipReader(stream).opf def set_metadata(stream, mi): - OCFZipWriter(stream).set_metadata(mi) - + reader = OCFZipReader(stream, root=os.getcwdu()) + reader.opf.smart_update(mi) + newopf = StringIO(reader.opf.render()) + safe_replace(stream, reader.container[OPF.MIMETYPE], newopf) + print newopf.getvalue() + def option_parser(): parser = get_parser('epub') parser.remove_option('--category') - parser.add_option('--tags', default=None, help=_('A comma separated list of tags to set')) + parser.add_option('--tags', default=None, + help=_('A comma separated list of tags to set')) return parser def main(args=sys.argv): @@ -126,7 +121,7 @@ def main(args=sys.argv): parser.print_help() return 1 stream = open(args[1], 'r+b') - mi = MetaInformation(OCFZipReader(stream).opf) + mi = MetaInformation(OCFZipReader(stream, root=os.getcwdu()).opf) if opts.title: mi.title = opts.title if opts.authors: @@ -136,8 +131,10 @@ def main(args=sys.argv): if opts.comment: mi.comments = opts.comment + stream.seek(0) set_metadata(stream, mi) print unicode(mi) + stream.close() return 0 if __name__ == '__main__': diff --git a/src/calibre/ebooks/metadata/opf2.py b/src/calibre/ebooks/metadata/opf2.py new file mode 100644 index 0000000000..5a3a74fd89 --- /dev/null +++ b/src/calibre/ebooks/metadata/opf2.py @@ -0,0 +1,220 @@ +#!/usr/bin/env python +__license__ = 'GPL v3' +__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' +__docformat__ = 'restructuredtext en' + +''' +lxml based OPF parser. +''' + +import sys, unittest, functools, os + +from lxml import etree + +from calibre.ebooks.chardet import xml_to_unicode + +class MetadataField(object): + + def __init__(self, name, is_dc=True): + self.name = name + self.is_dc = is_dc + + def __get__(self, obj, type=None): + ans = obj.get_metadata_element(self.name) + if ans is None: + return u'' + return obj.get_text(ans) + + def __set__(self, obj, val): + elem = obj.get_metadata_element(self.name) + if elem is None: + elem = obj.create_metadata_element(self.name, ns='dc' if self.is_dc else 'opf') + elem.text = unicode(val) + +class OPF(object): + MIMETYPE = 'application/oebps-package+xml' + PARSER = etree.XMLParser(recover=True) + NAMESPACES = { + None : "http://www.idpf.org/2007/opf", + 'dc' : "http://purl.org/dc/elements/1.1/", + 'opf' : "http://www.idpf.org/2007/opf", + } + xpn = NAMESPACES.copy() + xpn.pop(None) + xpn['re'] = 'http://exslt.org/regular-expressions' + XPath = functools.partial(etree.XPath, namespaces=xpn) + TEXT = XPath('string()') + + + metadata_path = XPath('/opf:package/opf:metadata') + metadata_elem_path = XPath('/opf:package/opf:metadata/*[re:match(name(), $name, "i")]') + authors_path = XPath('/opf:package/opf:metadata/*' + \ + '[re:match(name(), "creator", "i") and (@role="aut" or @opf:role="aut")]') + tags_path = XPath('/opf:package/opf:metadata/*[re:match(name(), "subject", "i")]') + isbn_path = XPath('/opf:package/opf:metadata/*[re:match(name(), "identifier", "i") and '+ + '(re:match(@scheme, "isbn", "i") or re:match(@opf:scheme, "isbn", "i"))]') + + title = MetadataField('title') + publisher = MetadataField('publisher') + language = MetadataField('language') + comments = MetadataField('description') + category = MetadataField('category') + series = MetadataField('series', is_dc=False) + series_index = MetadataField('series_index', is_dc=False) + rating = MetadataField('rating', is_dc=False) + + + def __init__(self, stream, basedir): + self.basedir = basedir + raw, self.encoding = xml_to_unicode(stream.read(), strip_encoding_pats=True, resolve_entities=True) + + self.tree = etree.fromstring(raw, self.PARSER) + self.metadata = self.metadata_path(self.tree) + if not self.metadata: + raise ValueError('Malformed OPF file: No element') + self.metadata = self.metadata[0] + + def get_text(self, elem): + return u''.join(self.TEXT(elem)) + + @apply + def authors(): + + def fget(self): + ans = [] + for elem in self.authors_path(self.tree): + ans.extend([x.strip() for x in self.get_text(elem).split(',')]) + return ans + + def fset(self, val): + remove = list(self.authors_path(self.tree)) + for elem in remove: + self.metadata.remove(elem) + for author in val: + elem = self.create_metadata_element('creator', ns='dc', + attrib={'{%s}role'%self.NAMESPACES['opf']:'aut'}) + elem.text = author + + return property(fget=fget, fset=fset) + + @apply + def author_sort(): + + def fget(self): + matches = self.authors_path(self.tree) + if matches: + ans = matches[0].get('opf:file-as', None) + return ans if ans else matches[0].get('file-as', None) + + def fset(self, val): + matches = self.authors_path(self.tree) + if matches: + matches[0].set('file-as', unicode(val)) + + return property(fget=fget, fset=fset) + + @apply + def tags(): + + def fget(self): + ans = [] + for tag in self.tags_path(self.tree): + ans.append(self.get_text(tag)) + return ans + + def fset(self, val): + for tag in list(self.tags_path(self.tree)): + self.metadata.remove(tag) + for tag in val: + elem = self.create_metadata_element('subject', ns='dc') + elem.text = unicode(tag) + + return property(fget=fget, fset=fset) + + @apply + def isbn(): + + def fget(self): + for match in self.isbn_path(self.tree): + return match.text if match.text else None + + def fset(self, val): + matches = self.isbn_path(self.tree) + if not matches: + matches = [self.create_metadata_element('dc:identifier', + attrib={'{%s}scheme'%self.NAMESPACES['opf']:'ISBN'})] + matches[0].text = unicode(val) + return property(fget=fget, fset=fset) + + def get_metadata_element(self, name): + matches = self.metadata_elem_path(self.tree, name=name) + if matches: + return matches[0] + + def create_metadata_element(self, name, attrib=None, ns='opf'): + elem = etree.SubElement(self.metadata, '{%s}%s'%(self.NAMESPACES[ns], name), + attrib=attrib, nsmap=self.NAMESPACES) + elem.tail = '\n' + return elem + + def render(self, encoding='utf-8'): + return etree.tostring(self.tree, encoding='utf-8', pretty_print=True) + + def smart_update(self, mi): + for attr in ('author_sort', 'title_sort', 'comments', 'category', + 'publisher', 'series', 'series_index', 'rating', + 'isbn', 'language', 'tags'): + val = getattr(mi, attr, None) + if val or val == []: + setattr(self, attr, val) + +class OPFTest(unittest.TestCase): + + def setUp(self): + import cStringIO + self.stream = cStringIO.StringIO( +'''\ + + + + A Cool & © ß Title + Monkey Kitchen, Next + OneTwo + 123456789 + + +''' + ) + self.opf = OPF(self.stream, os.getcwd()) + + def testReading(self): + opf = self.opf + self.assertEqual(opf.title, u'A Cool & \xa9 \xdf Title') + self.assertEqual(opf.authors, u'Monkey Kitchen,Next'.split(',')) + self.assertEqual(opf.author_sort, 'Monkey') + self.assertEqual(opf.tags, ['One', 'Two']) + self.assertEqual(opf.isbn, '123456789') + + + def testWriting(self): + for test in [('title', 'New & Title'), ('authors', ['One', 'Two']), + ('author_sort', "Kitchen"), ('tags', ['Three']), + ('isbn', 'a'), ('rating', '3')]: + setattr(self.opf, *test) + self.assertEqual(getattr(self.opf, test[0]), test[1]) + + self.opf.render() + +def suite(): + return unittest.TestLoader().loadTestsFromTestCase(OPFTest) + +def test(): + unittest.TextTestRunner(verbosity=2).run(suite()) + + + +def main(args=sys.argv): + return 0 + +if __name__ == '__main__': + sys.exit(test()) \ No newline at end of file