From 1546c828070a3824debd9bbd8fec2b29774c48b0 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 7 Jan 2008 18:04:44 +0000 Subject: [PATCH] Initial implementation of epub2lrf and epub-meta (thanks llasram) --- setup.py | 4 +- src/libprs500/ebooks/lrf/epub/__init__.py | 15 +++ src/libprs500/ebooks/lrf/epub/convert_from.py | 85 +++++++++++++ src/libprs500/ebooks/metadata/epub.py | 120 ++++++++++++++++++ src/libprs500/ebooks/metadata/opf.py | 1 + src/libprs500/linux.py | 2 + 6 files changed, 226 insertions(+), 1 deletion(-) create mode 100644 src/libprs500/ebooks/lrf/epub/__init__.py create mode 100644 src/libprs500/ebooks/lrf/epub/convert_from.py create mode 100644 src/libprs500/ebooks/metadata/epub.py diff --git a/setup.py b/setup.py index f95b43de32..008f1c301f 100644 --- a/setup.py +++ b/setup.py @@ -25,10 +25,12 @@ entry_points = { 'rtf-meta = libprs500.ebooks.metadata.rtf:main', 'pdf-meta = libprs500.ebooks.metadata.pdf:main', 'lit-meta = libprs500.ebooks.metadata.lit:main', + 'epub-meta = libprs500.ebooks.metadata.epub:main', 'txt2lrf = libprs500.ebooks.lrf.txt.convert_from:main', 'html2lrf = libprs500.ebooks.lrf.html.convert_from:main', 'markdown-libprs500 = libprs500.ebooks.markdown.markdown:main', 'lit2lrf = libprs500.ebooks.lrf.lit.convert_from:main', + 'epub2lrf = libprs500.ebooks.lrf.epub.convert_from:main', 'rtf2lrf = libprs500.ebooks.lrf.rtf.convert_from:main', 'web2disk = libprs500.web.fetch.simple:main', 'web2lrf = libprs500.ebooks.lrf.web.convert_from:main', @@ -131,4 +133,4 @@ if __name__ == '__main__': ) if 'develop' in ' '.join(sys.argv) and islinux: - subprocess.check_call('libprs500_postinstall', shell=True) \ No newline at end of file + subprocess.check_call('libprs500_postinstall', shell=True) diff --git a/src/libprs500/ebooks/lrf/epub/__init__.py b/src/libprs500/ebooks/lrf/epub/__init__.py new file mode 100644 index 0000000000..97ad144cc4 --- /dev/null +++ b/src/libprs500/ebooks/lrf/epub/__init__.py @@ -0,0 +1,15 @@ +## Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2 of the License, or +## (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License along +## with this program; if not, write to the Free Software Foundation, Inc., +## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + diff --git a/src/libprs500/ebooks/lrf/epub/convert_from.py b/src/libprs500/ebooks/lrf/epub/convert_from.py new file mode 100644 index 0000000000..708e0d2a40 --- /dev/null +++ b/src/libprs500/ebooks/lrf/epub/convert_from.py @@ -0,0 +1,85 @@ +## Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2 of the License, or +## (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License along +## with this program; if not, write to the Free Software Foundation, Inc., +## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +import os, sys, shutil, glob, logging +from tempfile import mkdtemp +from subprocess import Popen, PIPE +from libprs500.ebooks.lrf import option_parser as lrf_option_parser +from libprs500.ebooks import ConversionError +from libprs500.ebooks.lrf.html.convert_from import process_file as html_process_file +from libprs500.ebooks.metadata.opf import OPFReader, OPF +from libprs500.ebooks.metadata.epub import OCFDirReader +from libprs500.libunzip import extract as zip_extract +from libprs500 import isosx, __appname__, setup_cli_handlers, iswindows + + +def option_parser(): + return lrf_option_parser( + '''Usage: %prog [options] mybook.epub\n\n''' + '''%prog converts mybook.epub to mybook.lrf''' + ) + +def generate_html(pathtoepub, logger): + if not os.access(pathtoepub, os.R_OK): + raise ConversionError, 'Cannot read from ' + pathtoepub + tdir = mkdtemp(prefix=__appname__+'_') + os.rmdir(tdir) + try: + zip_extract(pathtoepub, tdir) + except: + if os.path.exists(tdir) and os.path.isdir(tdir): + shutil.rmtree(tdir) + raise ConversionError, '.epub extraction failed' + return tdir + +def process_file(path, options, logger=None): + if logger is None: + level = logging.DEBUG if options.verbose else logging.INFO + logger = logging.getLogger('epub2lrf') + setup_cli_handlers(logger, level) + epub = os.path.abspath(os.path.expanduser(path)) + tdir = generate_html(epub, logger) + try: + ocf = OCFDirReader(tdir) + htmlfile = ocf.opf.spine.items().next().href + options.opf = os.path.join(tdir, ocf.container[OPF.MIMETYPE]) + if not options.output: + ext = '.lrs' if options.lrs else '.lrf' + options.output = os.path.abspath(os.path.basename(os.path.splitext(path)[0]) + ext) + options.output = os.path.abspath(os.path.expanduser(options.output)) + options.use_spine = True + + html_process_file(htmlfile, options, logger=logger) + finally: + try: + shutil.rmtree(tdir) + except: + logger.warning('Failed to delete temporary directory '+tdir) + + +def main(args=sys.argv, logger=None): + parser = option_parser() + options, args = parser.parse_args(args) + if len(args) != 2: + parser.print_help() + print + print 'No epub file specified' + return 1 + process_file(args[1], options, logger) + return 0 + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/src/libprs500/ebooks/metadata/epub.py b/src/libprs500/ebooks/metadata/epub.py new file mode 100644 index 0000000000..4727ae5ca1 --- /dev/null +++ b/src/libprs500/ebooks/metadata/epub.py @@ -0,0 +1,120 @@ +## Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2 of the License, or +## (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License along +## with this program; if not, write to the Free Software Foundation, Inc., +## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +'''Read meta information from PDF files''' + +from __future__ import with_statement + +import sys, os + +from zipfile import ZipFile, BadZipfile +from cStringIO import StringIO +from contextlib import closing + +from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup, BeautifulSoup +from libprs500.ebooks.metadata import MetaInformation +from libprs500.ebooks.metadata.opf import OPF, OPFReader + + +class EPubException(Exception): + pass + +class OCFException(EPubException): + pass + +class ContainerException(OCFException): + pass + +class Container(dict): + def __init__(self, stream=None): + if not stream: return + soup = BeautifulStoneSoup(stream.read()) + container = soup.find('container') + if not container: + raise OCFException(" element missing") + if container.get('version', None) != '1.0': + raise EPubException("unsupported version of OCF") + rootfiles = container.find('rootfiles') + if not rootfiles: + raise EPubException(" element missing") + for rootfile in rootfiles.findAll('rootfile'): + try: + self[rootfile['media-type']] = rootfile['full-path'] + except KeyError: + raise EPubException(" element malformed") + +class OCF(object): + MIMETYPE = 'application/epub+zip' + CONTAINER_PATH = 'META-INF/container.xml' + + def __init__(self): + raise NotImplementedError('Abstract base class') + +class OCFReader(OCF): + def __init__(self): + try: + mimetype = self.open('mimetype').read().rstrip() + if mimetype != OCF.MIMETYPE: + raise EPubException + except (KeyError, EPubException): + raise EPubException("not an .epub OCF container") + + try: + with closing(self.open(OCF.CONTAINER_PATH)) as f: + self.container = Container(f) + except KeyError: + raise EPubException("missing OCF container.xml file") + + try: + with closing(self.open(self.container[OPF.MIMETYPE])) as f: + self.opf = OPFReader(f, self.root) + except KeyError: + raise EPubException("missing OPF package file") + +class OCFZipReader(OCFReader): + def __init__(self, stream): + try: + self.archive = ZipFile(stream, 'r') + except BadZipfile: + raise EPubException("not a ZIP .epub OCF container") + self.root = getattr(stream, 'name', os.getcwd()) + super(OCFZipReader, self).__init__() + + def open(self, name, mode='r'): + return StringIO(self.archive.read(name)) + +class OCFDirReader(OCFReader): + def __init__(self, path): + self.root = path + super(OCFDirReader, self).__init__() + + def open(self, path, *args, **kwargs): + return open(os.path.join(self.root, path), *args, **kwargs) + + +def get_metadata(stream): + """ Return metadata as a L{MetaInfo} object """ + return OCFZipReader(stream).opf + +def main(args=sys.argv): + if len(args) != 2 or '--help' in args or '-help' in args: + print >>sys.stderr, 'Usage: epub-meta FILE' + return 1 + + path = os.path.abspath(os.path.expanduser(args[1])) + print unicode(get_metadata(open(path, 'rb'))) + return 0 + +if __name__ == '__main__': + sys.exit(main()) diff --git a/src/libprs500/ebooks/metadata/opf.py b/src/libprs500/ebooks/metadata/opf.py index f87c6cdda0..7f0eee751a 100644 --- a/src/libprs500/ebooks/metadata/opf.py +++ b/src/libprs500/ebooks/metadata/opf.py @@ -120,6 +120,7 @@ class standard_field(object): class OPF(MetaInformation): + MIMETYPE = 'application/oebps-package+xml' ENTITY_PATTERN = re.compile(r'&(\S+?);') libprs_id = standard_field('libprs_id') diff --git a/src/libprs500/linux.py b/src/libprs500/linux.py index 95336bd07c..af8790a9e8 100644 --- a/src/libprs500/linux.py +++ b/src/libprs500/linux.py @@ -88,6 +88,7 @@ def setup_completion(): ['htm', 'html', 'xhtml', 'xhtm', 'rar', 'zip', 'php'])) f.write(opts_and_exts('txt2lrf', txtop, ['txt'])) f.write(opts_and_exts('lit2lrf', htmlop, ['lit'])) + f.write(opts_and_exts('epub2lrf', htmlop, ['epub'])) f.write(opts_and_exts('rtf2lrf', htmlop, ['rtf'])) f.write(opts_and_exts('pdf2lrf', htmlop, ['pdf'])) f.write(opts_and_exts('any2lrf', htmlop, @@ -97,6 +98,7 @@ def setup_completion(): f.write(opts_and_exts('rtf-meta', metaop, ['rtf'])) f.write(opts_and_exts('pdf-meta', metaop, ['pdf'])) f.write(opts_and_exts('lit-meta', metaop, ['lit'])) + f.write(opts_and_exts('epub-meta', metaop, ['epub'])) f.write(opts_and_exts('lrfviewer', lrfviewerop, ['lrf'])) f.write(''' _prs500_ls()