From 1546c828070a3824debd9bbd8fec2b29774c48b0 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Mon, 7 Jan 2008 18:04:44 +0000
Subject: [PATCH] Initial implementation of epub2lrf and epub-meta (thanks
 llasram)

---
 setup.py                                      |   4 +-
 src/libprs500/ebooks/lrf/epub/__init__.py     |  15 +++
 src/libprs500/ebooks/lrf/epub/convert_from.py |  85 +++++++++++++
 src/libprs500/ebooks/metadata/epub.py         | 120 ++++++++++++++++++
 src/libprs500/ebooks/metadata/opf.py          |   1 +
 src/libprs500/linux.py                        |   2 +
 6 files changed, 226 insertions(+), 1 deletion(-)
 create mode 100644 src/libprs500/ebooks/lrf/epub/__init__.py
 create mode 100644 src/libprs500/ebooks/lrf/epub/convert_from.py
 create mode 100644 src/libprs500/ebooks/metadata/epub.py

diff --git a/setup.py b/setup.py
index f95b43de32..008f1c301f 100644
--- a/setup.py
+++ b/setup.py
@@ -25,10 +25,12 @@ entry_points = {
                              'rtf-meta  = libprs500.ebooks.metadata.rtf:main', 
                              'pdf-meta  = libprs500.ebooks.metadata.pdf:main', 
                              'lit-meta  = libprs500.ebooks.metadata.lit:main',
+                             'epub-meta = libprs500.ebooks.metadata.epub:main',
                              'txt2lrf   = libprs500.ebooks.lrf.txt.convert_from:main', 
                              'html2lrf  = libprs500.ebooks.lrf.html.convert_from:main',
                              'markdown-libprs500  = libprs500.ebooks.markdown.markdown:main',
                              'lit2lrf   = libprs500.ebooks.lrf.lit.convert_from:main',
+                             'epub2lrf  = libprs500.ebooks.lrf.epub.convert_from:main',
                              'rtf2lrf   = libprs500.ebooks.lrf.rtf.convert_from:main',
                              'web2disk  = libprs500.web.fetch.simple:main',
                              'web2lrf   = libprs500.ebooks.lrf.web.convert_from:main',
@@ -131,4 +133,4 @@ if __name__ == '__main__':
          )
     
     if 'develop' in ' '.join(sys.argv) and islinux:
-        subprocess.check_call('libprs500_postinstall', shell=True)
\ No newline at end of file
+        subprocess.check_call('libprs500_postinstall', shell=True)
diff --git a/src/libprs500/ebooks/lrf/epub/__init__.py b/src/libprs500/ebooks/lrf/epub/__init__.py
new file mode 100644
index 0000000000..97ad144cc4
--- /dev/null
+++ b/src/libprs500/ebooks/lrf/epub/__init__.py
@@ -0,0 +1,15 @@
+##    Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net
+##    This program is free software; you can redistribute it and/or modify
+##    it under the terms of the GNU General Public License as published by
+##    the Free Software Foundation; either version 2 of the License, or
+##    (at your option) any later version.
+##
+##    This program is distributed in the hope that it will be useful,
+##    but WITHOUT ANY WARRANTY; without even the implied warranty of
+##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+##    GNU General Public License for more details.
+##
+##    You should have received a copy of the GNU General Public License along
+##    with this program; if not, write to the Free Software Foundation, Inc.,
+##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
diff --git a/src/libprs500/ebooks/lrf/epub/convert_from.py b/src/libprs500/ebooks/lrf/epub/convert_from.py
new file mode 100644
index 0000000000..708e0d2a40
--- /dev/null
+++ b/src/libprs500/ebooks/lrf/epub/convert_from.py
@@ -0,0 +1,85 @@
+##    Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net
+##    This program is free software; you can redistribute it and/or modify
+##    it under the terms of the GNU General Public License as published by
+##    the Free Software Foundation; either version 2 of the License, or
+##    (at your option) any later version.
+##
+##    This program is distributed in the hope that it will be useful,
+##    but WITHOUT ANY WARRANTY; without even the implied warranty of
+##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+##    GNU General Public License for more details.
+##
+##    You should have received a copy of the GNU General Public License along
+##    with this program; if not, write to the Free Software Foundation, Inc.,
+##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+import os, sys, shutil, glob, logging
+from tempfile import mkdtemp
+from subprocess import Popen, PIPE
+from libprs500.ebooks.lrf import option_parser as lrf_option_parser
+from libprs500.ebooks import ConversionError
+from libprs500.ebooks.lrf.html.convert_from import process_file as html_process_file
+from libprs500.ebooks.metadata.opf import OPFReader, OPF
+from libprs500.ebooks.metadata.epub import OCFDirReader
+from libprs500.libunzip import extract as zip_extract
+from libprs500 import isosx, __appname__, setup_cli_handlers, iswindows
+
+
+def option_parser():
+    return lrf_option_parser(
+        '''Usage: %prog [options] mybook.epub\n\n'''
+        '''%prog converts mybook.epub to mybook.lrf'''
+        )
+
+def generate_html(pathtoepub, logger):
+    if not os.access(pathtoepub, os.R_OK):
+        raise ConversionError, 'Cannot read from ' + pathtoepub
+    tdir = mkdtemp(prefix=__appname__+'_')
+    os.rmdir(tdir)
+    try:
+        zip_extract(pathtoepub, tdir)
+    except:
+        if os.path.exists(tdir) and os.path.isdir(tdir):
+            shutil.rmtree(tdir)        
+        raise ConversionError, '.epub extraction failed'
+    return tdir
+
+def process_file(path, options, logger=None):
+    if logger is None:
+        level = logging.DEBUG if options.verbose else logging.INFO
+        logger = logging.getLogger('epub2lrf')
+        setup_cli_handlers(logger, level)
+    epub = os.path.abspath(os.path.expanduser(path))
+    tdir = generate_html(epub, logger)
+    try:
+        ocf = OCFDirReader(tdir)
+        htmlfile = ocf.opf.spine.items().next().href
+        options.opf = os.path.join(tdir, ocf.container[OPF.MIMETYPE])
+        if not options.output:
+            ext = '.lrs' if options.lrs else '.lrf'
+            options.output = os.path.abspath(os.path.basename(os.path.splitext(path)[0]) + ext)
+        options.output = os.path.abspath(os.path.expanduser(options.output))
+        options.use_spine = True
+        
+        html_process_file(htmlfile, options, logger=logger)
+    finally:
+        try:
+            shutil.rmtree(tdir)
+        except:
+            logger.warning('Failed to delete temporary directory '+tdir)
+
+
+def main(args=sys.argv, logger=None):
+    parser = option_parser()
+    options, args = parser.parse_args(args)
+    if len(args) != 2:            
+        parser.print_help()
+        print
+        print 'No epub file specified'
+        return 1
+    process_file(args[1], options, logger)
+    return 0        
+        
+            
+if __name__ == '__main__':
+    sys.exit(main())
diff --git a/src/libprs500/ebooks/metadata/epub.py b/src/libprs500/ebooks/metadata/epub.py
new file mode 100644
index 0000000000..4727ae5ca1
--- /dev/null
+++ b/src/libprs500/ebooks/metadata/epub.py
@@ -0,0 +1,120 @@
+##    Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net
+##    This program is free software; you can redistribute it and/or modify
+##    it under the terms of the GNU General Public License as published by
+##    the Free Software Foundation; either version 2 of the License, or
+##    (at your option) any later version.
+##
+##    This program is distributed in the hope that it will be useful,
+##    but WITHOUT ANY WARRANTY; without even the implied warranty of
+##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+##    GNU General Public License for more details.
+##
+##    You should have received a copy of the GNU General Public License along
+##    with this program; if not, write to the Free Software Foundation, Inc.,
+##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+'''Read meta information from PDF files'''
+
+from __future__ import with_statement
+
+import sys, os
+
+from zipfile import ZipFile, BadZipfile
+from cStringIO import StringIO
+from contextlib import closing
+
+from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup, BeautifulSoup
+from libprs500.ebooks.metadata import MetaInformation
+from libprs500.ebooks.metadata.opf import OPF, OPFReader
+
+
+class EPubException(Exception):
+    pass
+
+class OCFException(EPubException):
+    pass
+
+class ContainerException(OCFException):
+    pass
+
+class Container(dict):
+    def __init__(self, stream=None):
+        if not stream: return
+        soup = BeautifulStoneSoup(stream.read())
+        container = soup.find('container')
+        if not container:
+            raise OCFException("<container/> element missing")
+        if container.get('version', None) != '1.0':
+            raise EPubException("unsupported version of OCF")
+        rootfiles = container.find('rootfiles')
+        if not rootfiles:
+            raise EPubException("<rootfiles/> element missing")
+        for rootfile in rootfiles.findAll('rootfile'):
+            try:
+                self[rootfile['media-type']] = rootfile['full-path']
+            except KeyError:
+                raise EPubException("<rootfile/> element malformed")
+
+class OCF(object):
+    MIMETYPE = 'application/epub+zip'
+    CONTAINER_PATH = 'META-INF/container.xml'
+    
+    def __init__(self):
+        raise NotImplementedError('Abstract base class')
+
+class OCFReader(OCF):
+    def __init__(self):
+        try:
+            mimetype = self.open('mimetype').read().rstrip()
+            if mimetype != OCF.MIMETYPE:
+                raise EPubException
+        except (KeyError, EPubException):
+            raise EPubException("not an .epub OCF container")
+
+        try:
+            with closing(self.open(OCF.CONTAINER_PATH)) as f:
+                self.container = Container(f)
+        except KeyError:
+            raise EPubException("missing OCF container.xml file")
+
+        try:
+            with closing(self.open(self.container[OPF.MIMETYPE])) as f:
+                self.opf = OPFReader(f, self.root)
+        except KeyError:
+            raise EPubException("missing OPF package file")
+
+class OCFZipReader(OCFReader):
+    def __init__(self, stream):
+        try:
+            self.archive = ZipFile(stream, 'r')
+        except BadZipfile:
+            raise EPubException("not a ZIP .epub OCF container")
+        self.root = getattr(stream, 'name', os.getcwd())
+        super(OCFZipReader, self).__init__()
+
+    def open(self, name, mode='r'):
+        return StringIO(self.archive.read(name))
+
+class OCFDirReader(OCFReader):
+    def __init__(self, path):
+        self.root = path
+        super(OCFDirReader, self).__init__()
+        
+    def open(self, path, *args, **kwargs):
+        return open(os.path.join(self.root, path), *args, **kwargs)
+    
+    
+def get_metadata(stream):
+    """ Return metadata as a L{MetaInfo} object """
+    return OCFZipReader(stream).opf
+
+def main(args=sys.argv):
+    if len(args) != 2 or '--help' in args or '-help' in args:
+        print >>sys.stderr, 'Usage: epub-meta FILE'
+        return 1
+    
+    path = os.path.abspath(os.path.expanduser(args[1]))
+    print unicode(get_metadata(open(path, 'rb')))
+    return 0
+
+if __name__ == '__main__':
+    sys.exit(main())
diff --git a/src/libprs500/ebooks/metadata/opf.py b/src/libprs500/ebooks/metadata/opf.py
index f87c6cdda0..7f0eee751a 100644
--- a/src/libprs500/ebooks/metadata/opf.py
+++ b/src/libprs500/ebooks/metadata/opf.py
@@ -120,6 +120,7 @@ class standard_field(object):
         
 class OPF(MetaInformation):
     
+    MIMETYPE = 'application/oebps-package+xml'
     ENTITY_PATTERN = re.compile(r'&(\S+?);')
     
     libprs_id     = standard_field('libprs_id')
diff --git a/src/libprs500/linux.py b/src/libprs500/linux.py
index 95336bd07c..af8790a9e8 100644
--- a/src/libprs500/linux.py
+++ b/src/libprs500/linux.py
@@ -88,6 +88,7 @@ def setup_completion():
                               ['htm', 'html', 'xhtml', 'xhtm', 'rar', 'zip', 'php']))
         f.write(opts_and_exts('txt2lrf', txtop, ['txt']))
         f.write(opts_and_exts('lit2lrf', htmlop, ['lit']))
+        f.write(opts_and_exts('epub2lrf', htmlop, ['epub']))
         f.write(opts_and_exts('rtf2lrf', htmlop, ['rtf']))
         f.write(opts_and_exts('pdf2lrf', htmlop, ['pdf']))
         f.write(opts_and_exts('any2lrf', htmlop, 
@@ -97,6 +98,7 @@ def setup_completion():
         f.write(opts_and_exts('rtf-meta', metaop, ['rtf']))
         f.write(opts_and_exts('pdf-meta', metaop, ['pdf']))
         f.write(opts_and_exts('lit-meta', metaop, ['lit']))
+        f.write(opts_and_exts('epub-meta', metaop, ['epub']))
         f.write(opts_and_exts('lrfviewer', lrfviewerop, ['lrf']))
         f.write('''
 _prs500_ls()