diff --git a/setup.py b/setup.py index 7cd2198079..a1b4499980 100644 --- a/setup.py +++ b/setup.py @@ -38,6 +38,7 @@ entry_points = { 'web2disk = libprs500.web.fetch.simple:main', 'web2lrf = libprs500.ebooks.lrf.web.convert_from:main', 'pdf2lrf = libprs500.ebooks.lrf.pdf.convert_from:main', + 'mobi2lrf = libprs500.ebooks.lrf.mobi.convert_from:main', 'any2lrf = libprs500.ebooks.lrf.any.convert_from:main', 'lrf2lrs = libprs500.ebooks.lrf.parser:main', 'lrs2lrf = libprs500.ebooks.lrf.lrs.convert_from:main', diff --git a/src/libprs500/ebooks/lrf/any/convert_from.py b/src/libprs500/ebooks/lrf/any/convert_from.py index daea5c5e2f..d9761252dd 100644 --- a/src/libprs500/ebooks/lrf/any/convert_from.py +++ b/src/libprs500/ebooks/lrf/any/convert_from.py @@ -25,6 +25,7 @@ from libprs500.ebooks.lrf.rtf.convert_from import process_file as rtf2lrf from libprs500.ebooks.lrf.txt.convert_from import process_file as txt2lrf from libprs500.ebooks.lrf.html.convert_from import process_file as html2lrf from libprs500.ebooks.lrf.epub.convert_from import process_file as epub2lrf +from libprs500.ebooks.lrf.mobi.convert_from import process_file as mobi2lrf def largest_file(files): maxsize, file = 0, None @@ -81,7 +82,7 @@ def handle_archive(path): files = [] cdir = traverse_subdirs(tdir) file = None - for ext in ('lit', 'rtf', 'pdf', 'txt', 'epub'): + for ext in ('lit', 'rtf', 'pdf', 'txt', 'epub', 'mobi', 'prc'): pat = os.path.join(cdir, '*.'+ext) files.extend(glob.glob(pat)) file = largest_file(files) @@ -135,6 +136,8 @@ def process_file(path, options, logger=None): convertor = txt2lrf elif 'epub' == ext: convertor = epub2lrf + elif ext in ['mobi', 'prc']: + convertor = mobi2lrf if not convertor: raise UnknownFormatError('Coverting from %s to LRF is not supported.') convertor(path, options, logger) @@ -150,7 +153,7 @@ def main(args=sys.argv, logger=None, gui_mode=False): any2lrf myfile Convert any ebook format into LRF. Supported formats are: -LIT, RTF, TXT, HTML, EPUB and PDF. any2lrf will also process a RAR or +LIT, RTF, TXT, HTML, EPUB, MOBI, PRC and PDF. any2lrf will also process a RAR or ZIP archive. ''', gui_mode=gui_mode) options, args = parser.parse_args(args) diff --git a/src/libprs500/ebooks/lrf/html/convert_from.py b/src/libprs500/ebooks/lrf/html/convert_from.py index 9f9203e617..667dab0c21 100644 --- a/src/libprs500/ebooks/lrf/html/convert_from.py +++ b/src/libprs500/ebooks/lrf/html/convert_from.py @@ -1740,9 +1740,9 @@ def process_file(path, options, logger=None): re.compile(fpba[2], re.IGNORECASE)] if not hasattr(options, 'anchor_ids'): options.anchor_ids = True - files = options.spine if options.use_spine else [path] + files = options.spine if (options.use_spine and hasattr(options, 'spine')) else [path] conv = HTMLConverter(book, fonts, options, logger, files) - if options.use_spine: + if options.use_spine and hasattr(options, 'toc'): conv.create_toc(options.toc) oname = options.output if not oname: diff --git a/src/libprs500/ebooks/lrf/lrs/convert_from.py b/src/libprs500/ebooks/lrf/lrs/convert_from.py index 6fec9b508e..bafa18e540 100644 --- a/src/libprs500/ebooks/lrf/lrs/convert_from.py +++ b/src/libprs500/ebooks/lrf/lrs/convert_from.py @@ -20,7 +20,7 @@ import sys, os, logging from libprs500 import __author__, __appname__, __version__, setup_cli_handlers from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup, NavigableString, \ - CData, Tag + CData, Tag, UnicodeDammit from libprs500.ebooks.lrf.pylrs.pylrs import Book, PageStyle, TextStyle, \ BlockStyle, ImageStream, Font, StyleDefault, BookSetting, Header, \ Image, ImageBlock, Page, TextBlock, Canvas, Paragraph, CR, Span, \ @@ -38,7 +38,8 @@ class LrsParser(object): def __init__(self, stream, logger): self.logger = logger src = stream.read() - self.soup = BeautifulStoneSoup(src, selfClosingTags=self.SELF_CLOSING_TAGS) + self.soup = BeautifulStoneSoup(UnicodeDammit(src).unicode, + selfClosingTags=self.SELF_CLOSING_TAGS) self.objects = {} for obj in self.soup.findAll(objid=True): self.objects[obj['objid']] = obj diff --git a/src/libprs500/ebooks/lrf/mobi/__init__.py b/src/libprs500/ebooks/lrf/mobi/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/libprs500/ebooks/lrf/mobi/convert_from.py b/src/libprs500/ebooks/lrf/mobi/convert_from.py new file mode 100644 index 0000000000..f8e4468a94 --- /dev/null +++ b/src/libprs500/ebooks/lrf/mobi/convert_from.py @@ -0,0 +1,73 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai + +## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2 of the License, or +## (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License along +## with this program; if not, write to the Free Software Foundation, Inc., +## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +'''''' + +import sys, tempfile, os, logging, shutil + +from libprs500 import setup_cli_handlers, __appname__ +from libprs500.ebooks.mobi.reader import MobiReader +from libprs500.ebooks.lrf import option_parser as lrf_option_parser +from libprs500.ebooks.lrf.html.convert_from import process_file as html_process_file + +def generate_html(mobifile, tdir): + mr = MobiReader(mobifile) + mr.extract_content(tdir) + return mr.htmlfile + +def process_file(path, options, logger=None): + if logger is None: + level = logging.DEBUG if options.verbose else logging.INFO + logger = logging.getLogger('lit2lrf') + setup_cli_handlers(logger, level) + mobi = os.path.abspath(os.path.expanduser(path)) + tdir = tempfile.mkdtemp('mobi2lrf', __appname__) + try: + htmlfile = generate_html(mobi, tdir) + if not options.output: + ext = '.lrs' if options.lrs else '.lrf' + options.output = os.path.abspath(os.path.basename(os.path.splitext(path)[0]) + ext) + options.output = os.path.abspath(os.path.expanduser(options.output)) + options.use_spine = True + html_process_file(htmlfile, options, logger=logger) + finally: + try: + shutil.rmtree(tdir) + except: + logger.warning('Failed to delete temporary directory '+tdir) + +def option_parser(): + return lrf_option_parser( + '''Usage: %prog [options] mybook.mobi|prc\n\n''' + '''%prog converts mybook.mobi to mybook.lrf''' + ) + + +def main(args=sys.argv, logger=None): + parser = option_parser() + options, args = parser.parse_args(args) + if len(args) != 2: + parser.print_help() + print + print 'No mobi file specified' + return 1 + process_file(args[1], options, logger) + + return 0 + +if __name__ == '__main__': + sys.exit(main()) \ No newline at end of file diff --git a/src/libprs500/ebooks/metadata/meta.py b/src/libprs500/ebooks/metadata/meta.py index aa9a355ab5..ebb4b0d527 100644 --- a/src/libprs500/ebooks/metadata/meta.py +++ b/src/libprs500/ebooks/metadata/meta.py @@ -21,6 +21,7 @@ from libprs500.ebooks.metadata.pdf import get_metadata as pdf_metadata from libprs500.ebooks.metadata.lit import get_metadata as lit_metadata from libprs500.ebooks.metadata.epub import get_metadata as epub_metadata from libprs500.ebooks.metadata.html import get_metadata as html_metadata +from libprs500.ebooks.mobi.reader import get_metadata as mobi_metadata from libprs500.ebooks.metadata.opf import OPFReader from libprs500.ebooks.metadata.rtf import set_metadata as set_rtf_metadata from libprs500.ebooks.lrf.meta import set_metadata as set_lrf_metadata @@ -31,6 +32,8 @@ def get_metadata(stream, stream_type='lrf', use_libprs_metadata=False): if stream_type: stream_type = stream_type.lower() if stream_type in ('html', 'html', 'xhtml', 'xhtm'): stream_type = 'html' + if stream_type in ('mobi', 'prc'): + stream_type = 'mobi' if use_libprs_metadata and hasattr(stream, 'name'): mi = libprs_metadata(stream.name) if mi is not None: @@ -102,7 +105,10 @@ def metadata_from_filename(name, pat=None): return mi def libprs_metadata(name): + if os.path.basename(name) != 'metadata.opf': + name = os.path.join(os.path.dirname(name), 'metadata.opf') if os.access(name, os.R_OK): + print name name = os.path.abspath(name) f = open(name, 'rb') opf = OPFReader(f, os.path.dirname(name)) diff --git a/src/libprs500/ebooks/metadata/opf.py b/src/libprs500/ebooks/metadata/opf.py index 69631a228d..05283b211a 100644 --- a/src/libprs500/ebooks/metadata/opf.py +++ b/src/libprs500/ebooks/metadata/opf.py @@ -558,6 +558,7 @@ class OPFCreator(OPF): self.tags = mi.tags if mi.isbn: self.isbn = mi.isbn + self.cover_data = mi.cover_data if hasattr(mi, 'libprs_id'): self.libprs_id = mi.libprs_id if hasattr(mi, 'uid'): diff --git a/src/libprs500/ebooks/mobi/reader.py b/src/libprs500/ebooks/mobi/reader.py index b889f39528..cd878a88bf 100644 --- a/src/libprs500/ebooks/mobi/reader.py +++ b/src/libprs500/ebooks/mobi/reader.py @@ -1,6 +1,5 @@ #!/usr/bin/env python # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai - ## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net ## This program is free software; you can redistribute it and/or modify ## it under the terms of the GNU General Public License as published by @@ -19,13 +18,14 @@ Read data from .mobi files ''' -import sys, struct, os, cStringIO, re +import sys, struct, os, cStringIO, re, atexit, shutil, tempfile try: from PIL import Image as PILImage except ImportError: import Image as PILImage +from libprs500 import __appname__ from libprs500.ebooks.mobi import MobiError from libprs500.ebooks.mobi.huffcdic import HuffReader from libprs500.ebooks.mobi.palmdoc import decompress_doc @@ -57,9 +57,9 @@ class EXTHHeader(object): elif id == 202: self.thumbnail_offset, = struct.unpack('>L', content) pos += 3 - stop = raw.find('\x00') + stop = raw[pos:].find('\x00') if stop > -1: - self.mi.title = raw[pos:stop].decode(codec, 'ignore') + self.mi.title = raw[pos:pos+stop].decode(codec, 'ignore') def process_metadata(self, id, content, codec): @@ -161,8 +161,42 @@ class MobiReader(object): def extract_content(self, output_dir=os.getcwdu()): + output_dir = os.path.abspath(output_dir) if self.book_header.encryption_type != 0: - raise MobiError('Cannot extract content from DRM protected ebook') + raise MobiError('Cannot extract content from a DRM protected ebook') + + processed_records = self.extract_text() + self.add_anchors() + self.extract_images(processed_records, output_dir) + self.replace_page_breaks() + + self.processed_html = re.compile('
', re.IGNORECASE).sub( + '\n\n', + self.processed_html) + + htmlfile = os.path.join(output_dir, self.name+'.html') + open(htmlfile, 'wb').write(self.processed_html.encode('utf8')) + self.htmlfile = htmlfile + + if self.book_header.exth is not None: + opf = self.create_opf(htmlfile) + opf.write(open(os.path.splitext(htmlfile)[0]+'.opf', 'wb')) + + def create_opf(self, htmlfile): + mi = self.book_header.exth.mi + opf = OPFCreator(mi) + if hasattr(self.book_header.exth, 'cover_offset'): + opf.cover = 'images/%05d.jpg'%(self.book_header.exth.cover_offset+1) + manifest = [(os.path.basename(htmlfile), 'text/x-oeb1-document')] + for i in self.image_names: + manifest.append(('images/'+i, 'image/jpg')) + + opf.create_manifest(manifest) + opf.create_spine([os.path.basename(htmlfile)]) + return opf + + + def extract_text(self): text_sections = [self.sections[i][0] for i in range(1, self.book_header.records+1)] processed_records = list(range(0, self.book_header.records+1)) @@ -189,29 +223,7 @@ class MobiReader(object): else: raise MobiError('Unknown compression algorithm: %s'%repr(self.book_header.compression_type)) - self.add_anchors() - self.extract_images(processed_records, output_dir) - self.replace_page_breaks() - - self.processed_html = re.compile('', re.IGNORECASE).sub( - '\n\n', - self.processed_html) - - htmlfile = os.path.join(output_dir, self.name+'.html') - open(htmlfile, 'wb').write(self.processed_html.encode('utf8')) - - if self.book_header.exth is not None: - mi = self.book_header.exth.mi - opf = OPFCreator(mi) - if hasattr(self.book_header.exth, 'cover_offset'): - opf.cover = 'images/%d.jpg'%(self.book_header.exth.cover_offset+1) - manifest = [(os.path.basename(htmlfile), 'text/x-oeb1-document')] - for i in self.image_names: - manifest.append(('images/'+i, 'image/jpg')) - - opf.create_manifest(manifest) - opf.create_spine([os.path.basename(htmlfile)]) - opf.write(open(os.path.splitext(htmlfile)[0]+'.opf', 'wb')) + return processed_records def replace_page_breaks(self): @@ -264,10 +276,26 @@ class MobiReader(object): one = re.compile(r'src=["\']{0,1}[^\'"]+["\']{0,1}', re.IGNORECASE).sub('', match.group(1)).strip() return '