Implement getting metadata from MOBI and PRC files

This commit is contained in:
Kovid Goyal 2008-02-15 08:07:00 +00:00
parent 731811e871
commit 4185f4734a
11 changed files with 152 additions and 47 deletions

View File

@ -38,6 +38,7 @@ entry_points = {
'web2disk = libprs500.web.fetch.simple:main',
'web2lrf = libprs500.ebooks.lrf.web.convert_from:main',
'pdf2lrf = libprs500.ebooks.lrf.pdf.convert_from:main',
'mobi2lrf = libprs500.ebooks.lrf.mobi.convert_from:main',
'any2lrf = libprs500.ebooks.lrf.any.convert_from:main',
'lrf2lrs = libprs500.ebooks.lrf.parser:main',
'lrs2lrf = libprs500.ebooks.lrf.lrs.convert_from:main',

View File

@ -25,6 +25,7 @@ from libprs500.ebooks.lrf.rtf.convert_from import process_file as rtf2lrf
from libprs500.ebooks.lrf.txt.convert_from import process_file as txt2lrf
from libprs500.ebooks.lrf.html.convert_from import process_file as html2lrf
from libprs500.ebooks.lrf.epub.convert_from import process_file as epub2lrf
from libprs500.ebooks.lrf.mobi.convert_from import process_file as mobi2lrf
def largest_file(files):
maxsize, file = 0, None
@ -81,7 +82,7 @@ def handle_archive(path):
files = []
cdir = traverse_subdirs(tdir)
file = None
for ext in ('lit', 'rtf', 'pdf', 'txt', 'epub'):
for ext in ('lit', 'rtf', 'pdf', 'txt', 'epub', 'mobi', 'prc'):
pat = os.path.join(cdir, '*.'+ext)
files.extend(glob.glob(pat))
file = largest_file(files)
@ -135,6 +136,8 @@ def process_file(path, options, logger=None):
convertor = txt2lrf
elif 'epub' == ext:
convertor = epub2lrf
elif ext in ['mobi', 'prc']:
convertor = mobi2lrf
if not convertor:
raise UnknownFormatError('Coverting from %s to LRF is not supported.')
convertor(path, options, logger)
@ -150,7 +153,7 @@ def main(args=sys.argv, logger=None, gui_mode=False):
any2lrf myfile
Convert any ebook format into LRF. Supported formats are:
LIT, RTF, TXT, HTML, EPUB and PDF. any2lrf will also process a RAR or
LIT, RTF, TXT, HTML, EPUB, MOBI, PRC and PDF. any2lrf will also process a RAR or
ZIP archive.
''', gui_mode=gui_mode)
options, args = parser.parse_args(args)

View File

@ -1740,9 +1740,9 @@ def process_file(path, options, logger=None):
re.compile(fpba[2], re.IGNORECASE)]
if not hasattr(options, 'anchor_ids'):
options.anchor_ids = True
files = options.spine if options.use_spine else [path]
files = options.spine if (options.use_spine and hasattr(options, 'spine')) else [path]
conv = HTMLConverter(book, fonts, options, logger, files)
if options.use_spine:
if options.use_spine and hasattr(options, 'toc'):
conv.create_toc(options.toc)
oname = options.output
if not oname:

View File

@ -20,7 +20,7 @@ import sys, os, logging
from libprs500 import __author__, __appname__, __version__, setup_cli_handlers
from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup, NavigableString, \
CData, Tag
CData, Tag, UnicodeDammit
from libprs500.ebooks.lrf.pylrs.pylrs import Book, PageStyle, TextStyle, \
BlockStyle, ImageStream, Font, StyleDefault, BookSetting, Header, \
Image, ImageBlock, Page, TextBlock, Canvas, Paragraph, CR, Span, \
@ -38,7 +38,8 @@ class LrsParser(object):
def __init__(self, stream, logger):
self.logger = logger
src = stream.read()
self.soup = BeautifulStoneSoup(src, selfClosingTags=self.SELF_CLOSING_TAGS)
self.soup = BeautifulStoneSoup(UnicodeDammit(src).unicode,
selfClosingTags=self.SELF_CLOSING_TAGS)
self.objects = {}
for obj in self.soup.findAll(objid=True):
self.objects[obj['objid']] = obj

View File

@ -0,0 +1,73 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
''''''
import sys, tempfile, os, logging, shutil
from libprs500 import setup_cli_handlers, __appname__
from libprs500.ebooks.mobi.reader import MobiReader
from libprs500.ebooks.lrf import option_parser as lrf_option_parser
from libprs500.ebooks.lrf.html.convert_from import process_file as html_process_file
def generate_html(mobifile, tdir):
mr = MobiReader(mobifile)
mr.extract_content(tdir)
return mr.htmlfile
def process_file(path, options, logger=None):
if logger is None:
level = logging.DEBUG if options.verbose else logging.INFO
logger = logging.getLogger('lit2lrf')
setup_cli_handlers(logger, level)
mobi = os.path.abspath(os.path.expanduser(path))
tdir = tempfile.mkdtemp('mobi2lrf', __appname__)
try:
htmlfile = generate_html(mobi, tdir)
if not options.output:
ext = '.lrs' if options.lrs else '.lrf'
options.output = os.path.abspath(os.path.basename(os.path.splitext(path)[0]) + ext)
options.output = os.path.abspath(os.path.expanduser(options.output))
options.use_spine = True
html_process_file(htmlfile, options, logger=logger)
finally:
try:
shutil.rmtree(tdir)
except:
logger.warning('Failed to delete temporary directory '+tdir)
def option_parser():
return lrf_option_parser(
'''Usage: %prog [options] mybook.mobi|prc\n\n'''
'''%prog converts mybook.mobi to mybook.lrf'''
)
def main(args=sys.argv, logger=None):
parser = option_parser()
options, args = parser.parse_args(args)
if len(args) != 2:
parser.print_help()
print
print 'No mobi file specified'
return 1
process_file(args[1], options, logger)
return 0
if __name__ == '__main__':
sys.exit(main())

View File

@ -21,6 +21,7 @@ from libprs500.ebooks.metadata.pdf import get_metadata as pdf_metadata
from libprs500.ebooks.metadata.lit import get_metadata as lit_metadata
from libprs500.ebooks.metadata.epub import get_metadata as epub_metadata
from libprs500.ebooks.metadata.html import get_metadata as html_metadata
from libprs500.ebooks.mobi.reader import get_metadata as mobi_metadata
from libprs500.ebooks.metadata.opf import OPFReader
from libprs500.ebooks.metadata.rtf import set_metadata as set_rtf_metadata
from libprs500.ebooks.lrf.meta import set_metadata as set_lrf_metadata
@ -31,6 +32,8 @@ def get_metadata(stream, stream_type='lrf', use_libprs_metadata=False):
if stream_type: stream_type = stream_type.lower()
if stream_type in ('html', 'html', 'xhtml', 'xhtm'):
stream_type = 'html'
if stream_type in ('mobi', 'prc'):
stream_type = 'mobi'
if use_libprs_metadata and hasattr(stream, 'name'):
mi = libprs_metadata(stream.name)
if mi is not None:
@ -102,7 +105,10 @@ def metadata_from_filename(name, pat=None):
return mi
def libprs_metadata(name):
if os.path.basename(name) != 'metadata.opf':
name = os.path.join(os.path.dirname(name), 'metadata.opf')
if os.access(name, os.R_OK):
print name
name = os.path.abspath(name)
f = open(name, 'rb')
opf = OPFReader(f, os.path.dirname(name))

View File

@ -558,6 +558,7 @@ class OPFCreator(OPF):
self.tags = mi.tags
if mi.isbn:
self.isbn = mi.isbn
self.cover_data = mi.cover_data
if hasattr(mi, 'libprs_id'):
self.libprs_id = mi.libprs_id
if hasattr(mi, 'uid'):

View File

@ -1,6 +1,5 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
@ -19,13 +18,14 @@
Read data from .mobi files
'''
import sys, struct, os, cStringIO, re
import sys, struct, os, cStringIO, re, atexit, shutil, tempfile
try:
from PIL import Image as PILImage
except ImportError:
import Image as PILImage
from libprs500 import __appname__
from libprs500.ebooks.mobi import MobiError
from libprs500.ebooks.mobi.huffcdic import HuffReader
from libprs500.ebooks.mobi.palmdoc import decompress_doc
@ -57,9 +57,9 @@ class EXTHHeader(object):
elif id == 202:
self.thumbnail_offset, = struct.unpack('>L', content)
pos += 3
stop = raw.find('\x00')
stop = raw[pos:].find('\x00')
if stop > -1:
self.mi.title = raw[pos:stop].decode(codec, 'ignore')
self.mi.title = raw[pos:pos+stop].decode(codec, 'ignore')
def process_metadata(self, id, content, codec):
@ -161,8 +161,42 @@ class MobiReader(object):
def extract_content(self, output_dir=os.getcwdu()):
output_dir = os.path.abspath(output_dir)
if self.book_header.encryption_type != 0:
raise MobiError('Cannot extract content from DRM protected ebook')
raise MobiError('Cannot extract content from a DRM protected ebook')
processed_records = self.extract_text()
self.add_anchors()
self.extract_images(processed_records, output_dir)
self.replace_page_breaks()
self.processed_html = re.compile('<head>', re.IGNORECASE).sub(
'<head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\n',
self.processed_html)
htmlfile = os.path.join(output_dir, self.name+'.html')
open(htmlfile, 'wb').write(self.processed_html.encode('utf8'))
self.htmlfile = htmlfile
if self.book_header.exth is not None:
opf = self.create_opf(htmlfile)
opf.write(open(os.path.splitext(htmlfile)[0]+'.opf', 'wb'))
def create_opf(self, htmlfile):
mi = self.book_header.exth.mi
opf = OPFCreator(mi)
if hasattr(self.book_header.exth, 'cover_offset'):
opf.cover = 'images/%05d.jpg'%(self.book_header.exth.cover_offset+1)
manifest = [(os.path.basename(htmlfile), 'text/x-oeb1-document')]
for i in self.image_names:
manifest.append(('images/'+i, 'image/jpg'))
opf.create_manifest(manifest)
opf.create_spine([os.path.basename(htmlfile)])
return opf
def extract_text(self):
text_sections = [self.sections[i][0] for i in range(1, self.book_header.records+1)]
processed_records = list(range(0, self.book_header.records+1))
@ -189,29 +223,7 @@ class MobiReader(object):
else:
raise MobiError('Unknown compression algorithm: %s'%repr(self.book_header.compression_type))
self.add_anchors()
self.extract_images(processed_records, output_dir)
self.replace_page_breaks()
self.processed_html = re.compile('<head>', re.IGNORECASE).sub(
'<head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\n',
self.processed_html)
htmlfile = os.path.join(output_dir, self.name+'.html')
open(htmlfile, 'wb').write(self.processed_html.encode('utf8'))
if self.book_header.exth is not None:
mi = self.book_header.exth.mi
opf = OPFCreator(mi)
if hasattr(self.book_header.exth, 'cover_offset'):
opf.cover = 'images/%d.jpg'%(self.book_header.exth.cover_offset+1)
manifest = [(os.path.basename(htmlfile), 'text/x-oeb1-document')]
for i in self.image_names:
manifest.append(('images/'+i, 'image/jpg'))
opf.create_manifest(manifest)
opf.create_spine([os.path.basename(htmlfile)])
opf.write(open(os.path.splitext(htmlfile)[0]+'.opf', 'wb'))
return processed_records
def replace_page_breaks(self):
@ -264,10 +276,26 @@ class MobiReader(object):
one = re.compile(r'src=["\']{0,1}[^\'"]+["\']{0,1}', re.IGNORECASE).sub('', match.group(1)).strip()
return '<img'+one+' src="images/%s.jpg"'%match.group(2)
self.processed_html = \
if hasattr(self, 'processed_html'):
self.processed_html = \
re.compile(r'<img(.+?)recindex=[\'"]{0,1}(\d+)[\'"]{0,1}', re.IGNORECASE|re.DOTALL)\
.sub(fix_images, self.processed_html)
def get_metadata(stream):
mr = MobiReader(stream)
if mr.book_header.exth is None:
mi = MetaInformation(mr.name, ['Unknown'])
else:
tdir = tempfile.mkdtemp('mobi-meta', __appname__)
atexit.register(shutil.rmtree, tdir)
mr.extract_images([], tdir)
mi = mr.create_opf('dummy.html')
if mi.cover:
cover = os.path.join(tdir, mi.cover)
print cover
if os.access(cover, os.R_OK):
mi.cover_data = ('JPEG', open(os.path.join(tdir, mi.cover), 'rb').read())
return mi
def option_parser():
from libprs500 import OptionParser

View File

@ -25,7 +25,7 @@ from libprs500.gui2 import qstring_to_unicode, error_dialog, \
from libprs500.gui2.widgets import FontFamilyModel
from libprs500.ebooks.lrf import option_parser
from libprs500.ptempfile import PersistentTemporaryFile
from libprs500 import __appname__, osx_version
from libprs500 import __appname__
font_family_model = None
@ -101,16 +101,7 @@ class LRFSingleDialog(QDialog, Ui_LRFSingleDialog):
if self.selected_format:
self.setWindowTitle(_('Convert %s to LRF')%(self.selected_format,))
if self.selected_format == 'RTF':
try:
major, minor = osx_version()[:2]
if False and (major == 10 and minor > 4) or major > 10:
self.selected_format = None
d = error_dialog(self, _('RTF conversion not supported'),
_('Conversion of RTF files is not supported on OS X Leopard and higher. This is because unrtf, the underlying program does not work. If you are willing to port unrtf to Leopard, contact me.'))
d.exec_()
except:
pass
else:
self.setWindowTitle(_('Set conversion defaults'))

View File

@ -109,6 +109,7 @@ def setup_completion(fatal_errors):
f.write(opts_and_exts('lit2lrf', htmlop, ['lit']))
f.write(opts_and_exts('epub2lrf', htmlop, ['epub']))
f.write(opts_and_exts('rtf2lrf', htmlop, ['rtf']))
f.write(opts_and_exts('mobi2lrf', htmlop, ['mobi', 'prc']))
f.write(opts_and_exts('pdf2lrf', htmlop, ['pdf']))
f.write(opts_and_exts('any2lrf', htmlop,
['epub', 'htm', 'html', 'xhtml', 'xhtm', 'rar', 'zip', 'txt', 'lit', 'rtf', 'pdf']))