mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Implement getting metadata from MOBI and PRC files
This commit is contained in:
parent
731811e871
commit
4185f4734a
1
setup.py
1
setup.py
@ -38,6 +38,7 @@ entry_points = {
|
||||
'web2disk = libprs500.web.fetch.simple:main',
|
||||
'web2lrf = libprs500.ebooks.lrf.web.convert_from:main',
|
||||
'pdf2lrf = libprs500.ebooks.lrf.pdf.convert_from:main',
|
||||
'mobi2lrf = libprs500.ebooks.lrf.mobi.convert_from:main',
|
||||
'any2lrf = libprs500.ebooks.lrf.any.convert_from:main',
|
||||
'lrf2lrs = libprs500.ebooks.lrf.parser:main',
|
||||
'lrs2lrf = libprs500.ebooks.lrf.lrs.convert_from:main',
|
||||
|
@ -25,6 +25,7 @@ from libprs500.ebooks.lrf.rtf.convert_from import process_file as rtf2lrf
|
||||
from libprs500.ebooks.lrf.txt.convert_from import process_file as txt2lrf
|
||||
from libprs500.ebooks.lrf.html.convert_from import process_file as html2lrf
|
||||
from libprs500.ebooks.lrf.epub.convert_from import process_file as epub2lrf
|
||||
from libprs500.ebooks.lrf.mobi.convert_from import process_file as mobi2lrf
|
||||
|
||||
def largest_file(files):
|
||||
maxsize, file = 0, None
|
||||
@ -81,7 +82,7 @@ def handle_archive(path):
|
||||
files = []
|
||||
cdir = traverse_subdirs(tdir)
|
||||
file = None
|
||||
for ext in ('lit', 'rtf', 'pdf', 'txt', 'epub'):
|
||||
for ext in ('lit', 'rtf', 'pdf', 'txt', 'epub', 'mobi', 'prc'):
|
||||
pat = os.path.join(cdir, '*.'+ext)
|
||||
files.extend(glob.glob(pat))
|
||||
file = largest_file(files)
|
||||
@ -135,6 +136,8 @@ def process_file(path, options, logger=None):
|
||||
convertor = txt2lrf
|
||||
elif 'epub' == ext:
|
||||
convertor = epub2lrf
|
||||
elif ext in ['mobi', 'prc']:
|
||||
convertor = mobi2lrf
|
||||
if not convertor:
|
||||
raise UnknownFormatError('Coverting from %s to LRF is not supported.')
|
||||
convertor(path, options, logger)
|
||||
@ -150,7 +153,7 @@ def main(args=sys.argv, logger=None, gui_mode=False):
|
||||
any2lrf myfile
|
||||
|
||||
Convert any ebook format into LRF. Supported formats are:
|
||||
LIT, RTF, TXT, HTML, EPUB and PDF. any2lrf will also process a RAR or
|
||||
LIT, RTF, TXT, HTML, EPUB, MOBI, PRC and PDF. any2lrf will also process a RAR or
|
||||
ZIP archive.
|
||||
''', gui_mode=gui_mode)
|
||||
options, args = parser.parse_args(args)
|
||||
|
@ -1740,9 +1740,9 @@ def process_file(path, options, logger=None):
|
||||
re.compile(fpba[2], re.IGNORECASE)]
|
||||
if not hasattr(options, 'anchor_ids'):
|
||||
options.anchor_ids = True
|
||||
files = options.spine if options.use_spine else [path]
|
||||
files = options.spine if (options.use_spine and hasattr(options, 'spine')) else [path]
|
||||
conv = HTMLConverter(book, fonts, options, logger, files)
|
||||
if options.use_spine:
|
||||
if options.use_spine and hasattr(options, 'toc'):
|
||||
conv.create_toc(options.toc)
|
||||
oname = options.output
|
||||
if not oname:
|
||||
|
@ -20,7 +20,7 @@ import sys, os, logging
|
||||
|
||||
from libprs500 import __author__, __appname__, __version__, setup_cli_handlers
|
||||
from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup, NavigableString, \
|
||||
CData, Tag
|
||||
CData, Tag, UnicodeDammit
|
||||
from libprs500.ebooks.lrf.pylrs.pylrs import Book, PageStyle, TextStyle, \
|
||||
BlockStyle, ImageStream, Font, StyleDefault, BookSetting, Header, \
|
||||
Image, ImageBlock, Page, TextBlock, Canvas, Paragraph, CR, Span, \
|
||||
@ -38,7 +38,8 @@ class LrsParser(object):
|
||||
def __init__(self, stream, logger):
|
||||
self.logger = logger
|
||||
src = stream.read()
|
||||
self.soup = BeautifulStoneSoup(src, selfClosingTags=self.SELF_CLOSING_TAGS)
|
||||
self.soup = BeautifulStoneSoup(UnicodeDammit(src).unicode,
|
||||
selfClosingTags=self.SELF_CLOSING_TAGS)
|
||||
self.objects = {}
|
||||
for obj in self.soup.findAll(objid=True):
|
||||
self.objects[obj['objid']] = obj
|
||||
|
0
src/libprs500/ebooks/lrf/mobi/__init__.py
Normal file
0
src/libprs500/ebooks/lrf/mobi/__init__.py
Normal file
73
src/libprs500/ebooks/lrf/mobi/convert_from.py
Normal file
73
src/libprs500/ebooks/lrf/mobi/convert_from.py
Normal file
@ -0,0 +1,73 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
|
||||
## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
|
||||
## This program is free software; you can redistribute it and/or modify
|
||||
## it under the terms of the GNU General Public License as published by
|
||||
## the Free Software Foundation; either version 2 of the License, or
|
||||
## (at your option) any later version.
|
||||
##
|
||||
## This program is distributed in the hope that it will be useful,
|
||||
## but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
## GNU General Public License for more details.
|
||||
##
|
||||
## You should have received a copy of the GNU General Public License along
|
||||
## with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
''''''
|
||||
|
||||
import sys, tempfile, os, logging, shutil
|
||||
|
||||
from libprs500 import setup_cli_handlers, __appname__
|
||||
from libprs500.ebooks.mobi.reader import MobiReader
|
||||
from libprs500.ebooks.lrf import option_parser as lrf_option_parser
|
||||
from libprs500.ebooks.lrf.html.convert_from import process_file as html_process_file
|
||||
|
||||
def generate_html(mobifile, tdir):
|
||||
mr = MobiReader(mobifile)
|
||||
mr.extract_content(tdir)
|
||||
return mr.htmlfile
|
||||
|
||||
def process_file(path, options, logger=None):
|
||||
if logger is None:
|
||||
level = logging.DEBUG if options.verbose else logging.INFO
|
||||
logger = logging.getLogger('lit2lrf')
|
||||
setup_cli_handlers(logger, level)
|
||||
mobi = os.path.abspath(os.path.expanduser(path))
|
||||
tdir = tempfile.mkdtemp('mobi2lrf', __appname__)
|
||||
try:
|
||||
htmlfile = generate_html(mobi, tdir)
|
||||
if not options.output:
|
||||
ext = '.lrs' if options.lrs else '.lrf'
|
||||
options.output = os.path.abspath(os.path.basename(os.path.splitext(path)[0]) + ext)
|
||||
options.output = os.path.abspath(os.path.expanduser(options.output))
|
||||
options.use_spine = True
|
||||
html_process_file(htmlfile, options, logger=logger)
|
||||
finally:
|
||||
try:
|
||||
shutil.rmtree(tdir)
|
||||
except:
|
||||
logger.warning('Failed to delete temporary directory '+tdir)
|
||||
|
||||
def option_parser():
|
||||
return lrf_option_parser(
|
||||
'''Usage: %prog [options] mybook.mobi|prc\n\n'''
|
||||
'''%prog converts mybook.mobi to mybook.lrf'''
|
||||
)
|
||||
|
||||
|
||||
def main(args=sys.argv, logger=None):
|
||||
parser = option_parser()
|
||||
options, args = parser.parse_args(args)
|
||||
if len(args) != 2:
|
||||
parser.print_help()
|
||||
print
|
||||
print 'No mobi file specified'
|
||||
return 1
|
||||
process_file(args[1], options, logger)
|
||||
|
||||
return 0
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
@ -21,6 +21,7 @@ from libprs500.ebooks.metadata.pdf import get_metadata as pdf_metadata
|
||||
from libprs500.ebooks.metadata.lit import get_metadata as lit_metadata
|
||||
from libprs500.ebooks.metadata.epub import get_metadata as epub_metadata
|
||||
from libprs500.ebooks.metadata.html import get_metadata as html_metadata
|
||||
from libprs500.ebooks.mobi.reader import get_metadata as mobi_metadata
|
||||
from libprs500.ebooks.metadata.opf import OPFReader
|
||||
from libprs500.ebooks.metadata.rtf import set_metadata as set_rtf_metadata
|
||||
from libprs500.ebooks.lrf.meta import set_metadata as set_lrf_metadata
|
||||
@ -31,6 +32,8 @@ def get_metadata(stream, stream_type='lrf', use_libprs_metadata=False):
|
||||
if stream_type: stream_type = stream_type.lower()
|
||||
if stream_type in ('html', 'html', 'xhtml', 'xhtm'):
|
||||
stream_type = 'html'
|
||||
if stream_type in ('mobi', 'prc'):
|
||||
stream_type = 'mobi'
|
||||
if use_libprs_metadata and hasattr(stream, 'name'):
|
||||
mi = libprs_metadata(stream.name)
|
||||
if mi is not None:
|
||||
@ -102,7 +105,10 @@ def metadata_from_filename(name, pat=None):
|
||||
return mi
|
||||
|
||||
def libprs_metadata(name):
|
||||
if os.path.basename(name) != 'metadata.opf':
|
||||
name = os.path.join(os.path.dirname(name), 'metadata.opf')
|
||||
if os.access(name, os.R_OK):
|
||||
print name
|
||||
name = os.path.abspath(name)
|
||||
f = open(name, 'rb')
|
||||
opf = OPFReader(f, os.path.dirname(name))
|
||||
|
@ -558,6 +558,7 @@ class OPFCreator(OPF):
|
||||
self.tags = mi.tags
|
||||
if mi.isbn:
|
||||
self.isbn = mi.isbn
|
||||
self.cover_data = mi.cover_data
|
||||
if hasattr(mi, 'libprs_id'):
|
||||
self.libprs_id = mi.libprs_id
|
||||
if hasattr(mi, 'uid'):
|
||||
|
@ -1,6 +1,5 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
|
||||
## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
|
||||
## This program is free software; you can redistribute it and/or modify
|
||||
## it under the terms of the GNU General Public License as published by
|
||||
@ -19,13 +18,14 @@
|
||||
Read data from .mobi files
|
||||
'''
|
||||
|
||||
import sys, struct, os, cStringIO, re
|
||||
import sys, struct, os, cStringIO, re, atexit, shutil, tempfile
|
||||
|
||||
try:
|
||||
from PIL import Image as PILImage
|
||||
except ImportError:
|
||||
import Image as PILImage
|
||||
|
||||
from libprs500 import __appname__
|
||||
from libprs500.ebooks.mobi import MobiError
|
||||
from libprs500.ebooks.mobi.huffcdic import HuffReader
|
||||
from libprs500.ebooks.mobi.palmdoc import decompress_doc
|
||||
@ -57,9 +57,9 @@ class EXTHHeader(object):
|
||||
elif id == 202:
|
||||
self.thumbnail_offset, = struct.unpack('>L', content)
|
||||
pos += 3
|
||||
stop = raw.find('\x00')
|
||||
stop = raw[pos:].find('\x00')
|
||||
if stop > -1:
|
||||
self.mi.title = raw[pos:stop].decode(codec, 'ignore')
|
||||
self.mi.title = raw[pos:pos+stop].decode(codec, 'ignore')
|
||||
|
||||
|
||||
def process_metadata(self, id, content, codec):
|
||||
@ -161,8 +161,42 @@ class MobiReader(object):
|
||||
|
||||
|
||||
def extract_content(self, output_dir=os.getcwdu()):
|
||||
output_dir = os.path.abspath(output_dir)
|
||||
if self.book_header.encryption_type != 0:
|
||||
raise MobiError('Cannot extract content from DRM protected ebook')
|
||||
raise MobiError('Cannot extract content from a DRM protected ebook')
|
||||
|
||||
processed_records = self.extract_text()
|
||||
self.add_anchors()
|
||||
self.extract_images(processed_records, output_dir)
|
||||
self.replace_page_breaks()
|
||||
|
||||
self.processed_html = re.compile('<head>', re.IGNORECASE).sub(
|
||||
'<head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\n',
|
||||
self.processed_html)
|
||||
|
||||
htmlfile = os.path.join(output_dir, self.name+'.html')
|
||||
open(htmlfile, 'wb').write(self.processed_html.encode('utf8'))
|
||||
self.htmlfile = htmlfile
|
||||
|
||||
if self.book_header.exth is not None:
|
||||
opf = self.create_opf(htmlfile)
|
||||
opf.write(open(os.path.splitext(htmlfile)[0]+'.opf', 'wb'))
|
||||
|
||||
def create_opf(self, htmlfile):
|
||||
mi = self.book_header.exth.mi
|
||||
opf = OPFCreator(mi)
|
||||
if hasattr(self.book_header.exth, 'cover_offset'):
|
||||
opf.cover = 'images/%05d.jpg'%(self.book_header.exth.cover_offset+1)
|
||||
manifest = [(os.path.basename(htmlfile), 'text/x-oeb1-document')]
|
||||
for i in self.image_names:
|
||||
manifest.append(('images/'+i, 'image/jpg'))
|
||||
|
||||
opf.create_manifest(manifest)
|
||||
opf.create_spine([os.path.basename(htmlfile)])
|
||||
return opf
|
||||
|
||||
|
||||
def extract_text(self):
|
||||
text_sections = [self.sections[i][0] for i in range(1, self.book_header.records+1)]
|
||||
processed_records = list(range(0, self.book_header.records+1))
|
||||
|
||||
@ -189,29 +223,7 @@ class MobiReader(object):
|
||||
else:
|
||||
raise MobiError('Unknown compression algorithm: %s'%repr(self.book_header.compression_type))
|
||||
|
||||
self.add_anchors()
|
||||
self.extract_images(processed_records, output_dir)
|
||||
self.replace_page_breaks()
|
||||
|
||||
self.processed_html = re.compile('<head>', re.IGNORECASE).sub(
|
||||
'<head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\n',
|
||||
self.processed_html)
|
||||
|
||||
htmlfile = os.path.join(output_dir, self.name+'.html')
|
||||
open(htmlfile, 'wb').write(self.processed_html.encode('utf8'))
|
||||
|
||||
if self.book_header.exth is not None:
|
||||
mi = self.book_header.exth.mi
|
||||
opf = OPFCreator(mi)
|
||||
if hasattr(self.book_header.exth, 'cover_offset'):
|
||||
opf.cover = 'images/%d.jpg'%(self.book_header.exth.cover_offset+1)
|
||||
manifest = [(os.path.basename(htmlfile), 'text/x-oeb1-document')]
|
||||
for i in self.image_names:
|
||||
manifest.append(('images/'+i, 'image/jpg'))
|
||||
|
||||
opf.create_manifest(manifest)
|
||||
opf.create_spine([os.path.basename(htmlfile)])
|
||||
opf.write(open(os.path.splitext(htmlfile)[0]+'.opf', 'wb'))
|
||||
return processed_records
|
||||
|
||||
|
||||
def replace_page_breaks(self):
|
||||
@ -264,10 +276,26 @@ class MobiReader(object):
|
||||
one = re.compile(r'src=["\']{0,1}[^\'"]+["\']{0,1}', re.IGNORECASE).sub('', match.group(1)).strip()
|
||||
return '<img'+one+' src="images/%s.jpg"'%match.group(2)
|
||||
|
||||
self.processed_html = \
|
||||
if hasattr(self, 'processed_html'):
|
||||
self.processed_html = \
|
||||
re.compile(r'<img(.+?)recindex=[\'"]{0,1}(\d+)[\'"]{0,1}', re.IGNORECASE|re.DOTALL)\
|
||||
.sub(fix_images, self.processed_html)
|
||||
|
||||
|
||||
def get_metadata(stream):
|
||||
mr = MobiReader(stream)
|
||||
if mr.book_header.exth is None:
|
||||
mi = MetaInformation(mr.name, ['Unknown'])
|
||||
else:
|
||||
tdir = tempfile.mkdtemp('mobi-meta', __appname__)
|
||||
atexit.register(shutil.rmtree, tdir)
|
||||
mr.extract_images([], tdir)
|
||||
mi = mr.create_opf('dummy.html')
|
||||
if mi.cover:
|
||||
cover = os.path.join(tdir, mi.cover)
|
||||
print cover
|
||||
if os.access(cover, os.R_OK):
|
||||
mi.cover_data = ('JPEG', open(os.path.join(tdir, mi.cover), 'rb').read())
|
||||
return mi
|
||||
|
||||
def option_parser():
|
||||
from libprs500 import OptionParser
|
||||
|
@ -25,7 +25,7 @@ from libprs500.gui2 import qstring_to_unicode, error_dialog, \
|
||||
from libprs500.gui2.widgets import FontFamilyModel
|
||||
from libprs500.ebooks.lrf import option_parser
|
||||
from libprs500.ptempfile import PersistentTemporaryFile
|
||||
from libprs500 import __appname__, osx_version
|
||||
from libprs500 import __appname__
|
||||
|
||||
font_family_model = None
|
||||
|
||||
@ -101,16 +101,7 @@ class LRFSingleDialog(QDialog, Ui_LRFSingleDialog):
|
||||
|
||||
if self.selected_format:
|
||||
self.setWindowTitle(_('Convert %s to LRF')%(self.selected_format,))
|
||||
if self.selected_format == 'RTF':
|
||||
try:
|
||||
major, minor = osx_version()[:2]
|
||||
if False and (major == 10 and minor > 4) or major > 10:
|
||||
self.selected_format = None
|
||||
d = error_dialog(self, _('RTF conversion not supported'),
|
||||
_('Conversion of RTF files is not supported on OS X Leopard and higher. This is because unrtf, the underlying program does not work. If you are willing to port unrtf to Leopard, contact me.'))
|
||||
d.exec_()
|
||||
except:
|
||||
pass
|
||||
|
||||
else:
|
||||
self.setWindowTitle(_('Set conversion defaults'))
|
||||
|
||||
|
@ -109,6 +109,7 @@ def setup_completion(fatal_errors):
|
||||
f.write(opts_and_exts('lit2lrf', htmlop, ['lit']))
|
||||
f.write(opts_and_exts('epub2lrf', htmlop, ['epub']))
|
||||
f.write(opts_and_exts('rtf2lrf', htmlop, ['rtf']))
|
||||
f.write(opts_and_exts('mobi2lrf', htmlop, ['mobi', 'prc']))
|
||||
f.write(opts_and_exts('pdf2lrf', htmlop, ['pdf']))
|
||||
f.write(opts_and_exts('any2lrf', htmlop,
|
||||
['epub', 'htm', 'html', 'xhtml', 'xhtm', 'rar', 'zip', 'txt', 'lit', 'rtf', 'pdf']))
|
||||
|
Loading…
x
Reference in New Issue
Block a user