Implement conversion of non-DRMed MOBI/PRC files to OEBPS ebooks. Handles all compression methods.

This commit is contained in:
Kovid Goyal 2008-02-15 06:19:58 +00:00
parent 97fdac5ef9
commit 782b195531
7 changed files with 549 additions and 2 deletions

View File

@ -44,6 +44,7 @@ entry_points = {
'pdfreflow = libprs500.ebooks.lrf.pdf.reflow:main',
'isbndb = libprs500.ebooks.metadata.isbndb:main',
'librarything = libprs500.ebooks.metadata.library_thing:main',
'mobi2oeb = libprs500.ebooks.mobi.reader:main',
'lrf2html = libprs500.ebooks.lrf.html.convert_to:main',
],
'gui_scripts' : [

View File

@ -177,6 +177,7 @@ class OPF(MetaInformation):
MIMETYPE = 'application/oebps-package+xml'
ENTITY_PATTERN = re.compile(r'&(\S+?);')
uid = standard_field('uid')
libprs_id = standard_field('libprs_id')
title = standard_field('title')
authors = standard_field('authors')
@ -243,6 +244,7 @@ class OPF(MetaInformation):
else:
dcm = doc.createElement(type)
metadata.appendChild(dcm)
metadata.appendChild(doc.createTextNode('\n'))
tags = dcm.getElementsByTagName(name)
if tags and not replace:
for tag in tags:
@ -260,6 +262,7 @@ class OPF(MetaInformation):
for attr, vattr in vattrs:
el.setAttribute(attr, vattr)
dcm.appendChild(el)
dcm.appendChild(doc.createTextNode('\n'))
self._commit(doc)
@ -350,6 +353,15 @@ class OPF(MetaInformation):
comments = ''
self._set_metadata_element('dc:Description', comments)
def get_uid(self):
package = self.soup.find('package')
if package.has_key('unique-identifier'):
return package['unique-identifier']
def set_uid(self, uid):
package = self.soup.find('package')
package['unique-identifier'] = str(uid)
def get_category(self):
category = self.soup.find('dc:type')
if category:
@ -500,7 +512,12 @@ class OPF(MetaInformation):
self._set_metadata_element('dc:Subject', tags)
def write(self, stream):
stream.write(self.soup.prettify('utf-8'))
src = unicode(self.soup)
src = re.sub(r'>\s*</item(ref)*>', ' />\n', src)
src = re.sub(r'<manifest><', '<manifest>\n<', src)
src = re.sub(r'<spine><', '<spine>\n<', src)
src = re.sub(r'^<item', ' <item', src)
stream.write(src.encode('utf-8')+'\n')
class OPFReader(OPF):
@ -543,6 +560,44 @@ class OPFCreator(OPF):
self.isbn = mi.isbn
if hasattr(mi, 'libprs_id'):
self.libprs_id = mi.libprs_id
if hasattr(mi, 'uid'):
self.uid = mi.uid
def create_manifest(self, entries):
doc = dom.parseString(self.soup.__str__('UTF-8').strip())
package = doc.documentElement
manifest = doc.createElement('manifest')
package.appendChild(manifest)
package.appendChild(doc.createTextNode('\n'))
self.href_map = {}
for href, media_type in entries:
item = doc.createElement('item')
item.setAttribute('href', href)
item.setAttribute('media-type', media_type)
self.href_map[href] = str(hash(href))
item.setAttribute('id', self.href_map[href])
manifest.appendChild(item)
manifest.appendChild(doc.createTextNode('\n'))
self._commit(doc)
def create_spine(self, entries):
doc = dom.parseString(self.soup.__str__('UTF-8').strip())
package = doc.documentElement
spine = doc.createElement('spine')
package.appendChild(spine)
package.appendChild(doc.createTextNode('\n'))
for href in entries:
itemref = doc.createElement('itemref')
itemref.setAttribute('idref', self.href_map[href])
spine.appendChild(itemref)
spine.appendChild(doc.createTextNode('\n'))
self._commit(doc)
def main(args=sys.argv):
parser = get_parser('opf')

View File

@ -0,0 +1,20 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
class MobiError(Exception):
pass

View File

@ -0,0 +1,127 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
'''
Decompress MOBI files compressed with the Huff/cdic algorithm. Code thanks to darkninja
and igorsk.
'''
import struct
from libprs500.ebooks.mobi import MobiError
class BitReader(object):
def __init__(self, data):
self.data, self.pos, self.nbits = data + "\x00\x00\x00\x00", 0, len(data) * 8
def peek(self, n):
r, g = 0, 0
while g < n:
r, g = (r << 8) | ord(self.data[(self.pos+g)>>3]), g + 8 - ((self.pos+g) & 7)
return (r >> (g - n)) & ((1 << n) - 1)
def eat(self, n):
self.pos += n
return self.pos <= self.nbits
def left(self):
return self.nbits - self.pos
class HuffReader(object):
def __init__(self, huffs, extra_flags, codec='cp1252'):
self.huffs, self.extra_flags, self.codec = huffs, extra_flags, codec
if huffs[0][0:4] != 'HUFF' or huffs[0][4:8] != '\x00\x00\x00\x18':
raise MobiError('Invalid HUFF header')
if huffs[1][0:4] != 'CDIC' or huffs[1][4:8] != '\x00\x00\x00\x10':
raise ValueError('Invalid CDIC header')
self.entry_bits, = struct.unpack('>L', huffs[1][12:16])
off1,off2 = struct.unpack('>LL', huffs[0][16:24])
self.dict1 = struct.unpack('<256L', huffs[0][off1:off1+256*4])
self.dict2 = struct.unpack('<64L', huffs[0][off2:off2+64*4])
self.dicts = huffs[1:]
self.r = ''
def _unpack(self, bits, depth = 0):
if depth > 32:
raise MobiError('Corrupt file')
while bits.left():
dw = bits.peek(32)
v = self.dict1[dw >> 24]
codelen = v & 0x1F
assert codelen != 0
code = dw >> (32 - codelen)
r = (v >> 8)
if not (v & 0x80):
while code < self.dict2[(codelen-1)*2]:
codelen += 1
code = dw >> (32 - codelen)
r = self.dict2[(codelen-1)*2+1]
r -= code
assert codelen != 0
if not bits.eat(codelen):
return
dicno = r >> self.entry_bits
off1 = 16 + (r - (dicno << self.entry_bits)) * 2
dic = self.dicts[dicno]
off2 = 16 + ord(dic[off1]) * 256 + ord(dic[off1+1])
blen = ord(dic[off2]) * 256 + ord(dic[off2+1])
slice = dic[off2+2:off2+2+(blen&0x7fff)]
if blen & 0x8000:
self.r += slice
else:
self._unpack(BitReader(slice), depth + 1)
def unpack(self, data):
self.r = ''
self._unpack(BitReader(data))
return self.r
def sizeof_trailing_entries(self, data):
def sizeof_trailing_entry(ptr, psize):
bitpos, result = 0, 0
while True:
v = ord(ptr[psize-1])
result |= (v & 0x7F) << bitpos
bitpos += 7
psize -= 1
if (v & 0x80) != 0 or (bitpos >= 28) or (psize == 0):
return result
num = 0
size = len(data)
flags = self.extra_flags >> 1
while flags:
if flags & 1:
num += sizeof_trailing_entry(data, size - num)
flags >>= 1
return num
def decompress(self, sections):
r = ''
for data in sections:
trail_size = self.sizeof_trailing_entries(data)
r += self.unpack(data[:len(data)-trail_size])
if r.endswith('#'):
r = r[:-1]
return r.decode(self.codec)

View File

@ -0,0 +1,46 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
COUNT_BITS = 3
def decompress_doc(data, codec='cp1252'):
buffer = [ord(i) for i in data]
res = []
i = 0
while i < len(buffer):
c = buffer[i]
i += 1
if c >= 1 and c <= 8:
res.extend(buffer[i:i+c])
i += c
elif c <= 0x7f:
res.append(c)
elif c >= 0xc0:
res.extend( (ord(' '), c^0x80) )
else:
c = (c << 8) + buffer[i]
i += 1
di = (c & 0x3fff) >> COUNT_BITS
j = len(res)
num = (c & ((1 << COUNT_BITS) - 1)) + 3
for k in range( num ):
res.append(res[j - di+k])
return unicode(''.join([chr(i) for i in res]), codec)

View File

@ -0,0 +1,296 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
'''
Read data from .mobi files
'''
import sys, struct, os, cStringIO, re
try:
from PIL import Image as PILImage
except ImportError:
import Image as PILImage
from libprs500.ebooks.mobi import MobiError
from libprs500.ebooks.mobi.huffcdic import HuffReader
from libprs500.ebooks.mobi.palmdoc import decompress_doc
from libprs500.ebooks.metadata import MetaInformation
from libprs500.ebooks.metadata.opf import OPFCreator
class EXTHHeader(object):
def __init__(self, raw, codec):
self.doctype = raw[:4]
self.length, self.num_items = struct.unpack('>LL', raw[4:12])
raw = raw[12:]
pos = 0
self.mi = MetaInformation('Unknown', ['Unknown'])
self.has_fake_cover = True
for i in range(self.num_items):
id, size = struct.unpack('>LL', raw[pos:pos+8])
content = raw[pos+8:pos+size]
pos += size
if id >= 100 and id < 200:
self.process_metadata(id, content, codec)
elif id == 203:
self.has_fake_cover = bool(struct.unpack('>L', content)[0])
elif id == 201:
self.cover_offset, = struct.unpack('>L', content)
elif id == 202:
self.thumbnail_offset, = struct.unpack('>L', content)
pos += 3
stop = raw.find('\x00')
if stop > -1:
self.mi.title = raw[pos:stop].decode(codec, 'ignore')
def process_metadata(self, id, content, codec):
if id == 100:
aus = content.split(',')
authors = []
for a in aus:
authors.extend(a.split('&'))
self.mi.authors = [i.decode(codec, 'ignore') for i in authors]
elif id == 101:
self.mi.publisher = content.decode(codec, 'ignore')
elif id == 103:
self.mi.comments = content.decode(codec, 'ignore')
elif id == 104:
self.mi.isbn = content.decode(codec, 'ignore').strip().replace('-', '')
elif id == 105:
self.mi.category = content.decode(codec, 'ignore')
class BookHeader(object):
def __init__(self, raw, ident):
self.compression_type = raw[:2]
self.records, self.records_size = struct.unpack('>HH', raw[8:12])
self.encryption_type, = struct.unpack('>H', raw[12:14])
self.doctype = raw[16:20]
self.length, self.type, self.codepage, self.unique_id, self.version = \
struct.unpack('>LLLLL', raw[20:40])
if ident == 'TEXTREAD':
self.codepage = 1252
try:
self.codec = {
1252 : 'cp1252',
65001 : 'utf-8',
}[self.codepage]
except IndexError, KeyError:
raise MobiError('Unknown codepage: %d'%self.codepage)
if ident == 'TEXTREAD':
self.extra_flags = 0
else:
self.extra_flags, = struct.unpack('>L', raw[0xF0:0xF4])
if self.compression_type == 'DH':
self.huff_offset, self.huff_number = struct.unpack('>LL', raw[0x70:0x78])
self.exth_flag, = struct.unpack('>L', raw[0x80:0x84])
self.exth = None
if self.exth_flag & 0x40:
self.exth = EXTHHeader(raw[16+self.length:], self.codec)
self.exth.mi.uid = self.unique_id
class MobiReader(object):
PAGE_BREAK_PAT = re.compile(r'(<[/]{0,1}mbp:pagebreak\s*[/]{0,1}>)+', re.IGNORECASE)
def __init__(self, filename_or_stream):
if hasattr(filename_or_stream, 'read'):
stream = filename_or_stream
stream.seek(0)
else:
stream = open(filename_or_stream, 'rb')
raw = stream.read()
self.header = raw[0:72]
self.name = self.header[:32].replace('\x00', '')
self.num_sections, = struct.unpack('>H', raw[76:78])
self.ident = self.header[0x3C:0x3C+8].upper()
if self.ident not in ['BOOKMOBI', 'TEXTREAD']:
raise MobiError('Unknown book type: %s'%self.ident)
self.sections = []
self.section_headers = []
for i in range(self.num_sections):
offset, a1, a2, a3, a4 = struct.unpack('>LBBBB', raw[78+i*8:78+i*8+8])
flags, val = a1, a2<<16 | a3<<8 | a4
self.section_headers.append((offset, flags, val))
def section(section_number):
if section_number == self.num_sections - 1:
end_off = len(raw)
else:
end_off = self.section_headers[section_number + 1][0]
off = self.section_headers[section_number][0]
return raw[off:end_off]
for i in range(self.num_sections):
self.sections.append((section(i), self.section_headers[i]))
self.book_header = BookHeader(self.sections[0][0], self.ident)
def extract_content(self, output_dir=os.getcwdu()):
if self.book_header.encryption_type != 0:
raise MobiError('Cannot extract content from DRM protected ebook')
text_sections = [self.sections[i][0] for i in range(1, self.book_header.records+1)]
processed_records = list(range(0, self.book_header.records+1))
self.mobi_html = u''
codec = self.book_header.codec
if self.book_header.compression_type == 'DH':
huffs = [self.sections[i][0] for i in
range(self.book_header.huff_offset,
self.book_header.huff_offset+self.book_header.huff_number)]
processed_records += list(range(self.book_header.huff_offset,
self.book_header.huff_offset+self.book_header.huff_number))
huff = HuffReader(huffs, self.book_header.extra_flags, codec)
self.mobi_html = huff.decompress(text_sections)
elif self.book_header.compression_type == '\x00\x02':
for section in text_sections:
self.mobi_html += decompress_doc(section, codec)
elif self.book_header.compression_type == '\x00\x01':
t = [i.decode(codec) for i in text_sections]
self.mobi_html = ''.join(t)
else:
raise MobiError('Unknown compression algorithm: %s'%repr(self.book_header.compression_type))
self.add_anchors()
self.extract_images(processed_records, output_dir)
self.replace_page_breaks()
self.processed_html = re.compile('<head>', re.IGNORECASE).sub(
'<head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\n',
self.processed_html)
htmlfile = os.path.join(output_dir, self.name+'.html')
open(htmlfile, 'wb').write(self.processed_html.encode('utf8'))
if self.book_header.exth is not None:
mi = self.book_header.exth.mi
opf = OPFCreator(mi)
if hasattr(self.book_header.exth, 'cover_offset'):
opf.cover = 'images/%d.jpg'%(self.book_header.exth.cover_offset+1)
manifest = [(os.path.basename(htmlfile), 'text/x-oeb1-document')]
for i in self.image_names:
manifest.append(('images/'+i, 'image/jpg'))
opf.create_manifest(manifest)
opf.create_spine([os.path.basename(htmlfile)])
opf.write(open(os.path.splitext(htmlfile)[0]+'.opf', 'wb'))
def replace_page_breaks(self):
self.processed_html = self.PAGE_BREAK_PAT.sub('<br style="page-break-after:always" />',
self.processed_html)
def add_anchors(self):
positions = []
link_pattern = re.compile(r'<a\s+filepos=(\d+)', re.IGNORECASE)
for match in link_pattern.finditer(self.mobi_html):
positions.append(int(match.group(1)))
pos = 0
self.processed_html = ''
for end in positions:
oend = end
l = self.mobi_html.find('<', end)
r = self.mobi_html.find('>', end)
if r > -1 and r < l: # Move out of tag
end = r+1
self.processed_html += self.mobi_html[pos:end] + '<a name="%d" />'%oend
pos = end
self.processed_html += self.mobi_html[pos:]
self.processed_html = link_pattern.sub(lambda match: '<a href="#%d"'%int(match.group(1)),
self.processed_html)
def extract_images(self, processed_records, output_dir):
output_dir = os.path.abspath(os.path.join(output_dir, 'images'))
if not os.path.exists(output_dir):
os.makedirs(output_dir)
image_index = 0
self.image_names = []
for i in range(self.num_sections):
if i in processed_records:
continue
processed_records.append(i)
data = self.sections[i][0]
buf = cStringIO.StringIO(data)
try:
im = PILImage.open(buf)
except IOError:
continue
image_index += 1
path = os.path.join(output_dir, '%05d.jpg'%image_index)
self.image_names.append(os.path.basename(path))
im.convert('RGB').save(open(path, 'wb'), format='JPEG')
def fix_images(match):
one = re.compile(r'src=["\']{0,1}[^\'"]+["\']{0,1}', re.IGNORECASE).sub('', match.group(1)).strip()
return '<img'+one+' src="images/%s.jpg"'%match.group(2)
self.processed_html = \
re.compile(r'<img(.+?)recindex=[\'"]{0,1}(\d+)[\'"]{0,1}', re.IGNORECASE|re.DOTALL)\
.sub(fix_images, self.processed_html)
def option_parser():
from libprs500 import OptionParser
parser = OptionParser(usage='%prog [options] myebook.mobi')
parser.add_option('-o', '--output-dir', default='.',
help='Output directory. Defaults to current directory.')
return parser
def main(args=sys.argv):
parser = option_parser()
opts, args = parser.parse_args(args)
if len(args) != 2:
parser.print_help()
return 1
mr = MobiReader(args[1])
opts.output_dir = os.path.abspath(opts.output_dir)
mr.extract_content(opts.output_dir)
print 'OEB ebook created in', opts.output_dir
return 0
if __name__ == '__main__':
sys.exit(main())

View File

@ -98,6 +98,7 @@ def setup_completion(fatal_errors):
from libprs500.ebooks.lrf.parser import option_parser as lrf2lrsop
from libprs500.gui2.lrf_renderer.main import option_parser as lrfviewerop
from libprs500.ebooks.lrf.pdf.reflow import option_parser as pdfhtmlop
from libprs500.ebooks.mobi.reader import option_parser as mobioeb
f = open_file('/etc/bash_completion.d/libprs500')
@ -120,6 +121,7 @@ def setup_completion(fatal_errors):
f.write(opts_and_exts('epub-meta', metaop, ['epub']))
f.write(opts_and_exts('lrfviewer', lrfviewerop, ['lrf']))
f.write(opts_and_exts('pdfrelow', pdfhtmlop, ['pdf']))
f.write(opts_and_exts('mobi2oeb', mobioeb, ['mobi', 'prc']))
f.write('''
_prs500_ls()
{