Fix #2129 (.prc to MOBI causes TOC links to break)

This commit is contained in:
Kovid Goyal 2009-03-22 10:57:14 -07:00
parent 802385787a
commit af9f8a47a5
2 changed files with 90 additions and 88 deletions

View File

@ -27,7 +27,7 @@ from calibre.ebooks.metadata.toc import TOC
from calibre import sanitize_file_name from calibre import sanitize_file_name
class EXTHHeader(object): class EXTHHeader(object):
def __init__(self, raw, codec, title): def __init__(self, raw, codec, title):
self.doctype = raw[:4] self.doctype = raw[:4]
self.length, self.num_items = struct.unpack('>LL', raw[4:12]) self.length, self.num_items = struct.unpack('>LL', raw[4:12])
@ -35,7 +35,7 @@ class EXTHHeader(object):
pos = 0 pos = 0
self.mi = MetaInformation(_('Unknown'), [_('Unknown')]) self.mi = MetaInformation(_('Unknown'), [_('Unknown')])
self.has_fake_cover = True self.has_fake_cover = True
for i in range(self.num_items): for i in range(self.num_items):
id, size = struct.unpack('>LL', raw[pos:pos+8]) id, size = struct.unpack('>LL', raw[pos:pos+8])
content = raw[pos+8:pos+size] content = raw[pos+8:pos+size]
@ -43,18 +43,18 @@ class EXTHHeader(object):
if id >= 100 and id < 200: if id >= 100 and id < 200:
self.process_metadata(id, content, codec) self.process_metadata(id, content, codec)
elif id == 203: elif id == 203:
self.has_fake_cover = bool(struct.unpack('>L', content)[0]) self.has_fake_cover = bool(struct.unpack('>L', content)[0])
elif id == 201: elif id == 201:
co, = struct.unpack('>L', content) co, = struct.unpack('>L', content)
if co < 1e7: if co < 1e7:
self.cover_offset = co self.cover_offset = co
elif id == 202: elif id == 202:
self.thumbnail_offset, = struct.unpack('>L', content) self.thumbnail_offset, = struct.unpack('>L', content)
#else: #else:
# print 'unknown record', id, repr(content) # print 'unknown record', id, repr(content)
if title: if title:
self.mi.title = title self.mi.title = title
def process_metadata(self, id, content, codec): def process_metadata(self, id, content, codec):
if id == 100: if id == 100:
if self.mi.authors == [_('Unknown')]: if self.mi.authors == [_('Unknown')]:
@ -71,11 +71,11 @@ class EXTHHeader(object):
self.mi.tags = [] self.mi.tags = []
self.mi.tags.append(content.decode(codec, 'ignore')) self.mi.tags.append(content.decode(codec, 'ignore'))
#else: #else:
# print 'unhandled metadata record', id, repr(content), codec # print 'unhandled metadata record', id, repr(content), codec
class BookHeader(object): class BookHeader(object):
def __init__(self, raw, ident): def __init__(self, raw, ident):
self.compression_type = raw[:2] self.compression_type = raw[:2]
self.records, self.records_size = struct.unpack('>HH', raw[8:12]) self.records, self.records_size = struct.unpack('>HH', raw[8:12])
@ -96,8 +96,8 @@ class BookHeader(object):
self.doctype = raw[16:20] self.doctype = raw[16:20]
self.length, self.type, self.codepage, self.unique_id, self.version = \ self.length, self.type, self.codepage, self.unique_id, self.version = \
struct.unpack('>LLLLL', raw[20:40]) struct.unpack('>LLLLL', raw[20:40])
try: try:
self.codec = { self.codec = {
1252 : 'cp1252', 1252 : 'cp1252',
@ -106,15 +106,15 @@ class BookHeader(object):
except (IndexError, KeyError): except (IndexError, KeyError):
print '[WARNING] Unknown codepage %d. Assuming cp-1252'%self.codepage print '[WARNING] Unknown codepage %d. Assuming cp-1252'%self.codepage
self.codec = 'cp1252' self.codec = 'cp1252'
if ident == 'TEXTREAD' or self.length < 0xE4 or 0xE8 < self.length: if ident == 'TEXTREAD' or self.length < 0xE4 or 0xE8 < self.length:
self.extra_flags = 0 self.extra_flags = 0
else: else:
self.extra_flags, = struct.unpack('>H', raw[0xF2:0xF4]) self.extra_flags, = struct.unpack('>H', raw[0xF2:0xF4])
if self.compression_type == 'DH': if self.compression_type == 'DH':
self.huff_offset, self.huff_number = struct.unpack('>LL', raw[0x70:0x78]) self.huff_offset, self.huff_number = struct.unpack('>LL', raw[0x70:0x78])
toff, tlen = struct.unpack('>II', raw[0x54:0x5c]) toff, tlen = struct.unpack('>II', raw[0x54:0x5c])
tend = toff + tlen tend = toff + tlen
self.title = raw[toff:tend] if tend < len(raw) else _('Unknown') self.title = raw[toff:tend] if tend < len(raw) else _('Unknown')
@ -125,7 +125,7 @@ class BookHeader(object):
self.sublanguage = sub_language.get(sublangid, 'NEUTRAL') self.sublanguage = sub_language.get(sublangid, 'NEUTRAL')
self.mobi_version = struct.unpack('>I', raw[0x68:0x6c])[0] self.mobi_version = struct.unpack('>I', raw[0x68:0x6c])[0]
self.first_image_index = struct.unpack('>L', raw[0x6c:0x6c+4])[0] self.first_image_index = struct.unpack('>L', raw[0x6c:0x6c+4])[0]
self.exth_flag, = struct.unpack('>L', raw[0x80:0x84]) self.exth_flag, = struct.unpack('>L', raw[0x80:0x84])
self.exth = None self.exth = None
if not isinstance(self.title, unicode): if not isinstance(self.title, unicode):
@ -134,39 +134,39 @@ class BookHeader(object):
self.exth = EXTHHeader(raw[16+self.length:], self.codec, self.title) self.exth = EXTHHeader(raw[16+self.length:], self.codec, self.title)
self.exth.mi.uid = self.unique_id self.exth.mi.uid = self.unique_id
self.exth.mi.language = self.language self.exth.mi.language = self.language
class MobiReader(object): class MobiReader(object):
PAGE_BREAK_PAT = re.compile(r'(<[/]{0,1}mbp:pagebreak\s*[/]{0,1}>)+', re.IGNORECASE) PAGE_BREAK_PAT = re.compile(r'(<[/]{0,1}mbp:pagebreak\s*[/]{0,1}>)+', re.IGNORECASE)
IMAGE_ATTRS = ('lowrecindex', 'recindex', 'hirecindex') IMAGE_ATTRS = ('lowrecindex', 'recindex', 'hirecindex')
def __init__(self, filename_or_stream, verbose=False): def __init__(self, filename_or_stream, verbose=False):
self.verbose = verbose self.verbose = verbose
self.embedded_mi = None self.embedded_mi = None
if hasattr(filename_or_stream, 'read'): if hasattr(filename_or_stream, 'read'):
stream = filename_or_stream stream = filename_or_stream
stream.seek(0) stream.seek(0)
else: else:
stream = open(filename_or_stream, 'rb') stream = open(filename_or_stream, 'rb')
raw = stream.read() raw = stream.read()
self.header = raw[0:72] self.header = raw[0:72]
self.name = self.header[:32].replace('\x00', '') self.name = self.header[:32].replace('\x00', '')
self.num_sections, = struct.unpack('>H', raw[76:78]) self.num_sections, = struct.unpack('>H', raw[76:78])
self.ident = self.header[0x3C:0x3C+8].upper() self.ident = self.header[0x3C:0x3C+8].upper()
if self.ident not in ['BOOKMOBI', 'TEXTREAD']: if self.ident not in ['BOOKMOBI', 'TEXTREAD']:
raise MobiError('Unknown book type: %s'%self.ident) raise MobiError('Unknown book type: %s'%self.ident)
self.sections = [] self.sections = []
self.section_headers = [] self.section_headers = []
for i in range(self.num_sections): for i in range(self.num_sections):
offset, a1, a2, a3, a4 = struct.unpack('>LBBBB', raw[78+i*8:78+i*8+8]) offset, a1, a2, a3, a4 = struct.unpack('>LBBBB', raw[78+i*8:78+i*8+8])
flags, val = a1, a2<<16 | a3<<8 | a4 flags, val = a1, a2<<16 | a3<<8 | a4
self.section_headers.append((offset, flags, val)) self.section_headers.append((offset, flags, val))
def section(section_number): def section(section_number):
if section_number == self.num_sections - 1: if section_number == self.num_sections - 1:
end_off = len(raw) end_off = len(raw)
@ -174,32 +174,32 @@ class MobiReader(object):
end_off = self.section_headers[section_number + 1][0] end_off = self.section_headers[section_number + 1][0]
off = self.section_headers[section_number][0] off = self.section_headers[section_number][0]
return raw[off:end_off] return raw[off:end_off]
for i in range(self.num_sections): for i in range(self.num_sections):
self.sections.append((section(i), self.section_headers[i])) self.sections.append((section(i), self.section_headers[i]))
self.book_header = BookHeader(self.sections[0][0], self.ident) self.book_header = BookHeader(self.sections[0][0], self.ident)
self.name = self.name.decode(self.book_header.codec, 'replace') self.name = self.name.decode(self.book_header.codec, 'replace')
def extract_content(self, output_dir=os.getcwdu()): def extract_content(self, output_dir=os.getcwdu()):
output_dir = os.path.abspath(output_dir) output_dir = os.path.abspath(output_dir)
if self.book_header.encryption_type != 0: if self.book_header.encryption_type != 0:
raise DRMError(self.name) raise DRMError(self.name)
processed_records = self.extract_text() processed_records = self.extract_text()
self.add_anchors() self.add_anchors()
self.processed_html = self.processed_html.decode(self.book_header.codec, 'ignore') self.processed_html = self.processed_html.decode(self.book_header.codec, 'ignore')
for pat in ENCODING_PATS: for pat in ENCODING_PATS:
self.processed_html = pat.sub('', self.processed_html) self.processed_html = pat.sub('', self.processed_html)
e2u = functools.partial(entity_to_unicode, e2u = functools.partial(entity_to_unicode,
exceptions=['lt', 'gt', 'amp', 'apos', 'quot']) exceptions=['lt', 'gt', 'amp', 'apos', 'quot'])
self.processed_html = re.sub(r'&(\S+?);', e2u, self.processed_html = re.sub(r'&(\S+?);', e2u,
self.processed_html) self.processed_html)
self.extract_images(processed_records, output_dir) self.extract_images(processed_records, output_dir)
self.replace_page_breaks() self.replace_page_breaks()
self.cleanup_html() self.cleanup_html()
if self.processed_html.startswith('<body'): if self.processed_html.startswith('<body'):
self.processed_html = '<html><head></head>'+self.processed_html+'</html>' self.processed_html = '<html><head></head>'+self.processed_html+'</html>'
self.processed_html = \ self.processed_html = \
@ -212,7 +212,7 @@ class MobiReader(object):
'.italic { font-style: italic; }\n' '.italic { font-style: italic; }\n'
'</style>\n', '</style>\n',
self.processed_html) self.processed_html)
if self.verbose: if self.verbose:
print 'Parsing HTML...' print 'Parsing HTML...'
root = html.fromstring(self.processed_html) root = html.fromstring(self.processed_html)
@ -224,7 +224,7 @@ class MobiReader(object):
self.read_embedded_metadata(root, metadata_elems[0], guide) self.read_embedded_metadata(root, metadata_elems[0], guide)
for elem in guides + metadata_elems: for elem in guides + metadata_elems:
elem.getparent().remove(elem) elem.getparent().remove(elem)
htmlfile = os.path.join(output_dir, htmlfile = os.path.join(output_dir,
sanitize_file_name(self.name)+'.html') sanitize_file_name(self.name)+'.html')
try: try:
for ref in guide.xpath('descendant::reference'): for ref in guide.xpath('descendant::reference'):
@ -235,13 +235,13 @@ class MobiReader(object):
if self.verbose: if self.verbose:
print 'Serializing...' print 'Serializing...'
with open(htmlfile, 'wb') as f: with open(htmlfile, 'wb') as f:
raw = html.tostring(root, encoding='utf-8', method='xml', raw = html.tostring(root, encoding='utf-8', method='xml',
include_meta_content_type=True, pretty_print=True) include_meta_content_type=True, pretty_print=True)
raw = raw.replace('<head>', raw = raw.replace('<head>',
'<head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\n') '<head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\n')
f.write(raw) f.write(raw)
self.htmlfile = htmlfile self.htmlfile = htmlfile
if self.book_header.exth is not None or self.embedded_mi is not None: if self.book_header.exth is not None or self.embedded_mi is not None:
if self.verbose: if self.verbose:
print 'Creating OPF...' print 'Creating OPF...'
@ -251,7 +251,7 @@ class MobiReader(object):
ncx = ncx.getvalue() ncx = ncx.getvalue()
if ncx: if ncx:
open(os.path.splitext(htmlfile)[0]+'.ncx', 'wb').write(ncx) open(os.path.splitext(htmlfile)[0]+'.ncx', 'wb').write(ncx)
def read_embedded_metadata(self, root, elem, guide): def read_embedded_metadata(self, root, elem, guide):
raw = '<package>'+html.tostring(elem, encoding='utf-8')+'</package>' raw = '<package>'+html.tostring(elem, encoding='utf-8')+'</package>'
stream = cStringIO.StringIO(raw) stream = cStringIO.StringIO(raw)
@ -276,8 +276,8 @@ class MobiReader(object):
elem.getparent().remove(elem) elem.getparent().remove(elem)
break break
break break
def cleanup_html(self): def cleanup_html(self):
if self.verbose: if self.verbose:
print 'Cleaning up HTML...' print 'Cleaning up HTML...'
@ -286,7 +286,7 @@ class MobiReader(object):
self.processed_html = '<html><p>'+self.processed_html.replace('\n\n', '<p>')+'</html>' self.processed_html = '<html><p>'+self.processed_html.replace('\n\n', '<p>')+'</html>'
self.processed_html = self.processed_html.replace('\r\n', '\n') self.processed_html = self.processed_html.replace('\r\n', '\n')
self.processed_html = self.processed_html.replace('> <', '>\n<') self.processed_html = self.processed_html.replace('> <', '>\n<')
def upshift_markup(self, root): def upshift_markup(self, root):
if self.verbose: if self.verbose:
print 'Converting style information to CSS...' print 'Converting style information to CSS...'
@ -358,13 +358,15 @@ class MobiReader(object):
attrib['style'] = '; '.join(styles) attrib['style'] = '; '.join(styles)
if 'filepos-id' in attrib: if 'filepos-id' in attrib:
attrib['id'] = attrib.pop('filepos-id') attrib['id'] = attrib.pop('filepos-id')
if 'name' in attrib and attrib['name'] != attrib['id']:
attrib['name'] = attrib['id']
if 'filepos' in attrib: if 'filepos' in attrib:
filepos = attrib.pop('filepos') filepos = attrib.pop('filepos')
try: try:
attrib['href'] = "#filepos%d" % int(filepos) attrib['href'] = "#filepos%d" % int(filepos)
except ValueError: except ValueError:
pass pass
def create_opf(self, htmlfile, guide=None, root=None): def create_opf(self, htmlfile, guide=None, root=None):
mi = getattr(self.book_header.exth, 'mi', self.embedded_mi) mi = getattr(self.book_header.exth, 'mi', self.embedded_mi)
opf = OPFCreator(os.path.dirname(htmlfile), mi) opf = OPFCreator(os.path.dirname(htmlfile), mi)
@ -374,14 +376,14 @@ class MobiReader(object):
opf.cover = mi.cover opf.cover = mi.cover
else: else:
opf.cover = 'images/%05d.jpg'%1 opf.cover = 'images/%05d.jpg'%1
if not os.path.exists(os.path.join(os.path.dirname(htmlfile), if not os.path.exists(os.path.join(os.path.dirname(htmlfile),
*opf.cover.split('/'))): *opf.cover.split('/'))):
opf.cover = None opf.cover = None
manifest = [(htmlfile, 'text/x-oeb1-document')] manifest = [(htmlfile, 'text/x-oeb1-document')]
bp = os.path.dirname(htmlfile) bp = os.path.dirname(htmlfile)
for i in getattr(self, 'image_names', []): for i in getattr(self, 'image_names', []):
manifest.append((os.path.join(bp, 'images/', i), 'image/jpeg')) manifest.append((os.path.join(bp, 'images/', i), 'image/jpeg'))
opf.create_manifest(manifest) opf.create_manifest(manifest)
opf.create_spine([os.path.basename(htmlfile)]) opf.create_spine([os.path.basename(htmlfile)])
toc = None toc = None
@ -410,16 +412,16 @@ class MobiReader(object):
except: except:
text = '' text = ''
text = ent_pat.sub(entity_to_unicode, text) text = ent_pat.sub(entity_to_unicode, text)
tocobj.add_item(toc.partition('#')[0], href[1:], tocobj.add_item(toc.partition('#')[0], href[1:],
text) text)
if reached and x.get('class', None) == 'mbp_pagebreak': if reached and x.get('class', None) == 'mbp_pagebreak':
break break
if tocobj is not None: if tocobj is not None:
opf.set_toc(tocobj) opf.set_toc(tocobj)
return opf return opf
def sizeof_trailing_entries(self, data): def sizeof_trailing_entries(self, data):
def sizeof_trailing_entry(ptr, psize): def sizeof_trailing_entry(ptr, psize):
bitpos, result = 0, 0 bitpos, result = 0, 0
@ -430,7 +432,7 @@ class MobiReader(object):
psize -= 1 psize -= 1
if (v & 0x80) != 0 or (bitpos >= 28) or (psize == 0): if (v & 0x80) != 0 or (bitpos >= 28) or (psize == 0):
return result return result
num = 0 num = 0
size = len(data) size = len(data)
flags = self.book_header.extra_flags >> 1 flags = self.book_header.extra_flags >> 1
@ -446,28 +448,28 @@ class MobiReader(object):
data = self.sections[index][0] data = self.sections[index][0]
trail_size = self.sizeof_trailing_entries(data) trail_size = self.sizeof_trailing_entries(data)
return data[:len(data)-trail_size] return data[:len(data)-trail_size]
def extract_text(self): def extract_text(self):
if self.verbose: if self.verbose:
print 'Extracting text...' print 'Extracting text...'
text_sections = [self.text_section(i) for i in range(1, self.book_header.records+1)] text_sections = [self.text_section(i) for i in range(1, self.book_header.records+1)]
processed_records = list(range(0, self.book_header.records+1)) processed_records = list(range(0, self.book_header.records+1))
self.mobi_html = '' self.mobi_html = ''
if self.book_header.compression_type == 'DH': if self.book_header.compression_type == 'DH':
huffs = [self.sections[i][0] for i in huffs = [self.sections[i][0] for i in
range(self.book_header.huff_offset, range(self.book_header.huff_offset,
self.book_header.huff_offset+self.book_header.huff_number)] self.book_header.huff_offset+self.book_header.huff_number)]
processed_records += list(range(self.book_header.huff_offset, processed_records += list(range(self.book_header.huff_offset,
self.book_header.huff_offset+self.book_header.huff_number)) self.book_header.huff_offset+self.book_header.huff_number))
huff = HuffReader(huffs) huff = HuffReader(huffs)
self.mobi_html = huff.decompress(text_sections) self.mobi_html = huff.decompress(text_sections)
elif self.book_header.compression_type == '\x00\x02': elif self.book_header.compression_type == '\x00\x02':
for section in text_sections: for section in text_sections:
self.mobi_html += decompress_doc(section) self.mobi_html += decompress_doc(section)
elif self.book_header.compression_type == '\x00\x01': elif self.book_header.compression_type == '\x00\x01':
self.mobi_html = ''.join(text_sections) self.mobi_html = ''.join(text_sections)
else: else:
@ -475,13 +477,13 @@ class MobiReader(object):
if self.book_header.ancient and '<html' not in self.mobi_html[:300].lower(): if self.book_header.ancient and '<html' not in self.mobi_html[:300].lower():
self.mobi_html = self.mobi_html.replace('\r ', '\n\n ') self.mobi_html = self.mobi_html.replace('\r ', '\n\n ')
return processed_records return processed_records
def replace_page_breaks(self): def replace_page_breaks(self):
self.processed_html = self.PAGE_BREAK_PAT.sub( self.processed_html = self.PAGE_BREAK_PAT.sub(
'<div class="mbp_pagebreak" style="page-break-after: always; margin: 0; display: block" />', '<div class="mbp_pagebreak" style="page-break-after: always; margin: 0; display: block" />',
self.processed_html) self.processed_html)
def add_anchors(self): def add_anchors(self):
if self.verbose: if self.verbose:
print 'Adding anchors...' print 'Adding anchors...'
@ -511,8 +513,8 @@ class MobiReader(object):
self.processed_html += self.mobi_html[pos:end] + (anchor % oend) self.processed_html += self.mobi_html[pos:end] + (anchor % oend)
pos = end pos = end
self.processed_html += self.mobi_html[pos:] self.processed_html += self.mobi_html[pos:]
def extract_images(self, processed_records, output_dir): def extract_images(self, processed_records, output_dir):
if self.verbose: if self.verbose:
print 'Extracting images...' print 'Extracting images...'
@ -523,7 +525,7 @@ class MobiReader(object):
self.image_names = [] self.image_names = []
start = getattr(self.book_header, 'first_image_index', -1) start = getattr(self.book_header, 'first_image_index', -1)
if start > self.num_sections or start < 0: if start > self.num_sections or start < 0:
# BAEN PRC files have bad headers # BAEN PRC files have bad headers
start=0 start=0
for i in range(start, self.num_sections): for i in range(start, self.num_sections):
if i in processed_records: if i in processed_records:
@ -533,10 +535,10 @@ class MobiReader(object):
buf = cStringIO.StringIO(data) buf = cStringIO.StringIO(data)
image_index += 1 image_index += 1
try: try:
im = PILImage.open(buf) im = PILImage.open(buf)
except IOError: except IOError:
continue continue
path = os.path.join(output_dir, '%05d.jpg'%image_index) path = os.path.join(output_dir, '%05d.jpg'%image_index)
self.image_names.append(os.path.basename(path)) self.image_names.append(os.path.basename(path))
im.convert('RGB').save(open(path, 'wb'), format='JPEG') im.convert('RGB').save(open(path, 'wb'), format='JPEG')
@ -562,17 +564,17 @@ def get_metadata(stream):
import traceback import traceback
traceback.print_exc() traceback.print_exc()
return mi return mi
def option_parser(): def option_parser():
from calibre.utils.config import OptionParser from calibre.utils.config import OptionParser
parser = OptionParser(usage=_('%prog [options] myebook.mobi')) parser = OptionParser(usage=_('%prog [options] myebook.mobi'))
parser.add_option('-o', '--output-dir', default='.', parser.add_option('-o', '--output-dir', default='.',
help=_('Output directory. Defaults to current directory.')) help=_('Output directory. Defaults to current directory.'))
parser.add_option('-v', '--verbose', default=False, action='store_true', parser.add_option('-v', '--verbose', default=False, action='store_true',
help='Useful for debugging.') help='Useful for debugging.')
return parser return parser
def main(args=sys.argv): def main(args=sys.argv):
parser = option_parser() parser = option_parser()
@ -580,7 +582,7 @@ def main(args=sys.argv):
if len(args) != 2: if len(args) != 2:
parser.print_help() parser.print_help()
return 1 return 1
mr = MobiReader(args[1], verbose=opts.verbose) mr = MobiReader(args[1], verbose=opts.verbose)
opts.output_dir = os.path.abspath(opts.output_dir) opts.output_dir = os.path.abspath(opts.output_dir)
mr.extract_content(opts.output_dir) mr.extract_content(opts.output_dir)
@ -591,9 +593,9 @@ def main(args=sys.argv):
dat = dat.encode('utf-8') dat = dat.encode('utf-8')
open(oname, 'wb').write(dat) open(oname, 'wb').write(dat)
print _('Raw MOBI HTML saved in'), oname print _('Raw MOBI HTML saved in'), oname
print _('OEB ebook created in'), opts.output_dir print _('OEB ebook created in'), opts.output_dir
return 0 return 0
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -132,7 +132,7 @@ def rescale_image(data, maxsizeb, dimen=None):
class Serializer(object): class Serializer(object):
NSRMAP = {'': None, XML_NS: 'xml', XHTML_NS: '', MBP_NS: 'mbp'} NSRMAP = {'': None, XML_NS: 'xml', XHTML_NS: '', MBP_NS: 'mbp'}
def __init__(self, oeb, images): def __init__(self, oeb, images):
self.oeb = oeb self.oeb = oeb
self.images = images self.images = images
@ -192,7 +192,7 @@ class Serializer(object):
self.href_offsets[href].append(buffer.tell()) self.href_offsets[href].append(buffer.tell())
buffer.write('0000000000') buffer.write('0000000000')
return True return True
def serialize_body(self): def serialize_body(self):
buffer = self.buffer buffer = self.buffer
self.anchor_offset = buffer.tell() self.anchor_offset = buffer.tell()
@ -290,10 +290,10 @@ class Serializer(object):
buffer.seek(hoff) buffer.seek(hoff)
buffer.write('%010d' % ioff) buffer.write('%010d' % ioff)
class MobiWriter(object): class MobiWriter(object):
COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+') COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+')
def __init__(self, compression=None, imagemax=None, def __init__(self, compression=None, imagemax=None,
prefer_author_sort=False): prefer_author_sort=False):
self._compression = compression or UNCOMPRESSED self._compression = compression or UNCOMPRESSED
@ -305,11 +305,11 @@ class MobiWriter(object):
return self._dump_stream(oeb, path) return self._dump_stream(oeb, path)
with open(path, 'w+b') as stream: with open(path, 'w+b') as stream:
return self._dump_stream(oeb, stream) return self._dump_stream(oeb, stream)
def _write(self, *data): def _write(self, *data):
for datum in data: for datum in data:
self._stream.write(datum) self._stream.write(datum)
def _tell(self): def _tell(self):
return self._stream.tell() return self._stream.tell()
@ -364,7 +364,7 @@ class MobiWriter(object):
overlap = text.read(extra) overlap = text.read(extra)
text.seek(npos) text.seek(npos)
return data, overlap return data, overlap
def _generate_text(self): def _generate_text(self):
self._oeb.logger.info('Serializing markup content...') self._oeb.logger.info('Serializing markup content...')
serializer = Serializer(self._oeb, self._images) serializer = Serializer(self._oeb, self._images)
@ -405,7 +405,7 @@ class MobiWriter(object):
offset += RECORD_SIZE offset += RECORD_SIZE
data, overlap = self._read_text_record(text) data, overlap = self._read_text_record(text)
self._text_nrecords = nrecords self._text_nrecords = nrecords
def _generate_images(self): def _generate_images(self):
self._oeb.logger.info('Serializing images...') self._oeb.logger.info('Serializing images...')
images = [(index, href) for href, index in self._images.items()] images = [(index, href) for href, index in self._images.items()]
@ -418,7 +418,7 @@ class MobiWriter(object):
self._oeb.logger.warn('Bad image file %r' % item.href) self._oeb.logger.warn('Bad image file %r' % item.href)
continue continue
self._records.append(data) self._records.append(data)
def _generate_record0(self): def _generate_record0(self):
metadata = self._oeb.metadata metadata = self._oeb.metadata
exth = self._build_exth() exth = self._build_exth()
@ -510,7 +510,7 @@ class MobiWriter(object):
self._images[href] = index self._images[href] = index
self._records.append(data) self._records.append(data)
return index return index
def _write_header(self): def _write_header(self):
title = str(self._oeb.metadata.title[0]) title = str(self._oeb.metadata.title[0])
title = re.sub('[^-A-Za-z0-9]+', '_', title)[:32] title = re.sub('[^-A-Za-z0-9]+', '_', title)[:32]
@ -537,14 +537,14 @@ def config(defaults=None):
c = Config('mobi', desc) c = Config('mobi', desc)
else: else:
c = StringConfig(defaults, desc) c = StringConfig(defaults, desc)
mobi = c.add_group('mobipocket', _('Mobipocket-specific options.')) mobi = c.add_group('mobipocket', _('Mobipocket-specific options.'))
mobi('compress', ['--compress'], default=False, mobi('compress', ['--compress'], default=False,
help=_('Compress file text using PalmDOC compression. ' help=_('Compress file text using PalmDOC compression. '
'Results in smaller files, but takes a long time to run.')) 'Results in smaller files, but takes a long time to run.'))
mobi('rescale_images', ['--rescale-images'], default=False, mobi('rescale_images', ['--rescale-images'], default=False,
help=_('Modify images to meet Palm device size limitations.')) help=_('Modify images to meet Palm device size limitations.'))
mobi('toc_title', ['--toc-title'], default=None, mobi('toc_title', ['--toc-title'], default=None,
help=_('Title for any generated in-line table of contents.')) help=_('Title for any generated in-line table of contents.'))
mobi('ignore_tables', ['--ignore-tables'], default=False, mobi('ignore_tables', ['--ignore-tables'], default=False,
help=_('Render HTML tables as blocks of text instead of actual ' help=_('Render HTML tables as blocks of text instead of actual '
@ -565,13 +565,13 @@ def config(defaults=None):
c.add_opt('encoding', ['--encoding'], default=None, c.add_opt('encoding', ['--encoding'], default=None,
help=_('Character encoding for HTML files. Default is to auto detect.')) help=_('Character encoding for HTML files. Default is to auto detect.'))
return c return c
def option_parser(): def option_parser():
c = config() c = config()
parser = c.option_parser(usage='%prog '+_('[options]')+' file.opf') parser = c.option_parser(usage='%prog '+_('[options]')+' file.opf')
parser.add_option( parser.add_option(
'-o', '--output', default=None, '-o', '--output', default=None,
help=_('Output file. Default is derived from input filename.')) help=_('Output file. Default is derived from input filename.'))
parser.add_option( parser.add_option(
'-v', '--verbose', default=0, action='count', '-v', '--verbose', default=0, action='count',
@ -617,7 +617,7 @@ def oeb2mobi(opts, inpath):
writer.dump(oeb, outpath) writer.dump(oeb, outpath)
run_plugins_on_postprocess(outpath, 'mobi') run_plugins_on_postprocess(outpath, 'mobi')
logger.info(_('Output written to ') + outpath) logger.info(_('Output written to ') + outpath)
def main(argv=sys.argv): def main(argv=sys.argv):
parser = option_parser() parser = option_parser()
opts, args = parser.parse_args(argv[1:]) opts, args = parser.parse_args(argv[1:])