Fix #2129 (.prc to MOBI causes TOC links to break)

This commit is contained in:
Kovid Goyal 2009-03-22 10:57:14 -07:00
parent 802385787a
commit af9f8a47a5
2 changed files with 90 additions and 88 deletions

View File

@ -27,7 +27,7 @@ from calibre.ebooks.metadata.toc import TOC
from calibre import sanitize_file_name
class EXTHHeader(object):
def __init__(self, raw, codec, title):
self.doctype = raw[:4]
self.length, self.num_items = struct.unpack('>LL', raw[4:12])
@ -35,7 +35,7 @@ class EXTHHeader(object):
pos = 0
self.mi = MetaInformation(_('Unknown'), [_('Unknown')])
self.has_fake_cover = True
for i in range(self.num_items):
id, size = struct.unpack('>LL', raw[pos:pos+8])
content = raw[pos+8:pos+size]
@ -43,18 +43,18 @@ class EXTHHeader(object):
if id >= 100 and id < 200:
self.process_metadata(id, content, codec)
elif id == 203:
self.has_fake_cover = bool(struct.unpack('>L', content)[0])
self.has_fake_cover = bool(struct.unpack('>L', content)[0])
elif id == 201:
co, = struct.unpack('>L', content)
if co < 1e7:
self.cover_offset = co
self.cover_offset = co
elif id == 202:
self.thumbnail_offset, = struct.unpack('>L', content)
#else:
# print 'unknown record', id, repr(content)
if title:
self.mi.title = title
def process_metadata(self, id, content, codec):
if id == 100:
if self.mi.authors == [_('Unknown')]:
@ -71,11 +71,11 @@ class EXTHHeader(object):
self.mi.tags = []
self.mi.tags.append(content.decode(codec, 'ignore'))
#else:
# print 'unhandled metadata record', id, repr(content), codec
# print 'unhandled metadata record', id, repr(content), codec
class BookHeader(object):
def __init__(self, raw, ident):
self.compression_type = raw[:2]
self.records, self.records_size = struct.unpack('>HH', raw[8:12])
@ -96,8 +96,8 @@ class BookHeader(object):
self.doctype = raw[16:20]
self.length, self.type, self.codepage, self.unique_id, self.version = \
struct.unpack('>LLLLL', raw[20:40])
try:
self.codec = {
1252 : 'cp1252',
@ -106,15 +106,15 @@ class BookHeader(object):
except (IndexError, KeyError):
print '[WARNING] Unknown codepage %d. Assuming cp-1252'%self.codepage
self.codec = 'cp1252'
if ident == 'TEXTREAD' or self.length < 0xE4 or 0xE8 < self.length:
self.extra_flags = 0
else:
self.extra_flags, = struct.unpack('>H', raw[0xF2:0xF4])
if self.compression_type == 'DH':
self.huff_offset, self.huff_number = struct.unpack('>LL', raw[0x70:0x78])
self.huff_offset, self.huff_number = struct.unpack('>LL', raw[0x70:0x78])
toff, tlen = struct.unpack('>II', raw[0x54:0x5c])
tend = toff + tlen
self.title = raw[toff:tend] if tend < len(raw) else _('Unknown')
@ -125,7 +125,7 @@ class BookHeader(object):
self.sublanguage = sub_language.get(sublangid, 'NEUTRAL')
self.mobi_version = struct.unpack('>I', raw[0x68:0x6c])[0]
self.first_image_index = struct.unpack('>L', raw[0x6c:0x6c+4])[0]
self.exth_flag, = struct.unpack('>L', raw[0x80:0x84])
self.exth = None
if not isinstance(self.title, unicode):
@ -134,39 +134,39 @@ class BookHeader(object):
self.exth = EXTHHeader(raw[16+self.length:], self.codec, self.title)
self.exth.mi.uid = self.unique_id
self.exth.mi.language = self.language
class MobiReader(object):
PAGE_BREAK_PAT = re.compile(r'(<[/]{0,1}mbp:pagebreak\s*[/]{0,1}>)+', re.IGNORECASE)
IMAGE_ATTRS = ('lowrecindex', 'recindex', 'hirecindex')
def __init__(self, filename_or_stream, verbose=False):
self.verbose = verbose
self.embedded_mi = None
if hasattr(filename_or_stream, 'read'):
stream = filename_or_stream
stream.seek(0)
else:
stream = open(filename_or_stream, 'rb')
raw = stream.read()
self.header = raw[0:72]
self.name = self.header[:32].replace('\x00', '')
self.num_sections, = struct.unpack('>H', raw[76:78])
self.ident = self.header[0x3C:0x3C+8].upper()
if self.ident not in ['BOOKMOBI', 'TEXTREAD']:
raise MobiError('Unknown book type: %s'%self.ident)
raise MobiError('Unknown book type: %s'%self.ident)
self.sections = []
self.section_headers = []
for i in range(self.num_sections):
offset, a1, a2, a3, a4 = struct.unpack('>LBBBB', raw[78+i*8:78+i*8+8])
flags, val = a1, a2<<16 | a3<<8 | a4
self.section_headers.append((offset, flags, val))
def section(section_number):
if section_number == self.num_sections - 1:
end_off = len(raw)
@ -174,32 +174,32 @@ class MobiReader(object):
end_off = self.section_headers[section_number + 1][0]
off = self.section_headers[section_number][0]
return raw[off:end_off]
for i in range(self.num_sections):
self.sections.append((section(i), self.section_headers[i]))
self.sections.append((section(i), self.section_headers[i]))
self.book_header = BookHeader(self.sections[0][0], self.ident)
self.name = self.name.decode(self.book_header.codec, 'replace')
def extract_content(self, output_dir=os.getcwdu()):
output_dir = os.path.abspath(output_dir)
if self.book_header.encryption_type != 0:
raise DRMError(self.name)
processed_records = self.extract_text()
self.add_anchors()
self.processed_html = self.processed_html.decode(self.book_header.codec, 'ignore')
for pat in ENCODING_PATS:
self.processed_html = pat.sub('', self.processed_html)
e2u = functools.partial(entity_to_unicode,
e2u = functools.partial(entity_to_unicode,
exceptions=['lt', 'gt', 'amp', 'apos', 'quot'])
self.processed_html = re.sub(r'&(\S+?);', e2u,
self.processed_html)
self.extract_images(processed_records, output_dir)
self.replace_page_breaks()
self.cleanup_html()
if self.processed_html.startswith('<body'):
self.processed_html = '<html><head></head>'+self.processed_html+'</html>'
self.processed_html = \
@ -212,7 +212,7 @@ class MobiReader(object):
'.italic { font-style: italic; }\n'
'</style>\n',
self.processed_html)
if self.verbose:
print 'Parsing HTML...'
root = html.fromstring(self.processed_html)
@ -224,7 +224,7 @@ class MobiReader(object):
self.read_embedded_metadata(root, metadata_elems[0], guide)
for elem in guides + metadata_elems:
elem.getparent().remove(elem)
htmlfile = os.path.join(output_dir,
htmlfile = os.path.join(output_dir,
sanitize_file_name(self.name)+'.html')
try:
for ref in guide.xpath('descendant::reference'):
@ -235,13 +235,13 @@ class MobiReader(object):
if self.verbose:
print 'Serializing...'
with open(htmlfile, 'wb') as f:
raw = html.tostring(root, encoding='utf-8', method='xml',
raw = html.tostring(root, encoding='utf-8', method='xml',
include_meta_content_type=True, pretty_print=True)
raw = raw.replace('<head>',
raw = raw.replace('<head>',
'<head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\n')
f.write(raw)
self.htmlfile = htmlfile
if self.book_header.exth is not None or self.embedded_mi is not None:
if self.verbose:
print 'Creating OPF...'
@ -251,7 +251,7 @@ class MobiReader(object):
ncx = ncx.getvalue()
if ncx:
open(os.path.splitext(htmlfile)[0]+'.ncx', 'wb').write(ncx)
def read_embedded_metadata(self, root, elem, guide):
raw = '<package>'+html.tostring(elem, encoding='utf-8')+'</package>'
stream = cStringIO.StringIO(raw)
@ -276,8 +276,8 @@ class MobiReader(object):
elem.getparent().remove(elem)
break
break
def cleanup_html(self):
if self.verbose:
print 'Cleaning up HTML...'
@ -286,7 +286,7 @@ class MobiReader(object):
self.processed_html = '<html><p>'+self.processed_html.replace('\n\n', '<p>')+'</html>'
self.processed_html = self.processed_html.replace('\r\n', '\n')
self.processed_html = self.processed_html.replace('> <', '>\n<')
def upshift_markup(self, root):
if self.verbose:
print 'Converting style information to CSS...'
@ -358,13 +358,15 @@ class MobiReader(object):
attrib['style'] = '; '.join(styles)
if 'filepos-id' in attrib:
attrib['id'] = attrib.pop('filepos-id')
if 'name' in attrib and attrib['name'] != attrib['id']:
attrib['name'] = attrib['id']
if 'filepos' in attrib:
filepos = attrib.pop('filepos')
try:
attrib['href'] = "#filepos%d" % int(filepos)
except ValueError:
pass
def create_opf(self, htmlfile, guide=None, root=None):
mi = getattr(self.book_header.exth, 'mi', self.embedded_mi)
opf = OPFCreator(os.path.dirname(htmlfile), mi)
@ -374,14 +376,14 @@ class MobiReader(object):
opf.cover = mi.cover
else:
opf.cover = 'images/%05d.jpg'%1
if not os.path.exists(os.path.join(os.path.dirname(htmlfile),
if not os.path.exists(os.path.join(os.path.dirname(htmlfile),
*opf.cover.split('/'))):
opf.cover = None
manifest = [(htmlfile, 'text/x-oeb1-document')]
bp = os.path.dirname(htmlfile)
for i in getattr(self, 'image_names', []):
manifest.append((os.path.join(bp, 'images/', i), 'image/jpeg'))
opf.create_manifest(manifest)
opf.create_spine([os.path.basename(htmlfile)])
toc = None
@ -410,16 +412,16 @@ class MobiReader(object):
except:
text = ''
text = ent_pat.sub(entity_to_unicode, text)
tocobj.add_item(toc.partition('#')[0], href[1:],
tocobj.add_item(toc.partition('#')[0], href[1:],
text)
if reached and x.get('class', None) == 'mbp_pagebreak':
break
if tocobj is not None:
opf.set_toc(tocobj)
return opf
def sizeof_trailing_entries(self, data):
def sizeof_trailing_entry(ptr, psize):
bitpos, result = 0, 0
@ -430,7 +432,7 @@ class MobiReader(object):
psize -= 1
if (v & 0x80) != 0 or (bitpos >= 28) or (psize == 0):
return result
num = 0
size = len(data)
flags = self.book_header.extra_flags >> 1
@ -446,28 +448,28 @@ class MobiReader(object):
data = self.sections[index][0]
trail_size = self.sizeof_trailing_entries(data)
return data[:len(data)-trail_size]
def extract_text(self):
if self.verbose:
print 'Extracting text...'
text_sections = [self.text_section(i) for i in range(1, self.book_header.records+1)]
processed_records = list(range(0, self.book_header.records+1))
self.mobi_html = ''
if self.book_header.compression_type == 'DH':
huffs = [self.sections[i][0] for i in
range(self.book_header.huff_offset,
huffs = [self.sections[i][0] for i in
range(self.book_header.huff_offset,
self.book_header.huff_offset+self.book_header.huff_number)]
processed_records += list(range(self.book_header.huff_offset,
processed_records += list(range(self.book_header.huff_offset,
self.book_header.huff_offset+self.book_header.huff_number))
huff = HuffReader(huffs)
self.mobi_html = huff.decompress(text_sections)
elif self.book_header.compression_type == '\x00\x02':
for section in text_sections:
self.mobi_html += decompress_doc(section)
elif self.book_header.compression_type == '\x00\x01':
self.mobi_html = ''.join(text_sections)
else:
@ -475,13 +477,13 @@ class MobiReader(object):
if self.book_header.ancient and '<html' not in self.mobi_html[:300].lower():
self.mobi_html = self.mobi_html.replace('\r ', '\n\n ')
return processed_records
def replace_page_breaks(self):
self.processed_html = self.PAGE_BREAK_PAT.sub(
'<div class="mbp_pagebreak" style="page-break-after: always; margin: 0; display: block" />',
self.processed_html)
def add_anchors(self):
if self.verbose:
print 'Adding anchors...'
@ -511,8 +513,8 @@ class MobiReader(object):
self.processed_html += self.mobi_html[pos:end] + (anchor % oend)
pos = end
self.processed_html += self.mobi_html[pos:]
def extract_images(self, processed_records, output_dir):
if self.verbose:
print 'Extracting images...'
@ -523,7 +525,7 @@ class MobiReader(object):
self.image_names = []
start = getattr(self.book_header, 'first_image_index', -1)
if start > self.num_sections or start < 0:
# BAEN PRC files have bad headers
# BAEN PRC files have bad headers
start=0
for i in range(start, self.num_sections):
if i in processed_records:
@ -533,10 +535,10 @@ class MobiReader(object):
buf = cStringIO.StringIO(data)
image_index += 1
try:
im = PILImage.open(buf)
im = PILImage.open(buf)
except IOError:
continue
path = os.path.join(output_dir, '%05d.jpg'%image_index)
self.image_names.append(os.path.basename(path))
im.convert('RGB').save(open(path, 'wb'), format='JPEG')
@ -562,17 +564,17 @@ def get_metadata(stream):
import traceback
traceback.print_exc()
return mi
def option_parser():
from calibre.utils.config import OptionParser
parser = OptionParser(usage=_('%prog [options] myebook.mobi'))
parser.add_option('-o', '--output-dir', default='.',
parser.add_option('-o', '--output-dir', default='.',
help=_('Output directory. Defaults to current directory.'))
parser.add_option('-v', '--verbose', default=False, action='store_true',
help='Useful for debugging.')
return parser
def main(args=sys.argv):
parser = option_parser()
@ -580,7 +582,7 @@ def main(args=sys.argv):
if len(args) != 2:
parser.print_help()
return 1
mr = MobiReader(args[1], verbose=opts.verbose)
opts.output_dir = os.path.abspath(opts.output_dir)
mr.extract_content(opts.output_dir)
@ -591,9 +593,9 @@ def main(args=sys.argv):
dat = dat.encode('utf-8')
open(oname, 'wb').write(dat)
print _('Raw MOBI HTML saved in'), oname
print _('OEB ebook created in'), opts.output_dir
return 0
if __name__ == '__main__':

View File

@ -132,7 +132,7 @@ def rescale_image(data, maxsizeb, dimen=None):
class Serializer(object):
NSRMAP = {'': None, XML_NS: 'xml', XHTML_NS: '', MBP_NS: 'mbp'}
def __init__(self, oeb, images):
self.oeb = oeb
self.images = images
@ -192,7 +192,7 @@ class Serializer(object):
self.href_offsets[href].append(buffer.tell())
buffer.write('0000000000')
return True
def serialize_body(self):
buffer = self.buffer
self.anchor_offset = buffer.tell()
@ -290,10 +290,10 @@ class Serializer(object):
buffer.seek(hoff)
buffer.write('%010d' % ioff)
class MobiWriter(object):
COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+')
def __init__(self, compression=None, imagemax=None,
prefer_author_sort=False):
self._compression = compression or UNCOMPRESSED
@ -305,11 +305,11 @@ class MobiWriter(object):
return self._dump_stream(oeb, path)
with open(path, 'w+b') as stream:
return self._dump_stream(oeb, stream)
def _write(self, *data):
for datum in data:
self._stream.write(datum)
def _tell(self):
return self._stream.tell()
@ -364,7 +364,7 @@ class MobiWriter(object):
overlap = text.read(extra)
text.seek(npos)
return data, overlap
def _generate_text(self):
self._oeb.logger.info('Serializing markup content...')
serializer = Serializer(self._oeb, self._images)
@ -405,7 +405,7 @@ class MobiWriter(object):
offset += RECORD_SIZE
data, overlap = self._read_text_record(text)
self._text_nrecords = nrecords
def _generate_images(self):
self._oeb.logger.info('Serializing images...')
images = [(index, href) for href, index in self._images.items()]
@ -418,7 +418,7 @@ class MobiWriter(object):
self._oeb.logger.warn('Bad image file %r' % item.href)
continue
self._records.append(data)
def _generate_record0(self):
metadata = self._oeb.metadata
exth = self._build_exth()
@ -510,7 +510,7 @@ class MobiWriter(object):
self._images[href] = index
self._records.append(data)
return index
def _write_header(self):
title = str(self._oeb.metadata.title[0])
title = re.sub('[^-A-Za-z0-9]+', '_', title)[:32]
@ -537,14 +537,14 @@ def config(defaults=None):
c = Config('mobi', desc)
else:
c = StringConfig(defaults, desc)
mobi = c.add_group('mobipocket', _('Mobipocket-specific options.'))
mobi('compress', ['--compress'], default=False,
help=_('Compress file text using PalmDOC compression. '
'Results in smaller files, but takes a long time to run.'))
mobi('rescale_images', ['--rescale-images'], default=False,
mobi('rescale_images', ['--rescale-images'], default=False,
help=_('Modify images to meet Palm device size limitations.'))
mobi('toc_title', ['--toc-title'], default=None,
mobi('toc_title', ['--toc-title'], default=None,
help=_('Title for any generated in-line table of contents.'))
mobi('ignore_tables', ['--ignore-tables'], default=False,
help=_('Render HTML tables as blocks of text instead of actual '
@ -565,13 +565,13 @@ def config(defaults=None):
c.add_opt('encoding', ['--encoding'], default=None,
help=_('Character encoding for HTML files. Default is to auto detect.'))
return c
def option_parser():
c = config()
parser = c.option_parser(usage='%prog '+_('[options]')+' file.opf')
parser.add_option(
'-o', '--output', default=None,
'-o', '--output', default=None,
help=_('Output file. Default is derived from input filename.'))
parser.add_option(
'-v', '--verbose', default=0, action='count',
@ -617,7 +617,7 @@ def oeb2mobi(opts, inpath):
writer.dump(oeb, outpath)
run_plugins_on_postprocess(outpath, 'mobi')
logger.info(_('Output written to ') + outpath)
def main(argv=sys.argv):
parser = option_parser()
opts, args = parser.parse_args(argv[1:])