mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Fix #2129 (.prc to MOBI causes TOC links to break)
This commit is contained in:
parent
802385787a
commit
af9f8a47a5
@ -27,7 +27,7 @@ from calibre.ebooks.metadata.toc import TOC
|
|||||||
from calibre import sanitize_file_name
|
from calibre import sanitize_file_name
|
||||||
|
|
||||||
class EXTHHeader(object):
|
class EXTHHeader(object):
|
||||||
|
|
||||||
def __init__(self, raw, codec, title):
|
def __init__(self, raw, codec, title):
|
||||||
self.doctype = raw[:4]
|
self.doctype = raw[:4]
|
||||||
self.length, self.num_items = struct.unpack('>LL', raw[4:12])
|
self.length, self.num_items = struct.unpack('>LL', raw[4:12])
|
||||||
@ -35,7 +35,7 @@ class EXTHHeader(object):
|
|||||||
pos = 0
|
pos = 0
|
||||||
self.mi = MetaInformation(_('Unknown'), [_('Unknown')])
|
self.mi = MetaInformation(_('Unknown'), [_('Unknown')])
|
||||||
self.has_fake_cover = True
|
self.has_fake_cover = True
|
||||||
|
|
||||||
for i in range(self.num_items):
|
for i in range(self.num_items):
|
||||||
id, size = struct.unpack('>LL', raw[pos:pos+8])
|
id, size = struct.unpack('>LL', raw[pos:pos+8])
|
||||||
content = raw[pos+8:pos+size]
|
content = raw[pos+8:pos+size]
|
||||||
@ -43,18 +43,18 @@ class EXTHHeader(object):
|
|||||||
if id >= 100 and id < 200:
|
if id >= 100 and id < 200:
|
||||||
self.process_metadata(id, content, codec)
|
self.process_metadata(id, content, codec)
|
||||||
elif id == 203:
|
elif id == 203:
|
||||||
self.has_fake_cover = bool(struct.unpack('>L', content)[0])
|
self.has_fake_cover = bool(struct.unpack('>L', content)[0])
|
||||||
elif id == 201:
|
elif id == 201:
|
||||||
co, = struct.unpack('>L', content)
|
co, = struct.unpack('>L', content)
|
||||||
if co < 1e7:
|
if co < 1e7:
|
||||||
self.cover_offset = co
|
self.cover_offset = co
|
||||||
elif id == 202:
|
elif id == 202:
|
||||||
self.thumbnail_offset, = struct.unpack('>L', content)
|
self.thumbnail_offset, = struct.unpack('>L', content)
|
||||||
#else:
|
#else:
|
||||||
# print 'unknown record', id, repr(content)
|
# print 'unknown record', id, repr(content)
|
||||||
if title:
|
if title:
|
||||||
self.mi.title = title
|
self.mi.title = title
|
||||||
|
|
||||||
def process_metadata(self, id, content, codec):
|
def process_metadata(self, id, content, codec):
|
||||||
if id == 100:
|
if id == 100:
|
||||||
if self.mi.authors == [_('Unknown')]:
|
if self.mi.authors == [_('Unknown')]:
|
||||||
@ -71,11 +71,11 @@ class EXTHHeader(object):
|
|||||||
self.mi.tags = []
|
self.mi.tags = []
|
||||||
self.mi.tags.append(content.decode(codec, 'ignore'))
|
self.mi.tags.append(content.decode(codec, 'ignore'))
|
||||||
#else:
|
#else:
|
||||||
# print 'unhandled metadata record', id, repr(content), codec
|
# print 'unhandled metadata record', id, repr(content), codec
|
||||||
|
|
||||||
|
|
||||||
class BookHeader(object):
|
class BookHeader(object):
|
||||||
|
|
||||||
def __init__(self, raw, ident):
|
def __init__(self, raw, ident):
|
||||||
self.compression_type = raw[:2]
|
self.compression_type = raw[:2]
|
||||||
self.records, self.records_size = struct.unpack('>HH', raw[8:12])
|
self.records, self.records_size = struct.unpack('>HH', raw[8:12])
|
||||||
@ -96,8 +96,8 @@ class BookHeader(object):
|
|||||||
self.doctype = raw[16:20]
|
self.doctype = raw[16:20]
|
||||||
self.length, self.type, self.codepage, self.unique_id, self.version = \
|
self.length, self.type, self.codepage, self.unique_id, self.version = \
|
||||||
struct.unpack('>LLLLL', raw[20:40])
|
struct.unpack('>LLLLL', raw[20:40])
|
||||||
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
self.codec = {
|
self.codec = {
|
||||||
1252 : 'cp1252',
|
1252 : 'cp1252',
|
||||||
@ -106,15 +106,15 @@ class BookHeader(object):
|
|||||||
except (IndexError, KeyError):
|
except (IndexError, KeyError):
|
||||||
print '[WARNING] Unknown codepage %d. Assuming cp-1252'%self.codepage
|
print '[WARNING] Unknown codepage %d. Assuming cp-1252'%self.codepage
|
||||||
self.codec = 'cp1252'
|
self.codec = 'cp1252'
|
||||||
|
|
||||||
if ident == 'TEXTREAD' or self.length < 0xE4 or 0xE8 < self.length:
|
if ident == 'TEXTREAD' or self.length < 0xE4 or 0xE8 < self.length:
|
||||||
self.extra_flags = 0
|
self.extra_flags = 0
|
||||||
else:
|
else:
|
||||||
self.extra_flags, = struct.unpack('>H', raw[0xF2:0xF4])
|
self.extra_flags, = struct.unpack('>H', raw[0xF2:0xF4])
|
||||||
|
|
||||||
if self.compression_type == 'DH':
|
if self.compression_type == 'DH':
|
||||||
self.huff_offset, self.huff_number = struct.unpack('>LL', raw[0x70:0x78])
|
self.huff_offset, self.huff_number = struct.unpack('>LL', raw[0x70:0x78])
|
||||||
|
|
||||||
toff, tlen = struct.unpack('>II', raw[0x54:0x5c])
|
toff, tlen = struct.unpack('>II', raw[0x54:0x5c])
|
||||||
tend = toff + tlen
|
tend = toff + tlen
|
||||||
self.title = raw[toff:tend] if tend < len(raw) else _('Unknown')
|
self.title = raw[toff:tend] if tend < len(raw) else _('Unknown')
|
||||||
@ -125,7 +125,7 @@ class BookHeader(object):
|
|||||||
self.sublanguage = sub_language.get(sublangid, 'NEUTRAL')
|
self.sublanguage = sub_language.get(sublangid, 'NEUTRAL')
|
||||||
self.mobi_version = struct.unpack('>I', raw[0x68:0x6c])[0]
|
self.mobi_version = struct.unpack('>I', raw[0x68:0x6c])[0]
|
||||||
self.first_image_index = struct.unpack('>L', raw[0x6c:0x6c+4])[0]
|
self.first_image_index = struct.unpack('>L', raw[0x6c:0x6c+4])[0]
|
||||||
|
|
||||||
self.exth_flag, = struct.unpack('>L', raw[0x80:0x84])
|
self.exth_flag, = struct.unpack('>L', raw[0x80:0x84])
|
||||||
self.exth = None
|
self.exth = None
|
||||||
if not isinstance(self.title, unicode):
|
if not isinstance(self.title, unicode):
|
||||||
@ -134,39 +134,39 @@ class BookHeader(object):
|
|||||||
self.exth = EXTHHeader(raw[16+self.length:], self.codec, self.title)
|
self.exth = EXTHHeader(raw[16+self.length:], self.codec, self.title)
|
||||||
self.exth.mi.uid = self.unique_id
|
self.exth.mi.uid = self.unique_id
|
||||||
self.exth.mi.language = self.language
|
self.exth.mi.language = self.language
|
||||||
|
|
||||||
|
|
||||||
class MobiReader(object):
|
class MobiReader(object):
|
||||||
PAGE_BREAK_PAT = re.compile(r'(<[/]{0,1}mbp:pagebreak\s*[/]{0,1}>)+', re.IGNORECASE)
|
PAGE_BREAK_PAT = re.compile(r'(<[/]{0,1}mbp:pagebreak\s*[/]{0,1}>)+', re.IGNORECASE)
|
||||||
IMAGE_ATTRS = ('lowrecindex', 'recindex', 'hirecindex')
|
IMAGE_ATTRS = ('lowrecindex', 'recindex', 'hirecindex')
|
||||||
|
|
||||||
def __init__(self, filename_or_stream, verbose=False):
|
def __init__(self, filename_or_stream, verbose=False):
|
||||||
self.verbose = verbose
|
self.verbose = verbose
|
||||||
self.embedded_mi = None
|
self.embedded_mi = None
|
||||||
|
|
||||||
if hasattr(filename_or_stream, 'read'):
|
if hasattr(filename_or_stream, 'read'):
|
||||||
stream = filename_or_stream
|
stream = filename_or_stream
|
||||||
stream.seek(0)
|
stream.seek(0)
|
||||||
else:
|
else:
|
||||||
stream = open(filename_or_stream, 'rb')
|
stream = open(filename_or_stream, 'rb')
|
||||||
|
|
||||||
raw = stream.read()
|
raw = stream.read()
|
||||||
|
|
||||||
self.header = raw[0:72]
|
self.header = raw[0:72]
|
||||||
self.name = self.header[:32].replace('\x00', '')
|
self.name = self.header[:32].replace('\x00', '')
|
||||||
self.num_sections, = struct.unpack('>H', raw[76:78])
|
self.num_sections, = struct.unpack('>H', raw[76:78])
|
||||||
|
|
||||||
self.ident = self.header[0x3C:0x3C+8].upper()
|
self.ident = self.header[0x3C:0x3C+8].upper()
|
||||||
if self.ident not in ['BOOKMOBI', 'TEXTREAD']:
|
if self.ident not in ['BOOKMOBI', 'TEXTREAD']:
|
||||||
raise MobiError('Unknown book type: %s'%self.ident)
|
raise MobiError('Unknown book type: %s'%self.ident)
|
||||||
|
|
||||||
self.sections = []
|
self.sections = []
|
||||||
self.section_headers = []
|
self.section_headers = []
|
||||||
for i in range(self.num_sections):
|
for i in range(self.num_sections):
|
||||||
offset, a1, a2, a3, a4 = struct.unpack('>LBBBB', raw[78+i*8:78+i*8+8])
|
offset, a1, a2, a3, a4 = struct.unpack('>LBBBB', raw[78+i*8:78+i*8+8])
|
||||||
flags, val = a1, a2<<16 | a3<<8 | a4
|
flags, val = a1, a2<<16 | a3<<8 | a4
|
||||||
self.section_headers.append((offset, flags, val))
|
self.section_headers.append((offset, flags, val))
|
||||||
|
|
||||||
def section(section_number):
|
def section(section_number):
|
||||||
if section_number == self.num_sections - 1:
|
if section_number == self.num_sections - 1:
|
||||||
end_off = len(raw)
|
end_off = len(raw)
|
||||||
@ -174,32 +174,32 @@ class MobiReader(object):
|
|||||||
end_off = self.section_headers[section_number + 1][0]
|
end_off = self.section_headers[section_number + 1][0]
|
||||||
off = self.section_headers[section_number][0]
|
off = self.section_headers[section_number][0]
|
||||||
return raw[off:end_off]
|
return raw[off:end_off]
|
||||||
|
|
||||||
for i in range(self.num_sections):
|
for i in range(self.num_sections):
|
||||||
self.sections.append((section(i), self.section_headers[i]))
|
self.sections.append((section(i), self.section_headers[i]))
|
||||||
|
|
||||||
|
|
||||||
self.book_header = BookHeader(self.sections[0][0], self.ident)
|
self.book_header = BookHeader(self.sections[0][0], self.ident)
|
||||||
self.name = self.name.decode(self.book_header.codec, 'replace')
|
self.name = self.name.decode(self.book_header.codec, 'replace')
|
||||||
|
|
||||||
def extract_content(self, output_dir=os.getcwdu()):
|
def extract_content(self, output_dir=os.getcwdu()):
|
||||||
output_dir = os.path.abspath(output_dir)
|
output_dir = os.path.abspath(output_dir)
|
||||||
if self.book_header.encryption_type != 0:
|
if self.book_header.encryption_type != 0:
|
||||||
raise DRMError(self.name)
|
raise DRMError(self.name)
|
||||||
|
|
||||||
processed_records = self.extract_text()
|
processed_records = self.extract_text()
|
||||||
self.add_anchors()
|
self.add_anchors()
|
||||||
self.processed_html = self.processed_html.decode(self.book_header.codec, 'ignore')
|
self.processed_html = self.processed_html.decode(self.book_header.codec, 'ignore')
|
||||||
for pat in ENCODING_PATS:
|
for pat in ENCODING_PATS:
|
||||||
self.processed_html = pat.sub('', self.processed_html)
|
self.processed_html = pat.sub('', self.processed_html)
|
||||||
e2u = functools.partial(entity_to_unicode,
|
e2u = functools.partial(entity_to_unicode,
|
||||||
exceptions=['lt', 'gt', 'amp', 'apos', 'quot'])
|
exceptions=['lt', 'gt', 'amp', 'apos', 'quot'])
|
||||||
self.processed_html = re.sub(r'&(\S+?);', e2u,
|
self.processed_html = re.sub(r'&(\S+?);', e2u,
|
||||||
self.processed_html)
|
self.processed_html)
|
||||||
self.extract_images(processed_records, output_dir)
|
self.extract_images(processed_records, output_dir)
|
||||||
self.replace_page_breaks()
|
self.replace_page_breaks()
|
||||||
self.cleanup_html()
|
self.cleanup_html()
|
||||||
|
|
||||||
if self.processed_html.startswith('<body'):
|
if self.processed_html.startswith('<body'):
|
||||||
self.processed_html = '<html><head></head>'+self.processed_html+'</html>'
|
self.processed_html = '<html><head></head>'+self.processed_html+'</html>'
|
||||||
self.processed_html = \
|
self.processed_html = \
|
||||||
@ -212,7 +212,7 @@ class MobiReader(object):
|
|||||||
'.italic { font-style: italic; }\n'
|
'.italic { font-style: italic; }\n'
|
||||||
'</style>\n',
|
'</style>\n',
|
||||||
self.processed_html)
|
self.processed_html)
|
||||||
|
|
||||||
if self.verbose:
|
if self.verbose:
|
||||||
print 'Parsing HTML...'
|
print 'Parsing HTML...'
|
||||||
root = html.fromstring(self.processed_html)
|
root = html.fromstring(self.processed_html)
|
||||||
@ -224,7 +224,7 @@ class MobiReader(object):
|
|||||||
self.read_embedded_metadata(root, metadata_elems[0], guide)
|
self.read_embedded_metadata(root, metadata_elems[0], guide)
|
||||||
for elem in guides + metadata_elems:
|
for elem in guides + metadata_elems:
|
||||||
elem.getparent().remove(elem)
|
elem.getparent().remove(elem)
|
||||||
htmlfile = os.path.join(output_dir,
|
htmlfile = os.path.join(output_dir,
|
||||||
sanitize_file_name(self.name)+'.html')
|
sanitize_file_name(self.name)+'.html')
|
||||||
try:
|
try:
|
||||||
for ref in guide.xpath('descendant::reference'):
|
for ref in guide.xpath('descendant::reference'):
|
||||||
@ -235,13 +235,13 @@ class MobiReader(object):
|
|||||||
if self.verbose:
|
if self.verbose:
|
||||||
print 'Serializing...'
|
print 'Serializing...'
|
||||||
with open(htmlfile, 'wb') as f:
|
with open(htmlfile, 'wb') as f:
|
||||||
raw = html.tostring(root, encoding='utf-8', method='xml',
|
raw = html.tostring(root, encoding='utf-8', method='xml',
|
||||||
include_meta_content_type=True, pretty_print=True)
|
include_meta_content_type=True, pretty_print=True)
|
||||||
raw = raw.replace('<head>',
|
raw = raw.replace('<head>',
|
||||||
'<head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\n')
|
'<head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\n')
|
||||||
f.write(raw)
|
f.write(raw)
|
||||||
self.htmlfile = htmlfile
|
self.htmlfile = htmlfile
|
||||||
|
|
||||||
if self.book_header.exth is not None or self.embedded_mi is not None:
|
if self.book_header.exth is not None or self.embedded_mi is not None:
|
||||||
if self.verbose:
|
if self.verbose:
|
||||||
print 'Creating OPF...'
|
print 'Creating OPF...'
|
||||||
@ -251,7 +251,7 @@ class MobiReader(object):
|
|||||||
ncx = ncx.getvalue()
|
ncx = ncx.getvalue()
|
||||||
if ncx:
|
if ncx:
|
||||||
open(os.path.splitext(htmlfile)[0]+'.ncx', 'wb').write(ncx)
|
open(os.path.splitext(htmlfile)[0]+'.ncx', 'wb').write(ncx)
|
||||||
|
|
||||||
def read_embedded_metadata(self, root, elem, guide):
|
def read_embedded_metadata(self, root, elem, guide):
|
||||||
raw = '<package>'+html.tostring(elem, encoding='utf-8')+'</package>'
|
raw = '<package>'+html.tostring(elem, encoding='utf-8')+'</package>'
|
||||||
stream = cStringIO.StringIO(raw)
|
stream = cStringIO.StringIO(raw)
|
||||||
@ -276,8 +276,8 @@ class MobiReader(object):
|
|||||||
elem.getparent().remove(elem)
|
elem.getparent().remove(elem)
|
||||||
break
|
break
|
||||||
break
|
break
|
||||||
|
|
||||||
|
|
||||||
def cleanup_html(self):
|
def cleanup_html(self):
|
||||||
if self.verbose:
|
if self.verbose:
|
||||||
print 'Cleaning up HTML...'
|
print 'Cleaning up HTML...'
|
||||||
@ -286,7 +286,7 @@ class MobiReader(object):
|
|||||||
self.processed_html = '<html><p>'+self.processed_html.replace('\n\n', '<p>')+'</html>'
|
self.processed_html = '<html><p>'+self.processed_html.replace('\n\n', '<p>')+'</html>'
|
||||||
self.processed_html = self.processed_html.replace('\r\n', '\n')
|
self.processed_html = self.processed_html.replace('\r\n', '\n')
|
||||||
self.processed_html = self.processed_html.replace('> <', '>\n<')
|
self.processed_html = self.processed_html.replace('> <', '>\n<')
|
||||||
|
|
||||||
def upshift_markup(self, root):
|
def upshift_markup(self, root):
|
||||||
if self.verbose:
|
if self.verbose:
|
||||||
print 'Converting style information to CSS...'
|
print 'Converting style information to CSS...'
|
||||||
@ -358,13 +358,15 @@ class MobiReader(object):
|
|||||||
attrib['style'] = '; '.join(styles)
|
attrib['style'] = '; '.join(styles)
|
||||||
if 'filepos-id' in attrib:
|
if 'filepos-id' in attrib:
|
||||||
attrib['id'] = attrib.pop('filepos-id')
|
attrib['id'] = attrib.pop('filepos-id')
|
||||||
|
if 'name' in attrib and attrib['name'] != attrib['id']:
|
||||||
|
attrib['name'] = attrib['id']
|
||||||
if 'filepos' in attrib:
|
if 'filepos' in attrib:
|
||||||
filepos = attrib.pop('filepos')
|
filepos = attrib.pop('filepos')
|
||||||
try:
|
try:
|
||||||
attrib['href'] = "#filepos%d" % int(filepos)
|
attrib['href'] = "#filepos%d" % int(filepos)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def create_opf(self, htmlfile, guide=None, root=None):
|
def create_opf(self, htmlfile, guide=None, root=None):
|
||||||
mi = getattr(self.book_header.exth, 'mi', self.embedded_mi)
|
mi = getattr(self.book_header.exth, 'mi', self.embedded_mi)
|
||||||
opf = OPFCreator(os.path.dirname(htmlfile), mi)
|
opf = OPFCreator(os.path.dirname(htmlfile), mi)
|
||||||
@ -374,14 +376,14 @@ class MobiReader(object):
|
|||||||
opf.cover = mi.cover
|
opf.cover = mi.cover
|
||||||
else:
|
else:
|
||||||
opf.cover = 'images/%05d.jpg'%1
|
opf.cover = 'images/%05d.jpg'%1
|
||||||
if not os.path.exists(os.path.join(os.path.dirname(htmlfile),
|
if not os.path.exists(os.path.join(os.path.dirname(htmlfile),
|
||||||
*opf.cover.split('/'))):
|
*opf.cover.split('/'))):
|
||||||
opf.cover = None
|
opf.cover = None
|
||||||
manifest = [(htmlfile, 'text/x-oeb1-document')]
|
manifest = [(htmlfile, 'text/x-oeb1-document')]
|
||||||
bp = os.path.dirname(htmlfile)
|
bp = os.path.dirname(htmlfile)
|
||||||
for i in getattr(self, 'image_names', []):
|
for i in getattr(self, 'image_names', []):
|
||||||
manifest.append((os.path.join(bp, 'images/', i), 'image/jpeg'))
|
manifest.append((os.path.join(bp, 'images/', i), 'image/jpeg'))
|
||||||
|
|
||||||
opf.create_manifest(manifest)
|
opf.create_manifest(manifest)
|
||||||
opf.create_spine([os.path.basename(htmlfile)])
|
opf.create_spine([os.path.basename(htmlfile)])
|
||||||
toc = None
|
toc = None
|
||||||
@ -410,16 +412,16 @@ class MobiReader(object):
|
|||||||
except:
|
except:
|
||||||
text = ''
|
text = ''
|
||||||
text = ent_pat.sub(entity_to_unicode, text)
|
text = ent_pat.sub(entity_to_unicode, text)
|
||||||
tocobj.add_item(toc.partition('#')[0], href[1:],
|
tocobj.add_item(toc.partition('#')[0], href[1:],
|
||||||
text)
|
text)
|
||||||
if reached and x.get('class', None) == 'mbp_pagebreak':
|
if reached and x.get('class', None) == 'mbp_pagebreak':
|
||||||
break
|
break
|
||||||
if tocobj is not None:
|
if tocobj is not None:
|
||||||
opf.set_toc(tocobj)
|
opf.set_toc(tocobj)
|
||||||
|
|
||||||
return opf
|
return opf
|
||||||
|
|
||||||
|
|
||||||
def sizeof_trailing_entries(self, data):
|
def sizeof_trailing_entries(self, data):
|
||||||
def sizeof_trailing_entry(ptr, psize):
|
def sizeof_trailing_entry(ptr, psize):
|
||||||
bitpos, result = 0, 0
|
bitpos, result = 0, 0
|
||||||
@ -430,7 +432,7 @@ class MobiReader(object):
|
|||||||
psize -= 1
|
psize -= 1
|
||||||
if (v & 0x80) != 0 or (bitpos >= 28) or (psize == 0):
|
if (v & 0x80) != 0 or (bitpos >= 28) or (psize == 0):
|
||||||
return result
|
return result
|
||||||
|
|
||||||
num = 0
|
num = 0
|
||||||
size = len(data)
|
size = len(data)
|
||||||
flags = self.book_header.extra_flags >> 1
|
flags = self.book_header.extra_flags >> 1
|
||||||
@ -446,28 +448,28 @@ class MobiReader(object):
|
|||||||
data = self.sections[index][0]
|
data = self.sections[index][0]
|
||||||
trail_size = self.sizeof_trailing_entries(data)
|
trail_size = self.sizeof_trailing_entries(data)
|
||||||
return data[:len(data)-trail_size]
|
return data[:len(data)-trail_size]
|
||||||
|
|
||||||
def extract_text(self):
|
def extract_text(self):
|
||||||
if self.verbose:
|
if self.verbose:
|
||||||
print 'Extracting text...'
|
print 'Extracting text...'
|
||||||
text_sections = [self.text_section(i) for i in range(1, self.book_header.records+1)]
|
text_sections = [self.text_section(i) for i in range(1, self.book_header.records+1)]
|
||||||
processed_records = list(range(0, self.book_header.records+1))
|
processed_records = list(range(0, self.book_header.records+1))
|
||||||
|
|
||||||
self.mobi_html = ''
|
self.mobi_html = ''
|
||||||
|
|
||||||
if self.book_header.compression_type == 'DH':
|
if self.book_header.compression_type == 'DH':
|
||||||
huffs = [self.sections[i][0] for i in
|
huffs = [self.sections[i][0] for i in
|
||||||
range(self.book_header.huff_offset,
|
range(self.book_header.huff_offset,
|
||||||
self.book_header.huff_offset+self.book_header.huff_number)]
|
self.book_header.huff_offset+self.book_header.huff_number)]
|
||||||
processed_records += list(range(self.book_header.huff_offset,
|
processed_records += list(range(self.book_header.huff_offset,
|
||||||
self.book_header.huff_offset+self.book_header.huff_number))
|
self.book_header.huff_offset+self.book_header.huff_number))
|
||||||
huff = HuffReader(huffs)
|
huff = HuffReader(huffs)
|
||||||
self.mobi_html = huff.decompress(text_sections)
|
self.mobi_html = huff.decompress(text_sections)
|
||||||
|
|
||||||
elif self.book_header.compression_type == '\x00\x02':
|
elif self.book_header.compression_type == '\x00\x02':
|
||||||
for section in text_sections:
|
for section in text_sections:
|
||||||
self.mobi_html += decompress_doc(section)
|
self.mobi_html += decompress_doc(section)
|
||||||
|
|
||||||
elif self.book_header.compression_type == '\x00\x01':
|
elif self.book_header.compression_type == '\x00\x01':
|
||||||
self.mobi_html = ''.join(text_sections)
|
self.mobi_html = ''.join(text_sections)
|
||||||
else:
|
else:
|
||||||
@ -475,13 +477,13 @@ class MobiReader(object):
|
|||||||
if self.book_header.ancient and '<html' not in self.mobi_html[:300].lower():
|
if self.book_header.ancient and '<html' not in self.mobi_html[:300].lower():
|
||||||
self.mobi_html = self.mobi_html.replace('\r ', '\n\n ')
|
self.mobi_html = self.mobi_html.replace('\r ', '\n\n ')
|
||||||
return processed_records
|
return processed_records
|
||||||
|
|
||||||
|
|
||||||
def replace_page_breaks(self):
|
def replace_page_breaks(self):
|
||||||
self.processed_html = self.PAGE_BREAK_PAT.sub(
|
self.processed_html = self.PAGE_BREAK_PAT.sub(
|
||||||
'<div class="mbp_pagebreak" style="page-break-after: always; margin: 0; display: block" />',
|
'<div class="mbp_pagebreak" style="page-break-after: always; margin: 0; display: block" />',
|
||||||
self.processed_html)
|
self.processed_html)
|
||||||
|
|
||||||
def add_anchors(self):
|
def add_anchors(self):
|
||||||
if self.verbose:
|
if self.verbose:
|
||||||
print 'Adding anchors...'
|
print 'Adding anchors...'
|
||||||
@ -511,8 +513,8 @@ class MobiReader(object):
|
|||||||
self.processed_html += self.mobi_html[pos:end] + (anchor % oend)
|
self.processed_html += self.mobi_html[pos:end] + (anchor % oend)
|
||||||
pos = end
|
pos = end
|
||||||
self.processed_html += self.mobi_html[pos:]
|
self.processed_html += self.mobi_html[pos:]
|
||||||
|
|
||||||
|
|
||||||
def extract_images(self, processed_records, output_dir):
|
def extract_images(self, processed_records, output_dir):
|
||||||
if self.verbose:
|
if self.verbose:
|
||||||
print 'Extracting images...'
|
print 'Extracting images...'
|
||||||
@ -523,7 +525,7 @@ class MobiReader(object):
|
|||||||
self.image_names = []
|
self.image_names = []
|
||||||
start = getattr(self.book_header, 'first_image_index', -1)
|
start = getattr(self.book_header, 'first_image_index', -1)
|
||||||
if start > self.num_sections or start < 0:
|
if start > self.num_sections or start < 0:
|
||||||
# BAEN PRC files have bad headers
|
# BAEN PRC files have bad headers
|
||||||
start=0
|
start=0
|
||||||
for i in range(start, self.num_sections):
|
for i in range(start, self.num_sections):
|
||||||
if i in processed_records:
|
if i in processed_records:
|
||||||
@ -533,10 +535,10 @@ class MobiReader(object):
|
|||||||
buf = cStringIO.StringIO(data)
|
buf = cStringIO.StringIO(data)
|
||||||
image_index += 1
|
image_index += 1
|
||||||
try:
|
try:
|
||||||
im = PILImage.open(buf)
|
im = PILImage.open(buf)
|
||||||
except IOError:
|
except IOError:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
path = os.path.join(output_dir, '%05d.jpg'%image_index)
|
path = os.path.join(output_dir, '%05d.jpg'%image_index)
|
||||||
self.image_names.append(os.path.basename(path))
|
self.image_names.append(os.path.basename(path))
|
||||||
im.convert('RGB').save(open(path, 'wb'), format='JPEG')
|
im.convert('RGB').save(open(path, 'wb'), format='JPEG')
|
||||||
@ -562,17 +564,17 @@ def get_metadata(stream):
|
|||||||
import traceback
|
import traceback
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
return mi
|
return mi
|
||||||
|
|
||||||
|
|
||||||
def option_parser():
|
def option_parser():
|
||||||
from calibre.utils.config import OptionParser
|
from calibre.utils.config import OptionParser
|
||||||
parser = OptionParser(usage=_('%prog [options] myebook.mobi'))
|
parser = OptionParser(usage=_('%prog [options] myebook.mobi'))
|
||||||
parser.add_option('-o', '--output-dir', default='.',
|
parser.add_option('-o', '--output-dir', default='.',
|
||||||
help=_('Output directory. Defaults to current directory.'))
|
help=_('Output directory. Defaults to current directory.'))
|
||||||
parser.add_option('-v', '--verbose', default=False, action='store_true',
|
parser.add_option('-v', '--verbose', default=False, action='store_true',
|
||||||
help='Useful for debugging.')
|
help='Useful for debugging.')
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
|
|
||||||
def main(args=sys.argv):
|
def main(args=sys.argv):
|
||||||
parser = option_parser()
|
parser = option_parser()
|
||||||
@ -580,7 +582,7 @@ def main(args=sys.argv):
|
|||||||
if len(args) != 2:
|
if len(args) != 2:
|
||||||
parser.print_help()
|
parser.print_help()
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
mr = MobiReader(args[1], verbose=opts.verbose)
|
mr = MobiReader(args[1], verbose=opts.verbose)
|
||||||
opts.output_dir = os.path.abspath(opts.output_dir)
|
opts.output_dir = os.path.abspath(opts.output_dir)
|
||||||
mr.extract_content(opts.output_dir)
|
mr.extract_content(opts.output_dir)
|
||||||
@ -591,9 +593,9 @@ def main(args=sys.argv):
|
|||||||
dat = dat.encode('utf-8')
|
dat = dat.encode('utf-8')
|
||||||
open(oname, 'wb').write(dat)
|
open(oname, 'wb').write(dat)
|
||||||
print _('Raw MOBI HTML saved in'), oname
|
print _('Raw MOBI HTML saved in'), oname
|
||||||
|
|
||||||
print _('OEB ebook created in'), opts.output_dir
|
print _('OEB ebook created in'), opts.output_dir
|
||||||
|
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
@ -132,7 +132,7 @@ def rescale_image(data, maxsizeb, dimen=None):
|
|||||||
|
|
||||||
class Serializer(object):
|
class Serializer(object):
|
||||||
NSRMAP = {'': None, XML_NS: 'xml', XHTML_NS: '', MBP_NS: 'mbp'}
|
NSRMAP = {'': None, XML_NS: 'xml', XHTML_NS: '', MBP_NS: 'mbp'}
|
||||||
|
|
||||||
def __init__(self, oeb, images):
|
def __init__(self, oeb, images):
|
||||||
self.oeb = oeb
|
self.oeb = oeb
|
||||||
self.images = images
|
self.images = images
|
||||||
@ -192,7 +192,7 @@ class Serializer(object):
|
|||||||
self.href_offsets[href].append(buffer.tell())
|
self.href_offsets[href].append(buffer.tell())
|
||||||
buffer.write('0000000000')
|
buffer.write('0000000000')
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def serialize_body(self):
|
def serialize_body(self):
|
||||||
buffer = self.buffer
|
buffer = self.buffer
|
||||||
self.anchor_offset = buffer.tell()
|
self.anchor_offset = buffer.tell()
|
||||||
@ -290,10 +290,10 @@ class Serializer(object):
|
|||||||
buffer.seek(hoff)
|
buffer.seek(hoff)
|
||||||
buffer.write('%010d' % ioff)
|
buffer.write('%010d' % ioff)
|
||||||
|
|
||||||
|
|
||||||
class MobiWriter(object):
|
class MobiWriter(object):
|
||||||
COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+')
|
COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+')
|
||||||
|
|
||||||
def __init__(self, compression=None, imagemax=None,
|
def __init__(self, compression=None, imagemax=None,
|
||||||
prefer_author_sort=False):
|
prefer_author_sort=False):
|
||||||
self._compression = compression or UNCOMPRESSED
|
self._compression = compression or UNCOMPRESSED
|
||||||
@ -305,11 +305,11 @@ class MobiWriter(object):
|
|||||||
return self._dump_stream(oeb, path)
|
return self._dump_stream(oeb, path)
|
||||||
with open(path, 'w+b') as stream:
|
with open(path, 'w+b') as stream:
|
||||||
return self._dump_stream(oeb, stream)
|
return self._dump_stream(oeb, stream)
|
||||||
|
|
||||||
def _write(self, *data):
|
def _write(self, *data):
|
||||||
for datum in data:
|
for datum in data:
|
||||||
self._stream.write(datum)
|
self._stream.write(datum)
|
||||||
|
|
||||||
def _tell(self):
|
def _tell(self):
|
||||||
return self._stream.tell()
|
return self._stream.tell()
|
||||||
|
|
||||||
@ -364,7 +364,7 @@ class MobiWriter(object):
|
|||||||
overlap = text.read(extra)
|
overlap = text.read(extra)
|
||||||
text.seek(npos)
|
text.seek(npos)
|
||||||
return data, overlap
|
return data, overlap
|
||||||
|
|
||||||
def _generate_text(self):
|
def _generate_text(self):
|
||||||
self._oeb.logger.info('Serializing markup content...')
|
self._oeb.logger.info('Serializing markup content...')
|
||||||
serializer = Serializer(self._oeb, self._images)
|
serializer = Serializer(self._oeb, self._images)
|
||||||
@ -405,7 +405,7 @@ class MobiWriter(object):
|
|||||||
offset += RECORD_SIZE
|
offset += RECORD_SIZE
|
||||||
data, overlap = self._read_text_record(text)
|
data, overlap = self._read_text_record(text)
|
||||||
self._text_nrecords = nrecords
|
self._text_nrecords = nrecords
|
||||||
|
|
||||||
def _generate_images(self):
|
def _generate_images(self):
|
||||||
self._oeb.logger.info('Serializing images...')
|
self._oeb.logger.info('Serializing images...')
|
||||||
images = [(index, href) for href, index in self._images.items()]
|
images = [(index, href) for href, index in self._images.items()]
|
||||||
@ -418,7 +418,7 @@ class MobiWriter(object):
|
|||||||
self._oeb.logger.warn('Bad image file %r' % item.href)
|
self._oeb.logger.warn('Bad image file %r' % item.href)
|
||||||
continue
|
continue
|
||||||
self._records.append(data)
|
self._records.append(data)
|
||||||
|
|
||||||
def _generate_record0(self):
|
def _generate_record0(self):
|
||||||
metadata = self._oeb.metadata
|
metadata = self._oeb.metadata
|
||||||
exth = self._build_exth()
|
exth = self._build_exth()
|
||||||
@ -510,7 +510,7 @@ class MobiWriter(object):
|
|||||||
self._images[href] = index
|
self._images[href] = index
|
||||||
self._records.append(data)
|
self._records.append(data)
|
||||||
return index
|
return index
|
||||||
|
|
||||||
def _write_header(self):
|
def _write_header(self):
|
||||||
title = str(self._oeb.metadata.title[0])
|
title = str(self._oeb.metadata.title[0])
|
||||||
title = re.sub('[^-A-Za-z0-9]+', '_', title)[:32]
|
title = re.sub('[^-A-Za-z0-9]+', '_', title)[:32]
|
||||||
@ -537,14 +537,14 @@ def config(defaults=None):
|
|||||||
c = Config('mobi', desc)
|
c = Config('mobi', desc)
|
||||||
else:
|
else:
|
||||||
c = StringConfig(defaults, desc)
|
c = StringConfig(defaults, desc)
|
||||||
|
|
||||||
mobi = c.add_group('mobipocket', _('Mobipocket-specific options.'))
|
mobi = c.add_group('mobipocket', _('Mobipocket-specific options.'))
|
||||||
mobi('compress', ['--compress'], default=False,
|
mobi('compress', ['--compress'], default=False,
|
||||||
help=_('Compress file text using PalmDOC compression. '
|
help=_('Compress file text using PalmDOC compression. '
|
||||||
'Results in smaller files, but takes a long time to run.'))
|
'Results in smaller files, but takes a long time to run.'))
|
||||||
mobi('rescale_images', ['--rescale-images'], default=False,
|
mobi('rescale_images', ['--rescale-images'], default=False,
|
||||||
help=_('Modify images to meet Palm device size limitations.'))
|
help=_('Modify images to meet Palm device size limitations.'))
|
||||||
mobi('toc_title', ['--toc-title'], default=None,
|
mobi('toc_title', ['--toc-title'], default=None,
|
||||||
help=_('Title for any generated in-line table of contents.'))
|
help=_('Title for any generated in-line table of contents.'))
|
||||||
mobi('ignore_tables', ['--ignore-tables'], default=False,
|
mobi('ignore_tables', ['--ignore-tables'], default=False,
|
||||||
help=_('Render HTML tables as blocks of text instead of actual '
|
help=_('Render HTML tables as blocks of text instead of actual '
|
||||||
@ -565,13 +565,13 @@ def config(defaults=None):
|
|||||||
c.add_opt('encoding', ['--encoding'], default=None,
|
c.add_opt('encoding', ['--encoding'], default=None,
|
||||||
help=_('Character encoding for HTML files. Default is to auto detect.'))
|
help=_('Character encoding for HTML files. Default is to auto detect.'))
|
||||||
return c
|
return c
|
||||||
|
|
||||||
|
|
||||||
def option_parser():
|
def option_parser():
|
||||||
c = config()
|
c = config()
|
||||||
parser = c.option_parser(usage='%prog '+_('[options]')+' file.opf')
|
parser = c.option_parser(usage='%prog '+_('[options]')+' file.opf')
|
||||||
parser.add_option(
|
parser.add_option(
|
||||||
'-o', '--output', default=None,
|
'-o', '--output', default=None,
|
||||||
help=_('Output file. Default is derived from input filename.'))
|
help=_('Output file. Default is derived from input filename.'))
|
||||||
parser.add_option(
|
parser.add_option(
|
||||||
'-v', '--verbose', default=0, action='count',
|
'-v', '--verbose', default=0, action='count',
|
||||||
@ -617,7 +617,7 @@ def oeb2mobi(opts, inpath):
|
|||||||
writer.dump(oeb, outpath)
|
writer.dump(oeb, outpath)
|
||||||
run_plugins_on_postprocess(outpath, 'mobi')
|
run_plugins_on_postprocess(outpath, 'mobi')
|
||||||
logger.info(_('Output written to ') + outpath)
|
logger.info(_('Output written to ') + outpath)
|
||||||
|
|
||||||
def main(argv=sys.argv):
|
def main(argv=sys.argv):
|
||||||
parser = option_parser()
|
parser = option_parser()
|
||||||
opts, args = parser.parse_args(argv[1:])
|
opts, args = parser.parse_args(argv[1:])
|
||||||
|
Loading…
x
Reference in New Issue
Block a user