diff --git a/src/calibre/ebooks/mobi/reader.py b/src/calibre/ebooks/mobi/reader.py index 7fab3ac2d8..a7ae80404e 100644 --- a/src/calibre/ebooks/mobi/reader.py +++ b/src/calibre/ebooks/mobi/reader.py @@ -27,7 +27,7 @@ from calibre.ebooks.metadata.toc import TOC from calibre import sanitize_file_name class EXTHHeader(object): - + def __init__(self, raw, codec, title): self.doctype = raw[:4] self.length, self.num_items = struct.unpack('>LL', raw[4:12]) @@ -35,7 +35,7 @@ class EXTHHeader(object): pos = 0 self.mi = MetaInformation(_('Unknown'), [_('Unknown')]) self.has_fake_cover = True - + for i in range(self.num_items): id, size = struct.unpack('>LL', raw[pos:pos+8]) content = raw[pos+8:pos+size] @@ -43,18 +43,18 @@ class EXTHHeader(object): if id >= 100 and id < 200: self.process_metadata(id, content, codec) elif id == 203: - self.has_fake_cover = bool(struct.unpack('>L', content)[0]) + self.has_fake_cover = bool(struct.unpack('>L', content)[0]) elif id == 201: co, = struct.unpack('>L', content) if co < 1e7: - self.cover_offset = co + self.cover_offset = co elif id == 202: self.thumbnail_offset, = struct.unpack('>L', content) #else: # print 'unknown record', id, repr(content) if title: self.mi.title = title - + def process_metadata(self, id, content, codec): if id == 100: if self.mi.authors == [_('Unknown')]: @@ -71,11 +71,11 @@ class EXTHHeader(object): self.mi.tags = [] self.mi.tags.append(content.decode(codec, 'ignore')) #else: - # print 'unhandled metadata record', id, repr(content), codec - + # print 'unhandled metadata record', id, repr(content), codec + class BookHeader(object): - + def __init__(self, raw, ident): self.compression_type = raw[:2] self.records, self.records_size = struct.unpack('>HH', raw[8:12]) @@ -96,8 +96,8 @@ class BookHeader(object): self.doctype = raw[16:20] self.length, self.type, self.codepage, self.unique_id, self.version = \ struct.unpack('>LLLLL', raw[20:40]) - - + + try: self.codec = { 1252 : 'cp1252', @@ -106,15 +106,15 @@ class BookHeader(object): except (IndexError, KeyError): print '[WARNING] Unknown codepage %d. Assuming cp-1252'%self.codepage self.codec = 'cp1252' - + if ident == 'TEXTREAD' or self.length < 0xE4 or 0xE8 < self.length: self.extra_flags = 0 else: self.extra_flags, = struct.unpack('>H', raw[0xF2:0xF4]) - + if self.compression_type == 'DH': - self.huff_offset, self.huff_number = struct.unpack('>LL', raw[0x70:0x78]) - + self.huff_offset, self.huff_number = struct.unpack('>LL', raw[0x70:0x78]) + toff, tlen = struct.unpack('>II', raw[0x54:0x5c]) tend = toff + tlen self.title = raw[toff:tend] if tend < len(raw) else _('Unknown') @@ -125,7 +125,7 @@ class BookHeader(object): self.sublanguage = sub_language.get(sublangid, 'NEUTRAL') self.mobi_version = struct.unpack('>I', raw[0x68:0x6c])[0] self.first_image_index = struct.unpack('>L', raw[0x6c:0x6c+4])[0] - + self.exth_flag, = struct.unpack('>L', raw[0x80:0x84]) self.exth = None if not isinstance(self.title, unicode): @@ -134,39 +134,39 @@ class BookHeader(object): self.exth = EXTHHeader(raw[16+self.length:], self.codec, self.title) self.exth.mi.uid = self.unique_id self.exth.mi.language = self.language - + class MobiReader(object): PAGE_BREAK_PAT = re.compile(r'(<[/]{0,1}mbp:pagebreak\s*[/]{0,1}>)+', re.IGNORECASE) IMAGE_ATTRS = ('lowrecindex', 'recindex', 'hirecindex') - + def __init__(self, filename_or_stream, verbose=False): self.verbose = verbose self.embedded_mi = None - + if hasattr(filename_or_stream, 'read'): stream = filename_or_stream stream.seek(0) else: stream = open(filename_or_stream, 'rb') - + raw = stream.read() - + self.header = raw[0:72] self.name = self.header[:32].replace('\x00', '') self.num_sections, = struct.unpack('>H', raw[76:78]) - + self.ident = self.header[0x3C:0x3C+8].upper() if self.ident not in ['BOOKMOBI', 'TEXTREAD']: - raise MobiError('Unknown book type: %s'%self.ident) - + raise MobiError('Unknown book type: %s'%self.ident) + self.sections = [] self.section_headers = [] for i in range(self.num_sections): offset, a1, a2, a3, a4 = struct.unpack('>LBBBB', raw[78+i*8:78+i*8+8]) flags, val = a1, a2<<16 | a3<<8 | a4 self.section_headers.append((offset, flags, val)) - + def section(section_number): if section_number == self.num_sections - 1: end_off = len(raw) @@ -174,32 +174,32 @@ class MobiReader(object): end_off = self.section_headers[section_number + 1][0] off = self.section_headers[section_number][0] return raw[off:end_off] - + for i in range(self.num_sections): - self.sections.append((section(i), self.section_headers[i])) - - + self.sections.append((section(i), self.section_headers[i])) + + self.book_header = BookHeader(self.sections[0][0], self.ident) self.name = self.name.decode(self.book_header.codec, 'replace') - + def extract_content(self, output_dir=os.getcwdu()): output_dir = os.path.abspath(output_dir) if self.book_header.encryption_type != 0: raise DRMError(self.name) - + processed_records = self.extract_text() self.add_anchors() self.processed_html = self.processed_html.decode(self.book_header.codec, 'ignore') for pat in ENCODING_PATS: self.processed_html = pat.sub('', self.processed_html) - e2u = functools.partial(entity_to_unicode, + e2u = functools.partial(entity_to_unicode, exceptions=['lt', 'gt', 'amp', 'apos', 'quot']) self.processed_html = re.sub(r'&(\S+?);', e2u, self.processed_html) self.extract_images(processed_records, output_dir) self.replace_page_breaks() self.cleanup_html() - + if self.processed_html.startswith('' self.processed_html = \ @@ -212,7 +212,7 @@ class MobiReader(object): '.italic { font-style: italic; }\n' '\n', self.processed_html) - + if self.verbose: print 'Parsing HTML...' root = html.fromstring(self.processed_html) @@ -224,7 +224,7 @@ class MobiReader(object): self.read_embedded_metadata(root, metadata_elems[0], guide) for elem in guides + metadata_elems: elem.getparent().remove(elem) - htmlfile = os.path.join(output_dir, + htmlfile = os.path.join(output_dir, sanitize_file_name(self.name)+'.html') try: for ref in guide.xpath('descendant::reference'): @@ -235,13 +235,13 @@ class MobiReader(object): if self.verbose: print 'Serializing...' with open(htmlfile, 'wb') as f: - raw = html.tostring(root, encoding='utf-8', method='xml', + raw = html.tostring(root, encoding='utf-8', method='xml', include_meta_content_type=True, pretty_print=True) - raw = raw.replace('', + raw = raw.replace('', '\n\n') f.write(raw) self.htmlfile = htmlfile - + if self.book_header.exth is not None or self.embedded_mi is not None: if self.verbose: print 'Creating OPF...' @@ -251,7 +251,7 @@ class MobiReader(object): ncx = ncx.getvalue() if ncx: open(os.path.splitext(htmlfile)[0]+'.ncx', 'wb').write(ncx) - + def read_embedded_metadata(self, root, elem, guide): raw = ''+html.tostring(elem, encoding='utf-8')+'' stream = cStringIO.StringIO(raw) @@ -276,8 +276,8 @@ class MobiReader(object): elem.getparent().remove(elem) break break - - + + def cleanup_html(self): if self.verbose: print 'Cleaning up HTML...' @@ -286,7 +286,7 @@ class MobiReader(object): self.processed_html = '

'+self.processed_html.replace('\n\n', '

')+'' self.processed_html = self.processed_html.replace('\r\n', '\n') self.processed_html = self.processed_html.replace('> <', '>\n<') - + def upshift_markup(self, root): if self.verbose: print 'Converting style information to CSS...' @@ -358,13 +358,15 @@ class MobiReader(object): attrib['style'] = '; '.join(styles) if 'filepos-id' in attrib: attrib['id'] = attrib.pop('filepos-id') + if 'name' in attrib and attrib['name'] != attrib['id']: + attrib['name'] = attrib['id'] if 'filepos' in attrib: filepos = attrib.pop('filepos') try: attrib['href'] = "#filepos%d" % int(filepos) except ValueError: pass - + def create_opf(self, htmlfile, guide=None, root=None): mi = getattr(self.book_header.exth, 'mi', self.embedded_mi) opf = OPFCreator(os.path.dirname(htmlfile), mi) @@ -374,14 +376,14 @@ class MobiReader(object): opf.cover = mi.cover else: opf.cover = 'images/%05d.jpg'%1 - if not os.path.exists(os.path.join(os.path.dirname(htmlfile), + if not os.path.exists(os.path.join(os.path.dirname(htmlfile), *opf.cover.split('/'))): opf.cover = None manifest = [(htmlfile, 'text/x-oeb1-document')] bp = os.path.dirname(htmlfile) for i in getattr(self, 'image_names', []): manifest.append((os.path.join(bp, 'images/', i), 'image/jpeg')) - + opf.create_manifest(manifest) opf.create_spine([os.path.basename(htmlfile)]) toc = None @@ -410,16 +412,16 @@ class MobiReader(object): except: text = '' text = ent_pat.sub(entity_to_unicode, text) - tocobj.add_item(toc.partition('#')[0], href[1:], + tocobj.add_item(toc.partition('#')[0], href[1:], text) if reached and x.get('class', None) == 'mbp_pagebreak': break if tocobj is not None: opf.set_toc(tocobj) - + return opf - - + + def sizeof_trailing_entries(self, data): def sizeof_trailing_entry(ptr, psize): bitpos, result = 0, 0 @@ -430,7 +432,7 @@ class MobiReader(object): psize -= 1 if (v & 0x80) != 0 or (bitpos >= 28) or (psize == 0): return result - + num = 0 size = len(data) flags = self.book_header.extra_flags >> 1 @@ -446,28 +448,28 @@ class MobiReader(object): data = self.sections[index][0] trail_size = self.sizeof_trailing_entries(data) return data[:len(data)-trail_size] - + def extract_text(self): if self.verbose: print 'Extracting text...' text_sections = [self.text_section(i) for i in range(1, self.book_header.records+1)] processed_records = list(range(0, self.book_header.records+1)) - + self.mobi_html = '' - + if self.book_header.compression_type == 'DH': - huffs = [self.sections[i][0] for i in - range(self.book_header.huff_offset, + huffs = [self.sections[i][0] for i in + range(self.book_header.huff_offset, self.book_header.huff_offset+self.book_header.huff_number)] - processed_records += list(range(self.book_header.huff_offset, + processed_records += list(range(self.book_header.huff_offset, self.book_header.huff_offset+self.book_header.huff_number)) huff = HuffReader(huffs) self.mobi_html = huff.decompress(text_sections) - + elif self.book_header.compression_type == '\x00\x02': for section in text_sections: self.mobi_html += decompress_doc(section) - + elif self.book_header.compression_type == '\x00\x01': self.mobi_html = ''.join(text_sections) else: @@ -475,13 +477,13 @@ class MobiReader(object): if self.book_header.ancient and '', self.processed_html) - + def add_anchors(self): if self.verbose: print 'Adding anchors...' @@ -511,8 +513,8 @@ class MobiReader(object): self.processed_html += self.mobi_html[pos:end] + (anchor % oend) pos = end self.processed_html += self.mobi_html[pos:] - - + + def extract_images(self, processed_records, output_dir): if self.verbose: print 'Extracting images...' @@ -523,7 +525,7 @@ class MobiReader(object): self.image_names = [] start = getattr(self.book_header, 'first_image_index', -1) if start > self.num_sections or start < 0: - # BAEN PRC files have bad headers + # BAEN PRC files have bad headers start=0 for i in range(start, self.num_sections): if i in processed_records: @@ -533,10 +535,10 @@ class MobiReader(object): buf = cStringIO.StringIO(data) image_index += 1 try: - im = PILImage.open(buf) + im = PILImage.open(buf) except IOError: continue - + path = os.path.join(output_dir, '%05d.jpg'%image_index) self.image_names.append(os.path.basename(path)) im.convert('RGB').save(open(path, 'wb'), format='JPEG') @@ -562,17 +564,17 @@ def get_metadata(stream): import traceback traceback.print_exc() return mi - - + + def option_parser(): from calibre.utils.config import OptionParser parser = OptionParser(usage=_('%prog [options] myebook.mobi')) - parser.add_option('-o', '--output-dir', default='.', + parser.add_option('-o', '--output-dir', default='.', help=_('Output directory. Defaults to current directory.')) parser.add_option('-v', '--verbose', default=False, action='store_true', help='Useful for debugging.') return parser - + def main(args=sys.argv): parser = option_parser() @@ -580,7 +582,7 @@ def main(args=sys.argv): if len(args) != 2: parser.print_help() return 1 - + mr = MobiReader(args[1], verbose=opts.verbose) opts.output_dir = os.path.abspath(opts.output_dir) mr.extract_content(opts.output_dir) @@ -591,9 +593,9 @@ def main(args=sys.argv): dat = dat.encode('utf-8') open(oname, 'wb').write(dat) print _('Raw MOBI HTML saved in'), oname - + print _('OEB ebook created in'), opts.output_dir - + return 0 if __name__ == '__main__': diff --git a/src/calibre/ebooks/mobi/writer.py b/src/calibre/ebooks/mobi/writer.py index a38921d798..467e2c6dc7 100644 --- a/src/calibre/ebooks/mobi/writer.py +++ b/src/calibre/ebooks/mobi/writer.py @@ -132,7 +132,7 @@ def rescale_image(data, maxsizeb, dimen=None): class Serializer(object): NSRMAP = {'': None, XML_NS: 'xml', XHTML_NS: '', MBP_NS: 'mbp'} - + def __init__(self, oeb, images): self.oeb = oeb self.images = images @@ -192,7 +192,7 @@ class Serializer(object): self.href_offsets[href].append(buffer.tell()) buffer.write('0000000000') return True - + def serialize_body(self): buffer = self.buffer self.anchor_offset = buffer.tell() @@ -290,10 +290,10 @@ class Serializer(object): buffer.seek(hoff) buffer.write('%010d' % ioff) - + class MobiWriter(object): COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+') - + def __init__(self, compression=None, imagemax=None, prefer_author_sort=False): self._compression = compression or UNCOMPRESSED @@ -305,11 +305,11 @@ class MobiWriter(object): return self._dump_stream(oeb, path) with open(path, 'w+b') as stream: return self._dump_stream(oeb, stream) - + def _write(self, *data): for datum in data: self._stream.write(datum) - + def _tell(self): return self._stream.tell() @@ -364,7 +364,7 @@ class MobiWriter(object): overlap = text.read(extra) text.seek(npos) return data, overlap - + def _generate_text(self): self._oeb.logger.info('Serializing markup content...') serializer = Serializer(self._oeb, self._images) @@ -405,7 +405,7 @@ class MobiWriter(object): offset += RECORD_SIZE data, overlap = self._read_text_record(text) self._text_nrecords = nrecords - + def _generate_images(self): self._oeb.logger.info('Serializing images...') images = [(index, href) for href, index in self._images.items()] @@ -418,7 +418,7 @@ class MobiWriter(object): self._oeb.logger.warn('Bad image file %r' % item.href) continue self._records.append(data) - + def _generate_record0(self): metadata = self._oeb.metadata exth = self._build_exth() @@ -510,7 +510,7 @@ class MobiWriter(object): self._images[href] = index self._records.append(data) return index - + def _write_header(self): title = str(self._oeb.metadata.title[0]) title = re.sub('[^-A-Za-z0-9]+', '_', title)[:32] @@ -537,14 +537,14 @@ def config(defaults=None): c = Config('mobi', desc) else: c = StringConfig(defaults, desc) - + mobi = c.add_group('mobipocket', _('Mobipocket-specific options.')) mobi('compress', ['--compress'], default=False, help=_('Compress file text using PalmDOC compression. ' 'Results in smaller files, but takes a long time to run.')) - mobi('rescale_images', ['--rescale-images'], default=False, + mobi('rescale_images', ['--rescale-images'], default=False, help=_('Modify images to meet Palm device size limitations.')) - mobi('toc_title', ['--toc-title'], default=None, + mobi('toc_title', ['--toc-title'], default=None, help=_('Title for any generated in-line table of contents.')) mobi('ignore_tables', ['--ignore-tables'], default=False, help=_('Render HTML tables as blocks of text instead of actual ' @@ -565,13 +565,13 @@ def config(defaults=None): c.add_opt('encoding', ['--encoding'], default=None, help=_('Character encoding for HTML files. Default is to auto detect.')) return c - + def option_parser(): c = config() parser = c.option_parser(usage='%prog '+_('[options]')+' file.opf') parser.add_option( - '-o', '--output', default=None, + '-o', '--output', default=None, help=_('Output file. Default is derived from input filename.')) parser.add_option( '-v', '--verbose', default=0, action='count', @@ -617,7 +617,7 @@ def oeb2mobi(opts, inpath): writer.dump(oeb, outpath) run_plugins_on_postprocess(outpath, 'mobi') logger.info(_('Output written to ') + outpath) - + def main(argv=sys.argv): parser = option_parser() opts, args = parser.parse_args(argv[1:])