diff --git a/setup.py b/setup.py index d2593888e1..7cd2198079 100644 --- a/setup.py +++ b/setup.py @@ -44,6 +44,7 @@ entry_points = { 'pdfreflow = libprs500.ebooks.lrf.pdf.reflow:main', 'isbndb = libprs500.ebooks.metadata.isbndb:main', 'librarything = libprs500.ebooks.metadata.library_thing:main', + 'mobi2oeb = libprs500.ebooks.mobi.reader:main', 'lrf2html = libprs500.ebooks.lrf.html.convert_to:main', ], 'gui_scripts' : [ diff --git a/src/libprs500/ebooks/metadata/opf.py b/src/libprs500/ebooks/metadata/opf.py index caff3409e4..69631a228d 100644 --- a/src/libprs500/ebooks/metadata/opf.py +++ b/src/libprs500/ebooks/metadata/opf.py @@ -177,6 +177,7 @@ class OPF(MetaInformation): MIMETYPE = 'application/oebps-package+xml' ENTITY_PATTERN = re.compile(r'&(\S+?);') + uid = standard_field('uid') libprs_id = standard_field('libprs_id') title = standard_field('title') authors = standard_field('authors') @@ -239,10 +240,11 @@ class OPF(MetaInformation): dcms = metadata.getElementsByTagName(type) if dcms: - dcm = dcms[0] + dcm = dcms[0] else: dcm = doc.createElement(type) metadata.appendChild(dcm) + metadata.appendChild(doc.createTextNode('\n')) tags = dcm.getElementsByTagName(name) if tags and not replace: for tag in tags: @@ -260,6 +262,7 @@ class OPF(MetaInformation): for attr, vattr in vattrs: el.setAttribute(attr, vattr) dcm.appendChild(el) + dcm.appendChild(doc.createTextNode('\n')) self._commit(doc) @@ -350,6 +353,15 @@ class OPF(MetaInformation): comments = '' self._set_metadata_element('dc:Description', comments) + def get_uid(self): + package = self.soup.find('package') + if package.has_key('unique-identifier'): + return package['unique-identifier'] + + def set_uid(self, uid): + package = self.soup.find('package') + package['unique-identifier'] = str(uid) + def get_category(self): category = self.soup.find('dc:type') if category: @@ -500,7 +512,12 @@ class OPF(MetaInformation): self._set_metadata_element('dc:Subject', tags) def write(self, stream): - stream.write(self.soup.prettify('utf-8')) + src = unicode(self.soup) + src = re.sub(r'>\s*', ' />\n', src) + src = re.sub(r'<', '\n<', src) + src = re.sub(r'<', '\n<', src) + src = re.sub(r'^>3]), g + 8 - ((self.pos+g) & 7) + return (r >> (g - n)) & ((1 << n) - 1) + + def eat(self, n): + self.pos += n + return self.pos <= self.nbits + + def left(self): + return self.nbits - self.pos + +class HuffReader(object): + + def __init__(self, huffs, extra_flags, codec='cp1252'): + self.huffs, self.extra_flags, self.codec = huffs, extra_flags, codec + + if huffs[0][0:4] != 'HUFF' or huffs[0][4:8] != '\x00\x00\x00\x18': + raise MobiError('Invalid HUFF header') + + if huffs[1][0:4] != 'CDIC' or huffs[1][4:8] != '\x00\x00\x00\x10': + raise ValueError('Invalid CDIC header') + + self.entry_bits, = struct.unpack('>L', huffs[1][12:16]) + off1,off2 = struct.unpack('>LL', huffs[0][16:24]) + self.dict1 = struct.unpack('<256L', huffs[0][off1:off1+256*4]) + self.dict2 = struct.unpack('<64L', huffs[0][off2:off2+64*4]) + self.dicts = huffs[1:] + self.r = '' + + def _unpack(self, bits, depth = 0): + if depth > 32: + raise MobiError('Corrupt file') + + while bits.left(): + dw = bits.peek(32) + v = self.dict1[dw >> 24] + codelen = v & 0x1F + assert codelen != 0 + code = dw >> (32 - codelen) + r = (v >> 8) + if not (v & 0x80): + while code < self.dict2[(codelen-1)*2]: + codelen += 1 + code = dw >> (32 - codelen) + r = self.dict2[(codelen-1)*2+1] + r -= code + assert codelen != 0 + if not bits.eat(codelen): + return + dicno = r >> self.entry_bits + off1 = 16 + (r - (dicno << self.entry_bits)) * 2 + dic = self.dicts[dicno] + off2 = 16 + ord(dic[off1]) * 256 + ord(dic[off1+1]) + blen = ord(dic[off2]) * 256 + ord(dic[off2+1]) + slice = dic[off2+2:off2+2+(blen&0x7fff)] + if blen & 0x8000: + self.r += slice + else: + self._unpack(BitReader(slice), depth + 1) + + def unpack(self, data): + self.r = '' + self._unpack(BitReader(data)) + return self.r + + def sizeof_trailing_entries(self, data): + + def sizeof_trailing_entry(ptr, psize): + bitpos, result = 0, 0 + while True: + v = ord(ptr[psize-1]) + result |= (v & 0x7F) << bitpos + bitpos += 7 + psize -= 1 + if (v & 0x80) != 0 or (bitpos >= 28) or (psize == 0): + return result + + num = 0 + size = len(data) + flags = self.extra_flags >> 1 + while flags: + if flags & 1: + num += sizeof_trailing_entry(data, size - num) + flags >>= 1 + return num + + def decompress(self, sections): + r = '' + for data in sections: + trail_size = self.sizeof_trailing_entries(data) + r += self.unpack(data[:len(data)-trail_size]) + if r.endswith('#'): + r = r[:-1] + return r.decode(self.codec) diff --git a/src/libprs500/ebooks/mobi/palmdoc.py b/src/libprs500/ebooks/mobi/palmdoc.py new file mode 100644 index 0000000000..2dba998b08 --- /dev/null +++ b/src/libprs500/ebooks/mobi/palmdoc.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai + +## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2 of the License, or +## (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License along +## with this program; if not, write to the Free Software Foundation, Inc., +## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +COUNT_BITS = 3 + +def decompress_doc(data, codec='cp1252'): + buffer = [ord(i) for i in data] + res = [] + i = 0 + while i < len(buffer): + c = buffer[i] + i += 1 + if c >= 1 and c <= 8: + res.extend(buffer[i:i+c]) + i += c + elif c <= 0x7f: + res.append(c) + elif c >= 0xc0: + res.extend( (ord(' '), c^0x80) ) + else: + c = (c << 8) + buffer[i] + i += 1 + di = (c & 0x3fff) >> COUNT_BITS + j = len(res) + num = (c & ((1 << COUNT_BITS) - 1)) + 3 + + for k in range( num ): + res.append(res[j - di+k]) + + return unicode(''.join([chr(i) for i in res]), codec) + \ No newline at end of file diff --git a/src/libprs500/ebooks/mobi/reader.py b/src/libprs500/ebooks/mobi/reader.py new file mode 100644 index 0000000000..b889f39528 --- /dev/null +++ b/src/libprs500/ebooks/mobi/reader.py @@ -0,0 +1,296 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai + +## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2 of the License, or +## (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License along +## with this program; if not, write to the Free Software Foundation, Inc., +## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +''' +Read data from .mobi files +''' + +import sys, struct, os, cStringIO, re + +try: + from PIL import Image as PILImage +except ImportError: + import Image as PILImage + +from libprs500.ebooks.mobi import MobiError +from libprs500.ebooks.mobi.huffcdic import HuffReader +from libprs500.ebooks.mobi.palmdoc import decompress_doc +from libprs500.ebooks.metadata import MetaInformation +from libprs500.ebooks.metadata.opf import OPFCreator + + +class EXTHHeader(object): + + def __init__(self, raw, codec): + self.doctype = raw[:4] + self.length, self.num_items = struct.unpack('>LL', raw[4:12]) + raw = raw[12:] + pos = 0 + + self.mi = MetaInformation('Unknown', ['Unknown']) + self.has_fake_cover = True + + for i in range(self.num_items): + id, size = struct.unpack('>LL', raw[pos:pos+8]) + content = raw[pos+8:pos+size] + pos += size + if id >= 100 and id < 200: + self.process_metadata(id, content, codec) + elif id == 203: + self.has_fake_cover = bool(struct.unpack('>L', content)[0]) + elif id == 201: + self.cover_offset, = struct.unpack('>L', content) + elif id == 202: + self.thumbnail_offset, = struct.unpack('>L', content) + pos += 3 + stop = raw.find('\x00') + if stop > -1: + self.mi.title = raw[pos:stop].decode(codec, 'ignore') + + + def process_metadata(self, id, content, codec): + if id == 100: + aus = content.split(',') + authors = [] + for a in aus: + authors.extend(a.split('&')) + self.mi.authors = [i.decode(codec, 'ignore') for i in authors] + elif id == 101: + self.mi.publisher = content.decode(codec, 'ignore') + elif id == 103: + self.mi.comments = content.decode(codec, 'ignore') + elif id == 104: + self.mi.isbn = content.decode(codec, 'ignore').strip().replace('-', '') + elif id == 105: + self.mi.category = content.decode(codec, 'ignore') + + + +class BookHeader(object): + + def __init__(self, raw, ident): + self.compression_type = raw[:2] + self.records, self.records_size = struct.unpack('>HH', raw[8:12]) + self.encryption_type, = struct.unpack('>H', raw[12:14]) + self.doctype = raw[16:20] + self.length, self.type, self.codepage, self.unique_id, self.version = \ + struct.unpack('>LLLLL', raw[20:40]) + + if ident == 'TEXTREAD': + self.codepage = 1252 + + try: + self.codec = { + 1252 : 'cp1252', + 65001 : 'utf-8', + }[self.codepage] + except IndexError, KeyError: + raise MobiError('Unknown codepage: %d'%self.codepage) + + if ident == 'TEXTREAD': + self.extra_flags = 0 + else: + self.extra_flags, = struct.unpack('>L', raw[0xF0:0xF4]) + + if self.compression_type == 'DH': + self.huff_offset, self.huff_number = struct.unpack('>LL', raw[0x70:0x78]) + + self.exth_flag, = struct.unpack('>L', raw[0x80:0x84]) + self.exth = None + if self.exth_flag & 0x40: + self.exth = EXTHHeader(raw[16+self.length:], self.codec) + self.exth.mi.uid = self.unique_id + + +class MobiReader(object): + + PAGE_BREAK_PAT = re.compile(r'(<[/]{0,1}mbp:pagebreak\s*[/]{0,1}>)+', re.IGNORECASE) + + def __init__(self, filename_or_stream): + if hasattr(filename_or_stream, 'read'): + stream = filename_or_stream + stream.seek(0) + else: + stream = open(filename_or_stream, 'rb') + + raw = stream.read() + + self.header = raw[0:72] + self.name = self.header[:32].replace('\x00', '') + self.num_sections, = struct.unpack('>H', raw[76:78]) + + self.ident = self.header[0x3C:0x3C+8].upper() + if self.ident not in ['BOOKMOBI', 'TEXTREAD']: + raise MobiError('Unknown book type: %s'%self.ident) + + self.sections = [] + self.section_headers = [] + for i in range(self.num_sections): + offset, a1, a2, a3, a4 = struct.unpack('>LBBBB', raw[78+i*8:78+i*8+8]) + flags, val = a1, a2<<16 | a3<<8 | a4 + self.section_headers.append((offset, flags, val)) + + def section(section_number): + if section_number == self.num_sections - 1: + end_off = len(raw) + else: + end_off = self.section_headers[section_number + 1][0] + off = self.section_headers[section_number][0] + + return raw[off:end_off] + + for i in range(self.num_sections): + self.sections.append((section(i), self.section_headers[i])) + + + self.book_header = BookHeader(self.sections[0][0], self.ident) + + + def extract_content(self, output_dir=os.getcwdu()): + if self.book_header.encryption_type != 0: + raise MobiError('Cannot extract content from DRM protected ebook') + text_sections = [self.sections[i][0] for i in range(1, self.book_header.records+1)] + processed_records = list(range(0, self.book_header.records+1)) + + self.mobi_html = u'' + codec = self.book_header.codec + + if self.book_header.compression_type == 'DH': + huffs = [self.sections[i][0] for i in + range(self.book_header.huff_offset, + self.book_header.huff_offset+self.book_header.huff_number)] + processed_records += list(range(self.book_header.huff_offset, + self.book_header.huff_offset+self.book_header.huff_number)) + huff = HuffReader(huffs, self.book_header.extra_flags, codec) + self.mobi_html = huff.decompress(text_sections) + + elif self.book_header.compression_type == '\x00\x02': + for section in text_sections: + self.mobi_html += decompress_doc(section, codec) + + elif self.book_header.compression_type == '\x00\x01': + t = [i.decode(codec) for i in text_sections] + self.mobi_html = ''.join(t) + + else: + raise MobiError('Unknown compression algorithm: %s'%repr(self.book_header.compression_type)) + + self.add_anchors() + self.extract_images(processed_records, output_dir) + self.replace_page_breaks() + + self.processed_html = re.compile('', re.IGNORECASE).sub( + '\n\n', + self.processed_html) + + htmlfile = os.path.join(output_dir, self.name+'.html') + open(htmlfile, 'wb').write(self.processed_html.encode('utf8')) + + if self.book_header.exth is not None: + mi = self.book_header.exth.mi + opf = OPFCreator(mi) + if hasattr(self.book_header.exth, 'cover_offset'): + opf.cover = 'images/%d.jpg'%(self.book_header.exth.cover_offset+1) + manifest = [(os.path.basename(htmlfile), 'text/x-oeb1-document')] + for i in self.image_names: + manifest.append(('images/'+i, 'image/jpg')) + + opf.create_manifest(manifest) + opf.create_spine([os.path.basename(htmlfile)]) + opf.write(open(os.path.splitext(htmlfile)[0]+'.opf', 'wb')) + + + def replace_page_breaks(self): + self.processed_html = self.PAGE_BREAK_PAT.sub('
', + self.processed_html) + + def add_anchors(self): + positions = [] + link_pattern = re.compile(r'', end) + if r > -1 and r < l: # Move out of tag + end = r+1 + self.processed_html += self.mobi_html[pos:end] + ''%oend + pos = end + + self.processed_html += self.mobi_html[pos:] + self.processed_html = link_pattern.sub(lambda match: '