From 476e9371bb7937ac92e4d2eec1d1cde5f7f86e01 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 25 Jun 2008 16:05:01 -0700 Subject: [PATCH] Add support for extracting TOC from MOBI files to mobi2oeb --- src/calibre/ebooks/metadata/opf.py | 6 ++-- src/calibre/ebooks/metadata/opf.xml | 4 +++ src/calibre/ebooks/mobi/reader.py | 50 ++++++++++++++++++++++++----- 3 files changed, 49 insertions(+), 11 deletions(-) diff --git a/src/calibre/ebooks/metadata/opf.py b/src/calibre/ebooks/metadata/opf.py index bff9ed5f8a..51b392c70f 100644 --- a/src/calibre/ebooks/metadata/opf.py +++ b/src/calibre/ebooks/metadata/opf.py @@ -353,7 +353,7 @@ class OPF(MetaInformation): return reference.get('href') return None - def set_cover(self, path): + def set_cover(self, path, type='cover'): self._initialize() doc = dom.parseString(self.soup.__str__('UTF-8')) package = doc.documentElement @@ -363,11 +363,11 @@ class OPF(MetaInformation): else: guide = doc.createElement('guide') package.appendChild(guide) - el = self._find_element(guide, 'reference', [('type', 'cover')]) + el = self._find_element(guide, 'reference', [('type', type)]) if not el: el = doc.createElement('reference') guide.appendChild(el) - el.setAttribute('type', 'cover') + el.setAttribute('type', type) el.setAttribute('href', path) self._commit(doc) diff --git a/src/calibre/ebooks/metadata/opf.xml b/src/calibre/ebooks/metadata/opf.xml index ed54c10a32..b74c67e085 100644 --- a/src/calibre/ebooks/metadata/opf.xml +++ b/src/calibre/ebooks/metadata/opf.xml @@ -25,6 +25,10 @@ + + + + diff --git a/src/calibre/ebooks/mobi/reader.py b/src/calibre/ebooks/mobi/reader.py index 8f314026f8..22aa95b1e4 100644 --- a/src/calibre/ebooks/mobi/reader.py +++ b/src/calibre/ebooks/mobi/reader.py @@ -20,7 +20,7 @@ from calibre.ebooks.mobi.palmdoc import decompress_doc from calibre.ebooks.mobi.langcodes import main_language, sub_language from calibre.ebooks.metadata import MetaInformation from calibre.ebooks.metadata.opf import OPFCreator - +from calibre.ebooks.metadata.toc import TOC class EXTHHeader(object): @@ -172,22 +172,37 @@ class MobiReader(object): self.processed_html) soup = BeautifulSoup(self.processed_html.replace('> <', '>\n<')) + guide = soup.find('guide') for elem in soup.findAll(['metadata', 'guide']): elem.extract() - htmlfile = os.path.join(output_dir, self.name+'.html') + htmlfile = os.path.join(output_dir, self.name+'.html') + for ref in guide.findAll('reference', href=True): + ref['href'] = os.path.basename(htmlfile)+ref['href'] open(htmlfile, 'wb').write(unicode(soup).encode('utf8')) self.htmlfile = htmlfile if self.book_header.exth is not None: - opf = self.create_opf(htmlfile) - opf.render(open(os.path.splitext(htmlfile)[0]+'.opf', 'wb')) + ncx = cStringIO.StringIO() + opf = self.create_opf(htmlfile, guide) + opf.render(open(os.path.splitext(htmlfile)[0]+'.opf', 'wb'), ncx) + ncx = ncx.getvalue() + if ncx: + open(os.path.splitext(htmlfile)[0]+'.ncx', 'wb').write(ncx) def cleanup(self): self.processed_html = re.sub(r'
', '', self.processed_html) - def create_opf(self, htmlfile): + def create_opf(self, htmlfile, guide=None): mi = self.book_header.exth.mi opf = OPFCreator(os.path.dirname(htmlfile), mi) + guide_elements, toc = [], None + if guide: + for elem in guide.findAll('reference'): + if elem['type'] == 'toc': + toc = elem['href'] + continue + guide_elements.append((elem['title'], elem['type'], elem['href'])) + opf.extra_mobi_guide_elements = guide_elements if hasattr(self.book_header.exth, 'cover_offset'): opf.cover = 'images/%05d.jpg'%(self.book_header.exth.cover_offset+1) manifest = [(htmlfile, 'text/x-oeb1-document')] @@ -197,6 +212,23 @@ class MobiReader(object): opf.create_manifest(manifest) opf.create_spine([os.path.basename(htmlfile)]) + + if toc: + index = self.processed_html.find(' -1: + raw = ''+self.processed_html[index:] + soup = BeautifulSoup(raw) + tocobj = TOC() + for a in soup.findAll('a', href=True): + try: + text = ''.join(a.findAll(text=True)) + except: + text = '' + tocobj.add_item(toc.partition('#')[0], a['href'][1:], text) + if tocobj is not None: + opf.set_toc(tocobj) + return opf @@ -221,7 +253,6 @@ class MobiReader(object): elif self.book_header.compression_type == '\x00\x01': self.mobi_html = ''.join(text_sections) - else: raise MobiError('Unknown compression algorithm: %s'%repr(self.book_header.compression_type)) @@ -234,7 +265,7 @@ class MobiReader(object): def add_anchors(self): positions = set([]) - link_pattern = re.compile(r']+filepos=[\'"]{0,1}(\d+)[^<>]*>', re.IGNORECASE) for match in link_pattern.finditer(self.mobi_html): positions.add(int(match.group(1))) positions = list(positions) @@ -251,7 +282,10 @@ class MobiReader(object): pos = end self.processed_html += self.mobi_html[pos:] - self.processed_html = link_pattern.sub(lambda match: '