Refactor oeb iterator and implement logic to collect indexing data

2025-07-09 03:04:10 -04:00 · 2012-05-07 22:39:59 +05:30 · 2012-05-07 22:39:59 +05:30 · d993fbc91f
commit d993fbc91f
parent 820ba7ec7d
7 changed files with 459 additions and 387 deletions
--- a/src/calibre/ebooks/oeb/iterator.py
+++ b/src/calibre/ebooks/oeb/iterator.py
@ -1,383 +0,0 @@
 from __future__ import with_statement
 __license__   = 'GPL v3'
 __copyright__ = '2008 Kovid Goyal <kovid at kovidgoyal.net>'
 '''
 Iterate over the HTML files in an ebook. Useful for writing viewers.
 '''
 import re, os, math
 from cStringIO import StringIO
 from PyQt4.Qt import QFontDatabase
 from calibre.customize.ui import available_input_formats
 from calibre.ebooks.metadata.opf2 import OPF
 from calibre.ptempfile import TemporaryDirectory
 from calibre.ebooks.chardet import xml_to_unicode
 from calibre.utils.zipfile import safe_replace
 from calibre.utils.config import DynamicConfig
 from calibre.utils.logging import Log
 from calibre import (guess_type, prints, prepare_string_for_xml,
        xml_replace_entities)
 from calibre.ebooks.oeb.transforms.cover import CoverManager
 from calibre.constants import filesystem_encoding
 TITLEPAGE = CoverManager.SVG_TEMPLATE.decode('utf-8').replace(\
        '__ar__', 'none').replace('__viewbox__', '0 0 600 800'
        ).replace('__width__', '600').replace('__height__', '800')
 BM_FIELD_SEP = u'*|!|?|*'
 BM_LEGACY_ESC = u'esc-text-%&*#%(){}ads19-end-esc'
 def character_count(html):
    '''
    Return the number of "significant" text characters in a HTML string.
    '''
    count = 0
    strip_space = re.compile(r'\s+')
    for match in re.finditer(r'>[^<]+<', html):
        count += len(strip_space.sub(' ', match.group()))-2
    return count
 class UnsupportedFormatError(Exception):
    def __init__(self, fmt):
        Exception.__init__(self, _('%s format books are not supported')%fmt.upper())
 class SpineItem(unicode):
    def __new__(cls, path, mime_type=None):
        ppath = path.partition('#')[0]
        if not os.path.exists(path) and os.path.exists(ppath):
            path = ppath
        obj = super(SpineItem, cls).__new__(cls, path)
        raw = open(path, 'rb').read()
        raw, obj.encoding = xml_to_unicode(raw)
        obj.character_count = character_count(raw)
        obj.start_page = -1
        obj.pages      = -1
        obj.max_page   = -1
        if mime_type is None:
            mime_type = guess_type(obj)[0]
        obj.mime_type = mime_type
        return obj
 class FakeOpts(object):
    verbose = 0
    breadth_first = False
    max_levels = 5
    input_encoding = None
 def is_supported(path):
    ext = os.path.splitext(path)[1].replace('.', '').lower()
    ext = re.sub(r'(x{0,1})htm(l{0,1})', 'html', ext)
    return ext in available_input_formats()
 def write_oebbook(oeb, path):
    from calibre.ebooks.oeb.writer import OEBWriter
    from calibre import walk
    w = OEBWriter()
    w(oeb, path)
    for f in walk(path):
        if f.endswith('.opf'):
            return f
 class EbookIterator(object):
    CHARACTERS_PER_PAGE = 1000
    def __init__(self, pathtoebook, log=None):
        self.log = log
        if log is None:
            self.log = Log()
        pathtoebook = pathtoebook.strip()
        self.pathtoebook = os.path.abspath(pathtoebook)
        self.config = DynamicConfig(name='iterator')
        ext = os.path.splitext(pathtoebook)[1].replace('.', '').lower()
        ext = re.sub(r'(x{0,1})htm(l{0,1})', 'html', ext)
        self.ebook_ext = ext.replace('original_', '')
    def search(self, text, index, backwards=False):
        text = prepare_string_for_xml(text.lower())
        pmap = [(i, path) for i, path in enumerate(self.spine)]
        if backwards:
            pmap.reverse()
        for i, path in pmap:
            if (backwards and i < index) or (not backwards and i > index):
                with open(path, 'rb') as f:
                    raw = f.read().decode(path.encoding)
                try:
                    raw = xml_replace_entities(raw)
                except:
                    pass
                if text in raw.lower():
                    return i
    def find_missing_css_files(self):
        for x in os.walk(os.path.dirname(self.pathtoopf)):
            for f in x[-1]:
                if f.endswith('.css'):
                    yield os.path.join(x[0], f)
    def find_declared_css_files(self):
        for item in self.opf.manifest:
            if item.mime_type and 'css' in item.mime_type.lower():
                yield item.path
    def find_embedded_fonts(self):
        '''
        This will become unnecessary once Qt WebKit supports the @font-face rule.
        '''
        css_files = set(self.find_declared_css_files())
        if not css_files:
            css_files = set(self.find_missing_css_files())
        bad_map = {}
        font_family_pat = re.compile(r'font-family\s*:\s*([^;]+)')
        for csspath in css_files:
            try:
                css = open(csspath, 'rb').read().decode('utf-8', 'replace')
            except:
                continue
            for match in re.compile(r'@font-face\s*{([^}]+)}').finditer(css):
                block  = match.group(1)
                family = font_family_pat.search(block)
                url    = re.compile(r'url\s*\([\'"]*(.+?)[\'"]*\)', re.DOTALL).search(block)
                if url:
                    path = url.group(1).split('/')
                    path = os.path.join(os.path.dirname(csspath), *path)
                    if not os.access(path, os.R_OK):
                        continue
                    id = QFontDatabase.addApplicationFont(path)
                    if id != -1:
                        families = [unicode(f) for f in QFontDatabase.applicationFontFamilies(id)]
                        if family:
                            family = family.group(1)
                            specified_families = [x.strip().replace('"',
                                '').replace("'", '') for x in family.split(',')]
                            aliasing_ok = False
                            for f in specified_families:
                                bad_map[f] = families[0]
                                if not aliasing_ok and f in families:
                                    aliasing_ok = True
                            if not aliasing_ok:
                                prints('WARNING: Family aliasing not fully supported.')
                                prints('\tDeclared family: %r not in actual families: %r'
                                        % (family, families))
                            else:
                                prints('Loaded embedded font:', repr(family))
        if bad_map:
            def prepend_embedded_font(match):
                for bad, good in bad_map.items():
                    if bad in match.group(1):
                        prints('Substituting font family: %s -> %s'%(bad, good))
                        return match.group().replace(bad, '"%s"'%good)
            from calibre.ebooks.chardet import force_encoding
            for csspath in css_files:
                with open(csspath, 'r+b') as f:
                    css = f.read()
                    enc = force_encoding(css, False)
                    css = css.decode(enc, 'replace')
                    ncss = font_family_pat.sub(prepend_embedded_font, css)
                    if ncss != css:
                        f.seek(0)
                        f.truncate()
                        f.write(ncss.encode(enc))
    def __enter__(self, processed=False, only_input_plugin=False):
        self.delete_on_exit = []
        self._tdir = TemporaryDirectory('_ebook_iter')
        self.base  = self._tdir.__enter__()
        if not isinstance(self.base, unicode):
            self.base = self.base.decode(filesystem_encoding)
        from calibre.ebooks.conversion.plumber import Plumber, create_oebbook
        plumber = Plumber(self.pathtoebook, self.base, self.log)
        plumber.setup_options()
        if self.pathtoebook.lower().endswith('.opf'):
            plumber.opts.dont_package = True
        if hasattr(plumber.opts, 'no_process'):
            plumber.opts.no_process = True
        plumber.input_plugin.for_viewer = True
        with plumber.input_plugin:
            self.pathtoopf = plumber.input_plugin(open(plumber.input, 'rb'),
                plumber.opts, plumber.input_fmt, self.log,
                {}, self.base)
        if not only_input_plugin:
            if processed or plumber.input_fmt.lower() in ('pdb', 'pdf', 'rb') and \
                    not hasattr(self.pathtoopf, 'manifest'):
                if hasattr(self.pathtoopf, 'manifest'):
                    self.pathtoopf = write_oebbook(self.pathtoopf, self.base)
                self.pathtoopf = create_oebbook(self.log, self.pathtoopf,
                        plumber.opts)
        if hasattr(self.pathtoopf, 'manifest'):
            self.pathtoopf = write_oebbook(self.pathtoopf, self.base)
        self.book_format = os.path.splitext(self.pathtoebook)[1][1:].upper()
        if getattr(plumber.input_plugin, 'is_kf8', False):
            self.book_format = 'KF8'
        self.opf = getattr(plumber.input_plugin, 'optimize_opf_parsing', None)
        if self.opf is None:
            self.opf = OPF(self.pathtoopf, os.path.dirname(self.pathtoopf))
        self.language = self.opf.language
        if self.language:
            self.language = self.language.lower()
        ordered = [i for i in self.opf.spine if i.is_linear] + \
                  [i for i in self.opf.spine if not i.is_linear]
        self.spine = []
        for i in ordered:
            spath = i.path
            mt = None
            if i.idref is not None:
                mt = self.opf.manifest.type_for_id(i.idref)
            if mt is None:
                mt = guess_type(spath)[0]
            try:
                self.spine.append(SpineItem(spath, mime_type=mt))
            except:
                self.log.warn('Missing spine item:', repr(spath))
        cover = self.opf.cover
        if self.ebook_ext in ('lit', 'mobi', 'prc', 'opf', 'fb2') and cover:
            cfile = os.path.join(self.base, 'calibre_iterator_cover.html')
            rcpath = os.path.relpath(cover, self.base).replace(os.sep, '/')
            chtml = (TITLEPAGE%prepare_string_for_xml(rcpath, True)).encode('utf-8')
            open(cfile, 'wb').write(chtml)
            self.spine[0:0] = [SpineItem(cfile,
                mime_type='application/xhtml+xml')]
            self.delete_on_exit.append(cfile)
        if self.opf.path_to_html_toc is not None and \
           self.opf.path_to_html_toc not in self.spine:
            try:
                self.spine.append(SpineItem(self.opf.path_to_html_toc))
            except:
                import traceback
                traceback.print_exc()
        sizes = [i.character_count for i in self.spine]
        self.pages = [math.ceil(i/float(self.CHARACTERS_PER_PAGE)) for i in sizes]
        for p, s in zip(self.pages, self.spine):
            s.pages = p
        start = 1
        for s in self.spine:
            s.start_page = start
            start += s.pages
            s.max_page = s.start_page + s.pages - 1
        self.toc = self.opf.toc
        self.read_bookmarks()
        return self
    def parse_bookmarks(self, raw):
        for line in raw.splitlines():
            bm = None
            if line.count('^') > 0:
                tokens = line.rpartition('^')
                title, ref = tokens[0], tokens[2]
                try:
                    spine, _, pos = ref.partition('#')
                    spine = int(spine.strip())
                except:
                    continue
                bm = {'type':'legacy', 'title':title, 'spine':spine, 'pos':pos}
            elif BM_FIELD_SEP in line:
                try:
                    title, spine, pos = line.strip().split(BM_FIELD_SEP)
                    spine = int(spine)
                except:
                    continue
                # Unescape from serialization
                pos = pos.replace(BM_LEGACY_ESC, u'^')
                # Check for pos being a scroll fraction
                try:
                    pos = float(pos)
                except:
                    pass
                bm = {'type':'cfi', 'title':title, 'pos':pos, 'spine':spine}
            if bm:
                self.bookmarks.append(bm)
    def serialize_bookmarks(self, bookmarks):
        dat = []
        for bm in bookmarks:
            if bm['type'] == 'legacy':
                rec = u'%s^%d#%s'%(bm['title'], bm['spine'], bm['pos'])
            else:
                pos = bm['pos']
                if isinstance(pos, (int, float)):
                    pos = unicode(pos)
                else:
                    pos = pos.replace(u'^', BM_LEGACY_ESC)
                rec = BM_FIELD_SEP.join([bm['title'], unicode(bm['spine']), pos])
            dat.append(rec)
        return (u'\n'.join(dat) +u'\n')
    def read_bookmarks(self):
        self.bookmarks = []
        bmfile = os.path.join(self.base, 'META-INF', 'calibre_bookmarks.txt')
        raw = ''
        if os.path.exists(bmfile):
            with open(bmfile, 'rb') as f:
                raw = f.read()
        else:
            saved = self.config['bookmarks_'+self.pathtoebook]
            if saved:
                raw = saved
        if not isinstance(raw, unicode):
            raw = raw.decode('utf-8')
        self.parse_bookmarks(raw)
    def save_bookmarks(self, bookmarks=None):
        if bookmarks is None:
            bookmarks = self.bookmarks
        dat = self.serialize_bookmarks(bookmarks)
        if os.path.splitext(self.pathtoebook)[1].lower() == '.epub' and \
            os.access(self.pathtoebook, os.R_OK):
            try:
                zf = open(self.pathtoebook, 'r+b')
            except IOError:
                return
            safe_replace(zf, 'META-INF/calibre_bookmarks.txt',
                    StringIO(dat.encode('utf-8')),
                    add_missing=True)
        else:
            self.config['bookmarks_'+self.pathtoebook] = dat
    def add_bookmark(self, bm):
        self.bookmarks = [x for x in self.bookmarks if x['title'] !=
                bm['title']]
        self.bookmarks.append(bm)
        self.save_bookmarks()
    def set_bookmarks(self, bookmarks):
        self.bookmarks = bookmarks
    def __exit__(self, *args):
        self._tdir.__exit__(*args)
        for x in self.delete_on_exit:
            if os.path.exists(x):
                os.remove(x)
 def get_preprocess_html(path_to_ebook, output):
    from calibre.ebooks.conversion.preprocess import HTMLPreProcessor
    iterator = EbookIterator(path_to_ebook)
    iterator.__enter__(only_input_plugin=True)
    preprocessor = HTMLPreProcessor(None, False)
    with open(output, 'wb') as out:
        for path in iterator.spine:
            with open(path, 'rb') as f:
                html = f.read().decode('utf-8', 'replace')
            html = preprocessor(html, get_preprocess_html=True)
            out.write(html.encode('utf-8'))
            out.write(b'\n\n' + b'-'*80 + b'\n\n')
--- a/src/calibre/ebooks/oeb/iterator/init.py
+++ b/src/calibre/ebooks/oeb/iterator/init.py
@ -0,0 +1,42 @@
 #!/usr/bin/env python
 # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
 from __future__ import (unicode_literals, division, absolute_import,
                        print_function)
 __license__   = 'GPL v3'
 __copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 import os, re
 from calibre.customize.ui import available_input_formats
 def is_supported(path):
    ext = os.path.splitext(path)[1].replace('.', '').lower()
    ext = re.sub(r'(x{0,1})htm(l{0,1})', 'html', ext)
    return ext in available_input_formats()
 class UnsupportedFormatError(Exception):
    def __init__(self, fmt):
        Exception.__init__(self, _('%s format books are not supported')%fmt.upper())
 def EbookIterator(*args, **kwargs):
    'For backwards compatibility'
    from calibre.ebooks.oeb.iterator.book import EbookIterator
    return EbookIterator(*args, **kwargs)
 def get_preprocess_html(path_to_ebook, output):
    from calibre.ebooks.conversion.preprocess import HTMLPreProcessor
    iterator = EbookIterator(path_to_ebook)
    iterator.__enter__(only_input_plugin=True, run_char_count=False,
            read_anchor_map=False)
    preprocessor = HTMLPreProcessor(None, False)
    with open(output, 'wb') as out:
        for path in iterator.spine:
            with open(path, 'rb') as f:
                html = f.read().decode('utf-8', 'replace')
            html = preprocessor(html, get_preprocess_html=True)
            out.write(html.encode('utf-8'))
            out.write(b'\n\n' + b'-'*80 + b'\n\n')
--- a/src/calibre/ebooks/oeb/iterator/book.py
+++ b/src/calibre/ebooks/oeb/iterator/book.py
@ -0,0 +1,186 @@
 #!/usr/bin/env python
 # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
 from __future__ import (unicode_literals, division, absolute_import,
                        print_function)
 __license__   = 'GPL v3'
 __copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 '''
 Iterate over the HTML files in an ebook. Useful for writing viewers.
 '''
 import re, os, math
 from functools import partial
 from calibre.ebooks.metadata.opf2 import OPF
 from calibre.ptempfile import TemporaryDirectory
 from calibre.utils.config import DynamicConfig
 from calibre.utils.logging import default_log
 from calibre import (guess_type, prepare_string_for_xml,
        xml_replace_entities)
 from calibre.ebooks.oeb.transforms.cover import CoverManager
 from calibre.ebooks.oeb.iterator.spine import (SpineItem, create_indexing_data)
 from calibre.ebooks.oeb.iterator.bookmarks import BookmarksMixin
 TITLEPAGE = CoverManager.SVG_TEMPLATE.decode('utf-8').replace(\
        '__ar__', 'none').replace('__viewbox__', '0 0 600 800'
        ).replace('__width__', '600').replace('__height__', '800')
 class FakeOpts(object):
    verbose = 0
    breadth_first = False
    max_levels = 5
    input_encoding = None
 def write_oebbook(oeb, path):
    from calibre.ebooks.oeb.writer import OEBWriter
    from calibre import walk
    w = OEBWriter()
    w(oeb, path)
    for f in walk(path):
        if f.endswith('.opf'):
            return f
 class EbookIterator(BookmarksMixin):
    CHARACTERS_PER_PAGE = 1000
    def __init__(self, pathtoebook, log=None):
        self.log = log or default_log
        pathtoebook = pathtoebook.strip()
        self.pathtoebook = os.path.abspath(pathtoebook)
        self.config = DynamicConfig(name='iterator')
        ext = os.path.splitext(pathtoebook)[1].replace('.', '').lower()
        ext = re.sub(r'(x{0,1})htm(l{0,1})', 'html', ext)
        self.ebook_ext = ext.replace('original_', '')
    def search(self, text, index, backwards=False):
        text = prepare_string_for_xml(text.lower())
        pmap = [(i, path) for i, path in enumerate(self.spine)]
        if backwards:
            pmap.reverse()
        for i, path in pmap:
            if (backwards and i < index) or (not backwards and i > index):
                with open(path, 'rb') as f:
                    raw = f.read().decode(path.encoding)
                try:
                    raw = xml_replace_entities(raw)
                except:
                    pass
                if text in raw.lower():
                    return i
    def __enter__(self, processed=False, only_input_plugin=False,
            run_char_count=True, read_anchor_map=True):
        ''' Convert an ebook file into an exploded OEB book suitable for
        display in viewers/preprocessing etc. '''
        from calibre.ebooks.conversion.plumber import Plumber, create_oebbook
        self.delete_on_exit = []
        self._tdir = TemporaryDirectory('_ebook_iter')
        self.base  = self._tdir.__enter__()
        plumber = Plumber(self.pathtoebook, self.base, self.log)
        plumber.setup_options()
        if self.pathtoebook.lower().endswith('.opf'):
            plumber.opts.dont_package = True
        if hasattr(plumber.opts, 'no_process'):
            plumber.opts.no_process = True
        plumber.input_plugin.for_viewer = True
        with plumber.input_plugin, open(plumber.input, 'rb') as inf:
            self.pathtoopf = plumber.input_plugin(inf,
                plumber.opts, plumber.input_fmt, self.log,
                {}, self.base)
        if not only_input_plugin:
            # Run the HTML preprocess/parsing from the conversion pipeline as
            # well
            if (processed or plumber.input_fmt.lower() in {'pdb', 'pdf', 'rb'}
                    and not hasattr(self.pathtoopf, 'manifest')):
                if hasattr(self.pathtoopf, 'manifest'):
                    self.pathtoopf = write_oebbook(self.pathtoopf, self.base)
                self.pathtoopf = create_oebbook(self.log, self.pathtoopf,
                        plumber.opts)
        if hasattr(self.pathtoopf, 'manifest'):
            self.pathtoopf = write_oebbook(self.pathtoopf, self.base)
        self.book_format = os.path.splitext(self.pathtoebook)[1][1:].upper()
        if getattr(plumber.input_plugin, 'is_kf8', False):
            self.book_format = 'KF8'
        self.opf = getattr(plumber.input_plugin, 'optimize_opf_parsing', None)
        if self.opf is None:
            self.opf = OPF(self.pathtoopf, os.path.dirname(self.pathtoopf))
        self.language = self.opf.language
        if self.language:
            self.language = self.language.lower()
        ordered = [i for i in self.opf.spine if i.is_linear] + \
                  [i for i in self.opf.spine if not i.is_linear]
        self.spine = []
        Spiny = partial(SpineItem, read_anchor_map=read_anchor_map,
                run_char_count=run_char_count)
        for i in ordered:
            spath = i.path
            mt = None
            if i.idref is not None:
                mt = self.opf.manifest.type_for_id(i.idref)
            if mt is None:
                mt = guess_type(spath)[0]
            try:
                self.spine.append(Spiny(spath, mime_type=mt))
            except:
                self.log.warn('Missing spine item:', repr(spath))
        cover = self.opf.cover
        if cover and self.ebook_ext in {'lit', 'mobi', 'prc', 'opf', 'fb2',
                'azw', 'azw3'}:
            cfile = os.path.join(self.base, 'calibre_iterator_cover.html')
            rcpath = os.path.relpath(cover, self.base).replace(os.sep, '/')
            chtml = (TITLEPAGE%prepare_string_for_xml(rcpath, True)).encode('utf-8')
            with open(cfile, 'wb') as f:
                f.write(chtml)
            self.spine[0:0] = [Spiny(cfile,
                mime_type='application/xhtml+xml')]
            self.delete_on_exit.append(cfile)
        if self.opf.path_to_html_toc is not None and \
           self.opf.path_to_html_toc not in self.spine:
            try:
                self.spine.append(Spiny(self.opf.path_to_html_toc))
            except:
                import traceback
                traceback.print_exc()
        sizes = [i.character_count for i in self.spine]
        self.pages = [math.ceil(i/float(self.CHARACTERS_PER_PAGE)) for i in sizes]
        for p, s in zip(self.pages, self.spine):
            s.pages = p
        start = 1
        for s in self.spine:
            s.start_page = start
            start += s.pages
            s.max_page = s.start_page + s.pages - 1
        self.toc = self.opf.toc
        create_indexing_data(self.spine, self.toc)
        self.read_bookmarks()
        return self
    def __exit__(self, *args):
        self._tdir.__exit__(*args)
        for x in self.delete_on_exit:
            try:
                os.remove(x)
            except:
                pass
--- a/src/calibre/ebooks/oeb/iterator/bookmarks.py
+++ b/src/calibre/ebooks/oeb/iterator/bookmarks.py
@ -0,0 +1,105 @@
 #!/usr/bin/env python
 # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
 from __future__ import (unicode_literals, division, absolute_import,
                        print_function)
 __license__   = 'GPL v3'
 __copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 import os
 from io import BytesIO
 from calibre.utils.zipfile import safe_replace
 BM_FIELD_SEP = u'*|!|?|*'
 BM_LEGACY_ESC = u'esc-text-%&*#%(){}ads19-end-esc'
 class BookmarksMixin(object):
    def parse_bookmarks(self, raw):
        for line in raw.splitlines():
            bm = None
            if line.count('^') > 0:
                tokens = line.rpartition('^')
                title, ref = tokens[0], tokens[2]
                try:
                    spine, _, pos = ref.partition('#')
                    spine = int(spine.strip())
                except:
                    continue
                bm = {'type':'legacy', 'title':title, 'spine':spine, 'pos':pos}
            elif BM_FIELD_SEP in line:
                try:
                    title, spine, pos = line.strip().split(BM_FIELD_SEP)
                    spine = int(spine)
                except:
                    continue
                # Unescape from serialization
                pos = pos.replace(BM_LEGACY_ESC, u'^')
                # Check for pos being a scroll fraction
                try:
                    pos = float(pos)
                except:
                    pass
                bm = {'type':'cfi', 'title':title, 'pos':pos, 'spine':spine}
            if bm:
                self.bookmarks.append(bm)
    def serialize_bookmarks(self, bookmarks):
        dat = []
        for bm in bookmarks:
            if bm['type'] == 'legacy':
                rec = u'%s^%d#%s'%(bm['title'], bm['spine'], bm['pos'])
            else:
                pos = bm['pos']
                if isinstance(pos, (int, float)):
                    pos = unicode(pos)
                else:
                    pos = pos.replace(u'^', BM_LEGACY_ESC)
                rec = BM_FIELD_SEP.join([bm['title'], unicode(bm['spine']), pos])
            dat.append(rec)
        return (u'\n'.join(dat) +u'\n')
    def read_bookmarks(self):
        self.bookmarks = []
        bmfile = os.path.join(self.base, 'META-INF', 'calibre_bookmarks.txt')
        raw = ''
        if os.path.exists(bmfile):
            with open(bmfile, 'rb') as f:
                raw = f.read()
        else:
            saved = self.config['bookmarks_'+self.pathtoebook]
            if saved:
                raw = saved
        if not isinstance(raw, unicode):
            raw = raw.decode('utf-8')
        self.parse_bookmarks(raw)
    def save_bookmarks(self, bookmarks=None):
        if bookmarks is None:
            bookmarks = self.bookmarks
        dat = self.serialize_bookmarks(bookmarks)
        if os.path.splitext(self.pathtoebook)[1].lower() == '.epub' and \
            os.access(self.pathtoebook, os.R_OK):
            try:
                zf = open(self.pathtoebook, 'r+b')
            except IOError:
                return
            safe_replace(zf, 'META-INF/calibre_bookmarks.txt',
                    BytesIO(dat.encode('utf-8')),
                    add_missing=True)
        else:
            self.config['bookmarks_'+self.pathtoebook] = dat
    def add_bookmark(self, bm):
        self.bookmarks = [x for x in self.bookmarks if x['title'] !=
                bm['title']]
        self.bookmarks.append(bm)
        self.save_bookmarks()
    def set_bookmarks(self, bookmarks):
        self.bookmarks = bookmarks
--- a/src/calibre/ebooks/oeb/iterator/spine.py
+++ b/src/calibre/ebooks/oeb/iterator/spine.py
@ -0,0 +1,117 @@
 #!/usr/bin/env python
 # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
 from __future__ import (unicode_literals, division, absolute_import,
                        print_function)
 from future_builtins import map
 __license__   = 'GPL v3'
 __copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 import re, os
 from functools import partial
 from operator import attrgetter
 from collections import namedtuple
 from calibre import guess_type
 from calibre.ebooks.chardet import xml_to_unicode
 def character_count(html):
    ''' Return the number of "significant" text characters in a HTML string. '''
    count = 0
    strip_space = re.compile(r'\s+')
    for match in re.finditer(r'>[^<]+<', html):
        count += len(strip_space.sub(' ', match.group()))-2
    return count
 def anchor_map(html):
    ''' Return map of all anchor names to their offsets in the html '''
    ans = {}
    for match in re.finditer(
        r'''(?:id|name)\s*=\s*['"]([^'"]+)['"]''', html):
        anchor = match.group(0)
        ans[anchor] = ans.get(anchor, match.start())
    return ans
 class SpineItem(unicode):
    def __new__(cls, path, mime_type=None, read_anchor_map=True,
            run_char_count=True):
        ppath = path.partition('#')[0]
        if not os.path.exists(path) and os.path.exists(ppath):
            path = ppath
        obj = super(SpineItem, cls).__new__(cls, path)
        with open(path, 'rb') as f:
            raw = f.read()
        raw, obj.encoding = xml_to_unicode(raw)
        obj.character_count = character_count(raw) if run_char_count else 10000
        obj.anchor_map = anchor_map(raw) if read_anchor_map else {}
        obj.start_page = -1
        obj.pages      = -1
        obj.max_page   = -1
        obj.index_entries = []
        if mime_type is None:
            mime_type = guess_type(obj)[0]
        obj.mime_type = mime_type
        return obj
 class IndexEntry(object):
    def __init__(self, spine, toc_entry, num):
        self.num = num
        self.text = toc_entry.text or _('Unknown')
        self.key = toc_entry.abspath
        self.anchor = self.start_anchor = toc_entry.fragment or None
        self.spine_pos = spine.index(self.key)
        self.anchor_pos = 0
        if self.spine_pos > -1:
            self.anchor_pos = spine[self.spine_pos].anchor_map.get(self.anchor,
                    0)
        self.depth = 0
        p = toc_entry.parent
        while p is not None:
            self.depth += 1
            p = p.parent
        self.sort_key = (self.spine_pos, self.anchor_pos)
        self.spine_count = len(spine)
    def find_end(self, all_entries):
        potential_enders = [i for i in all_entries if
                i.depth <= self.depth and
                (
                    (i.spine_pos == self.spine_pos and i.anchor_pos >
                                                            self.anchor_pos)
                    or
                    i.spine_pos > self.spine_pos
                )]
        if potential_enders:
            # potential_enders is sorted by (spine_pos, anchor_pos)
            end = potential_enders[0]
            self.end_spine_pos = end.spine_pos
            self.end_anchor = end.anchor
        else:
            self.end_spine_pos = self.spine_count - 1
            self.end_anchor = None
 def create_indexing_data(spine, toc):
    if not toc: return
    f = partial(IndexEntry, spine)
    index_entries = list(map(f,
        (t for t in toc.flat() if t is not toc),
        (i-1 for i, t in enumerate(toc.flat()) if t is not toc)
        ))
    index_entries.sort(key=attrgetter('sort_key'))
    [ i.find_end(index_entries) for i in index_entries ]
    ie = namedtuple('IndexEntry', 'entry start_anchor end_anchor')
    for spine_pos, spine_item in enumerate(spine):
        for i in index_entries:
            if i.end_spine_pos < spine_pos or i.spine_pos > spine_pos:
                continue # Does not touch this file
            start = i.anchor if i.spine_pos == spine_pos else None
            end = i.end_anchor if i.spine_pos == spine_pos else None
            spine_item.index_entries.append(ie(i, start, end))
--- a/src/calibre/gui2/viewer/main.py
+++ b/src/calibre/gui2/viewer/main.py
@ -18,7 +18,7 @@ from calibre.gui2.widgets import ProgressIndicator
 from calibre.gui2.main_window import MainWindow
 from calibre.gui2 import (Application, ORG_NAME, APP_UID, choose_files,
    info_dialog, error_dialog, open_url, available_height)
-from calibre.ebooks.oeb.iterator import EbookIterator
+from calibre.ebooks.oeb.iterator.book import EbookIterator
 from calibre.ebooks import DRMError
 from calibre.constants import islinux, isbsd, isosx, filesystem_encoding
 from calibre.utils.config import Config, StringConfig, JSONConfig
@ -802,11 +802,13 @@ class EbookViewer(MainWindow, Ui_EbookViewer):
            if not title:
                title = os.path.splitext(os.path.basename(pathtoebook))[0]
            if self.iterator.toc:
-                self.toc_model = TOC(self.iterator.toc)
+                self.toc_model = TOC(self.iterator.spine, self.iterator.toc)
                self.toc.setModel(self.toc_model)
                if self.show_toc_on_open:
                    self.action_table_of_contents.setChecked(True)
            else:
                self.toc_model = TOC(self.iterator.spine)
                self.toc.setModel(self.toc_model)
                self.action_table_of_contents.setChecked(False)
            if isbytestring(pathtoebook):
                pathtoebook = force_unicode(pathtoebook, filesystem_encoding)
--- a/src/calibre/gui2/viewer/toc.py
+++ b/src/calibre/gui2/viewer/toc.py
@ -8,9 +8,10 @@ __copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 import re
 from PyQt4.Qt import QStandardItem, QStandardItemModel, Qt
 from calibre.ebooks.metadata.toc import TOC as MTOC
 class TOCItem(QStandardItem):
    def __init__(self, toc):
@ -30,8 +31,10 @@ class TOCItem(QStandardItem):
 class TOC(QStandardItemModel):
-    def __init__(self, toc):
+    def __init__(self, spine, toc=None):
        QStandardItemModel.__init__(self)
        if toc is None:
            toc = MTOC()
        for t in toc:
            self.appendRow(TOCItem(t))
        self.setHorizontalHeaderItem(0, QStandardItem(_('Table of Contents')))