Refactor oeb iterator and implement logic to collect indexing data

2025-07-09 03:04:10 -04:00 · 2012-05-07 22:39:59 +05:30 · 2012-05-07 22:39:59 +05:30 · d993fbc91f
commit d993fbc91f
parent 820ba7ec7d
7 changed files with 459 additions and 387 deletions
--- a/src/calibre/ebooks/oeb/iterator.py
+++ b/src/calibre/ebooks/oeb/iterator.py
@ -1,383 +0,0 @@
-from __future__ import with_statement
-__license__   = 'GPL v3'
-__copyright__ = '2008 Kovid Goyal <kovid at kovidgoyal.net>'
-
-'''
-Iterate over the HTML files in an ebook. Useful for writing viewers.
-'''
-
-import re, os, math
-from cStringIO import StringIO
-
-from PyQt4.Qt import QFontDatabase
-
-from calibre.customize.ui import available_input_formats
-from calibre.ebooks.metadata.opf2 import OPF
-from calibre.ptempfile import TemporaryDirectory
-from calibre.ebooks.chardet import xml_to_unicode
-from calibre.utils.zipfile import safe_replace
-from calibre.utils.config import DynamicConfig
-from calibre.utils.logging import Log
-from calibre import (guess_type, prints, prepare_string_for_xml,
-        xml_replace_entities)
-from calibre.ebooks.oeb.transforms.cover import CoverManager
-from calibre.constants import filesystem_encoding
-
-TITLEPAGE = CoverManager.SVG_TEMPLATE.decode('utf-8').replace(\
-        '__ar__', 'none').replace('__viewbox__', '0 0 600 800'
-        ).replace('__width__', '600').replace('__height__', '800')
-BM_FIELD_SEP = u'*|!|?|*'
-BM_LEGACY_ESC = u'esc-text-%&*#%(){}ads19-end-esc'
-
-def character_count(html):
-    '''
-    Return the number of "significant" text characters in a HTML string.
-    '''
-    count = 0
-    strip_space = re.compile(r'\s+')
-    for match in re.finditer(r'>[^<]+<', html):
-        count += len(strip_space.sub(' ', match.group()))-2
-    return count
-
-class UnsupportedFormatError(Exception):
-
-    def __init__(self, fmt):
-        Exception.__init__(self, _('%s format books are not supported')%fmt.upper())
-
-class SpineItem(unicode):
-
-    def __new__(cls, path, mime_type=None):
-        ppath = path.partition('#')[0]
-        if not os.path.exists(path) and os.path.exists(ppath):
-            path = ppath
-        obj = super(SpineItem, cls).__new__(cls, path)
-        raw = open(path, 'rb').read()
-        raw, obj.encoding = xml_to_unicode(raw)
-        obj.character_count = character_count(raw)
-        obj.start_page = -1
-        obj.pages      = -1
-        obj.max_page   = -1
-        if mime_type is None:
-            mime_type = guess_type(obj)[0]
-        obj.mime_type = mime_type
-        return obj
-
-class FakeOpts(object):
-    verbose = 0
-    breadth_first = False
-    max_levels = 5
-    input_encoding = None
-
-def is_supported(path):
-    ext = os.path.splitext(path)[1].replace('.', '').lower()
-    ext = re.sub(r'(x{0,1})htm(l{0,1})', 'html', ext)
-    return ext in available_input_formats()
-
-
-def write_oebbook(oeb, path):
-    from calibre.ebooks.oeb.writer import OEBWriter
-    from calibre import walk
-    w = OEBWriter()
-    w(oeb, path)
-    for f in walk(path):
-        if f.endswith('.opf'):
-            return f
-
-class EbookIterator(object):
-
-    CHARACTERS_PER_PAGE = 1000
-
-    def __init__(self, pathtoebook, log=None):
-        self.log = log
-        if log is None:
-            self.log = Log()
-        pathtoebook = pathtoebook.strip()
-        self.pathtoebook = os.path.abspath(pathtoebook)
-        self.config = DynamicConfig(name='iterator')
-        ext = os.path.splitext(pathtoebook)[1].replace('.', '').lower()
-        ext = re.sub(r'(x{0,1})htm(l{0,1})', 'html', ext)
-        self.ebook_ext = ext.replace('original_', '')
-
-    def search(self, text, index, backwards=False):
-        text = prepare_string_for_xml(text.lower())
-        pmap = [(i, path) for i, path in enumerate(self.spine)]
-        if backwards:
-            pmap.reverse()
-        for i, path in pmap:
-            if (backwards and i < index) or (not backwards and i > index):
-                with open(path, 'rb') as f:
-                    raw = f.read().decode(path.encoding)
-                try:
-                    raw = xml_replace_entities(raw)
-                except:
-                    pass
-                if text in raw.lower():
-                    return i
-
-    def find_missing_css_files(self):
-        for x in os.walk(os.path.dirname(self.pathtoopf)):
-            for f in x[-1]:
-                if f.endswith('.css'):
-                    yield os.path.join(x[0], f)
-
-    def find_declared_css_files(self):
-        for item in self.opf.manifest:
-            if item.mime_type and 'css' in item.mime_type.lower():
-                yield item.path
-
-    def find_embedded_fonts(self):
-        '''
-        This will become unnecessary once Qt WebKit supports the @font-face rule.
-        '''
-        css_files = set(self.find_declared_css_files())
-        if not css_files:
-            css_files = set(self.find_missing_css_files())
-        bad_map = {}
-        font_family_pat = re.compile(r'font-family\s*:\s*([^;]+)')
-        for csspath in css_files:
-            try:
-                css = open(csspath, 'rb').read().decode('utf-8', 'replace')
-            except:
-                continue
-            for match in re.compile(r'@font-face\s*{([^}]+)}').finditer(css):
-                block  = match.group(1)
-                family = font_family_pat.search(block)
-                url    = re.compile(r'url\s*\([\'"]*(.+?)[\'"]*\)', re.DOTALL).search(block)
-                if url:
-                    path = url.group(1).split('/')
-                    path = os.path.join(os.path.dirname(csspath), *path)
-                    if not os.access(path, os.R_OK):
-                        continue
-                    id = QFontDatabase.addApplicationFont(path)
-                    if id != -1:
-                        families = [unicode(f) for f in QFontDatabase.applicationFontFamilies(id)]
-                        if family:
-                            family = family.group(1)
-                            specified_families = [x.strip().replace('"',
-                                '').replace("'", '') for x in family.split(',')]
-                            aliasing_ok = False
-                            for f in specified_families:
-                                bad_map[f] = families[0]
-                                if not aliasing_ok and f in families:
-                                    aliasing_ok = True
-
-                            if not aliasing_ok:
-                                prints('WARNING: Family aliasing not fully supported.')
-                                prints('\tDeclared family: %r not in actual families: %r'
-                                        % (family, families))
-                            else:
-                                prints('Loaded embedded font:', repr(family))
-        if bad_map:
-            def prepend_embedded_font(match):
-                for bad, good in bad_map.items():
-                    if bad in match.group(1):
-                        prints('Substituting font family: %s -> %s'%(bad, good))
-                        return match.group().replace(bad, '"%s"'%good)
-
-            from calibre.ebooks.chardet import force_encoding
-            for csspath in css_files:
-                with open(csspath, 'r+b') as f:
-                    css = f.read()
-                    enc = force_encoding(css, False)
-                    css = css.decode(enc, 'replace')
-                    ncss = font_family_pat.sub(prepend_embedded_font, css)
-                    if ncss != css:
-                        f.seek(0)
-                        f.truncate()
-                        f.write(ncss.encode(enc))
-
-    def __enter__(self, processed=False, only_input_plugin=False):
-        self.delete_on_exit = []
-        self._tdir = TemporaryDirectory('_ebook_iter')
-        self.base  = self._tdir.__enter__()
-        if not isinstance(self.base, unicode):
-            self.base = self.base.decode(filesystem_encoding)
-        from calibre.ebooks.conversion.plumber import Plumber, create_oebbook
-        plumber = Plumber(self.pathtoebook, self.base, self.log)
-        plumber.setup_options()
-        if self.pathtoebook.lower().endswith('.opf'):
-            plumber.opts.dont_package = True
-        if hasattr(plumber.opts, 'no_process'):
-            plumber.opts.no_process = True
-
-        plumber.input_plugin.for_viewer = True
-        with plumber.input_plugin:
-            self.pathtoopf = plumber.input_plugin(open(plumber.input, 'rb'),
-                plumber.opts, plumber.input_fmt, self.log,
-                {}, self.base)
-
-        if not only_input_plugin:
-            if processed or plumber.input_fmt.lower() in ('pdb', 'pdf', 'rb') and \
-                    not hasattr(self.pathtoopf, 'manifest'):
-                if hasattr(self.pathtoopf, 'manifest'):
-                    self.pathtoopf = write_oebbook(self.pathtoopf, self.base)
-                self.pathtoopf = create_oebbook(self.log, self.pathtoopf,
-                        plumber.opts)
-
-        if hasattr(self.pathtoopf, 'manifest'):
-            self.pathtoopf = write_oebbook(self.pathtoopf, self.base)
-
-        self.book_format = os.path.splitext(self.pathtoebook)[1][1:].upper()
-        if getattr(plumber.input_plugin, 'is_kf8', False):
-            self.book_format = 'KF8'
-
-        self.opf = getattr(plumber.input_plugin, 'optimize_opf_parsing', None)
-        if self.opf is None:
-            self.opf = OPF(self.pathtoopf, os.path.dirname(self.pathtoopf))
-        self.language = self.opf.language
-        if self.language:
-            self.language = self.language.lower()
-        ordered = [i for i in self.opf.spine if i.is_linear] + \
-                  [i for i in self.opf.spine if not i.is_linear]
-        self.spine = []
-        for i in ordered:
-            spath = i.path
-            mt = None
-            if i.idref is not None:
-                mt = self.opf.manifest.type_for_id(i.idref)
-            if mt is None:
-                mt = guess_type(spath)[0]
-            try:
-                self.spine.append(SpineItem(spath, mime_type=mt))
-            except:
-                self.log.warn('Missing spine item:', repr(spath))
-
-        cover = self.opf.cover
-        if self.ebook_ext in ('lit', 'mobi', 'prc', 'opf', 'fb2') and cover:
-            cfile = os.path.join(self.base, 'calibre_iterator_cover.html')
-            rcpath = os.path.relpath(cover, self.base).replace(os.sep, '/')
-            chtml = (TITLEPAGE%prepare_string_for_xml(rcpath, True)).encode('utf-8')
-            open(cfile, 'wb').write(chtml)
-            self.spine[0:0] = [SpineItem(cfile,
-                mime_type='application/xhtml+xml')]
-            self.delete_on_exit.append(cfile)
-
-        if self.opf.path_to_html_toc is not None and \
-           self.opf.path_to_html_toc not in self.spine:
-            try:
-                self.spine.append(SpineItem(self.opf.path_to_html_toc))
-            except:
-                import traceback
-                traceback.print_exc()
-
-
-        sizes = [i.character_count for i in self.spine]
-        self.pages = [math.ceil(i/float(self.CHARACTERS_PER_PAGE)) for i in sizes]
-        for p, s in zip(self.pages, self.spine):
-            s.pages = p
-        start = 1
-
-        for s in self.spine:
-            s.start_page = start
-            start += s.pages
-            s.max_page = s.start_page + s.pages - 1
-        self.toc = self.opf.toc
-
-        self.read_bookmarks()
-
-        return self
-
-    def parse_bookmarks(self, raw):
-        for line in raw.splitlines():
-            bm = None
-            if line.count('^') > 0:
-                tokens = line.rpartition('^')
-                title, ref = tokens[0], tokens[2]
-                try:
-                    spine, _, pos = ref.partition('#')
-                    spine = int(spine.strip())
-                except:
-                    continue
-                bm = {'type':'legacy', 'title':title, 'spine':spine, 'pos':pos}
-            elif BM_FIELD_SEP in line:
-                try:
-                    title, spine, pos = line.strip().split(BM_FIELD_SEP)
-                    spine = int(spine)
-                except:
-                    continue
-                # Unescape from serialization
-                pos = pos.replace(BM_LEGACY_ESC, u'^')
-                # Check for pos being a scroll fraction
-                try:
-                    pos = float(pos)
-                except:
-                    pass
-                bm = {'type':'cfi', 'title':title, 'pos':pos, 'spine':spine}
-
-            if bm:
-                self.bookmarks.append(bm)
-
-    def serialize_bookmarks(self, bookmarks):
-        dat = []
-        for bm in bookmarks:
-            if bm['type'] == 'legacy':
-                rec = u'%s^%d#%s'%(bm['title'], bm['spine'], bm['pos'])
-            else:
-                pos = bm['pos']
-                if isinstance(pos, (int, float)):
-                    pos = unicode(pos)
-                else:
-                    pos = pos.replace(u'^', BM_LEGACY_ESC)
-                rec = BM_FIELD_SEP.join([bm['title'], unicode(bm['spine']), pos])
-            dat.append(rec)
-        return (u'\n'.join(dat) +u'\n')
-
-    def read_bookmarks(self):
-        self.bookmarks = []
-        bmfile = os.path.join(self.base, 'META-INF', 'calibre_bookmarks.txt')
-        raw = ''
-        if os.path.exists(bmfile):
-            with open(bmfile, 'rb') as f:
-                raw = f.read()
-        else:
-            saved = self.config['bookmarks_'+self.pathtoebook]
-            if saved:
-                raw = saved
-        if not isinstance(raw, unicode):
-            raw = raw.decode('utf-8')
-        self.parse_bookmarks(raw)
-
-    def save_bookmarks(self, bookmarks=None):
-        if bookmarks is None:
-            bookmarks = self.bookmarks
-        dat = self.serialize_bookmarks(bookmarks)
-        if os.path.splitext(self.pathtoebook)[1].lower() == '.epub' and \
-            os.access(self.pathtoebook, os.R_OK):
-            try:
-                zf = open(self.pathtoebook, 'r+b')
-            except IOError:
-                return
-            safe_replace(zf, 'META-INF/calibre_bookmarks.txt',
-                    StringIO(dat.encode('utf-8')),
-                    add_missing=True)
-        else:
-            self.config['bookmarks_'+self.pathtoebook] = dat
-
-    def add_bookmark(self, bm):
-        self.bookmarks = [x for x in self.bookmarks if x['title'] !=
-                bm['title']]
-        self.bookmarks.append(bm)
-        self.save_bookmarks()
-
-    def set_bookmarks(self, bookmarks):
-        self.bookmarks = bookmarks
-
-    def __exit__(self, *args):
-        self._tdir.__exit__(*args)
-        for x in self.delete_on_exit:
-            if os.path.exists(x):
-                os.remove(x)
-
-def get_preprocess_html(path_to_ebook, output):
-    from calibre.ebooks.conversion.preprocess import HTMLPreProcessor
-    iterator = EbookIterator(path_to_ebook)
-    iterator.__enter__(only_input_plugin=True)
-    preprocessor = HTMLPreProcessor(None, False)
-    with open(output, 'wb') as out:
-        for path in iterator.spine:
-            with open(path, 'rb') as f:
-                html = f.read().decode('utf-8', 'replace')
-            html = preprocessor(html, get_preprocess_html=True)
-            out.write(html.encode('utf-8'))
-            out.write(b'\n\n' + b'-'*80 + b'\n\n')
-
--- a/src/calibre/ebooks/oeb/iterator/init.py
+++ b/src/calibre/ebooks/oeb/iterator/init.py
@ -0,0 +1,42 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__   = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import os, re
+
+from calibre.customize.ui import available_input_formats
+
+def is_supported(path):
+    ext = os.path.splitext(path)[1].replace('.', '').lower()
+    ext = re.sub(r'(x{0,1})htm(l{0,1})', 'html', ext)
+    return ext in available_input_formats()
+
+class UnsupportedFormatError(Exception):
+
+    def __init__(self, fmt):
+        Exception.__init__(self, _('%s format books are not supported')%fmt.upper())
+
+def EbookIterator(*args, **kwargs):
+    'For backwards compatibility'
+    from calibre.ebooks.oeb.iterator.book import EbookIterator
+    return EbookIterator(*args, **kwargs)
+
+def get_preprocess_html(path_to_ebook, output):
+    from calibre.ebooks.conversion.preprocess import HTMLPreProcessor
+    iterator = EbookIterator(path_to_ebook)
+    iterator.__enter__(only_input_plugin=True, run_char_count=False,
+            read_anchor_map=False)
+    preprocessor = HTMLPreProcessor(None, False)
+    with open(output, 'wb') as out:
+        for path in iterator.spine:
+            with open(path, 'rb') as f:
+                html = f.read().decode('utf-8', 'replace')
+            html = preprocessor(html, get_preprocess_html=True)
+            out.write(html.encode('utf-8'))
+            out.write(b'\n\n' + b'-'*80 + b'\n\n')
+
--- a/src/calibre/ebooks/oeb/iterator/book.py
+++ b/src/calibre/ebooks/oeb/iterator/book.py
@ -0,0 +1,186 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__   = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+
+'''
+Iterate over the HTML files in an ebook. Useful for writing viewers.
+'''
+
+import re, os, math
+from functools import partial
+
+from calibre.ebooks.metadata.opf2 import OPF
+from calibre.ptempfile import TemporaryDirectory
+from calibre.utils.config import DynamicConfig
+from calibre.utils.logging import default_log
+from calibre import (guess_type, prepare_string_for_xml,
+        xml_replace_entities)
+from calibre.ebooks.oeb.transforms.cover import CoverManager
+
+from calibre.ebooks.oeb.iterator.spine import (SpineItem, create_indexing_data)
+from calibre.ebooks.oeb.iterator.bookmarks import BookmarksMixin
+
+TITLEPAGE = CoverManager.SVG_TEMPLATE.decode('utf-8').replace(\
+        '__ar__', 'none').replace('__viewbox__', '0 0 600 800'
+        ).replace('__width__', '600').replace('__height__', '800')
+
+class FakeOpts(object):
+    verbose = 0
+    breadth_first = False
+    max_levels = 5
+    input_encoding = None
+
+
+def write_oebbook(oeb, path):
+    from calibre.ebooks.oeb.writer import OEBWriter
+    from calibre import walk
+    w = OEBWriter()
+    w(oeb, path)
+    for f in walk(path):
+        if f.endswith('.opf'):
+            return f
+
+class EbookIterator(BookmarksMixin):
+
+    CHARACTERS_PER_PAGE = 1000
+
+    def __init__(self, pathtoebook, log=None):
+        self.log = log or default_log
+        pathtoebook = pathtoebook.strip()
+        self.pathtoebook = os.path.abspath(pathtoebook)
+        self.config = DynamicConfig(name='iterator')
+        ext = os.path.splitext(pathtoebook)[1].replace('.', '').lower()
+        ext = re.sub(r'(x{0,1})htm(l{0,1})', 'html', ext)
+        self.ebook_ext = ext.replace('original_', '')
+
+    def search(self, text, index, backwards=False):
+        text = prepare_string_for_xml(text.lower())
+        pmap = [(i, path) for i, path in enumerate(self.spine)]
+        if backwards:
+            pmap.reverse()
+        for i, path in pmap:
+            if (backwards and i < index) or (not backwards and i > index):
+                with open(path, 'rb') as f:
+                    raw = f.read().decode(path.encoding)
+                try:
+                    raw = xml_replace_entities(raw)
+                except:
+                    pass
+                if text in raw.lower():
+                    return i
+
+    def __enter__(self, processed=False, only_input_plugin=False,
+            run_char_count=True, read_anchor_map=True):
+        ''' Convert an ebook file into an exploded OEB book suitable for
+        display in viewers/preprocessing etc. '''
+
+        from calibre.ebooks.conversion.plumber import Plumber, create_oebbook
+
+        self.delete_on_exit = []
+        self._tdir = TemporaryDirectory('_ebook_iter')
+        self.base  = self._tdir.__enter__()
+        plumber = Plumber(self.pathtoebook, self.base, self.log)
+        plumber.setup_options()
+        if self.pathtoebook.lower().endswith('.opf'):
+            plumber.opts.dont_package = True
+        if hasattr(plumber.opts, 'no_process'):
+            plumber.opts.no_process = True
+
+        plumber.input_plugin.for_viewer = True
+        with plumber.input_plugin, open(plumber.input, 'rb') as inf:
+            self.pathtoopf = plumber.input_plugin(inf,
+                plumber.opts, plumber.input_fmt, self.log,
+                {}, self.base)
+
+        if not only_input_plugin:
+            # Run the HTML preprocess/parsing from the conversion pipeline as
+            # well
+            if (processed or plumber.input_fmt.lower() in {'pdb', 'pdf', 'rb'}
+                    and not hasattr(self.pathtoopf, 'manifest')):
+                if hasattr(self.pathtoopf, 'manifest'):
+                    self.pathtoopf = write_oebbook(self.pathtoopf, self.base)
+                self.pathtoopf = create_oebbook(self.log, self.pathtoopf,
+                        plumber.opts)
+
+        if hasattr(self.pathtoopf, 'manifest'):
+            self.pathtoopf = write_oebbook(self.pathtoopf, self.base)
+
+        self.book_format = os.path.splitext(self.pathtoebook)[1][1:].upper()
+        if getattr(plumber.input_plugin, 'is_kf8', False):
+            self.book_format = 'KF8'
+
+        self.opf = getattr(plumber.input_plugin, 'optimize_opf_parsing', None)
+        if self.opf is None:
+            self.opf = OPF(self.pathtoopf, os.path.dirname(self.pathtoopf))
+        self.language = self.opf.language
+        if self.language:
+            self.language = self.language.lower()
+        ordered = [i for i in self.opf.spine if i.is_linear] + \
+                  [i for i in self.opf.spine if not i.is_linear]
+        self.spine = []
+        Spiny = partial(SpineItem, read_anchor_map=read_anchor_map,
+                run_char_count=run_char_count)
+        for i in ordered:
+            spath = i.path
+            mt = None
+            if i.idref is not None:
+                mt = self.opf.manifest.type_for_id(i.idref)
+            if mt is None:
+                mt = guess_type(spath)[0]
+            try:
+                self.spine.append(Spiny(spath, mime_type=mt))
+            except:
+                self.log.warn('Missing spine item:', repr(spath))
+
+        cover = self.opf.cover
+        if cover and self.ebook_ext in {'lit', 'mobi', 'prc', 'opf', 'fb2',
+                'azw', 'azw3'}:
+            cfile = os.path.join(self.base, 'calibre_iterator_cover.html')
+            rcpath = os.path.relpath(cover, self.base).replace(os.sep, '/')
+            chtml = (TITLEPAGE%prepare_string_for_xml(rcpath, True)).encode('utf-8')
+            with open(cfile, 'wb') as f:
+                f.write(chtml)
+            self.spine[0:0] = [Spiny(cfile,
+                mime_type='application/xhtml+xml')]
+            self.delete_on_exit.append(cfile)
+
+        if self.opf.path_to_html_toc is not None and \
+           self.opf.path_to_html_toc not in self.spine:
+            try:
+                self.spine.append(Spiny(self.opf.path_to_html_toc))
+            except:
+                import traceback
+                traceback.print_exc()
+
+        sizes = [i.character_count for i in self.spine]
+        self.pages = [math.ceil(i/float(self.CHARACTERS_PER_PAGE)) for i in sizes]
+        for p, s in zip(self.pages, self.spine):
+            s.pages = p
+        start = 1
+
+        for s in self.spine:
+            s.start_page = start
+            start += s.pages
+            s.max_page = s.start_page + s.pages - 1
+        self.toc = self.opf.toc
+        create_indexing_data(self.spine, self.toc)
+
+        self.read_bookmarks()
+
+        return self
+
+    def __exit__(self, *args):
+        self._tdir.__exit__(*args)
+        for x in self.delete_on_exit:
+            try:
+                os.remove(x)
+            except:
+                pass
+
+
--- a/src/calibre/ebooks/oeb/iterator/bookmarks.py
+++ b/src/calibre/ebooks/oeb/iterator/bookmarks.py
@ -0,0 +1,105 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__   = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import os
+from io import BytesIO
+
+from calibre.utils.zipfile import safe_replace
+
+BM_FIELD_SEP = u'*|!|?|*'
+BM_LEGACY_ESC = u'esc-text-%&*#%(){}ads19-end-esc'
+
+class BookmarksMixin(object):
+
+    def parse_bookmarks(self, raw):
+        for line in raw.splitlines():
+            bm = None
+            if line.count('^') > 0:
+                tokens = line.rpartition('^')
+                title, ref = tokens[0], tokens[2]
+                try:
+                    spine, _, pos = ref.partition('#')
+                    spine = int(spine.strip())
+                except:
+                    continue
+                bm = {'type':'legacy', 'title':title, 'spine':spine, 'pos':pos}
+            elif BM_FIELD_SEP in line:
+                try:
+                    title, spine, pos = line.strip().split(BM_FIELD_SEP)
+                    spine = int(spine)
+                except:
+                    continue
+                # Unescape from serialization
+                pos = pos.replace(BM_LEGACY_ESC, u'^')
+                # Check for pos being a scroll fraction
+                try:
+                    pos = float(pos)
+                except:
+                    pass
+                bm = {'type':'cfi', 'title':title, 'pos':pos, 'spine':spine}
+
+            if bm:
+                self.bookmarks.append(bm)
+
+    def serialize_bookmarks(self, bookmarks):
+        dat = []
+        for bm in bookmarks:
+            if bm['type'] == 'legacy':
+                rec = u'%s^%d#%s'%(bm['title'], bm['spine'], bm['pos'])
+            else:
+                pos = bm['pos']
+                if isinstance(pos, (int, float)):
+                    pos = unicode(pos)
+                else:
+                    pos = pos.replace(u'^', BM_LEGACY_ESC)
+                rec = BM_FIELD_SEP.join([bm['title'], unicode(bm['spine']), pos])
+            dat.append(rec)
+        return (u'\n'.join(dat) +u'\n')
+
+    def read_bookmarks(self):
+        self.bookmarks = []
+        bmfile = os.path.join(self.base, 'META-INF', 'calibre_bookmarks.txt')
+        raw = ''
+        if os.path.exists(bmfile):
+            with open(bmfile, 'rb') as f:
+                raw = f.read()
+        else:
+            saved = self.config['bookmarks_'+self.pathtoebook]
+            if saved:
+                raw = saved
+        if not isinstance(raw, unicode):
+            raw = raw.decode('utf-8')
+        self.parse_bookmarks(raw)
+
+    def save_bookmarks(self, bookmarks=None):
+        if bookmarks is None:
+            bookmarks = self.bookmarks
+        dat = self.serialize_bookmarks(bookmarks)
+        if os.path.splitext(self.pathtoebook)[1].lower() == '.epub' and \
+            os.access(self.pathtoebook, os.R_OK):
+            try:
+                zf = open(self.pathtoebook, 'r+b')
+            except IOError:
+                return
+            safe_replace(zf, 'META-INF/calibre_bookmarks.txt',
+                    BytesIO(dat.encode('utf-8')),
+                    add_missing=True)
+        else:
+            self.config['bookmarks_'+self.pathtoebook] = dat
+
+    def add_bookmark(self, bm):
+        self.bookmarks = [x for x in self.bookmarks if x['title'] !=
+                bm['title']]
+        self.bookmarks.append(bm)
+        self.save_bookmarks()
+
+    def set_bookmarks(self, bookmarks):
+        self.bookmarks = bookmarks
+
+
--- a/src/calibre/ebooks/oeb/iterator/spine.py
+++ b/src/calibre/ebooks/oeb/iterator/spine.py
@ -0,0 +1,117 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+from future_builtins import map
+
+__license__   = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import re, os
+from functools import partial
+from operator import attrgetter
+from collections import namedtuple
+
+from calibre import guess_type
+from calibre.ebooks.chardet import xml_to_unicode
+
+def character_count(html):
+    ''' Return the number of "significant" text characters in a HTML string. '''
+    count = 0
+    strip_space = re.compile(r'\s+')
+    for match in re.finditer(r'>[^<]+<', html):
+        count += len(strip_space.sub(' ', match.group()))-2
+    return count
+
+def anchor_map(html):
+    ''' Return map of all anchor names to their offsets in the html '''
+    ans = {}
+    for match in re.finditer(
+        r'''(?:id|name)\s*=\s*['"]([^'"]+)['"]''', html):
+        anchor = match.group(0)
+        ans[anchor] = ans.get(anchor, match.start())
+    return ans
+
+class SpineItem(unicode):
+
+    def __new__(cls, path, mime_type=None, read_anchor_map=True,
+            run_char_count=True):
+        ppath = path.partition('#')[0]
+        if not os.path.exists(path) and os.path.exists(ppath):
+            path = ppath
+        obj = super(SpineItem, cls).__new__(cls, path)
+        with open(path, 'rb') as f:
+            raw = f.read()
+        raw, obj.encoding = xml_to_unicode(raw)
+        obj.character_count = character_count(raw) if run_char_count else 10000
+        obj.anchor_map = anchor_map(raw) if read_anchor_map else {}
+        obj.start_page = -1
+        obj.pages      = -1
+        obj.max_page   = -1
+        obj.index_entries = []
+        if mime_type is None:
+            mime_type = guess_type(obj)[0]
+        obj.mime_type = mime_type
+        return obj
+
+class IndexEntry(object):
+
+    def __init__(self, spine, toc_entry, num):
+        self.num = num
+        self.text = toc_entry.text or _('Unknown')
+        self.key = toc_entry.abspath
+        self.anchor = self.start_anchor = toc_entry.fragment or None
+        self.spine_pos = spine.index(self.key)
+        self.anchor_pos = 0
+        if self.spine_pos > -1:
+            self.anchor_pos = spine[self.spine_pos].anchor_map.get(self.anchor,
+                    0)
+
+        self.depth = 0
+        p = toc_entry.parent
+        while p is not None:
+            self.depth += 1
+            p = p.parent
+
+        self.sort_key = (self.spine_pos, self.anchor_pos)
+        self.spine_count = len(spine)
+
+    def find_end(self, all_entries):
+        potential_enders = [i for i in all_entries if
+                i.depth <= self.depth and
+                (
+                    (i.spine_pos == self.spine_pos and i.anchor_pos >
+                                                            self.anchor_pos)
+                    or
+                    i.spine_pos > self.spine_pos
+                )]
+        if potential_enders:
+            # potential_enders is sorted by (spine_pos, anchor_pos)
+            end = potential_enders[0]
+            self.end_spine_pos = end.spine_pos
+            self.end_anchor = end.anchor
+        else:
+            self.end_spine_pos = self.spine_count - 1
+            self.end_anchor = None
+
+def create_indexing_data(spine, toc):
+    if not toc: return
+    f = partial(IndexEntry, spine)
+    index_entries = list(map(f,
+        (t for t in toc.flat() if t is not toc),
+        (i-1 for i, t in enumerate(toc.flat()) if t is not toc)
+        ))
+    index_entries.sort(key=attrgetter('sort_key'))
+    [ i.find_end(index_entries) for i in index_entries ]
+
+    ie = namedtuple('IndexEntry', 'entry start_anchor end_anchor')
+
+    for spine_pos, spine_item in enumerate(spine):
+        for i in index_entries:
+            if i.end_spine_pos < spine_pos or i.spine_pos > spine_pos:
+                continue # Does not touch this file
+            start = i.anchor if i.spine_pos == spine_pos else None
+            end = i.end_anchor if i.spine_pos == spine_pos else None
+            spine_item.index_entries.append(ie(i, start, end))
+
--- a/src/calibre/gui2/viewer/main.py
+++ b/src/calibre/gui2/viewer/main.py
@ -18,7 +18,7 @@ from calibre.gui2.widgets import ProgressIndicator
 from calibre.gui2.main_window import MainWindow
 from calibre.gui2 import (Application, ORG_NAME, APP_UID, choose_files,
    info_dialog, error_dialog, open_url, available_height)
-from calibre.ebooks.oeb.iterator import EbookIterator
+from calibre.ebooks.oeb.iterator.book import EbookIterator
 from calibre.ebooks import DRMError
 from calibre.constants import islinux, isbsd, isosx, filesystem_encoding
 from calibre.utils.config import Config, StringConfig, JSONConfig
@ -802,11 +802,13 @@ class EbookViewer(MainWindow, Ui_EbookViewer):
            if not title:
                title = os.path.splitext(os.path.basename(pathtoebook))[0]
            if self.iterator.toc:
-                self.toc_model = TOC(self.iterator.toc)
+                self.toc_model = TOC(self.iterator.spine, self.iterator.toc)
                self.toc.setModel(self.toc_model)
                if self.show_toc_on_open:
                    self.action_table_of_contents.setChecked(True)
            else:
+                self.toc_model = TOC(self.iterator.spine)
+                self.toc.setModel(self.toc_model)
                self.action_table_of_contents.setChecked(False)
            if isbytestring(pathtoebook):
                pathtoebook = force_unicode(pathtoebook, filesystem_encoding)
--- a/src/calibre/gui2/viewer/toc.py
+++ b/src/calibre/gui2/viewer/toc.py
@ -8,9 +8,10 @@ __copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'

 import re
-
 from PyQt4.Qt import QStandardItem, QStandardItemModel, Qt

+from calibre.ebooks.metadata.toc import TOC as MTOC
+
 class TOCItem(QStandardItem):

    def __init__(self, toc):
@ -30,8 +31,10 @@ class TOCItem(QStandardItem):

 class TOC(QStandardItemModel):

-    def __init__(self, toc):
+    def __init__(self, spine, toc=None):
        QStandardItemModel.__init__(self)
+        if toc is None:
+            toc = MTOC()
        for t in toc:
            self.appendRow(TOCItem(t))
        self.setHorizontalHeaderItem(0, QStandardItem(_('Table of Contents')))