From d993fbc91f3f112f6d0c5fb1a509c0fc65070d9d Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 7 May 2012 22:39:59 +0530 Subject: [PATCH] Refactor oeb iterator and implement logic to collect indexing data --- src/calibre/ebooks/oeb/iterator.py | 383 ------------------- src/calibre/ebooks/oeb/iterator/__init__.py | 42 ++ src/calibre/ebooks/oeb/iterator/book.py | 186 +++++++++ src/calibre/ebooks/oeb/iterator/bookmarks.py | 105 +++++ src/calibre/ebooks/oeb/iterator/spine.py | 117 ++++++ src/calibre/gui2/viewer/main.py | 6 +- src/calibre/gui2/viewer/toc.py | 7 +- 7 files changed, 459 insertions(+), 387 deletions(-) delete mode 100644 src/calibre/ebooks/oeb/iterator.py create mode 100644 src/calibre/ebooks/oeb/iterator/__init__.py create mode 100644 src/calibre/ebooks/oeb/iterator/book.py create mode 100644 src/calibre/ebooks/oeb/iterator/bookmarks.py create mode 100644 src/calibre/ebooks/oeb/iterator/spine.py diff --git a/src/calibre/ebooks/oeb/iterator.py b/src/calibre/ebooks/oeb/iterator.py deleted file mode 100644 index 63965e59ef..0000000000 --- a/src/calibre/ebooks/oeb/iterator.py +++ /dev/null @@ -1,383 +0,0 @@ -from __future__ import with_statement -__license__ = 'GPL v3' -__copyright__ = '2008 Kovid Goyal ' - -''' -Iterate over the HTML files in an ebook. Useful for writing viewers. -''' - -import re, os, math -from cStringIO import StringIO - -from PyQt4.Qt import QFontDatabase - -from calibre.customize.ui import available_input_formats -from calibre.ebooks.metadata.opf2 import OPF -from calibre.ptempfile import TemporaryDirectory -from calibre.ebooks.chardet import xml_to_unicode -from calibre.utils.zipfile import safe_replace -from calibre.utils.config import DynamicConfig -from calibre.utils.logging import Log -from calibre import (guess_type, prints, prepare_string_for_xml, - xml_replace_entities) -from calibre.ebooks.oeb.transforms.cover import CoverManager -from calibre.constants import filesystem_encoding - -TITLEPAGE = CoverManager.SVG_TEMPLATE.decode('utf-8').replace(\ - '__ar__', 'none').replace('__viewbox__', '0 0 600 800' - ).replace('__width__', '600').replace('__height__', '800') -BM_FIELD_SEP = u'*|!|?|*' -BM_LEGACY_ESC = u'esc-text-%&*#%(){}ads19-end-esc' - -def character_count(html): - ''' - Return the number of "significant" text characters in a HTML string. - ''' - count = 0 - strip_space = re.compile(r'\s+') - for match in re.finditer(r'>[^<]+<', html): - count += len(strip_space.sub(' ', match.group()))-2 - return count - -class UnsupportedFormatError(Exception): - - def __init__(self, fmt): - Exception.__init__(self, _('%s format books are not supported')%fmt.upper()) - -class SpineItem(unicode): - - def __new__(cls, path, mime_type=None): - ppath = path.partition('#')[0] - if not os.path.exists(path) and os.path.exists(ppath): - path = ppath - obj = super(SpineItem, cls).__new__(cls, path) - raw = open(path, 'rb').read() - raw, obj.encoding = xml_to_unicode(raw) - obj.character_count = character_count(raw) - obj.start_page = -1 - obj.pages = -1 - obj.max_page = -1 - if mime_type is None: - mime_type = guess_type(obj)[0] - obj.mime_type = mime_type - return obj - -class FakeOpts(object): - verbose = 0 - breadth_first = False - max_levels = 5 - input_encoding = None - -def is_supported(path): - ext = os.path.splitext(path)[1].replace('.', '').lower() - ext = re.sub(r'(x{0,1})htm(l{0,1})', 'html', ext) - return ext in available_input_formats() - - -def write_oebbook(oeb, path): - from calibre.ebooks.oeb.writer import OEBWriter - from calibre import walk - w = OEBWriter() - w(oeb, path) - for f in walk(path): - if f.endswith('.opf'): - return f - -class EbookIterator(object): - - CHARACTERS_PER_PAGE = 1000 - - def __init__(self, pathtoebook, log=None): - self.log = log - if log is None: - self.log = Log() - pathtoebook = pathtoebook.strip() - self.pathtoebook = os.path.abspath(pathtoebook) - self.config = DynamicConfig(name='iterator') - ext = os.path.splitext(pathtoebook)[1].replace('.', '').lower() - ext = re.sub(r'(x{0,1})htm(l{0,1})', 'html', ext) - self.ebook_ext = ext.replace('original_', '') - - def search(self, text, index, backwards=False): - text = prepare_string_for_xml(text.lower()) - pmap = [(i, path) for i, path in enumerate(self.spine)] - if backwards: - pmap.reverse() - for i, path in pmap: - if (backwards and i < index) or (not backwards and i > index): - with open(path, 'rb') as f: - raw = f.read().decode(path.encoding) - try: - raw = xml_replace_entities(raw) - except: - pass - if text in raw.lower(): - return i - - def find_missing_css_files(self): - for x in os.walk(os.path.dirname(self.pathtoopf)): - for f in x[-1]: - if f.endswith('.css'): - yield os.path.join(x[0], f) - - def find_declared_css_files(self): - for item in self.opf.manifest: - if item.mime_type and 'css' in item.mime_type.lower(): - yield item.path - - def find_embedded_fonts(self): - ''' - This will become unnecessary once Qt WebKit supports the @font-face rule. - ''' - css_files = set(self.find_declared_css_files()) - if not css_files: - css_files = set(self.find_missing_css_files()) - bad_map = {} - font_family_pat = re.compile(r'font-family\s*:\s*([^;]+)') - for csspath in css_files: - try: - css = open(csspath, 'rb').read().decode('utf-8', 'replace') - except: - continue - for match in re.compile(r'@font-face\s*{([^}]+)}').finditer(css): - block = match.group(1) - family = font_family_pat.search(block) - url = re.compile(r'url\s*\([\'"]*(.+?)[\'"]*\)', re.DOTALL).search(block) - if url: - path = url.group(1).split('/') - path = os.path.join(os.path.dirname(csspath), *path) - if not os.access(path, os.R_OK): - continue - id = QFontDatabase.addApplicationFont(path) - if id != -1: - families = [unicode(f) for f in QFontDatabase.applicationFontFamilies(id)] - if family: - family = family.group(1) - specified_families = [x.strip().replace('"', - '').replace("'", '') for x in family.split(',')] - aliasing_ok = False - for f in specified_families: - bad_map[f] = families[0] - if not aliasing_ok and f in families: - aliasing_ok = True - - if not aliasing_ok: - prints('WARNING: Family aliasing not fully supported.') - prints('\tDeclared family: %r not in actual families: %r' - % (family, families)) - else: - prints('Loaded embedded font:', repr(family)) - if bad_map: - def prepend_embedded_font(match): - for bad, good in bad_map.items(): - if bad in match.group(1): - prints('Substituting font family: %s -> %s'%(bad, good)) - return match.group().replace(bad, '"%s"'%good) - - from calibre.ebooks.chardet import force_encoding - for csspath in css_files: - with open(csspath, 'r+b') as f: - css = f.read() - enc = force_encoding(css, False) - css = css.decode(enc, 'replace') - ncss = font_family_pat.sub(prepend_embedded_font, css) - if ncss != css: - f.seek(0) - f.truncate() - f.write(ncss.encode(enc)) - - def __enter__(self, processed=False, only_input_plugin=False): - self.delete_on_exit = [] - self._tdir = TemporaryDirectory('_ebook_iter') - self.base = self._tdir.__enter__() - if not isinstance(self.base, unicode): - self.base = self.base.decode(filesystem_encoding) - from calibre.ebooks.conversion.plumber import Plumber, create_oebbook - plumber = Plumber(self.pathtoebook, self.base, self.log) - plumber.setup_options() - if self.pathtoebook.lower().endswith('.opf'): - plumber.opts.dont_package = True - if hasattr(plumber.opts, 'no_process'): - plumber.opts.no_process = True - - plumber.input_plugin.for_viewer = True - with plumber.input_plugin: - self.pathtoopf = plumber.input_plugin(open(plumber.input, 'rb'), - plumber.opts, plumber.input_fmt, self.log, - {}, self.base) - - if not only_input_plugin: - if processed or plumber.input_fmt.lower() in ('pdb', 'pdf', 'rb') and \ - not hasattr(self.pathtoopf, 'manifest'): - if hasattr(self.pathtoopf, 'manifest'): - self.pathtoopf = write_oebbook(self.pathtoopf, self.base) - self.pathtoopf = create_oebbook(self.log, self.pathtoopf, - plumber.opts) - - if hasattr(self.pathtoopf, 'manifest'): - self.pathtoopf = write_oebbook(self.pathtoopf, self.base) - - self.book_format = os.path.splitext(self.pathtoebook)[1][1:].upper() - if getattr(plumber.input_plugin, 'is_kf8', False): - self.book_format = 'KF8' - - self.opf = getattr(plumber.input_plugin, 'optimize_opf_parsing', None) - if self.opf is None: - self.opf = OPF(self.pathtoopf, os.path.dirname(self.pathtoopf)) - self.language = self.opf.language - if self.language: - self.language = self.language.lower() - ordered = [i for i in self.opf.spine if i.is_linear] + \ - [i for i in self.opf.spine if not i.is_linear] - self.spine = [] - for i in ordered: - spath = i.path - mt = None - if i.idref is not None: - mt = self.opf.manifest.type_for_id(i.idref) - if mt is None: - mt = guess_type(spath)[0] - try: - self.spine.append(SpineItem(spath, mime_type=mt)) - except: - self.log.warn('Missing spine item:', repr(spath)) - - cover = self.opf.cover - if self.ebook_ext in ('lit', 'mobi', 'prc', 'opf', 'fb2') and cover: - cfile = os.path.join(self.base, 'calibre_iterator_cover.html') - rcpath = os.path.relpath(cover, self.base).replace(os.sep, '/') - chtml = (TITLEPAGE%prepare_string_for_xml(rcpath, True)).encode('utf-8') - open(cfile, 'wb').write(chtml) - self.spine[0:0] = [SpineItem(cfile, - mime_type='application/xhtml+xml')] - self.delete_on_exit.append(cfile) - - if self.opf.path_to_html_toc is not None and \ - self.opf.path_to_html_toc not in self.spine: - try: - self.spine.append(SpineItem(self.opf.path_to_html_toc)) - except: - import traceback - traceback.print_exc() - - - sizes = [i.character_count for i in self.spine] - self.pages = [math.ceil(i/float(self.CHARACTERS_PER_PAGE)) for i in sizes] - for p, s in zip(self.pages, self.spine): - s.pages = p - start = 1 - - for s in self.spine: - s.start_page = start - start += s.pages - s.max_page = s.start_page + s.pages - 1 - self.toc = self.opf.toc - - self.read_bookmarks() - - return self - - def parse_bookmarks(self, raw): - for line in raw.splitlines(): - bm = None - if line.count('^') > 0: - tokens = line.rpartition('^') - title, ref = tokens[0], tokens[2] - try: - spine, _, pos = ref.partition('#') - spine = int(spine.strip()) - except: - continue - bm = {'type':'legacy', 'title':title, 'spine':spine, 'pos':pos} - elif BM_FIELD_SEP in line: - try: - title, spine, pos = line.strip().split(BM_FIELD_SEP) - spine = int(spine) - except: - continue - # Unescape from serialization - pos = pos.replace(BM_LEGACY_ESC, u'^') - # Check for pos being a scroll fraction - try: - pos = float(pos) - except: - pass - bm = {'type':'cfi', 'title':title, 'pos':pos, 'spine':spine} - - if bm: - self.bookmarks.append(bm) - - def serialize_bookmarks(self, bookmarks): - dat = [] - for bm in bookmarks: - if bm['type'] == 'legacy': - rec = u'%s^%d#%s'%(bm['title'], bm['spine'], bm['pos']) - else: - pos = bm['pos'] - if isinstance(pos, (int, float)): - pos = unicode(pos) - else: - pos = pos.replace(u'^', BM_LEGACY_ESC) - rec = BM_FIELD_SEP.join([bm['title'], unicode(bm['spine']), pos]) - dat.append(rec) - return (u'\n'.join(dat) +u'\n') - - def read_bookmarks(self): - self.bookmarks = [] - bmfile = os.path.join(self.base, 'META-INF', 'calibre_bookmarks.txt') - raw = '' - if os.path.exists(bmfile): - with open(bmfile, 'rb') as f: - raw = f.read() - else: - saved = self.config['bookmarks_'+self.pathtoebook] - if saved: - raw = saved - if not isinstance(raw, unicode): - raw = raw.decode('utf-8') - self.parse_bookmarks(raw) - - def save_bookmarks(self, bookmarks=None): - if bookmarks is None: - bookmarks = self.bookmarks - dat = self.serialize_bookmarks(bookmarks) - if os.path.splitext(self.pathtoebook)[1].lower() == '.epub' and \ - os.access(self.pathtoebook, os.R_OK): - try: - zf = open(self.pathtoebook, 'r+b') - except IOError: - return - safe_replace(zf, 'META-INF/calibre_bookmarks.txt', - StringIO(dat.encode('utf-8')), - add_missing=True) - else: - self.config['bookmarks_'+self.pathtoebook] = dat - - def add_bookmark(self, bm): - self.bookmarks = [x for x in self.bookmarks if x['title'] != - bm['title']] - self.bookmarks.append(bm) - self.save_bookmarks() - - def set_bookmarks(self, bookmarks): - self.bookmarks = bookmarks - - def __exit__(self, *args): - self._tdir.__exit__(*args) - for x in self.delete_on_exit: - if os.path.exists(x): - os.remove(x) - -def get_preprocess_html(path_to_ebook, output): - from calibre.ebooks.conversion.preprocess import HTMLPreProcessor - iterator = EbookIterator(path_to_ebook) - iterator.__enter__(only_input_plugin=True) - preprocessor = HTMLPreProcessor(None, False) - with open(output, 'wb') as out: - for path in iterator.spine: - with open(path, 'rb') as f: - html = f.read().decode('utf-8', 'replace') - html = preprocessor(html, get_preprocess_html=True) - out.write(html.encode('utf-8')) - out.write(b'\n\n' + b'-'*80 + b'\n\n') - diff --git a/src/calibre/ebooks/oeb/iterator/__init__.py b/src/calibre/ebooks/oeb/iterator/__init__.py new file mode 100644 index 0000000000..29487cbb84 --- /dev/null +++ b/src/calibre/ebooks/oeb/iterator/__init__.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2012, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import os, re + +from calibre.customize.ui import available_input_formats + +def is_supported(path): + ext = os.path.splitext(path)[1].replace('.', '').lower() + ext = re.sub(r'(x{0,1})htm(l{0,1})', 'html', ext) + return ext in available_input_formats() + +class UnsupportedFormatError(Exception): + + def __init__(self, fmt): + Exception.__init__(self, _('%s format books are not supported')%fmt.upper()) + +def EbookIterator(*args, **kwargs): + 'For backwards compatibility' + from calibre.ebooks.oeb.iterator.book import EbookIterator + return EbookIterator(*args, **kwargs) + +def get_preprocess_html(path_to_ebook, output): + from calibre.ebooks.conversion.preprocess import HTMLPreProcessor + iterator = EbookIterator(path_to_ebook) + iterator.__enter__(only_input_plugin=True, run_char_count=False, + read_anchor_map=False) + preprocessor = HTMLPreProcessor(None, False) + with open(output, 'wb') as out: + for path in iterator.spine: + with open(path, 'rb') as f: + html = f.read().decode('utf-8', 'replace') + html = preprocessor(html, get_preprocess_html=True) + out.write(html.encode('utf-8')) + out.write(b'\n\n' + b'-'*80 + b'\n\n') + diff --git a/src/calibre/ebooks/oeb/iterator/book.py b/src/calibre/ebooks/oeb/iterator/book.py new file mode 100644 index 0000000000..d958ffd959 --- /dev/null +++ b/src/calibre/ebooks/oeb/iterator/book.py @@ -0,0 +1,186 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2012, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + + +''' +Iterate over the HTML files in an ebook. Useful for writing viewers. +''' + +import re, os, math +from functools import partial + +from calibre.ebooks.metadata.opf2 import OPF +from calibre.ptempfile import TemporaryDirectory +from calibre.utils.config import DynamicConfig +from calibre.utils.logging import default_log +from calibre import (guess_type, prepare_string_for_xml, + xml_replace_entities) +from calibre.ebooks.oeb.transforms.cover import CoverManager + +from calibre.ebooks.oeb.iterator.spine import (SpineItem, create_indexing_data) +from calibre.ebooks.oeb.iterator.bookmarks import BookmarksMixin + +TITLEPAGE = CoverManager.SVG_TEMPLATE.decode('utf-8').replace(\ + '__ar__', 'none').replace('__viewbox__', '0 0 600 800' + ).replace('__width__', '600').replace('__height__', '800') + +class FakeOpts(object): + verbose = 0 + breadth_first = False + max_levels = 5 + input_encoding = None + + +def write_oebbook(oeb, path): + from calibre.ebooks.oeb.writer import OEBWriter + from calibre import walk + w = OEBWriter() + w(oeb, path) + for f in walk(path): + if f.endswith('.opf'): + return f + +class EbookIterator(BookmarksMixin): + + CHARACTERS_PER_PAGE = 1000 + + def __init__(self, pathtoebook, log=None): + self.log = log or default_log + pathtoebook = pathtoebook.strip() + self.pathtoebook = os.path.abspath(pathtoebook) + self.config = DynamicConfig(name='iterator') + ext = os.path.splitext(pathtoebook)[1].replace('.', '').lower() + ext = re.sub(r'(x{0,1})htm(l{0,1})', 'html', ext) + self.ebook_ext = ext.replace('original_', '') + + def search(self, text, index, backwards=False): + text = prepare_string_for_xml(text.lower()) + pmap = [(i, path) for i, path in enumerate(self.spine)] + if backwards: + pmap.reverse() + for i, path in pmap: + if (backwards and i < index) or (not backwards and i > index): + with open(path, 'rb') as f: + raw = f.read().decode(path.encoding) + try: + raw = xml_replace_entities(raw) + except: + pass + if text in raw.lower(): + return i + + def __enter__(self, processed=False, only_input_plugin=False, + run_char_count=True, read_anchor_map=True): + ''' Convert an ebook file into an exploded OEB book suitable for + display in viewers/preprocessing etc. ''' + + from calibre.ebooks.conversion.plumber import Plumber, create_oebbook + + self.delete_on_exit = [] + self._tdir = TemporaryDirectory('_ebook_iter') + self.base = self._tdir.__enter__() + plumber = Plumber(self.pathtoebook, self.base, self.log) + plumber.setup_options() + if self.pathtoebook.lower().endswith('.opf'): + plumber.opts.dont_package = True + if hasattr(plumber.opts, 'no_process'): + plumber.opts.no_process = True + + plumber.input_plugin.for_viewer = True + with plumber.input_plugin, open(plumber.input, 'rb') as inf: + self.pathtoopf = plumber.input_plugin(inf, + plumber.opts, plumber.input_fmt, self.log, + {}, self.base) + + if not only_input_plugin: + # Run the HTML preprocess/parsing from the conversion pipeline as + # well + if (processed or plumber.input_fmt.lower() in {'pdb', 'pdf', 'rb'} + and not hasattr(self.pathtoopf, 'manifest')): + if hasattr(self.pathtoopf, 'manifest'): + self.pathtoopf = write_oebbook(self.pathtoopf, self.base) + self.pathtoopf = create_oebbook(self.log, self.pathtoopf, + plumber.opts) + + if hasattr(self.pathtoopf, 'manifest'): + self.pathtoopf = write_oebbook(self.pathtoopf, self.base) + + self.book_format = os.path.splitext(self.pathtoebook)[1][1:].upper() + if getattr(plumber.input_plugin, 'is_kf8', False): + self.book_format = 'KF8' + + self.opf = getattr(plumber.input_plugin, 'optimize_opf_parsing', None) + if self.opf is None: + self.opf = OPF(self.pathtoopf, os.path.dirname(self.pathtoopf)) + self.language = self.opf.language + if self.language: + self.language = self.language.lower() + ordered = [i for i in self.opf.spine if i.is_linear] + \ + [i for i in self.opf.spine if not i.is_linear] + self.spine = [] + Spiny = partial(SpineItem, read_anchor_map=read_anchor_map, + run_char_count=run_char_count) + for i in ordered: + spath = i.path + mt = None + if i.idref is not None: + mt = self.opf.manifest.type_for_id(i.idref) + if mt is None: + mt = guess_type(spath)[0] + try: + self.spine.append(Spiny(spath, mime_type=mt)) + except: + self.log.warn('Missing spine item:', repr(spath)) + + cover = self.opf.cover + if cover and self.ebook_ext in {'lit', 'mobi', 'prc', 'opf', 'fb2', + 'azw', 'azw3'}: + cfile = os.path.join(self.base, 'calibre_iterator_cover.html') + rcpath = os.path.relpath(cover, self.base).replace(os.sep, '/') + chtml = (TITLEPAGE%prepare_string_for_xml(rcpath, True)).encode('utf-8') + with open(cfile, 'wb') as f: + f.write(chtml) + self.spine[0:0] = [Spiny(cfile, + mime_type='application/xhtml+xml')] + self.delete_on_exit.append(cfile) + + if self.opf.path_to_html_toc is not None and \ + self.opf.path_to_html_toc not in self.spine: + try: + self.spine.append(Spiny(self.opf.path_to_html_toc)) + except: + import traceback + traceback.print_exc() + + sizes = [i.character_count for i in self.spine] + self.pages = [math.ceil(i/float(self.CHARACTERS_PER_PAGE)) for i in sizes] + for p, s in zip(self.pages, self.spine): + s.pages = p + start = 1 + + for s in self.spine: + s.start_page = start + start += s.pages + s.max_page = s.start_page + s.pages - 1 + self.toc = self.opf.toc + create_indexing_data(self.spine, self.toc) + + self.read_bookmarks() + + return self + + def __exit__(self, *args): + self._tdir.__exit__(*args) + for x in self.delete_on_exit: + try: + os.remove(x) + except: + pass + + diff --git a/src/calibre/ebooks/oeb/iterator/bookmarks.py b/src/calibre/ebooks/oeb/iterator/bookmarks.py new file mode 100644 index 0000000000..3ef110caaa --- /dev/null +++ b/src/calibre/ebooks/oeb/iterator/bookmarks.py @@ -0,0 +1,105 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2012, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import os +from io import BytesIO + +from calibre.utils.zipfile import safe_replace + +BM_FIELD_SEP = u'*|!|?|*' +BM_LEGACY_ESC = u'esc-text-%&*#%(){}ads19-end-esc' + +class BookmarksMixin(object): + + def parse_bookmarks(self, raw): + for line in raw.splitlines(): + bm = None + if line.count('^') > 0: + tokens = line.rpartition('^') + title, ref = tokens[0], tokens[2] + try: + spine, _, pos = ref.partition('#') + spine = int(spine.strip()) + except: + continue + bm = {'type':'legacy', 'title':title, 'spine':spine, 'pos':pos} + elif BM_FIELD_SEP in line: + try: + title, spine, pos = line.strip().split(BM_FIELD_SEP) + spine = int(spine) + except: + continue + # Unescape from serialization + pos = pos.replace(BM_LEGACY_ESC, u'^') + # Check for pos being a scroll fraction + try: + pos = float(pos) + except: + pass + bm = {'type':'cfi', 'title':title, 'pos':pos, 'spine':spine} + + if bm: + self.bookmarks.append(bm) + + def serialize_bookmarks(self, bookmarks): + dat = [] + for bm in bookmarks: + if bm['type'] == 'legacy': + rec = u'%s^%d#%s'%(bm['title'], bm['spine'], bm['pos']) + else: + pos = bm['pos'] + if isinstance(pos, (int, float)): + pos = unicode(pos) + else: + pos = pos.replace(u'^', BM_LEGACY_ESC) + rec = BM_FIELD_SEP.join([bm['title'], unicode(bm['spine']), pos]) + dat.append(rec) + return (u'\n'.join(dat) +u'\n') + + def read_bookmarks(self): + self.bookmarks = [] + bmfile = os.path.join(self.base, 'META-INF', 'calibre_bookmarks.txt') + raw = '' + if os.path.exists(bmfile): + with open(bmfile, 'rb') as f: + raw = f.read() + else: + saved = self.config['bookmarks_'+self.pathtoebook] + if saved: + raw = saved + if not isinstance(raw, unicode): + raw = raw.decode('utf-8') + self.parse_bookmarks(raw) + + def save_bookmarks(self, bookmarks=None): + if bookmarks is None: + bookmarks = self.bookmarks + dat = self.serialize_bookmarks(bookmarks) + if os.path.splitext(self.pathtoebook)[1].lower() == '.epub' and \ + os.access(self.pathtoebook, os.R_OK): + try: + zf = open(self.pathtoebook, 'r+b') + except IOError: + return + safe_replace(zf, 'META-INF/calibre_bookmarks.txt', + BytesIO(dat.encode('utf-8')), + add_missing=True) + else: + self.config['bookmarks_'+self.pathtoebook] = dat + + def add_bookmark(self, bm): + self.bookmarks = [x for x in self.bookmarks if x['title'] != + bm['title']] + self.bookmarks.append(bm) + self.save_bookmarks() + + def set_bookmarks(self, bookmarks): + self.bookmarks = bookmarks + + diff --git a/src/calibre/ebooks/oeb/iterator/spine.py b/src/calibre/ebooks/oeb/iterator/spine.py new file mode 100644 index 0000000000..daddec6ec1 --- /dev/null +++ b/src/calibre/ebooks/oeb/iterator/spine.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import (unicode_literals, division, absolute_import, + print_function) +from future_builtins import map + +__license__ = 'GPL v3' +__copyright__ = '2012, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import re, os +from functools import partial +from operator import attrgetter +from collections import namedtuple + +from calibre import guess_type +from calibre.ebooks.chardet import xml_to_unicode + +def character_count(html): + ''' Return the number of "significant" text characters in a HTML string. ''' + count = 0 + strip_space = re.compile(r'\s+') + for match in re.finditer(r'>[^<]+<', html): + count += len(strip_space.sub(' ', match.group()))-2 + return count + +def anchor_map(html): + ''' Return map of all anchor names to their offsets in the html ''' + ans = {} + for match in re.finditer( + r'''(?:id|name)\s*=\s*['"]([^'"]+)['"]''', html): + anchor = match.group(0) + ans[anchor] = ans.get(anchor, match.start()) + return ans + +class SpineItem(unicode): + + def __new__(cls, path, mime_type=None, read_anchor_map=True, + run_char_count=True): + ppath = path.partition('#')[0] + if not os.path.exists(path) and os.path.exists(ppath): + path = ppath + obj = super(SpineItem, cls).__new__(cls, path) + with open(path, 'rb') as f: + raw = f.read() + raw, obj.encoding = xml_to_unicode(raw) + obj.character_count = character_count(raw) if run_char_count else 10000 + obj.anchor_map = anchor_map(raw) if read_anchor_map else {} + obj.start_page = -1 + obj.pages = -1 + obj.max_page = -1 + obj.index_entries = [] + if mime_type is None: + mime_type = guess_type(obj)[0] + obj.mime_type = mime_type + return obj + +class IndexEntry(object): + + def __init__(self, spine, toc_entry, num): + self.num = num + self.text = toc_entry.text or _('Unknown') + self.key = toc_entry.abspath + self.anchor = self.start_anchor = toc_entry.fragment or None + self.spine_pos = spine.index(self.key) + self.anchor_pos = 0 + if self.spine_pos > -1: + self.anchor_pos = spine[self.spine_pos].anchor_map.get(self.anchor, + 0) + + self.depth = 0 + p = toc_entry.parent + while p is not None: + self.depth += 1 + p = p.parent + + self.sort_key = (self.spine_pos, self.anchor_pos) + self.spine_count = len(spine) + + def find_end(self, all_entries): + potential_enders = [i for i in all_entries if + i.depth <= self.depth and + ( + (i.spine_pos == self.spine_pos and i.anchor_pos > + self.anchor_pos) + or + i.spine_pos > self.spine_pos + )] + if potential_enders: + # potential_enders is sorted by (spine_pos, anchor_pos) + end = potential_enders[0] + self.end_spine_pos = end.spine_pos + self.end_anchor = end.anchor + else: + self.end_spine_pos = self.spine_count - 1 + self.end_anchor = None + +def create_indexing_data(spine, toc): + if not toc: return + f = partial(IndexEntry, spine) + index_entries = list(map(f, + (t for t in toc.flat() if t is not toc), + (i-1 for i, t in enumerate(toc.flat()) if t is not toc) + )) + index_entries.sort(key=attrgetter('sort_key')) + [ i.find_end(index_entries) for i in index_entries ] + + ie = namedtuple('IndexEntry', 'entry start_anchor end_anchor') + + for spine_pos, spine_item in enumerate(spine): + for i in index_entries: + if i.end_spine_pos < spine_pos or i.spine_pos > spine_pos: + continue # Does not touch this file + start = i.anchor if i.spine_pos == spine_pos else None + end = i.end_anchor if i.spine_pos == spine_pos else None + spine_item.index_entries.append(ie(i, start, end)) + diff --git a/src/calibre/gui2/viewer/main.py b/src/calibre/gui2/viewer/main.py index 2e4a2f8924..2b7dc8b41d 100644 --- a/src/calibre/gui2/viewer/main.py +++ b/src/calibre/gui2/viewer/main.py @@ -18,7 +18,7 @@ from calibre.gui2.widgets import ProgressIndicator from calibre.gui2.main_window import MainWindow from calibre.gui2 import (Application, ORG_NAME, APP_UID, choose_files, info_dialog, error_dialog, open_url, available_height) -from calibre.ebooks.oeb.iterator import EbookIterator +from calibre.ebooks.oeb.iterator.book import EbookIterator from calibre.ebooks import DRMError from calibre.constants import islinux, isbsd, isosx, filesystem_encoding from calibre.utils.config import Config, StringConfig, JSONConfig @@ -802,11 +802,13 @@ class EbookViewer(MainWindow, Ui_EbookViewer): if not title: title = os.path.splitext(os.path.basename(pathtoebook))[0] if self.iterator.toc: - self.toc_model = TOC(self.iterator.toc) + self.toc_model = TOC(self.iterator.spine, self.iterator.toc) self.toc.setModel(self.toc_model) if self.show_toc_on_open: self.action_table_of_contents.setChecked(True) else: + self.toc_model = TOC(self.iterator.spine) + self.toc.setModel(self.toc_model) self.action_table_of_contents.setChecked(False) if isbytestring(pathtoebook): pathtoebook = force_unicode(pathtoebook, filesystem_encoding) diff --git a/src/calibre/gui2/viewer/toc.py b/src/calibre/gui2/viewer/toc.py index dd14eb604a..b702f46577 100644 --- a/src/calibre/gui2/viewer/toc.py +++ b/src/calibre/gui2/viewer/toc.py @@ -8,9 +8,10 @@ __copyright__ = '2012, Kovid Goyal ' __docformat__ = 'restructuredtext en' import re - from PyQt4.Qt import QStandardItem, QStandardItemModel, Qt +from calibre.ebooks.metadata.toc import TOC as MTOC + class TOCItem(QStandardItem): def __init__(self, toc): @@ -30,8 +31,10 @@ class TOCItem(QStandardItem): class TOC(QStandardItemModel): - def __init__(self, toc): + def __init__(self, spine, toc=None): QStandardItemModel.__init__(self) + if toc is None: + toc = MTOC() for t in toc: self.appendRow(TOCItem(t)) self.setHorizontalHeaderItem(0, QStandardItem(_('Table of Contents')))