mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Refactor oeb iterator and implement logic to collect indexing data
This commit is contained in:
parent
820ba7ec7d
commit
d993fbc91f
@ -1,383 +0,0 @@
|
|||||||
from __future__ import with_statement
|
|
||||||
__license__ = 'GPL v3'
|
|
||||||
__copyright__ = '2008 Kovid Goyal <kovid at kovidgoyal.net>'
|
|
||||||
|
|
||||||
'''
|
|
||||||
Iterate over the HTML files in an ebook. Useful for writing viewers.
|
|
||||||
'''
|
|
||||||
|
|
||||||
import re, os, math
|
|
||||||
from cStringIO import StringIO
|
|
||||||
|
|
||||||
from PyQt4.Qt import QFontDatabase
|
|
||||||
|
|
||||||
from calibre.customize.ui import available_input_formats
|
|
||||||
from calibre.ebooks.metadata.opf2 import OPF
|
|
||||||
from calibre.ptempfile import TemporaryDirectory
|
|
||||||
from calibre.ebooks.chardet import xml_to_unicode
|
|
||||||
from calibre.utils.zipfile import safe_replace
|
|
||||||
from calibre.utils.config import DynamicConfig
|
|
||||||
from calibre.utils.logging import Log
|
|
||||||
from calibre import (guess_type, prints, prepare_string_for_xml,
|
|
||||||
xml_replace_entities)
|
|
||||||
from calibre.ebooks.oeb.transforms.cover import CoverManager
|
|
||||||
from calibre.constants import filesystem_encoding
|
|
||||||
|
|
||||||
TITLEPAGE = CoverManager.SVG_TEMPLATE.decode('utf-8').replace(\
|
|
||||||
'__ar__', 'none').replace('__viewbox__', '0 0 600 800'
|
|
||||||
).replace('__width__', '600').replace('__height__', '800')
|
|
||||||
BM_FIELD_SEP = u'*|!|?|*'
|
|
||||||
BM_LEGACY_ESC = u'esc-text-%&*#%(){}ads19-end-esc'
|
|
||||||
|
|
||||||
def character_count(html):
|
|
||||||
'''
|
|
||||||
Return the number of "significant" text characters in a HTML string.
|
|
||||||
'''
|
|
||||||
count = 0
|
|
||||||
strip_space = re.compile(r'\s+')
|
|
||||||
for match in re.finditer(r'>[^<]+<', html):
|
|
||||||
count += len(strip_space.sub(' ', match.group()))-2
|
|
||||||
return count
|
|
||||||
|
|
||||||
class UnsupportedFormatError(Exception):
|
|
||||||
|
|
||||||
def __init__(self, fmt):
|
|
||||||
Exception.__init__(self, _('%s format books are not supported')%fmt.upper())
|
|
||||||
|
|
||||||
class SpineItem(unicode):
|
|
||||||
|
|
||||||
def __new__(cls, path, mime_type=None):
|
|
||||||
ppath = path.partition('#')[0]
|
|
||||||
if not os.path.exists(path) and os.path.exists(ppath):
|
|
||||||
path = ppath
|
|
||||||
obj = super(SpineItem, cls).__new__(cls, path)
|
|
||||||
raw = open(path, 'rb').read()
|
|
||||||
raw, obj.encoding = xml_to_unicode(raw)
|
|
||||||
obj.character_count = character_count(raw)
|
|
||||||
obj.start_page = -1
|
|
||||||
obj.pages = -1
|
|
||||||
obj.max_page = -1
|
|
||||||
if mime_type is None:
|
|
||||||
mime_type = guess_type(obj)[0]
|
|
||||||
obj.mime_type = mime_type
|
|
||||||
return obj
|
|
||||||
|
|
||||||
class FakeOpts(object):
|
|
||||||
verbose = 0
|
|
||||||
breadth_first = False
|
|
||||||
max_levels = 5
|
|
||||||
input_encoding = None
|
|
||||||
|
|
||||||
def is_supported(path):
|
|
||||||
ext = os.path.splitext(path)[1].replace('.', '').lower()
|
|
||||||
ext = re.sub(r'(x{0,1})htm(l{0,1})', 'html', ext)
|
|
||||||
return ext in available_input_formats()
|
|
||||||
|
|
||||||
|
|
||||||
def write_oebbook(oeb, path):
|
|
||||||
from calibre.ebooks.oeb.writer import OEBWriter
|
|
||||||
from calibre import walk
|
|
||||||
w = OEBWriter()
|
|
||||||
w(oeb, path)
|
|
||||||
for f in walk(path):
|
|
||||||
if f.endswith('.opf'):
|
|
||||||
return f
|
|
||||||
|
|
||||||
class EbookIterator(object):
|
|
||||||
|
|
||||||
CHARACTERS_PER_PAGE = 1000
|
|
||||||
|
|
||||||
def __init__(self, pathtoebook, log=None):
|
|
||||||
self.log = log
|
|
||||||
if log is None:
|
|
||||||
self.log = Log()
|
|
||||||
pathtoebook = pathtoebook.strip()
|
|
||||||
self.pathtoebook = os.path.abspath(pathtoebook)
|
|
||||||
self.config = DynamicConfig(name='iterator')
|
|
||||||
ext = os.path.splitext(pathtoebook)[1].replace('.', '').lower()
|
|
||||||
ext = re.sub(r'(x{0,1})htm(l{0,1})', 'html', ext)
|
|
||||||
self.ebook_ext = ext.replace('original_', '')
|
|
||||||
|
|
||||||
def search(self, text, index, backwards=False):
|
|
||||||
text = prepare_string_for_xml(text.lower())
|
|
||||||
pmap = [(i, path) for i, path in enumerate(self.spine)]
|
|
||||||
if backwards:
|
|
||||||
pmap.reverse()
|
|
||||||
for i, path in pmap:
|
|
||||||
if (backwards and i < index) or (not backwards and i > index):
|
|
||||||
with open(path, 'rb') as f:
|
|
||||||
raw = f.read().decode(path.encoding)
|
|
||||||
try:
|
|
||||||
raw = xml_replace_entities(raw)
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
if text in raw.lower():
|
|
||||||
return i
|
|
||||||
|
|
||||||
def find_missing_css_files(self):
|
|
||||||
for x in os.walk(os.path.dirname(self.pathtoopf)):
|
|
||||||
for f in x[-1]:
|
|
||||||
if f.endswith('.css'):
|
|
||||||
yield os.path.join(x[0], f)
|
|
||||||
|
|
||||||
def find_declared_css_files(self):
|
|
||||||
for item in self.opf.manifest:
|
|
||||||
if item.mime_type and 'css' in item.mime_type.lower():
|
|
||||||
yield item.path
|
|
||||||
|
|
||||||
def find_embedded_fonts(self):
|
|
||||||
'''
|
|
||||||
This will become unnecessary once Qt WebKit supports the @font-face rule.
|
|
||||||
'''
|
|
||||||
css_files = set(self.find_declared_css_files())
|
|
||||||
if not css_files:
|
|
||||||
css_files = set(self.find_missing_css_files())
|
|
||||||
bad_map = {}
|
|
||||||
font_family_pat = re.compile(r'font-family\s*:\s*([^;]+)')
|
|
||||||
for csspath in css_files:
|
|
||||||
try:
|
|
||||||
css = open(csspath, 'rb').read().decode('utf-8', 'replace')
|
|
||||||
except:
|
|
||||||
continue
|
|
||||||
for match in re.compile(r'@font-face\s*{([^}]+)}').finditer(css):
|
|
||||||
block = match.group(1)
|
|
||||||
family = font_family_pat.search(block)
|
|
||||||
url = re.compile(r'url\s*\([\'"]*(.+?)[\'"]*\)', re.DOTALL).search(block)
|
|
||||||
if url:
|
|
||||||
path = url.group(1).split('/')
|
|
||||||
path = os.path.join(os.path.dirname(csspath), *path)
|
|
||||||
if not os.access(path, os.R_OK):
|
|
||||||
continue
|
|
||||||
id = QFontDatabase.addApplicationFont(path)
|
|
||||||
if id != -1:
|
|
||||||
families = [unicode(f) for f in QFontDatabase.applicationFontFamilies(id)]
|
|
||||||
if family:
|
|
||||||
family = family.group(1)
|
|
||||||
specified_families = [x.strip().replace('"',
|
|
||||||
'').replace("'", '') for x in family.split(',')]
|
|
||||||
aliasing_ok = False
|
|
||||||
for f in specified_families:
|
|
||||||
bad_map[f] = families[0]
|
|
||||||
if not aliasing_ok and f in families:
|
|
||||||
aliasing_ok = True
|
|
||||||
|
|
||||||
if not aliasing_ok:
|
|
||||||
prints('WARNING: Family aliasing not fully supported.')
|
|
||||||
prints('\tDeclared family: %r not in actual families: %r'
|
|
||||||
% (family, families))
|
|
||||||
else:
|
|
||||||
prints('Loaded embedded font:', repr(family))
|
|
||||||
if bad_map:
|
|
||||||
def prepend_embedded_font(match):
|
|
||||||
for bad, good in bad_map.items():
|
|
||||||
if bad in match.group(1):
|
|
||||||
prints('Substituting font family: %s -> %s'%(bad, good))
|
|
||||||
return match.group().replace(bad, '"%s"'%good)
|
|
||||||
|
|
||||||
from calibre.ebooks.chardet import force_encoding
|
|
||||||
for csspath in css_files:
|
|
||||||
with open(csspath, 'r+b') as f:
|
|
||||||
css = f.read()
|
|
||||||
enc = force_encoding(css, False)
|
|
||||||
css = css.decode(enc, 'replace')
|
|
||||||
ncss = font_family_pat.sub(prepend_embedded_font, css)
|
|
||||||
if ncss != css:
|
|
||||||
f.seek(0)
|
|
||||||
f.truncate()
|
|
||||||
f.write(ncss.encode(enc))
|
|
||||||
|
|
||||||
def __enter__(self, processed=False, only_input_plugin=False):
|
|
||||||
self.delete_on_exit = []
|
|
||||||
self._tdir = TemporaryDirectory('_ebook_iter')
|
|
||||||
self.base = self._tdir.__enter__()
|
|
||||||
if not isinstance(self.base, unicode):
|
|
||||||
self.base = self.base.decode(filesystem_encoding)
|
|
||||||
from calibre.ebooks.conversion.plumber import Plumber, create_oebbook
|
|
||||||
plumber = Plumber(self.pathtoebook, self.base, self.log)
|
|
||||||
plumber.setup_options()
|
|
||||||
if self.pathtoebook.lower().endswith('.opf'):
|
|
||||||
plumber.opts.dont_package = True
|
|
||||||
if hasattr(plumber.opts, 'no_process'):
|
|
||||||
plumber.opts.no_process = True
|
|
||||||
|
|
||||||
plumber.input_plugin.for_viewer = True
|
|
||||||
with plumber.input_plugin:
|
|
||||||
self.pathtoopf = plumber.input_plugin(open(plumber.input, 'rb'),
|
|
||||||
plumber.opts, plumber.input_fmt, self.log,
|
|
||||||
{}, self.base)
|
|
||||||
|
|
||||||
if not only_input_plugin:
|
|
||||||
if processed or plumber.input_fmt.lower() in ('pdb', 'pdf', 'rb') and \
|
|
||||||
not hasattr(self.pathtoopf, 'manifest'):
|
|
||||||
if hasattr(self.pathtoopf, 'manifest'):
|
|
||||||
self.pathtoopf = write_oebbook(self.pathtoopf, self.base)
|
|
||||||
self.pathtoopf = create_oebbook(self.log, self.pathtoopf,
|
|
||||||
plumber.opts)
|
|
||||||
|
|
||||||
if hasattr(self.pathtoopf, 'manifest'):
|
|
||||||
self.pathtoopf = write_oebbook(self.pathtoopf, self.base)
|
|
||||||
|
|
||||||
self.book_format = os.path.splitext(self.pathtoebook)[1][1:].upper()
|
|
||||||
if getattr(plumber.input_plugin, 'is_kf8', False):
|
|
||||||
self.book_format = 'KF8'
|
|
||||||
|
|
||||||
self.opf = getattr(plumber.input_plugin, 'optimize_opf_parsing', None)
|
|
||||||
if self.opf is None:
|
|
||||||
self.opf = OPF(self.pathtoopf, os.path.dirname(self.pathtoopf))
|
|
||||||
self.language = self.opf.language
|
|
||||||
if self.language:
|
|
||||||
self.language = self.language.lower()
|
|
||||||
ordered = [i for i in self.opf.spine if i.is_linear] + \
|
|
||||||
[i for i in self.opf.spine if not i.is_linear]
|
|
||||||
self.spine = []
|
|
||||||
for i in ordered:
|
|
||||||
spath = i.path
|
|
||||||
mt = None
|
|
||||||
if i.idref is not None:
|
|
||||||
mt = self.opf.manifest.type_for_id(i.idref)
|
|
||||||
if mt is None:
|
|
||||||
mt = guess_type(spath)[0]
|
|
||||||
try:
|
|
||||||
self.spine.append(SpineItem(spath, mime_type=mt))
|
|
||||||
except:
|
|
||||||
self.log.warn('Missing spine item:', repr(spath))
|
|
||||||
|
|
||||||
cover = self.opf.cover
|
|
||||||
if self.ebook_ext in ('lit', 'mobi', 'prc', 'opf', 'fb2') and cover:
|
|
||||||
cfile = os.path.join(self.base, 'calibre_iterator_cover.html')
|
|
||||||
rcpath = os.path.relpath(cover, self.base).replace(os.sep, '/')
|
|
||||||
chtml = (TITLEPAGE%prepare_string_for_xml(rcpath, True)).encode('utf-8')
|
|
||||||
open(cfile, 'wb').write(chtml)
|
|
||||||
self.spine[0:0] = [SpineItem(cfile,
|
|
||||||
mime_type='application/xhtml+xml')]
|
|
||||||
self.delete_on_exit.append(cfile)
|
|
||||||
|
|
||||||
if self.opf.path_to_html_toc is not None and \
|
|
||||||
self.opf.path_to_html_toc not in self.spine:
|
|
||||||
try:
|
|
||||||
self.spine.append(SpineItem(self.opf.path_to_html_toc))
|
|
||||||
except:
|
|
||||||
import traceback
|
|
||||||
traceback.print_exc()
|
|
||||||
|
|
||||||
|
|
||||||
sizes = [i.character_count for i in self.spine]
|
|
||||||
self.pages = [math.ceil(i/float(self.CHARACTERS_PER_PAGE)) for i in sizes]
|
|
||||||
for p, s in zip(self.pages, self.spine):
|
|
||||||
s.pages = p
|
|
||||||
start = 1
|
|
||||||
|
|
||||||
for s in self.spine:
|
|
||||||
s.start_page = start
|
|
||||||
start += s.pages
|
|
||||||
s.max_page = s.start_page + s.pages - 1
|
|
||||||
self.toc = self.opf.toc
|
|
||||||
|
|
||||||
self.read_bookmarks()
|
|
||||||
|
|
||||||
return self
|
|
||||||
|
|
||||||
def parse_bookmarks(self, raw):
|
|
||||||
for line in raw.splitlines():
|
|
||||||
bm = None
|
|
||||||
if line.count('^') > 0:
|
|
||||||
tokens = line.rpartition('^')
|
|
||||||
title, ref = tokens[0], tokens[2]
|
|
||||||
try:
|
|
||||||
spine, _, pos = ref.partition('#')
|
|
||||||
spine = int(spine.strip())
|
|
||||||
except:
|
|
||||||
continue
|
|
||||||
bm = {'type':'legacy', 'title':title, 'spine':spine, 'pos':pos}
|
|
||||||
elif BM_FIELD_SEP in line:
|
|
||||||
try:
|
|
||||||
title, spine, pos = line.strip().split(BM_FIELD_SEP)
|
|
||||||
spine = int(spine)
|
|
||||||
except:
|
|
||||||
continue
|
|
||||||
# Unescape from serialization
|
|
||||||
pos = pos.replace(BM_LEGACY_ESC, u'^')
|
|
||||||
# Check for pos being a scroll fraction
|
|
||||||
try:
|
|
||||||
pos = float(pos)
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
bm = {'type':'cfi', 'title':title, 'pos':pos, 'spine':spine}
|
|
||||||
|
|
||||||
if bm:
|
|
||||||
self.bookmarks.append(bm)
|
|
||||||
|
|
||||||
def serialize_bookmarks(self, bookmarks):
|
|
||||||
dat = []
|
|
||||||
for bm in bookmarks:
|
|
||||||
if bm['type'] == 'legacy':
|
|
||||||
rec = u'%s^%d#%s'%(bm['title'], bm['spine'], bm['pos'])
|
|
||||||
else:
|
|
||||||
pos = bm['pos']
|
|
||||||
if isinstance(pos, (int, float)):
|
|
||||||
pos = unicode(pos)
|
|
||||||
else:
|
|
||||||
pos = pos.replace(u'^', BM_LEGACY_ESC)
|
|
||||||
rec = BM_FIELD_SEP.join([bm['title'], unicode(bm['spine']), pos])
|
|
||||||
dat.append(rec)
|
|
||||||
return (u'\n'.join(dat) +u'\n')
|
|
||||||
|
|
||||||
def read_bookmarks(self):
|
|
||||||
self.bookmarks = []
|
|
||||||
bmfile = os.path.join(self.base, 'META-INF', 'calibre_bookmarks.txt')
|
|
||||||
raw = ''
|
|
||||||
if os.path.exists(bmfile):
|
|
||||||
with open(bmfile, 'rb') as f:
|
|
||||||
raw = f.read()
|
|
||||||
else:
|
|
||||||
saved = self.config['bookmarks_'+self.pathtoebook]
|
|
||||||
if saved:
|
|
||||||
raw = saved
|
|
||||||
if not isinstance(raw, unicode):
|
|
||||||
raw = raw.decode('utf-8')
|
|
||||||
self.parse_bookmarks(raw)
|
|
||||||
|
|
||||||
def save_bookmarks(self, bookmarks=None):
|
|
||||||
if bookmarks is None:
|
|
||||||
bookmarks = self.bookmarks
|
|
||||||
dat = self.serialize_bookmarks(bookmarks)
|
|
||||||
if os.path.splitext(self.pathtoebook)[1].lower() == '.epub' and \
|
|
||||||
os.access(self.pathtoebook, os.R_OK):
|
|
||||||
try:
|
|
||||||
zf = open(self.pathtoebook, 'r+b')
|
|
||||||
except IOError:
|
|
||||||
return
|
|
||||||
safe_replace(zf, 'META-INF/calibre_bookmarks.txt',
|
|
||||||
StringIO(dat.encode('utf-8')),
|
|
||||||
add_missing=True)
|
|
||||||
else:
|
|
||||||
self.config['bookmarks_'+self.pathtoebook] = dat
|
|
||||||
|
|
||||||
def add_bookmark(self, bm):
|
|
||||||
self.bookmarks = [x for x in self.bookmarks if x['title'] !=
|
|
||||||
bm['title']]
|
|
||||||
self.bookmarks.append(bm)
|
|
||||||
self.save_bookmarks()
|
|
||||||
|
|
||||||
def set_bookmarks(self, bookmarks):
|
|
||||||
self.bookmarks = bookmarks
|
|
||||||
|
|
||||||
def __exit__(self, *args):
|
|
||||||
self._tdir.__exit__(*args)
|
|
||||||
for x in self.delete_on_exit:
|
|
||||||
if os.path.exists(x):
|
|
||||||
os.remove(x)
|
|
||||||
|
|
||||||
def get_preprocess_html(path_to_ebook, output):
|
|
||||||
from calibre.ebooks.conversion.preprocess import HTMLPreProcessor
|
|
||||||
iterator = EbookIterator(path_to_ebook)
|
|
||||||
iterator.__enter__(only_input_plugin=True)
|
|
||||||
preprocessor = HTMLPreProcessor(None, False)
|
|
||||||
with open(output, 'wb') as out:
|
|
||||||
for path in iterator.spine:
|
|
||||||
with open(path, 'rb') as f:
|
|
||||||
html = f.read().decode('utf-8', 'replace')
|
|
||||||
html = preprocessor(html, get_preprocess_html=True)
|
|
||||||
out.write(html.encode('utf-8'))
|
|
||||||
out.write(b'\n\n' + b'-'*80 + b'\n\n')
|
|
||||||
|
|
42
src/calibre/ebooks/oeb/iterator/__init__.py
Normal file
42
src/calibre/ebooks/oeb/iterator/__init__.py
Normal file
@ -0,0 +1,42 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||||
|
from __future__ import (unicode_literals, division, absolute_import,
|
||||||
|
print_function)
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
import os, re
|
||||||
|
|
||||||
|
from calibre.customize.ui import available_input_formats
|
||||||
|
|
||||||
|
def is_supported(path):
|
||||||
|
ext = os.path.splitext(path)[1].replace('.', '').lower()
|
||||||
|
ext = re.sub(r'(x{0,1})htm(l{0,1})', 'html', ext)
|
||||||
|
return ext in available_input_formats()
|
||||||
|
|
||||||
|
class UnsupportedFormatError(Exception):
|
||||||
|
|
||||||
|
def __init__(self, fmt):
|
||||||
|
Exception.__init__(self, _('%s format books are not supported')%fmt.upper())
|
||||||
|
|
||||||
|
def EbookIterator(*args, **kwargs):
|
||||||
|
'For backwards compatibility'
|
||||||
|
from calibre.ebooks.oeb.iterator.book import EbookIterator
|
||||||
|
return EbookIterator(*args, **kwargs)
|
||||||
|
|
||||||
|
def get_preprocess_html(path_to_ebook, output):
|
||||||
|
from calibre.ebooks.conversion.preprocess import HTMLPreProcessor
|
||||||
|
iterator = EbookIterator(path_to_ebook)
|
||||||
|
iterator.__enter__(only_input_plugin=True, run_char_count=False,
|
||||||
|
read_anchor_map=False)
|
||||||
|
preprocessor = HTMLPreProcessor(None, False)
|
||||||
|
with open(output, 'wb') as out:
|
||||||
|
for path in iterator.spine:
|
||||||
|
with open(path, 'rb') as f:
|
||||||
|
html = f.read().decode('utf-8', 'replace')
|
||||||
|
html = preprocessor(html, get_preprocess_html=True)
|
||||||
|
out.write(html.encode('utf-8'))
|
||||||
|
out.write(b'\n\n' + b'-'*80 + b'\n\n')
|
||||||
|
|
186
src/calibre/ebooks/oeb/iterator/book.py
Normal file
186
src/calibre/ebooks/oeb/iterator/book.py
Normal file
@ -0,0 +1,186 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||||
|
from __future__ import (unicode_literals, division, absolute_import,
|
||||||
|
print_function)
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
|
||||||
|
'''
|
||||||
|
Iterate over the HTML files in an ebook. Useful for writing viewers.
|
||||||
|
'''
|
||||||
|
|
||||||
|
import re, os, math
|
||||||
|
from functools import partial
|
||||||
|
|
||||||
|
from calibre.ebooks.metadata.opf2 import OPF
|
||||||
|
from calibre.ptempfile import TemporaryDirectory
|
||||||
|
from calibre.utils.config import DynamicConfig
|
||||||
|
from calibre.utils.logging import default_log
|
||||||
|
from calibre import (guess_type, prepare_string_for_xml,
|
||||||
|
xml_replace_entities)
|
||||||
|
from calibre.ebooks.oeb.transforms.cover import CoverManager
|
||||||
|
|
||||||
|
from calibre.ebooks.oeb.iterator.spine import (SpineItem, create_indexing_data)
|
||||||
|
from calibre.ebooks.oeb.iterator.bookmarks import BookmarksMixin
|
||||||
|
|
||||||
|
TITLEPAGE = CoverManager.SVG_TEMPLATE.decode('utf-8').replace(\
|
||||||
|
'__ar__', 'none').replace('__viewbox__', '0 0 600 800'
|
||||||
|
).replace('__width__', '600').replace('__height__', '800')
|
||||||
|
|
||||||
|
class FakeOpts(object):
|
||||||
|
verbose = 0
|
||||||
|
breadth_first = False
|
||||||
|
max_levels = 5
|
||||||
|
input_encoding = None
|
||||||
|
|
||||||
|
|
||||||
|
def write_oebbook(oeb, path):
|
||||||
|
from calibre.ebooks.oeb.writer import OEBWriter
|
||||||
|
from calibre import walk
|
||||||
|
w = OEBWriter()
|
||||||
|
w(oeb, path)
|
||||||
|
for f in walk(path):
|
||||||
|
if f.endswith('.opf'):
|
||||||
|
return f
|
||||||
|
|
||||||
|
class EbookIterator(BookmarksMixin):
|
||||||
|
|
||||||
|
CHARACTERS_PER_PAGE = 1000
|
||||||
|
|
||||||
|
def __init__(self, pathtoebook, log=None):
|
||||||
|
self.log = log or default_log
|
||||||
|
pathtoebook = pathtoebook.strip()
|
||||||
|
self.pathtoebook = os.path.abspath(pathtoebook)
|
||||||
|
self.config = DynamicConfig(name='iterator')
|
||||||
|
ext = os.path.splitext(pathtoebook)[1].replace('.', '').lower()
|
||||||
|
ext = re.sub(r'(x{0,1})htm(l{0,1})', 'html', ext)
|
||||||
|
self.ebook_ext = ext.replace('original_', '')
|
||||||
|
|
||||||
|
def search(self, text, index, backwards=False):
|
||||||
|
text = prepare_string_for_xml(text.lower())
|
||||||
|
pmap = [(i, path) for i, path in enumerate(self.spine)]
|
||||||
|
if backwards:
|
||||||
|
pmap.reverse()
|
||||||
|
for i, path in pmap:
|
||||||
|
if (backwards and i < index) or (not backwards and i > index):
|
||||||
|
with open(path, 'rb') as f:
|
||||||
|
raw = f.read().decode(path.encoding)
|
||||||
|
try:
|
||||||
|
raw = xml_replace_entities(raw)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
if text in raw.lower():
|
||||||
|
return i
|
||||||
|
|
||||||
|
def __enter__(self, processed=False, only_input_plugin=False,
|
||||||
|
run_char_count=True, read_anchor_map=True):
|
||||||
|
''' Convert an ebook file into an exploded OEB book suitable for
|
||||||
|
display in viewers/preprocessing etc. '''
|
||||||
|
|
||||||
|
from calibre.ebooks.conversion.plumber import Plumber, create_oebbook
|
||||||
|
|
||||||
|
self.delete_on_exit = []
|
||||||
|
self._tdir = TemporaryDirectory('_ebook_iter')
|
||||||
|
self.base = self._tdir.__enter__()
|
||||||
|
plumber = Plumber(self.pathtoebook, self.base, self.log)
|
||||||
|
plumber.setup_options()
|
||||||
|
if self.pathtoebook.lower().endswith('.opf'):
|
||||||
|
plumber.opts.dont_package = True
|
||||||
|
if hasattr(plumber.opts, 'no_process'):
|
||||||
|
plumber.opts.no_process = True
|
||||||
|
|
||||||
|
plumber.input_plugin.for_viewer = True
|
||||||
|
with plumber.input_plugin, open(plumber.input, 'rb') as inf:
|
||||||
|
self.pathtoopf = plumber.input_plugin(inf,
|
||||||
|
plumber.opts, plumber.input_fmt, self.log,
|
||||||
|
{}, self.base)
|
||||||
|
|
||||||
|
if not only_input_plugin:
|
||||||
|
# Run the HTML preprocess/parsing from the conversion pipeline as
|
||||||
|
# well
|
||||||
|
if (processed or plumber.input_fmt.lower() in {'pdb', 'pdf', 'rb'}
|
||||||
|
and not hasattr(self.pathtoopf, 'manifest')):
|
||||||
|
if hasattr(self.pathtoopf, 'manifest'):
|
||||||
|
self.pathtoopf = write_oebbook(self.pathtoopf, self.base)
|
||||||
|
self.pathtoopf = create_oebbook(self.log, self.pathtoopf,
|
||||||
|
plumber.opts)
|
||||||
|
|
||||||
|
if hasattr(self.pathtoopf, 'manifest'):
|
||||||
|
self.pathtoopf = write_oebbook(self.pathtoopf, self.base)
|
||||||
|
|
||||||
|
self.book_format = os.path.splitext(self.pathtoebook)[1][1:].upper()
|
||||||
|
if getattr(plumber.input_plugin, 'is_kf8', False):
|
||||||
|
self.book_format = 'KF8'
|
||||||
|
|
||||||
|
self.opf = getattr(plumber.input_plugin, 'optimize_opf_parsing', None)
|
||||||
|
if self.opf is None:
|
||||||
|
self.opf = OPF(self.pathtoopf, os.path.dirname(self.pathtoopf))
|
||||||
|
self.language = self.opf.language
|
||||||
|
if self.language:
|
||||||
|
self.language = self.language.lower()
|
||||||
|
ordered = [i for i in self.opf.spine if i.is_linear] + \
|
||||||
|
[i for i in self.opf.spine if not i.is_linear]
|
||||||
|
self.spine = []
|
||||||
|
Spiny = partial(SpineItem, read_anchor_map=read_anchor_map,
|
||||||
|
run_char_count=run_char_count)
|
||||||
|
for i in ordered:
|
||||||
|
spath = i.path
|
||||||
|
mt = None
|
||||||
|
if i.idref is not None:
|
||||||
|
mt = self.opf.manifest.type_for_id(i.idref)
|
||||||
|
if mt is None:
|
||||||
|
mt = guess_type(spath)[0]
|
||||||
|
try:
|
||||||
|
self.spine.append(Spiny(spath, mime_type=mt))
|
||||||
|
except:
|
||||||
|
self.log.warn('Missing spine item:', repr(spath))
|
||||||
|
|
||||||
|
cover = self.opf.cover
|
||||||
|
if cover and self.ebook_ext in {'lit', 'mobi', 'prc', 'opf', 'fb2',
|
||||||
|
'azw', 'azw3'}:
|
||||||
|
cfile = os.path.join(self.base, 'calibre_iterator_cover.html')
|
||||||
|
rcpath = os.path.relpath(cover, self.base).replace(os.sep, '/')
|
||||||
|
chtml = (TITLEPAGE%prepare_string_for_xml(rcpath, True)).encode('utf-8')
|
||||||
|
with open(cfile, 'wb') as f:
|
||||||
|
f.write(chtml)
|
||||||
|
self.spine[0:0] = [Spiny(cfile,
|
||||||
|
mime_type='application/xhtml+xml')]
|
||||||
|
self.delete_on_exit.append(cfile)
|
||||||
|
|
||||||
|
if self.opf.path_to_html_toc is not None and \
|
||||||
|
self.opf.path_to_html_toc not in self.spine:
|
||||||
|
try:
|
||||||
|
self.spine.append(Spiny(self.opf.path_to_html_toc))
|
||||||
|
except:
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
|
||||||
|
sizes = [i.character_count for i in self.spine]
|
||||||
|
self.pages = [math.ceil(i/float(self.CHARACTERS_PER_PAGE)) for i in sizes]
|
||||||
|
for p, s in zip(self.pages, self.spine):
|
||||||
|
s.pages = p
|
||||||
|
start = 1
|
||||||
|
|
||||||
|
for s in self.spine:
|
||||||
|
s.start_page = start
|
||||||
|
start += s.pages
|
||||||
|
s.max_page = s.start_page + s.pages - 1
|
||||||
|
self.toc = self.opf.toc
|
||||||
|
create_indexing_data(self.spine, self.toc)
|
||||||
|
|
||||||
|
self.read_bookmarks()
|
||||||
|
|
||||||
|
return self
|
||||||
|
|
||||||
|
def __exit__(self, *args):
|
||||||
|
self._tdir.__exit__(*args)
|
||||||
|
for x in self.delete_on_exit:
|
||||||
|
try:
|
||||||
|
os.remove(x)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
105
src/calibre/ebooks/oeb/iterator/bookmarks.py
Normal file
105
src/calibre/ebooks/oeb/iterator/bookmarks.py
Normal file
@ -0,0 +1,105 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||||
|
from __future__ import (unicode_literals, division, absolute_import,
|
||||||
|
print_function)
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
import os
|
||||||
|
from io import BytesIO
|
||||||
|
|
||||||
|
from calibre.utils.zipfile import safe_replace
|
||||||
|
|
||||||
|
BM_FIELD_SEP = u'*|!|?|*'
|
||||||
|
BM_LEGACY_ESC = u'esc-text-%&*#%(){}ads19-end-esc'
|
||||||
|
|
||||||
|
class BookmarksMixin(object):
|
||||||
|
|
||||||
|
def parse_bookmarks(self, raw):
|
||||||
|
for line in raw.splitlines():
|
||||||
|
bm = None
|
||||||
|
if line.count('^') > 0:
|
||||||
|
tokens = line.rpartition('^')
|
||||||
|
title, ref = tokens[0], tokens[2]
|
||||||
|
try:
|
||||||
|
spine, _, pos = ref.partition('#')
|
||||||
|
spine = int(spine.strip())
|
||||||
|
except:
|
||||||
|
continue
|
||||||
|
bm = {'type':'legacy', 'title':title, 'spine':spine, 'pos':pos}
|
||||||
|
elif BM_FIELD_SEP in line:
|
||||||
|
try:
|
||||||
|
title, spine, pos = line.strip().split(BM_FIELD_SEP)
|
||||||
|
spine = int(spine)
|
||||||
|
except:
|
||||||
|
continue
|
||||||
|
# Unescape from serialization
|
||||||
|
pos = pos.replace(BM_LEGACY_ESC, u'^')
|
||||||
|
# Check for pos being a scroll fraction
|
||||||
|
try:
|
||||||
|
pos = float(pos)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
bm = {'type':'cfi', 'title':title, 'pos':pos, 'spine':spine}
|
||||||
|
|
||||||
|
if bm:
|
||||||
|
self.bookmarks.append(bm)
|
||||||
|
|
||||||
|
def serialize_bookmarks(self, bookmarks):
|
||||||
|
dat = []
|
||||||
|
for bm in bookmarks:
|
||||||
|
if bm['type'] == 'legacy':
|
||||||
|
rec = u'%s^%d#%s'%(bm['title'], bm['spine'], bm['pos'])
|
||||||
|
else:
|
||||||
|
pos = bm['pos']
|
||||||
|
if isinstance(pos, (int, float)):
|
||||||
|
pos = unicode(pos)
|
||||||
|
else:
|
||||||
|
pos = pos.replace(u'^', BM_LEGACY_ESC)
|
||||||
|
rec = BM_FIELD_SEP.join([bm['title'], unicode(bm['spine']), pos])
|
||||||
|
dat.append(rec)
|
||||||
|
return (u'\n'.join(dat) +u'\n')
|
||||||
|
|
||||||
|
def read_bookmarks(self):
|
||||||
|
self.bookmarks = []
|
||||||
|
bmfile = os.path.join(self.base, 'META-INF', 'calibre_bookmarks.txt')
|
||||||
|
raw = ''
|
||||||
|
if os.path.exists(bmfile):
|
||||||
|
with open(bmfile, 'rb') as f:
|
||||||
|
raw = f.read()
|
||||||
|
else:
|
||||||
|
saved = self.config['bookmarks_'+self.pathtoebook]
|
||||||
|
if saved:
|
||||||
|
raw = saved
|
||||||
|
if not isinstance(raw, unicode):
|
||||||
|
raw = raw.decode('utf-8')
|
||||||
|
self.parse_bookmarks(raw)
|
||||||
|
|
||||||
|
def save_bookmarks(self, bookmarks=None):
|
||||||
|
if bookmarks is None:
|
||||||
|
bookmarks = self.bookmarks
|
||||||
|
dat = self.serialize_bookmarks(bookmarks)
|
||||||
|
if os.path.splitext(self.pathtoebook)[1].lower() == '.epub' and \
|
||||||
|
os.access(self.pathtoebook, os.R_OK):
|
||||||
|
try:
|
||||||
|
zf = open(self.pathtoebook, 'r+b')
|
||||||
|
except IOError:
|
||||||
|
return
|
||||||
|
safe_replace(zf, 'META-INF/calibre_bookmarks.txt',
|
||||||
|
BytesIO(dat.encode('utf-8')),
|
||||||
|
add_missing=True)
|
||||||
|
else:
|
||||||
|
self.config['bookmarks_'+self.pathtoebook] = dat
|
||||||
|
|
||||||
|
def add_bookmark(self, bm):
|
||||||
|
self.bookmarks = [x for x in self.bookmarks if x['title'] !=
|
||||||
|
bm['title']]
|
||||||
|
self.bookmarks.append(bm)
|
||||||
|
self.save_bookmarks()
|
||||||
|
|
||||||
|
def set_bookmarks(self, bookmarks):
|
||||||
|
self.bookmarks = bookmarks
|
||||||
|
|
||||||
|
|
117
src/calibre/ebooks/oeb/iterator/spine.py
Normal file
117
src/calibre/ebooks/oeb/iterator/spine.py
Normal file
@ -0,0 +1,117 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||||
|
from __future__ import (unicode_literals, division, absolute_import,
|
||||||
|
print_function)
|
||||||
|
from future_builtins import map
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
import re, os
|
||||||
|
from functools import partial
|
||||||
|
from operator import attrgetter
|
||||||
|
from collections import namedtuple
|
||||||
|
|
||||||
|
from calibre import guess_type
|
||||||
|
from calibre.ebooks.chardet import xml_to_unicode
|
||||||
|
|
||||||
|
def character_count(html):
|
||||||
|
''' Return the number of "significant" text characters in a HTML string. '''
|
||||||
|
count = 0
|
||||||
|
strip_space = re.compile(r'\s+')
|
||||||
|
for match in re.finditer(r'>[^<]+<', html):
|
||||||
|
count += len(strip_space.sub(' ', match.group()))-2
|
||||||
|
return count
|
||||||
|
|
||||||
|
def anchor_map(html):
|
||||||
|
''' Return map of all anchor names to their offsets in the html '''
|
||||||
|
ans = {}
|
||||||
|
for match in re.finditer(
|
||||||
|
r'''(?:id|name)\s*=\s*['"]([^'"]+)['"]''', html):
|
||||||
|
anchor = match.group(0)
|
||||||
|
ans[anchor] = ans.get(anchor, match.start())
|
||||||
|
return ans
|
||||||
|
|
||||||
|
class SpineItem(unicode):
|
||||||
|
|
||||||
|
def __new__(cls, path, mime_type=None, read_anchor_map=True,
|
||||||
|
run_char_count=True):
|
||||||
|
ppath = path.partition('#')[0]
|
||||||
|
if not os.path.exists(path) and os.path.exists(ppath):
|
||||||
|
path = ppath
|
||||||
|
obj = super(SpineItem, cls).__new__(cls, path)
|
||||||
|
with open(path, 'rb') as f:
|
||||||
|
raw = f.read()
|
||||||
|
raw, obj.encoding = xml_to_unicode(raw)
|
||||||
|
obj.character_count = character_count(raw) if run_char_count else 10000
|
||||||
|
obj.anchor_map = anchor_map(raw) if read_anchor_map else {}
|
||||||
|
obj.start_page = -1
|
||||||
|
obj.pages = -1
|
||||||
|
obj.max_page = -1
|
||||||
|
obj.index_entries = []
|
||||||
|
if mime_type is None:
|
||||||
|
mime_type = guess_type(obj)[0]
|
||||||
|
obj.mime_type = mime_type
|
||||||
|
return obj
|
||||||
|
|
||||||
|
class IndexEntry(object):
|
||||||
|
|
||||||
|
def __init__(self, spine, toc_entry, num):
|
||||||
|
self.num = num
|
||||||
|
self.text = toc_entry.text or _('Unknown')
|
||||||
|
self.key = toc_entry.abspath
|
||||||
|
self.anchor = self.start_anchor = toc_entry.fragment or None
|
||||||
|
self.spine_pos = spine.index(self.key)
|
||||||
|
self.anchor_pos = 0
|
||||||
|
if self.spine_pos > -1:
|
||||||
|
self.anchor_pos = spine[self.spine_pos].anchor_map.get(self.anchor,
|
||||||
|
0)
|
||||||
|
|
||||||
|
self.depth = 0
|
||||||
|
p = toc_entry.parent
|
||||||
|
while p is not None:
|
||||||
|
self.depth += 1
|
||||||
|
p = p.parent
|
||||||
|
|
||||||
|
self.sort_key = (self.spine_pos, self.anchor_pos)
|
||||||
|
self.spine_count = len(spine)
|
||||||
|
|
||||||
|
def find_end(self, all_entries):
|
||||||
|
potential_enders = [i for i in all_entries if
|
||||||
|
i.depth <= self.depth and
|
||||||
|
(
|
||||||
|
(i.spine_pos == self.spine_pos and i.anchor_pos >
|
||||||
|
self.anchor_pos)
|
||||||
|
or
|
||||||
|
i.spine_pos > self.spine_pos
|
||||||
|
)]
|
||||||
|
if potential_enders:
|
||||||
|
# potential_enders is sorted by (spine_pos, anchor_pos)
|
||||||
|
end = potential_enders[0]
|
||||||
|
self.end_spine_pos = end.spine_pos
|
||||||
|
self.end_anchor = end.anchor
|
||||||
|
else:
|
||||||
|
self.end_spine_pos = self.spine_count - 1
|
||||||
|
self.end_anchor = None
|
||||||
|
|
||||||
|
def create_indexing_data(spine, toc):
|
||||||
|
if not toc: return
|
||||||
|
f = partial(IndexEntry, spine)
|
||||||
|
index_entries = list(map(f,
|
||||||
|
(t for t in toc.flat() if t is not toc),
|
||||||
|
(i-1 for i, t in enumerate(toc.flat()) if t is not toc)
|
||||||
|
))
|
||||||
|
index_entries.sort(key=attrgetter('sort_key'))
|
||||||
|
[ i.find_end(index_entries) for i in index_entries ]
|
||||||
|
|
||||||
|
ie = namedtuple('IndexEntry', 'entry start_anchor end_anchor')
|
||||||
|
|
||||||
|
for spine_pos, spine_item in enumerate(spine):
|
||||||
|
for i in index_entries:
|
||||||
|
if i.end_spine_pos < spine_pos or i.spine_pos > spine_pos:
|
||||||
|
continue # Does not touch this file
|
||||||
|
start = i.anchor if i.spine_pos == spine_pos else None
|
||||||
|
end = i.end_anchor if i.spine_pos == spine_pos else None
|
||||||
|
spine_item.index_entries.append(ie(i, start, end))
|
||||||
|
|
@ -18,7 +18,7 @@ from calibre.gui2.widgets import ProgressIndicator
|
|||||||
from calibre.gui2.main_window import MainWindow
|
from calibre.gui2.main_window import MainWindow
|
||||||
from calibre.gui2 import (Application, ORG_NAME, APP_UID, choose_files,
|
from calibre.gui2 import (Application, ORG_NAME, APP_UID, choose_files,
|
||||||
info_dialog, error_dialog, open_url, available_height)
|
info_dialog, error_dialog, open_url, available_height)
|
||||||
from calibre.ebooks.oeb.iterator import EbookIterator
|
from calibre.ebooks.oeb.iterator.book import EbookIterator
|
||||||
from calibre.ebooks import DRMError
|
from calibre.ebooks import DRMError
|
||||||
from calibre.constants import islinux, isbsd, isosx, filesystem_encoding
|
from calibre.constants import islinux, isbsd, isosx, filesystem_encoding
|
||||||
from calibre.utils.config import Config, StringConfig, JSONConfig
|
from calibre.utils.config import Config, StringConfig, JSONConfig
|
||||||
@ -802,11 +802,13 @@ class EbookViewer(MainWindow, Ui_EbookViewer):
|
|||||||
if not title:
|
if not title:
|
||||||
title = os.path.splitext(os.path.basename(pathtoebook))[0]
|
title = os.path.splitext(os.path.basename(pathtoebook))[0]
|
||||||
if self.iterator.toc:
|
if self.iterator.toc:
|
||||||
self.toc_model = TOC(self.iterator.toc)
|
self.toc_model = TOC(self.iterator.spine, self.iterator.toc)
|
||||||
self.toc.setModel(self.toc_model)
|
self.toc.setModel(self.toc_model)
|
||||||
if self.show_toc_on_open:
|
if self.show_toc_on_open:
|
||||||
self.action_table_of_contents.setChecked(True)
|
self.action_table_of_contents.setChecked(True)
|
||||||
else:
|
else:
|
||||||
|
self.toc_model = TOC(self.iterator.spine)
|
||||||
|
self.toc.setModel(self.toc_model)
|
||||||
self.action_table_of_contents.setChecked(False)
|
self.action_table_of_contents.setChecked(False)
|
||||||
if isbytestring(pathtoebook):
|
if isbytestring(pathtoebook):
|
||||||
pathtoebook = force_unicode(pathtoebook, filesystem_encoding)
|
pathtoebook = force_unicode(pathtoebook, filesystem_encoding)
|
||||||
|
@ -8,9 +8,10 @@ __copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
|||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from PyQt4.Qt import QStandardItem, QStandardItemModel, Qt
|
from PyQt4.Qt import QStandardItem, QStandardItemModel, Qt
|
||||||
|
|
||||||
|
from calibre.ebooks.metadata.toc import TOC as MTOC
|
||||||
|
|
||||||
class TOCItem(QStandardItem):
|
class TOCItem(QStandardItem):
|
||||||
|
|
||||||
def __init__(self, toc):
|
def __init__(self, toc):
|
||||||
@ -30,8 +31,10 @@ class TOCItem(QStandardItem):
|
|||||||
|
|
||||||
class TOC(QStandardItemModel):
|
class TOC(QStandardItemModel):
|
||||||
|
|
||||||
def __init__(self, toc):
|
def __init__(self, spine, toc=None):
|
||||||
QStandardItemModel.__init__(self)
|
QStandardItemModel.__init__(self)
|
||||||
|
if toc is None:
|
||||||
|
toc = MTOC()
|
||||||
for t in toc:
|
for t in toc:
|
||||||
self.appendRow(TOCItem(t))
|
self.appendRow(TOCItem(t))
|
||||||
self.setHorizontalHeaderItem(0, QStandardItem(_('Table of Contents')))
|
self.setHorizontalHeaderItem(0, QStandardItem(_('Table of Contents')))
|
||||||
|
Loading…
x
Reference in New Issue
Block a user