mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Refactor oeb iterator and implement logic to collect indexing data
This commit is contained in:
parent
820ba7ec7d
commit
d993fbc91f
@ -1,383 +0,0 @@
|
||||
from __future__ import with_statement
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008 Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
'''
|
||||
Iterate over the HTML files in an ebook. Useful for writing viewers.
|
||||
'''
|
||||
|
||||
import re, os, math
|
||||
from cStringIO import StringIO
|
||||
|
||||
from PyQt4.Qt import QFontDatabase
|
||||
|
||||
from calibre.customize.ui import available_input_formats
|
||||
from calibre.ebooks.metadata.opf2 import OPF
|
||||
from calibre.ptempfile import TemporaryDirectory
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
from calibre.utils.zipfile import safe_replace
|
||||
from calibre.utils.config import DynamicConfig
|
||||
from calibre.utils.logging import Log
|
||||
from calibre import (guess_type, prints, prepare_string_for_xml,
|
||||
xml_replace_entities)
|
||||
from calibre.ebooks.oeb.transforms.cover import CoverManager
|
||||
from calibre.constants import filesystem_encoding
|
||||
|
||||
TITLEPAGE = CoverManager.SVG_TEMPLATE.decode('utf-8').replace(\
|
||||
'__ar__', 'none').replace('__viewbox__', '0 0 600 800'
|
||||
).replace('__width__', '600').replace('__height__', '800')
|
||||
BM_FIELD_SEP = u'*|!|?|*'
|
||||
BM_LEGACY_ESC = u'esc-text-%&*#%(){}ads19-end-esc'
|
||||
|
||||
def character_count(html):
|
||||
'''
|
||||
Return the number of "significant" text characters in a HTML string.
|
||||
'''
|
||||
count = 0
|
||||
strip_space = re.compile(r'\s+')
|
||||
for match in re.finditer(r'>[^<]+<', html):
|
||||
count += len(strip_space.sub(' ', match.group()))-2
|
||||
return count
|
||||
|
||||
class UnsupportedFormatError(Exception):
|
||||
|
||||
def __init__(self, fmt):
|
||||
Exception.__init__(self, _('%s format books are not supported')%fmt.upper())
|
||||
|
||||
class SpineItem(unicode):
|
||||
|
||||
def __new__(cls, path, mime_type=None):
|
||||
ppath = path.partition('#')[0]
|
||||
if not os.path.exists(path) and os.path.exists(ppath):
|
||||
path = ppath
|
||||
obj = super(SpineItem, cls).__new__(cls, path)
|
||||
raw = open(path, 'rb').read()
|
||||
raw, obj.encoding = xml_to_unicode(raw)
|
||||
obj.character_count = character_count(raw)
|
||||
obj.start_page = -1
|
||||
obj.pages = -1
|
||||
obj.max_page = -1
|
||||
if mime_type is None:
|
||||
mime_type = guess_type(obj)[0]
|
||||
obj.mime_type = mime_type
|
||||
return obj
|
||||
|
||||
class FakeOpts(object):
|
||||
verbose = 0
|
||||
breadth_first = False
|
||||
max_levels = 5
|
||||
input_encoding = None
|
||||
|
||||
def is_supported(path):
|
||||
ext = os.path.splitext(path)[1].replace('.', '').lower()
|
||||
ext = re.sub(r'(x{0,1})htm(l{0,1})', 'html', ext)
|
||||
return ext in available_input_formats()
|
||||
|
||||
|
||||
def write_oebbook(oeb, path):
|
||||
from calibre.ebooks.oeb.writer import OEBWriter
|
||||
from calibre import walk
|
||||
w = OEBWriter()
|
||||
w(oeb, path)
|
||||
for f in walk(path):
|
||||
if f.endswith('.opf'):
|
||||
return f
|
||||
|
||||
class EbookIterator(object):
|
||||
|
||||
CHARACTERS_PER_PAGE = 1000
|
||||
|
||||
def __init__(self, pathtoebook, log=None):
|
||||
self.log = log
|
||||
if log is None:
|
||||
self.log = Log()
|
||||
pathtoebook = pathtoebook.strip()
|
||||
self.pathtoebook = os.path.abspath(pathtoebook)
|
||||
self.config = DynamicConfig(name='iterator')
|
||||
ext = os.path.splitext(pathtoebook)[1].replace('.', '').lower()
|
||||
ext = re.sub(r'(x{0,1})htm(l{0,1})', 'html', ext)
|
||||
self.ebook_ext = ext.replace('original_', '')
|
||||
|
||||
def search(self, text, index, backwards=False):
|
||||
text = prepare_string_for_xml(text.lower())
|
||||
pmap = [(i, path) for i, path in enumerate(self.spine)]
|
||||
if backwards:
|
||||
pmap.reverse()
|
||||
for i, path in pmap:
|
||||
if (backwards and i < index) or (not backwards and i > index):
|
||||
with open(path, 'rb') as f:
|
||||
raw = f.read().decode(path.encoding)
|
||||
try:
|
||||
raw = xml_replace_entities(raw)
|
||||
except:
|
||||
pass
|
||||
if text in raw.lower():
|
||||
return i
|
||||
|
||||
def find_missing_css_files(self):
|
||||
for x in os.walk(os.path.dirname(self.pathtoopf)):
|
||||
for f in x[-1]:
|
||||
if f.endswith('.css'):
|
||||
yield os.path.join(x[0], f)
|
||||
|
||||
def find_declared_css_files(self):
|
||||
for item in self.opf.manifest:
|
||||
if item.mime_type and 'css' in item.mime_type.lower():
|
||||
yield item.path
|
||||
|
||||
def find_embedded_fonts(self):
|
||||
'''
|
||||
This will become unnecessary once Qt WebKit supports the @font-face rule.
|
||||
'''
|
||||
css_files = set(self.find_declared_css_files())
|
||||
if not css_files:
|
||||
css_files = set(self.find_missing_css_files())
|
||||
bad_map = {}
|
||||
font_family_pat = re.compile(r'font-family\s*:\s*([^;]+)')
|
||||
for csspath in css_files:
|
||||
try:
|
||||
css = open(csspath, 'rb').read().decode('utf-8', 'replace')
|
||||
except:
|
||||
continue
|
||||
for match in re.compile(r'@font-face\s*{([^}]+)}').finditer(css):
|
||||
block = match.group(1)
|
||||
family = font_family_pat.search(block)
|
||||
url = re.compile(r'url\s*\([\'"]*(.+?)[\'"]*\)', re.DOTALL).search(block)
|
||||
if url:
|
||||
path = url.group(1).split('/')
|
||||
path = os.path.join(os.path.dirname(csspath), *path)
|
||||
if not os.access(path, os.R_OK):
|
||||
continue
|
||||
id = QFontDatabase.addApplicationFont(path)
|
||||
if id != -1:
|
||||
families = [unicode(f) for f in QFontDatabase.applicationFontFamilies(id)]
|
||||
if family:
|
||||
family = family.group(1)
|
||||
specified_families = [x.strip().replace('"',
|
||||
'').replace("'", '') for x in family.split(',')]
|
||||
aliasing_ok = False
|
||||
for f in specified_families:
|
||||
bad_map[f] = families[0]
|
||||
if not aliasing_ok and f in families:
|
||||
aliasing_ok = True
|
||||
|
||||
if not aliasing_ok:
|
||||
prints('WARNING: Family aliasing not fully supported.')
|
||||
prints('\tDeclared family: %r not in actual families: %r'
|
||||
% (family, families))
|
||||
else:
|
||||
prints('Loaded embedded font:', repr(family))
|
||||
if bad_map:
|
||||
def prepend_embedded_font(match):
|
||||
for bad, good in bad_map.items():
|
||||
if bad in match.group(1):
|
||||
prints('Substituting font family: %s -> %s'%(bad, good))
|
||||
return match.group().replace(bad, '"%s"'%good)
|
||||
|
||||
from calibre.ebooks.chardet import force_encoding
|
||||
for csspath in css_files:
|
||||
with open(csspath, 'r+b') as f:
|
||||
css = f.read()
|
||||
enc = force_encoding(css, False)
|
||||
css = css.decode(enc, 'replace')
|
||||
ncss = font_family_pat.sub(prepend_embedded_font, css)
|
||||
if ncss != css:
|
||||
f.seek(0)
|
||||
f.truncate()
|
||||
f.write(ncss.encode(enc))
|
||||
|
||||
def __enter__(self, processed=False, only_input_plugin=False):
|
||||
self.delete_on_exit = []
|
||||
self._tdir = TemporaryDirectory('_ebook_iter')
|
||||
self.base = self._tdir.__enter__()
|
||||
if not isinstance(self.base, unicode):
|
||||
self.base = self.base.decode(filesystem_encoding)
|
||||
from calibre.ebooks.conversion.plumber import Plumber, create_oebbook
|
||||
plumber = Plumber(self.pathtoebook, self.base, self.log)
|
||||
plumber.setup_options()
|
||||
if self.pathtoebook.lower().endswith('.opf'):
|
||||
plumber.opts.dont_package = True
|
||||
if hasattr(plumber.opts, 'no_process'):
|
||||
plumber.opts.no_process = True
|
||||
|
||||
plumber.input_plugin.for_viewer = True
|
||||
with plumber.input_plugin:
|
||||
self.pathtoopf = plumber.input_plugin(open(plumber.input, 'rb'),
|
||||
plumber.opts, plumber.input_fmt, self.log,
|
||||
{}, self.base)
|
||||
|
||||
if not only_input_plugin:
|
||||
if processed or plumber.input_fmt.lower() in ('pdb', 'pdf', 'rb') and \
|
||||
not hasattr(self.pathtoopf, 'manifest'):
|
||||
if hasattr(self.pathtoopf, 'manifest'):
|
||||
self.pathtoopf = write_oebbook(self.pathtoopf, self.base)
|
||||
self.pathtoopf = create_oebbook(self.log, self.pathtoopf,
|
||||
plumber.opts)
|
||||
|
||||
if hasattr(self.pathtoopf, 'manifest'):
|
||||
self.pathtoopf = write_oebbook(self.pathtoopf, self.base)
|
||||
|
||||
self.book_format = os.path.splitext(self.pathtoebook)[1][1:].upper()
|
||||
if getattr(plumber.input_plugin, 'is_kf8', False):
|
||||
self.book_format = 'KF8'
|
||||
|
||||
self.opf = getattr(plumber.input_plugin, 'optimize_opf_parsing', None)
|
||||
if self.opf is None:
|
||||
self.opf = OPF(self.pathtoopf, os.path.dirname(self.pathtoopf))
|
||||
self.language = self.opf.language
|
||||
if self.language:
|
||||
self.language = self.language.lower()
|
||||
ordered = [i for i in self.opf.spine if i.is_linear] + \
|
||||
[i for i in self.opf.spine if not i.is_linear]
|
||||
self.spine = []
|
||||
for i in ordered:
|
||||
spath = i.path
|
||||
mt = None
|
||||
if i.idref is not None:
|
||||
mt = self.opf.manifest.type_for_id(i.idref)
|
||||
if mt is None:
|
||||
mt = guess_type(spath)[0]
|
||||
try:
|
||||
self.spine.append(SpineItem(spath, mime_type=mt))
|
||||
except:
|
||||
self.log.warn('Missing spine item:', repr(spath))
|
||||
|
||||
cover = self.opf.cover
|
||||
if self.ebook_ext in ('lit', 'mobi', 'prc', 'opf', 'fb2') and cover:
|
||||
cfile = os.path.join(self.base, 'calibre_iterator_cover.html')
|
||||
rcpath = os.path.relpath(cover, self.base).replace(os.sep, '/')
|
||||
chtml = (TITLEPAGE%prepare_string_for_xml(rcpath, True)).encode('utf-8')
|
||||
open(cfile, 'wb').write(chtml)
|
||||
self.spine[0:0] = [SpineItem(cfile,
|
||||
mime_type='application/xhtml+xml')]
|
||||
self.delete_on_exit.append(cfile)
|
||||
|
||||
if self.opf.path_to_html_toc is not None and \
|
||||
self.opf.path_to_html_toc not in self.spine:
|
||||
try:
|
||||
self.spine.append(SpineItem(self.opf.path_to_html_toc))
|
||||
except:
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
|
||||
sizes = [i.character_count for i in self.spine]
|
||||
self.pages = [math.ceil(i/float(self.CHARACTERS_PER_PAGE)) for i in sizes]
|
||||
for p, s in zip(self.pages, self.spine):
|
||||
s.pages = p
|
||||
start = 1
|
||||
|
||||
for s in self.spine:
|
||||
s.start_page = start
|
||||
start += s.pages
|
||||
s.max_page = s.start_page + s.pages - 1
|
||||
self.toc = self.opf.toc
|
||||
|
||||
self.read_bookmarks()
|
||||
|
||||
return self
|
||||
|
||||
def parse_bookmarks(self, raw):
|
||||
for line in raw.splitlines():
|
||||
bm = None
|
||||
if line.count('^') > 0:
|
||||
tokens = line.rpartition('^')
|
||||
title, ref = tokens[0], tokens[2]
|
||||
try:
|
||||
spine, _, pos = ref.partition('#')
|
||||
spine = int(spine.strip())
|
||||
except:
|
||||
continue
|
||||
bm = {'type':'legacy', 'title':title, 'spine':spine, 'pos':pos}
|
||||
elif BM_FIELD_SEP in line:
|
||||
try:
|
||||
title, spine, pos = line.strip().split(BM_FIELD_SEP)
|
||||
spine = int(spine)
|
||||
except:
|
||||
continue
|
||||
# Unescape from serialization
|
||||
pos = pos.replace(BM_LEGACY_ESC, u'^')
|
||||
# Check for pos being a scroll fraction
|
||||
try:
|
||||
pos = float(pos)
|
||||
except:
|
||||
pass
|
||||
bm = {'type':'cfi', 'title':title, 'pos':pos, 'spine':spine}
|
||||
|
||||
if bm:
|
||||
self.bookmarks.append(bm)
|
||||
|
||||
def serialize_bookmarks(self, bookmarks):
|
||||
dat = []
|
||||
for bm in bookmarks:
|
||||
if bm['type'] == 'legacy':
|
||||
rec = u'%s^%d#%s'%(bm['title'], bm['spine'], bm['pos'])
|
||||
else:
|
||||
pos = bm['pos']
|
||||
if isinstance(pos, (int, float)):
|
||||
pos = unicode(pos)
|
||||
else:
|
||||
pos = pos.replace(u'^', BM_LEGACY_ESC)
|
||||
rec = BM_FIELD_SEP.join([bm['title'], unicode(bm['spine']), pos])
|
||||
dat.append(rec)
|
||||
return (u'\n'.join(dat) +u'\n')
|
||||
|
||||
def read_bookmarks(self):
|
||||
self.bookmarks = []
|
||||
bmfile = os.path.join(self.base, 'META-INF', 'calibre_bookmarks.txt')
|
||||
raw = ''
|
||||
if os.path.exists(bmfile):
|
||||
with open(bmfile, 'rb') as f:
|
||||
raw = f.read()
|
||||
else:
|
||||
saved = self.config['bookmarks_'+self.pathtoebook]
|
||||
if saved:
|
||||
raw = saved
|
||||
if not isinstance(raw, unicode):
|
||||
raw = raw.decode('utf-8')
|
||||
self.parse_bookmarks(raw)
|
||||
|
||||
def save_bookmarks(self, bookmarks=None):
|
||||
if bookmarks is None:
|
||||
bookmarks = self.bookmarks
|
||||
dat = self.serialize_bookmarks(bookmarks)
|
||||
if os.path.splitext(self.pathtoebook)[1].lower() == '.epub' and \
|
||||
os.access(self.pathtoebook, os.R_OK):
|
||||
try:
|
||||
zf = open(self.pathtoebook, 'r+b')
|
||||
except IOError:
|
||||
return
|
||||
safe_replace(zf, 'META-INF/calibre_bookmarks.txt',
|
||||
StringIO(dat.encode('utf-8')),
|
||||
add_missing=True)
|
||||
else:
|
||||
self.config['bookmarks_'+self.pathtoebook] = dat
|
||||
|
||||
def add_bookmark(self, bm):
|
||||
self.bookmarks = [x for x in self.bookmarks if x['title'] !=
|
||||
bm['title']]
|
||||
self.bookmarks.append(bm)
|
||||
self.save_bookmarks()
|
||||
|
||||
def set_bookmarks(self, bookmarks):
|
||||
self.bookmarks = bookmarks
|
||||
|
||||
def __exit__(self, *args):
|
||||
self._tdir.__exit__(*args)
|
||||
for x in self.delete_on_exit:
|
||||
if os.path.exists(x):
|
||||
os.remove(x)
|
||||
|
||||
def get_preprocess_html(path_to_ebook, output):
|
||||
from calibre.ebooks.conversion.preprocess import HTMLPreProcessor
|
||||
iterator = EbookIterator(path_to_ebook)
|
||||
iterator.__enter__(only_input_plugin=True)
|
||||
preprocessor = HTMLPreProcessor(None, False)
|
||||
with open(output, 'wb') as out:
|
||||
for path in iterator.spine:
|
||||
with open(path, 'rb') as f:
|
||||
html = f.read().decode('utf-8', 'replace')
|
||||
html = preprocessor(html, get_preprocess_html=True)
|
||||
out.write(html.encode('utf-8'))
|
||||
out.write(b'\n\n' + b'-'*80 + b'\n\n')
|
||||
|
42
src/calibre/ebooks/oeb/iterator/__init__.py
Normal file
42
src/calibre/ebooks/oeb/iterator/__init__.py
Normal file
@ -0,0 +1,42 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import (unicode_literals, division, absolute_import,
|
||||
print_function)
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os, re
|
||||
|
||||
from calibre.customize.ui import available_input_formats
|
||||
|
||||
def is_supported(path):
|
||||
ext = os.path.splitext(path)[1].replace('.', '').lower()
|
||||
ext = re.sub(r'(x{0,1})htm(l{0,1})', 'html', ext)
|
||||
return ext in available_input_formats()
|
||||
|
||||
class UnsupportedFormatError(Exception):
|
||||
|
||||
def __init__(self, fmt):
|
||||
Exception.__init__(self, _('%s format books are not supported')%fmt.upper())
|
||||
|
||||
def EbookIterator(*args, **kwargs):
|
||||
'For backwards compatibility'
|
||||
from calibre.ebooks.oeb.iterator.book import EbookIterator
|
||||
return EbookIterator(*args, **kwargs)
|
||||
|
||||
def get_preprocess_html(path_to_ebook, output):
|
||||
from calibre.ebooks.conversion.preprocess import HTMLPreProcessor
|
||||
iterator = EbookIterator(path_to_ebook)
|
||||
iterator.__enter__(only_input_plugin=True, run_char_count=False,
|
||||
read_anchor_map=False)
|
||||
preprocessor = HTMLPreProcessor(None, False)
|
||||
with open(output, 'wb') as out:
|
||||
for path in iterator.spine:
|
||||
with open(path, 'rb') as f:
|
||||
html = f.read().decode('utf-8', 'replace')
|
||||
html = preprocessor(html, get_preprocess_html=True)
|
||||
out.write(html.encode('utf-8'))
|
||||
out.write(b'\n\n' + b'-'*80 + b'\n\n')
|
||||
|
186
src/calibre/ebooks/oeb/iterator/book.py
Normal file
186
src/calibre/ebooks/oeb/iterator/book.py
Normal file
@ -0,0 +1,186 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import (unicode_literals, division, absolute_import,
|
||||
print_function)
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
|
||||
'''
|
||||
Iterate over the HTML files in an ebook. Useful for writing viewers.
|
||||
'''
|
||||
|
||||
import re, os, math
|
||||
from functools import partial
|
||||
|
||||
from calibre.ebooks.metadata.opf2 import OPF
|
||||
from calibre.ptempfile import TemporaryDirectory
|
||||
from calibre.utils.config import DynamicConfig
|
||||
from calibre.utils.logging import default_log
|
||||
from calibre import (guess_type, prepare_string_for_xml,
|
||||
xml_replace_entities)
|
||||
from calibre.ebooks.oeb.transforms.cover import CoverManager
|
||||
|
||||
from calibre.ebooks.oeb.iterator.spine import (SpineItem, create_indexing_data)
|
||||
from calibre.ebooks.oeb.iterator.bookmarks import BookmarksMixin
|
||||
|
||||
TITLEPAGE = CoverManager.SVG_TEMPLATE.decode('utf-8').replace(\
|
||||
'__ar__', 'none').replace('__viewbox__', '0 0 600 800'
|
||||
).replace('__width__', '600').replace('__height__', '800')
|
||||
|
||||
class FakeOpts(object):
|
||||
verbose = 0
|
||||
breadth_first = False
|
||||
max_levels = 5
|
||||
input_encoding = None
|
||||
|
||||
|
||||
def write_oebbook(oeb, path):
|
||||
from calibre.ebooks.oeb.writer import OEBWriter
|
||||
from calibre import walk
|
||||
w = OEBWriter()
|
||||
w(oeb, path)
|
||||
for f in walk(path):
|
||||
if f.endswith('.opf'):
|
||||
return f
|
||||
|
||||
class EbookIterator(BookmarksMixin):
|
||||
|
||||
CHARACTERS_PER_PAGE = 1000
|
||||
|
||||
def __init__(self, pathtoebook, log=None):
|
||||
self.log = log or default_log
|
||||
pathtoebook = pathtoebook.strip()
|
||||
self.pathtoebook = os.path.abspath(pathtoebook)
|
||||
self.config = DynamicConfig(name='iterator')
|
||||
ext = os.path.splitext(pathtoebook)[1].replace('.', '').lower()
|
||||
ext = re.sub(r'(x{0,1})htm(l{0,1})', 'html', ext)
|
||||
self.ebook_ext = ext.replace('original_', '')
|
||||
|
||||
def search(self, text, index, backwards=False):
|
||||
text = prepare_string_for_xml(text.lower())
|
||||
pmap = [(i, path) for i, path in enumerate(self.spine)]
|
||||
if backwards:
|
||||
pmap.reverse()
|
||||
for i, path in pmap:
|
||||
if (backwards and i < index) or (not backwards and i > index):
|
||||
with open(path, 'rb') as f:
|
||||
raw = f.read().decode(path.encoding)
|
||||
try:
|
||||
raw = xml_replace_entities(raw)
|
||||
except:
|
||||
pass
|
||||
if text in raw.lower():
|
||||
return i
|
||||
|
||||
def __enter__(self, processed=False, only_input_plugin=False,
|
||||
run_char_count=True, read_anchor_map=True):
|
||||
''' Convert an ebook file into an exploded OEB book suitable for
|
||||
display in viewers/preprocessing etc. '''
|
||||
|
||||
from calibre.ebooks.conversion.plumber import Plumber, create_oebbook
|
||||
|
||||
self.delete_on_exit = []
|
||||
self._tdir = TemporaryDirectory('_ebook_iter')
|
||||
self.base = self._tdir.__enter__()
|
||||
plumber = Plumber(self.pathtoebook, self.base, self.log)
|
||||
plumber.setup_options()
|
||||
if self.pathtoebook.lower().endswith('.opf'):
|
||||
plumber.opts.dont_package = True
|
||||
if hasattr(plumber.opts, 'no_process'):
|
||||
plumber.opts.no_process = True
|
||||
|
||||
plumber.input_plugin.for_viewer = True
|
||||
with plumber.input_plugin, open(plumber.input, 'rb') as inf:
|
||||
self.pathtoopf = plumber.input_plugin(inf,
|
||||
plumber.opts, plumber.input_fmt, self.log,
|
||||
{}, self.base)
|
||||
|
||||
if not only_input_plugin:
|
||||
# Run the HTML preprocess/parsing from the conversion pipeline as
|
||||
# well
|
||||
if (processed or plumber.input_fmt.lower() in {'pdb', 'pdf', 'rb'}
|
||||
and not hasattr(self.pathtoopf, 'manifest')):
|
||||
if hasattr(self.pathtoopf, 'manifest'):
|
||||
self.pathtoopf = write_oebbook(self.pathtoopf, self.base)
|
||||
self.pathtoopf = create_oebbook(self.log, self.pathtoopf,
|
||||
plumber.opts)
|
||||
|
||||
if hasattr(self.pathtoopf, 'manifest'):
|
||||
self.pathtoopf = write_oebbook(self.pathtoopf, self.base)
|
||||
|
||||
self.book_format = os.path.splitext(self.pathtoebook)[1][1:].upper()
|
||||
if getattr(plumber.input_plugin, 'is_kf8', False):
|
||||
self.book_format = 'KF8'
|
||||
|
||||
self.opf = getattr(plumber.input_plugin, 'optimize_opf_parsing', None)
|
||||
if self.opf is None:
|
||||
self.opf = OPF(self.pathtoopf, os.path.dirname(self.pathtoopf))
|
||||
self.language = self.opf.language
|
||||
if self.language:
|
||||
self.language = self.language.lower()
|
||||
ordered = [i for i in self.opf.spine if i.is_linear] + \
|
||||
[i for i in self.opf.spine if not i.is_linear]
|
||||
self.spine = []
|
||||
Spiny = partial(SpineItem, read_anchor_map=read_anchor_map,
|
||||
run_char_count=run_char_count)
|
||||
for i in ordered:
|
||||
spath = i.path
|
||||
mt = None
|
||||
if i.idref is not None:
|
||||
mt = self.opf.manifest.type_for_id(i.idref)
|
||||
if mt is None:
|
||||
mt = guess_type(spath)[0]
|
||||
try:
|
||||
self.spine.append(Spiny(spath, mime_type=mt))
|
||||
except:
|
||||
self.log.warn('Missing spine item:', repr(spath))
|
||||
|
||||
cover = self.opf.cover
|
||||
if cover and self.ebook_ext in {'lit', 'mobi', 'prc', 'opf', 'fb2',
|
||||
'azw', 'azw3'}:
|
||||
cfile = os.path.join(self.base, 'calibre_iterator_cover.html')
|
||||
rcpath = os.path.relpath(cover, self.base).replace(os.sep, '/')
|
||||
chtml = (TITLEPAGE%prepare_string_for_xml(rcpath, True)).encode('utf-8')
|
||||
with open(cfile, 'wb') as f:
|
||||
f.write(chtml)
|
||||
self.spine[0:0] = [Spiny(cfile,
|
||||
mime_type='application/xhtml+xml')]
|
||||
self.delete_on_exit.append(cfile)
|
||||
|
||||
if self.opf.path_to_html_toc is not None and \
|
||||
self.opf.path_to_html_toc not in self.spine:
|
||||
try:
|
||||
self.spine.append(Spiny(self.opf.path_to_html_toc))
|
||||
except:
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
sizes = [i.character_count for i in self.spine]
|
||||
self.pages = [math.ceil(i/float(self.CHARACTERS_PER_PAGE)) for i in sizes]
|
||||
for p, s in zip(self.pages, self.spine):
|
||||
s.pages = p
|
||||
start = 1
|
||||
|
||||
for s in self.spine:
|
||||
s.start_page = start
|
||||
start += s.pages
|
||||
s.max_page = s.start_page + s.pages - 1
|
||||
self.toc = self.opf.toc
|
||||
create_indexing_data(self.spine, self.toc)
|
||||
|
||||
self.read_bookmarks()
|
||||
|
||||
return self
|
||||
|
||||
def __exit__(self, *args):
|
||||
self._tdir.__exit__(*args)
|
||||
for x in self.delete_on_exit:
|
||||
try:
|
||||
os.remove(x)
|
||||
except:
|
||||
pass
|
||||
|
||||
|
105
src/calibre/ebooks/oeb/iterator/bookmarks.py
Normal file
105
src/calibre/ebooks/oeb/iterator/bookmarks.py
Normal file
@ -0,0 +1,105 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import (unicode_literals, division, absolute_import,
|
||||
print_function)
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
from io import BytesIO
|
||||
|
||||
from calibre.utils.zipfile import safe_replace
|
||||
|
||||
BM_FIELD_SEP = u'*|!|?|*'
|
||||
BM_LEGACY_ESC = u'esc-text-%&*#%(){}ads19-end-esc'
|
||||
|
||||
class BookmarksMixin(object):
|
||||
|
||||
def parse_bookmarks(self, raw):
|
||||
for line in raw.splitlines():
|
||||
bm = None
|
||||
if line.count('^') > 0:
|
||||
tokens = line.rpartition('^')
|
||||
title, ref = tokens[0], tokens[2]
|
||||
try:
|
||||
spine, _, pos = ref.partition('#')
|
||||
spine = int(spine.strip())
|
||||
except:
|
||||
continue
|
||||
bm = {'type':'legacy', 'title':title, 'spine':spine, 'pos':pos}
|
||||
elif BM_FIELD_SEP in line:
|
||||
try:
|
||||
title, spine, pos = line.strip().split(BM_FIELD_SEP)
|
||||
spine = int(spine)
|
||||
except:
|
||||
continue
|
||||
# Unescape from serialization
|
||||
pos = pos.replace(BM_LEGACY_ESC, u'^')
|
||||
# Check for pos being a scroll fraction
|
||||
try:
|
||||
pos = float(pos)
|
||||
except:
|
||||
pass
|
||||
bm = {'type':'cfi', 'title':title, 'pos':pos, 'spine':spine}
|
||||
|
||||
if bm:
|
||||
self.bookmarks.append(bm)
|
||||
|
||||
def serialize_bookmarks(self, bookmarks):
|
||||
dat = []
|
||||
for bm in bookmarks:
|
||||
if bm['type'] == 'legacy':
|
||||
rec = u'%s^%d#%s'%(bm['title'], bm['spine'], bm['pos'])
|
||||
else:
|
||||
pos = bm['pos']
|
||||
if isinstance(pos, (int, float)):
|
||||
pos = unicode(pos)
|
||||
else:
|
||||
pos = pos.replace(u'^', BM_LEGACY_ESC)
|
||||
rec = BM_FIELD_SEP.join([bm['title'], unicode(bm['spine']), pos])
|
||||
dat.append(rec)
|
||||
return (u'\n'.join(dat) +u'\n')
|
||||
|
||||
def read_bookmarks(self):
|
||||
self.bookmarks = []
|
||||
bmfile = os.path.join(self.base, 'META-INF', 'calibre_bookmarks.txt')
|
||||
raw = ''
|
||||
if os.path.exists(bmfile):
|
||||
with open(bmfile, 'rb') as f:
|
||||
raw = f.read()
|
||||
else:
|
||||
saved = self.config['bookmarks_'+self.pathtoebook]
|
||||
if saved:
|
||||
raw = saved
|
||||
if not isinstance(raw, unicode):
|
||||
raw = raw.decode('utf-8')
|
||||
self.parse_bookmarks(raw)
|
||||
|
||||
def save_bookmarks(self, bookmarks=None):
|
||||
if bookmarks is None:
|
||||
bookmarks = self.bookmarks
|
||||
dat = self.serialize_bookmarks(bookmarks)
|
||||
if os.path.splitext(self.pathtoebook)[1].lower() == '.epub' and \
|
||||
os.access(self.pathtoebook, os.R_OK):
|
||||
try:
|
||||
zf = open(self.pathtoebook, 'r+b')
|
||||
except IOError:
|
||||
return
|
||||
safe_replace(zf, 'META-INF/calibre_bookmarks.txt',
|
||||
BytesIO(dat.encode('utf-8')),
|
||||
add_missing=True)
|
||||
else:
|
||||
self.config['bookmarks_'+self.pathtoebook] = dat
|
||||
|
||||
def add_bookmark(self, bm):
|
||||
self.bookmarks = [x for x in self.bookmarks if x['title'] !=
|
||||
bm['title']]
|
||||
self.bookmarks.append(bm)
|
||||
self.save_bookmarks()
|
||||
|
||||
def set_bookmarks(self, bookmarks):
|
||||
self.bookmarks = bookmarks
|
||||
|
||||
|
117
src/calibre/ebooks/oeb/iterator/spine.py
Normal file
117
src/calibre/ebooks/oeb/iterator/spine.py
Normal file
@ -0,0 +1,117 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import (unicode_literals, division, absolute_import,
|
||||
print_function)
|
||||
from future_builtins import map
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import re, os
|
||||
from functools import partial
|
||||
from operator import attrgetter
|
||||
from collections import namedtuple
|
||||
|
||||
from calibre import guess_type
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
|
||||
def character_count(html):
|
||||
''' Return the number of "significant" text characters in a HTML string. '''
|
||||
count = 0
|
||||
strip_space = re.compile(r'\s+')
|
||||
for match in re.finditer(r'>[^<]+<', html):
|
||||
count += len(strip_space.sub(' ', match.group()))-2
|
||||
return count
|
||||
|
||||
def anchor_map(html):
|
||||
''' Return map of all anchor names to their offsets in the html '''
|
||||
ans = {}
|
||||
for match in re.finditer(
|
||||
r'''(?:id|name)\s*=\s*['"]([^'"]+)['"]''', html):
|
||||
anchor = match.group(0)
|
||||
ans[anchor] = ans.get(anchor, match.start())
|
||||
return ans
|
||||
|
||||
class SpineItem(unicode):
|
||||
|
||||
def __new__(cls, path, mime_type=None, read_anchor_map=True,
|
||||
run_char_count=True):
|
||||
ppath = path.partition('#')[0]
|
||||
if not os.path.exists(path) and os.path.exists(ppath):
|
||||
path = ppath
|
||||
obj = super(SpineItem, cls).__new__(cls, path)
|
||||
with open(path, 'rb') as f:
|
||||
raw = f.read()
|
||||
raw, obj.encoding = xml_to_unicode(raw)
|
||||
obj.character_count = character_count(raw) if run_char_count else 10000
|
||||
obj.anchor_map = anchor_map(raw) if read_anchor_map else {}
|
||||
obj.start_page = -1
|
||||
obj.pages = -1
|
||||
obj.max_page = -1
|
||||
obj.index_entries = []
|
||||
if mime_type is None:
|
||||
mime_type = guess_type(obj)[0]
|
||||
obj.mime_type = mime_type
|
||||
return obj
|
||||
|
||||
class IndexEntry(object):
|
||||
|
||||
def __init__(self, spine, toc_entry, num):
|
||||
self.num = num
|
||||
self.text = toc_entry.text or _('Unknown')
|
||||
self.key = toc_entry.abspath
|
||||
self.anchor = self.start_anchor = toc_entry.fragment or None
|
||||
self.spine_pos = spine.index(self.key)
|
||||
self.anchor_pos = 0
|
||||
if self.spine_pos > -1:
|
||||
self.anchor_pos = spine[self.spine_pos].anchor_map.get(self.anchor,
|
||||
0)
|
||||
|
||||
self.depth = 0
|
||||
p = toc_entry.parent
|
||||
while p is not None:
|
||||
self.depth += 1
|
||||
p = p.parent
|
||||
|
||||
self.sort_key = (self.spine_pos, self.anchor_pos)
|
||||
self.spine_count = len(spine)
|
||||
|
||||
def find_end(self, all_entries):
|
||||
potential_enders = [i for i in all_entries if
|
||||
i.depth <= self.depth and
|
||||
(
|
||||
(i.spine_pos == self.spine_pos and i.anchor_pos >
|
||||
self.anchor_pos)
|
||||
or
|
||||
i.spine_pos > self.spine_pos
|
||||
)]
|
||||
if potential_enders:
|
||||
# potential_enders is sorted by (spine_pos, anchor_pos)
|
||||
end = potential_enders[0]
|
||||
self.end_spine_pos = end.spine_pos
|
||||
self.end_anchor = end.anchor
|
||||
else:
|
||||
self.end_spine_pos = self.spine_count - 1
|
||||
self.end_anchor = None
|
||||
|
||||
def create_indexing_data(spine, toc):
|
||||
if not toc: return
|
||||
f = partial(IndexEntry, spine)
|
||||
index_entries = list(map(f,
|
||||
(t for t in toc.flat() if t is not toc),
|
||||
(i-1 for i, t in enumerate(toc.flat()) if t is not toc)
|
||||
))
|
||||
index_entries.sort(key=attrgetter('sort_key'))
|
||||
[ i.find_end(index_entries) for i in index_entries ]
|
||||
|
||||
ie = namedtuple('IndexEntry', 'entry start_anchor end_anchor')
|
||||
|
||||
for spine_pos, spine_item in enumerate(spine):
|
||||
for i in index_entries:
|
||||
if i.end_spine_pos < spine_pos or i.spine_pos > spine_pos:
|
||||
continue # Does not touch this file
|
||||
start = i.anchor if i.spine_pos == spine_pos else None
|
||||
end = i.end_anchor if i.spine_pos == spine_pos else None
|
||||
spine_item.index_entries.append(ie(i, start, end))
|
||||
|
@ -18,7 +18,7 @@ from calibre.gui2.widgets import ProgressIndicator
|
||||
from calibre.gui2.main_window import MainWindow
|
||||
from calibre.gui2 import (Application, ORG_NAME, APP_UID, choose_files,
|
||||
info_dialog, error_dialog, open_url, available_height)
|
||||
from calibre.ebooks.oeb.iterator import EbookIterator
|
||||
from calibre.ebooks.oeb.iterator.book import EbookIterator
|
||||
from calibre.ebooks import DRMError
|
||||
from calibre.constants import islinux, isbsd, isosx, filesystem_encoding
|
||||
from calibre.utils.config import Config, StringConfig, JSONConfig
|
||||
@ -802,11 +802,13 @@ class EbookViewer(MainWindow, Ui_EbookViewer):
|
||||
if not title:
|
||||
title = os.path.splitext(os.path.basename(pathtoebook))[0]
|
||||
if self.iterator.toc:
|
||||
self.toc_model = TOC(self.iterator.toc)
|
||||
self.toc_model = TOC(self.iterator.spine, self.iterator.toc)
|
||||
self.toc.setModel(self.toc_model)
|
||||
if self.show_toc_on_open:
|
||||
self.action_table_of_contents.setChecked(True)
|
||||
else:
|
||||
self.toc_model = TOC(self.iterator.spine)
|
||||
self.toc.setModel(self.toc_model)
|
||||
self.action_table_of_contents.setChecked(False)
|
||||
if isbytestring(pathtoebook):
|
||||
pathtoebook = force_unicode(pathtoebook, filesystem_encoding)
|
||||
|
@ -8,9 +8,10 @@ __copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import re
|
||||
|
||||
from PyQt4.Qt import QStandardItem, QStandardItemModel, Qt
|
||||
|
||||
from calibre.ebooks.metadata.toc import TOC as MTOC
|
||||
|
||||
class TOCItem(QStandardItem):
|
||||
|
||||
def __init__(self, toc):
|
||||
@ -30,8 +31,10 @@ class TOCItem(QStandardItem):
|
||||
|
||||
class TOC(QStandardItemModel):
|
||||
|
||||
def __init__(self, toc):
|
||||
def __init__(self, spine, toc=None):
|
||||
QStandardItemModel.__init__(self)
|
||||
if toc is None:
|
||||
toc = MTOC()
|
||||
for t in toc:
|
||||
self.appendRow(TOCItem(t))
|
||||
self.setHorizontalHeaderItem(0, QStandardItem(_('Table of Contents')))
|
||||
|
Loading…
x
Reference in New Issue
Block a user