mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 18:54:09 -04:00
Python parser for EPUB CFI
This commit is contained in:
parent
63db9cbddb
commit
caad45e8d8
0
src/calibre/ebooks/epub/cfi/__init__.py
Normal file
0
src/calibre/ebooks/epub/cfi/__init__.py
Normal file
41
src/calibre/ebooks/epub/cfi/epubcfi.ebnf
Normal file
41
src/calibre/ebooks/epub/cfi/epubcfi.ebnf
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
(*
|
||||||
|
Adapted from http://www.idpf.org/epub/linking/cfi/epub-cfi.html
|
||||||
|
|
||||||
|
Changes from spec:
|
||||||
|
|
||||||
|
1) Text location assertion is only allowed after a text offset instead of after any kind of offset
|
||||||
|
|
||||||
|
2) An offset is not allowed immediately after a redirect as it makes no sense
|
||||||
|
|
||||||
|
Intended to be used with grako, like this
|
||||||
|
grako -n -w "" epubcfi.ebnf -o epubcfi.py
|
||||||
|
*)
|
||||||
|
|
||||||
|
fragment = "epubcfi(" parent:path [ "," start:path "," end:path ] ")";
|
||||||
|
|
||||||
|
path = steps:( { step }+ ) [ ( "!" redirect:path ) | offset:offset ];
|
||||||
|
|
||||||
|
step = "/" num:integer [ "[" id_assertion:characters "]" ];
|
||||||
|
|
||||||
|
text_offset = ":" char_offset:integer [ "[" text_assertion:text_assertion "]" ];
|
||||||
|
|
||||||
|
spatial_offset = "@" x:number ":" y:number;
|
||||||
|
|
||||||
|
temporal_offset = "~" t:number;
|
||||||
|
|
||||||
|
offset = (text_offset:text_offset) | (spatio_temporal_offset:(temporal_offset spatial_offset)) | (temporal_offset:temporal_offset) | (spatial_offset:spatial_offset);
|
||||||
|
|
||||||
|
text_assertion = [ ( ( before:characters [ "," after:characters ] ) | ( "," after:characters ) ) ] [ parameters:{parameter} ];
|
||||||
|
|
||||||
|
parameter = ";" name:characters_no_space "=" { value+:characters [","] }+;
|
||||||
|
|
||||||
|
(* No leading zeros allowed in integers *)
|
||||||
|
integer = /0|(?:[1-9][0-9]*)/;
|
||||||
|
|
||||||
|
(* No leading zeros, except for numbers in (0, 1) and no trailing zeros for the fractional part *)
|
||||||
|
number = /(?:[1-9][0-9]*(?:[.][0-9]*[1-9]){0,1})|(?:0[.][0-9]*[1-9])/;
|
||||||
|
|
||||||
|
(* All valid unicode characters, except for the special chars which are preceded by a ^ *)
|
||||||
|
characters = /(?:[-\u0009\u000a\u000d\u0020-\u0027\u002a\u002b\u002e-\u003a\u003c\u003e-\u005a\u005c\u005f-\ud7ff\ue000-\ufffd\U00010000-\U0010FFFF]|(?:\^[[\](),;=^]))+/;
|
||||||
|
characters_no_space = /(?:[-\u0009\u000a\u000d\u0021-\u0027\u002a\u002b\u002e-\u003a\u003c\u003e-\u005a\u005c\u005f-\ud7ff\ue000-\ufffd\U00010000-\U0010FFFF]|(?:\^[[\](),;=^]))+/;
|
||||||
|
|
169
src/calibre/ebooks/epub/cfi/parse.py
Normal file
169
src/calibre/ebooks/epub/cfi/parse.py
Normal file
@ -0,0 +1,169 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# vim:fileencoding=utf-8
|
||||||
|
from __future__ import (unicode_literals, division, absolute_import,
|
||||||
|
print_function)
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||||
|
|
||||||
|
import regex
|
||||||
|
from future_builtins import map, zip
|
||||||
|
|
||||||
|
class Parser(object):
|
||||||
|
|
||||||
|
''' See epubcfi.ebnf for the specification that this parser tries to
|
||||||
|
follow. I have implemented it manually, since I dont want to depend on
|
||||||
|
grako, and the grammar is pretty simple. This parser is thread-safe, i.e.
|
||||||
|
it can be used from multiple threads simulataneously. '''
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
# All allowed unicode characters + escaped special characters
|
||||||
|
special_char = r'[\[\](),;=^]'
|
||||||
|
unescaped_char = '[[\t\n\r -\ud7ff\ue000-\ufffd\U00010000-\U0010ffff]--%s]' % special_char
|
||||||
|
escaped_char = r'\^' + special_char
|
||||||
|
chars = r'(?:%s|(?:%s))+' % (unescaped_char, escaped_char)
|
||||||
|
chars_no_space = chars.replace('0020', '0021')
|
||||||
|
# No leading zeros allowed for integers
|
||||||
|
integer = r'(?:[1-9][0-9]*)|0'
|
||||||
|
# No leading zeros, except for numbers in (0, 1) and no trailing zeros for the fractional part
|
||||||
|
frac = r'\.[0-9]*[1-9]'
|
||||||
|
number = r'(?:[1-9][0-9]*(?:{0})?)|(?:0{0})|(?:0)'.format(frac)
|
||||||
|
c = lambda x:regex.compile(x, flags=regex.VERSION1)
|
||||||
|
|
||||||
|
# A step of the form /integer
|
||||||
|
self.step_pat = c(r'/(%s)' % integer)
|
||||||
|
# An id assertion of the form [characters]
|
||||||
|
self.id_assertion_pat = c(r'\[(%s)\]' % chars)
|
||||||
|
|
||||||
|
# A text offset of the form :integer
|
||||||
|
self.text_offset_pat = c(r':(%s)' % integer)
|
||||||
|
# A temporal offset of the form ~number
|
||||||
|
self.temporal_offset_pat = c(r'~(%s)' % number)
|
||||||
|
# A spatial offset of the form @number:number
|
||||||
|
self.spatial_offset_pat = c(r'@({0}):({0})'.format(number))
|
||||||
|
# A spatio-temporal offset of the form ~number@number:number
|
||||||
|
self.st_offset_pat = c(r'~({0})@({0}):({0})'.format(number))
|
||||||
|
|
||||||
|
# Text assertion patterns
|
||||||
|
self.ta1_pat = c(r'({0})(?:,({0})){{0,1}}'.format(chars))
|
||||||
|
self.ta2_pat = c(r',(%s)' % chars)
|
||||||
|
self.parameters_pat = c(r'(?:;(%s)=((?:%s,?)+))+' % (chars_no_space, chars))
|
||||||
|
self.csv_pat = c(r'(?:(%s),?)+' % chars)
|
||||||
|
|
||||||
|
# Unescape characters
|
||||||
|
unescape_pat = c(r'%s(%s)' % (escaped_char[:2], escaped_char[2:]))
|
||||||
|
self.unescape = lambda x: unescape_pat.sub(r'\1', x)
|
||||||
|
|
||||||
|
def parse_epubcfi(self, raw):
|
||||||
|
' Parse a full epubcfi of the form epubcfi(path [ , path , path ]) '
|
||||||
|
null = {}, {}, {}, raw
|
||||||
|
if not raw.startswith('epubcfi('):
|
||||||
|
return null
|
||||||
|
raw = raw[len('epubcfi('):]
|
||||||
|
parent_cfi, raw = self.parse_path(raw)
|
||||||
|
if not parent_cfi:
|
||||||
|
return null
|
||||||
|
start_cfi, end_cfi = {}, {}
|
||||||
|
if raw.startswith(','):
|
||||||
|
start_cfi, raw = self.parse_path(raw[1:])
|
||||||
|
if raw.startswith(','):
|
||||||
|
end_cfi, raw = self.parse_path(raw[1:])
|
||||||
|
if not start_cfi or not end_cfi:
|
||||||
|
return null
|
||||||
|
if raw.startswith(')'):
|
||||||
|
raw = raw[1:]
|
||||||
|
else:
|
||||||
|
return null
|
||||||
|
|
||||||
|
return parent_cfi, start_cfi, end_cfi, raw
|
||||||
|
|
||||||
|
def parse_path(self, raw):
|
||||||
|
' Parse the path component of an epubcfi of the form /step... '
|
||||||
|
path = {'steps':[]}
|
||||||
|
raw = self._parse_path(raw, path)
|
||||||
|
if not path['steps']:
|
||||||
|
path = {}
|
||||||
|
return path, raw
|
||||||
|
|
||||||
|
def do_match(self, pat, raw):
|
||||||
|
m = pat.match(raw)
|
||||||
|
if m is not None:
|
||||||
|
raw = raw[len(m.group()):]
|
||||||
|
return m, raw
|
||||||
|
|
||||||
|
def _parse_path(self, raw, ans):
|
||||||
|
m, raw = self.do_match(self.step_pat, raw)
|
||||||
|
if m is None:
|
||||||
|
return raw
|
||||||
|
ans['steps'].append({'num':int(m.group(1))})
|
||||||
|
m, raw = self.do_match(self.id_assertion_pat, raw)
|
||||||
|
if m is not None:
|
||||||
|
ans['steps'][-1]['id'] = self.unescape(m.group(1))
|
||||||
|
if raw.startswith('!'):
|
||||||
|
ans['redirect'] = r = {'steps':[]}
|
||||||
|
return self._parse_path(raw[1:], r)
|
||||||
|
else:
|
||||||
|
remaining_raw = self.parse_offset(raw, ans['steps'][-1])
|
||||||
|
return self._parse_path(raw, ans) if remaining_raw is None else remaining_raw
|
||||||
|
|
||||||
|
def parse_offset(self, raw, ans):
|
||||||
|
m, raw = self.do_match(self.text_offset_pat, raw)
|
||||||
|
if m is not None:
|
||||||
|
ans['text_offset'] = int(m.group(1))
|
||||||
|
return self.parse_text_assertion(raw, ans)
|
||||||
|
m, raw = self.do_match(self.st_offset_pat, raw)
|
||||||
|
if m is not None:
|
||||||
|
t, x, y = m.groups()
|
||||||
|
ans['temporal_offset'] = float(t)
|
||||||
|
ans['spatial_offset'] = tuple(map(float, (x, y)))
|
||||||
|
return raw
|
||||||
|
m, raw = self.do_match(self.temporal_offset_pat, raw)
|
||||||
|
if m is not None:
|
||||||
|
ans['temporal_offset'] = float(m.group(1))
|
||||||
|
return raw
|
||||||
|
m, raw = self.do_match(self.spatial_offset_pat, raw)
|
||||||
|
if m is not None:
|
||||||
|
ans['spatial_offset'] = tuple(map(float, m.groups()))
|
||||||
|
return raw
|
||||||
|
|
||||||
|
def parse_text_assertion(self, raw, ans):
|
||||||
|
oraw = raw
|
||||||
|
if not raw.startswith('['):
|
||||||
|
return oraw
|
||||||
|
raw = raw[1:]
|
||||||
|
ta = {}
|
||||||
|
m, raw = self.do_match(self.ta1_pat, raw)
|
||||||
|
if m is not None:
|
||||||
|
before, after = m.groups()
|
||||||
|
ta['before'] = self.unescape(before)
|
||||||
|
if after is not None:
|
||||||
|
ta['after'] = self.unescape(after)
|
||||||
|
else:
|
||||||
|
m, raw = self.do_match(self.ta2_pat, raw)
|
||||||
|
if m is not None:
|
||||||
|
ta['after'] = self.unescape(m.group(1))
|
||||||
|
|
||||||
|
# parse parameters
|
||||||
|
m, raw = self.do_match(self.parameters_pat, raw)
|
||||||
|
if m is not None:
|
||||||
|
params = {}
|
||||||
|
for name, value in zip(m.captures(1), m.captures(2)):
|
||||||
|
params[name] = tuple(map(self.unescape, self.csv_pat.match(value).captures(1)))
|
||||||
|
if params:
|
||||||
|
ta['params'] = params
|
||||||
|
|
||||||
|
if not raw.startswith(']'):
|
||||||
|
return oraw # no closing ] or extra content in the assertion
|
||||||
|
|
||||||
|
if ta:
|
||||||
|
ans['text_assertion'] = ta
|
||||||
|
return raw[1:]
|
||||||
|
|
||||||
|
_parser = None
|
||||||
|
|
||||||
|
def parser():
|
||||||
|
global _parser
|
||||||
|
if _parser is None:
|
||||||
|
_parser = Parser()
|
||||||
|
return _parser
|
||||||
|
|
85
src/calibre/ebooks/epub/cfi/tests.py
Normal file
85
src/calibre/ebooks/epub/cfi/tests.py
Normal file
@ -0,0 +1,85 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# vim:fileencoding=utf-8
|
||||||
|
from __future__ import (unicode_literals, division, absolute_import,
|
||||||
|
print_function)
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||||
|
|
||||||
|
import unittest
|
||||||
|
from future_builtins import map
|
||||||
|
|
||||||
|
from calibre.ebooks.epub.cfi.parse import parser
|
||||||
|
|
||||||
|
class Tests(unittest.TestCase):
|
||||||
|
|
||||||
|
def test_parsing(self):
|
||||||
|
p = parser()
|
||||||
|
def step(x):
|
||||||
|
if isinstance(x, int):
|
||||||
|
return {'num': x}
|
||||||
|
return {'num':x[0], 'id':x[1]}
|
||||||
|
def s(*args):
|
||||||
|
return {'steps':list(map(step, args))}
|
||||||
|
def r(*args):
|
||||||
|
idx = args.index('!')
|
||||||
|
ans = s(*args[:idx])
|
||||||
|
ans['redirect'] = s(*args[idx+1:])
|
||||||
|
return ans
|
||||||
|
def o(*args):
|
||||||
|
ans = s(1)
|
||||||
|
step = ans['steps'][-1]
|
||||||
|
typ, val = args[:2]
|
||||||
|
step[{'@':'spatial_offset', '~':'temporal_offset', ':':'text_offset'}[typ]] = val
|
||||||
|
if len(args) == 4:
|
||||||
|
typ, val = args[2:]
|
||||||
|
step[{'@':'spatial_offset', '~':'temporal_offset'}[typ]] = val
|
||||||
|
return ans
|
||||||
|
def a(before=None, after=None, **params):
|
||||||
|
ans = o(':', 3)
|
||||||
|
step = ans['steps'][-1]
|
||||||
|
ta = {}
|
||||||
|
if before is not None:
|
||||||
|
ta['before'] = before
|
||||||
|
if after is not None:
|
||||||
|
ta['after'] = after
|
||||||
|
if params:
|
||||||
|
ta['params'] = {unicode(k):(v,) if isinstance(v, unicode) else v for k, v in params.iteritems()}
|
||||||
|
if ta:
|
||||||
|
step['text_assertion'] = ta
|
||||||
|
return ans
|
||||||
|
|
||||||
|
for raw, path, leftover in [
|
||||||
|
# Test parsing of steps
|
||||||
|
('/2', s(2), ''),
|
||||||
|
('/2/3/4', s(2, 3, 4), ''),
|
||||||
|
('/1/2[some^,^^id]/3', s(1, (2, 'some,^id'), 3), ''),
|
||||||
|
('/1/2!/3/4', r(1, 2, '!', 3, 4), ''),
|
||||||
|
('/1/2[id]!/3/4', r(1, (2, 'id'), '!', 3, 4), ''),
|
||||||
|
('/1!/2[id]/3/4', r(1, '!', (2, 'id'), 3, 4), ''),
|
||||||
|
|
||||||
|
# Test parsing of offsets
|
||||||
|
('/1~0', o('~', 0), ''),
|
||||||
|
('/1~7', o('~', 7), ''),
|
||||||
|
('/1~43.1', o('~', 43.1), ''),
|
||||||
|
('/1~0.01', o('~', 0.01), ''),
|
||||||
|
('/1~1.301', o('~', 1.301), ''),
|
||||||
|
('/1@23:34.1', o('@', (23, 34.1)), ''),
|
||||||
|
('/1~3@3.1:2.3', o('~', 3.0, '@', (3.1, 2.3)), ''),
|
||||||
|
('/1:0', o(':', 0), ''),
|
||||||
|
('/1:3', o(':', 3), ''),
|
||||||
|
|
||||||
|
# Test parsing of text assertions
|
||||||
|
('/1:3[aa^,b]', a('aa,b'), ''),
|
||||||
|
('/1:3[aa^,b,c1]', a('aa,b', 'c1'), ''),
|
||||||
|
('/1:3[,aa^,b]', a(after='aa,b'), ''),
|
||||||
|
('/1:3[;s=a]', a(s='a'), ''),
|
||||||
|
('/1:3[a;s=a]', a('a', s='a'), ''),
|
||||||
|
('/1:3[a;s=a^,b,c^;d;x=y]', a('a', s=('a,b', 'c;d'), x='y'), ''),
|
||||||
|
|
||||||
|
]:
|
||||||
|
self.assertEqual(p.parse_path(raw), (path, leftover))
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
suite = unittest.TestLoader().loadTestsFromTestCase(Tests)
|
||||||
|
unittest.TextTestRunner(verbosity=2).run(suite)
|
Loading…
x
Reference in New Issue
Block a user