From caad45e8d88a1027f603356f11d2e6ef8528a946 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 10 Aug 2014 07:48:09 +0530 Subject: [PATCH] Python parser for EPUB CFI --- src/calibre/ebooks/epub/cfi/__init__.py | 0 src/calibre/ebooks/epub/cfi/epubcfi.ebnf | 41 ++++++ src/calibre/ebooks/epub/cfi/parse.py | 169 +++++++++++++++++++++++ src/calibre/ebooks/epub/cfi/tests.py | 85 ++++++++++++ 4 files changed, 295 insertions(+) create mode 100644 src/calibre/ebooks/epub/cfi/__init__.py create mode 100644 src/calibre/ebooks/epub/cfi/epubcfi.ebnf create mode 100644 src/calibre/ebooks/epub/cfi/parse.py create mode 100644 src/calibre/ebooks/epub/cfi/tests.py diff --git a/src/calibre/ebooks/epub/cfi/__init__.py b/src/calibre/ebooks/epub/cfi/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/calibre/ebooks/epub/cfi/epubcfi.ebnf b/src/calibre/ebooks/epub/cfi/epubcfi.ebnf new file mode 100644 index 0000000000..db14e39333 --- /dev/null +++ b/src/calibre/ebooks/epub/cfi/epubcfi.ebnf @@ -0,0 +1,41 @@ +(* +Adapted from http://www.idpf.org/epub/linking/cfi/epub-cfi.html + +Changes from spec: + +1) Text location assertion is only allowed after a text offset instead of after any kind of offset + +2) An offset is not allowed immediately after a redirect as it makes no sense + +Intended to be used with grako, like this +grako -n -w "" epubcfi.ebnf -o epubcfi.py +*) + +fragment = "epubcfi(" parent:path [ "," start:path "," end:path ] ")"; + +path = steps:( { step }+ ) [ ( "!" redirect:path ) | offset:offset ]; + +step = "/" num:integer [ "[" id_assertion:characters "]" ]; + +text_offset = ":" char_offset:integer [ "[" text_assertion:text_assertion "]" ]; + +spatial_offset = "@" x:number ":" y:number; + +temporal_offset = "~" t:number; + +offset = (text_offset:text_offset) | (spatio_temporal_offset:(temporal_offset spatial_offset)) | (temporal_offset:temporal_offset) | (spatial_offset:spatial_offset); + +text_assertion = [ ( ( before:characters [ "," after:characters ] ) | ( "," after:characters ) ) ] [ parameters:{parameter} ]; + +parameter = ";" name:characters_no_space "=" { value+:characters [","] }+; + +(* No leading zeros allowed in integers *) +integer = /0|(?:[1-9][0-9]*)/; + +(* No leading zeros, except for numbers in (0, 1) and no trailing zeros for the fractional part *) +number = /(?:[1-9][0-9]*(?:[.][0-9]*[1-9]){0,1})|(?:0[.][0-9]*[1-9])/; + +(* All valid unicode characters, except for the special chars which are preceded by a ^ *) +characters = /(?:[-\u0009\u000a\u000d\u0020-\u0027\u002a\u002b\u002e-\u003a\u003c\u003e-\u005a\u005c\u005f-\ud7ff\ue000-\ufffd\U00010000-\U0010FFFF]|(?:\^[[\](),;=^]))+/; +characters_no_space = /(?:[-\u0009\u000a\u000d\u0021-\u0027\u002a\u002b\u002e-\u003a\u003c\u003e-\u005a\u005c\u005f-\ud7ff\ue000-\ufffd\U00010000-\U0010FFFF]|(?:\^[[\](),;=^]))+/; + diff --git a/src/calibre/ebooks/epub/cfi/parse.py b/src/calibre/ebooks/epub/cfi/parse.py new file mode 100644 index 0000000000..ed8f82953f --- /dev/null +++ b/src/calibre/ebooks/epub/cfi/parse.py @@ -0,0 +1,169 @@ +#!/usr/bin/env python +# vim:fileencoding=utf-8 +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2014, Kovid Goyal ' + +import regex +from future_builtins import map, zip + +class Parser(object): + + ''' See epubcfi.ebnf for the specification that this parser tries to + follow. I have implemented it manually, since I dont want to depend on + grako, and the grammar is pretty simple. This parser is thread-safe, i.e. + it can be used from multiple threads simulataneously. ''' + + def __init__(self): + # All allowed unicode characters + escaped special characters + special_char = r'[\[\](),;=^]' + unescaped_char = '[[\t\n\r -\ud7ff\ue000-\ufffd\U00010000-\U0010ffff]--%s]' % special_char + escaped_char = r'\^' + special_char + chars = r'(?:%s|(?:%s))+' % (unescaped_char, escaped_char) + chars_no_space = chars.replace('0020', '0021') + # No leading zeros allowed for integers + integer = r'(?:[1-9][0-9]*)|0' + # No leading zeros, except for numbers in (0, 1) and no trailing zeros for the fractional part + frac = r'\.[0-9]*[1-9]' + number = r'(?:[1-9][0-9]*(?:{0})?)|(?:0{0})|(?:0)'.format(frac) + c = lambda x:regex.compile(x, flags=regex.VERSION1) + + # A step of the form /integer + self.step_pat = c(r'/(%s)' % integer) + # An id assertion of the form [characters] + self.id_assertion_pat = c(r'\[(%s)\]' % chars) + + # A text offset of the form :integer + self.text_offset_pat = c(r':(%s)' % integer) + # A temporal offset of the form ~number + self.temporal_offset_pat = c(r'~(%s)' % number) + # A spatial offset of the form @number:number + self.spatial_offset_pat = c(r'@({0}):({0})'.format(number)) + # A spatio-temporal offset of the form ~number@number:number + self.st_offset_pat = c(r'~({0})@({0}):({0})'.format(number)) + + # Text assertion patterns + self.ta1_pat = c(r'({0})(?:,({0})){{0,1}}'.format(chars)) + self.ta2_pat = c(r',(%s)' % chars) + self.parameters_pat = c(r'(?:;(%s)=((?:%s,?)+))+' % (chars_no_space, chars)) + self.csv_pat = c(r'(?:(%s),?)+' % chars) + + # Unescape characters + unescape_pat = c(r'%s(%s)' % (escaped_char[:2], escaped_char[2:])) + self.unescape = lambda x: unescape_pat.sub(r'\1', x) + + def parse_epubcfi(self, raw): + ' Parse a full epubcfi of the form epubcfi(path [ , path , path ]) ' + null = {}, {}, {}, raw + if not raw.startswith('epubcfi('): + return null + raw = raw[len('epubcfi('):] + parent_cfi, raw = self.parse_path(raw) + if not parent_cfi: + return null + start_cfi, end_cfi = {}, {} + if raw.startswith(','): + start_cfi, raw = self.parse_path(raw[1:]) + if raw.startswith(','): + end_cfi, raw = self.parse_path(raw[1:]) + if not start_cfi or not end_cfi: + return null + if raw.startswith(')'): + raw = raw[1:] + else: + return null + + return parent_cfi, start_cfi, end_cfi, raw + + def parse_path(self, raw): + ' Parse the path component of an epubcfi of the form /step... ' + path = {'steps':[]} + raw = self._parse_path(raw, path) + if not path['steps']: + path = {} + return path, raw + + def do_match(self, pat, raw): + m = pat.match(raw) + if m is not None: + raw = raw[len(m.group()):] + return m, raw + + def _parse_path(self, raw, ans): + m, raw = self.do_match(self.step_pat, raw) + if m is None: + return raw + ans['steps'].append({'num':int(m.group(1))}) + m, raw = self.do_match(self.id_assertion_pat, raw) + if m is not None: + ans['steps'][-1]['id'] = self.unescape(m.group(1)) + if raw.startswith('!'): + ans['redirect'] = r = {'steps':[]} + return self._parse_path(raw[1:], r) + else: + remaining_raw = self.parse_offset(raw, ans['steps'][-1]) + return self._parse_path(raw, ans) if remaining_raw is None else remaining_raw + + def parse_offset(self, raw, ans): + m, raw = self.do_match(self.text_offset_pat, raw) + if m is not None: + ans['text_offset'] = int(m.group(1)) + return self.parse_text_assertion(raw, ans) + m, raw = self.do_match(self.st_offset_pat, raw) + if m is not None: + t, x, y = m.groups() + ans['temporal_offset'] = float(t) + ans['spatial_offset'] = tuple(map(float, (x, y))) + return raw + m, raw = self.do_match(self.temporal_offset_pat, raw) + if m is not None: + ans['temporal_offset'] = float(m.group(1)) + return raw + m, raw = self.do_match(self.spatial_offset_pat, raw) + if m is not None: + ans['spatial_offset'] = tuple(map(float, m.groups())) + return raw + + def parse_text_assertion(self, raw, ans): + oraw = raw + if not raw.startswith('['): + return oraw + raw = raw[1:] + ta = {} + m, raw = self.do_match(self.ta1_pat, raw) + if m is not None: + before, after = m.groups() + ta['before'] = self.unescape(before) + if after is not None: + ta['after'] = self.unescape(after) + else: + m, raw = self.do_match(self.ta2_pat, raw) + if m is not None: + ta['after'] = self.unescape(m.group(1)) + + # parse parameters + m, raw = self.do_match(self.parameters_pat, raw) + if m is not None: + params = {} + for name, value in zip(m.captures(1), m.captures(2)): + params[name] = tuple(map(self.unescape, self.csv_pat.match(value).captures(1))) + if params: + ta['params'] = params + + if not raw.startswith(']'): + return oraw # no closing ] or extra content in the assertion + + if ta: + ans['text_assertion'] = ta + return raw[1:] + +_parser = None + +def parser(): + global _parser + if _parser is None: + _parser = Parser() + return _parser + diff --git a/src/calibre/ebooks/epub/cfi/tests.py b/src/calibre/ebooks/epub/cfi/tests.py new file mode 100644 index 0000000000..f50762639d --- /dev/null +++ b/src/calibre/ebooks/epub/cfi/tests.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python +# vim:fileencoding=utf-8 +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2014, Kovid Goyal ' + +import unittest +from future_builtins import map + +from calibre.ebooks.epub.cfi.parse import parser + +class Tests(unittest.TestCase): + + def test_parsing(self): + p = parser() + def step(x): + if isinstance(x, int): + return {'num': x} + return {'num':x[0], 'id':x[1]} + def s(*args): + return {'steps':list(map(step, args))} + def r(*args): + idx = args.index('!') + ans = s(*args[:idx]) + ans['redirect'] = s(*args[idx+1:]) + return ans + def o(*args): + ans = s(1) + step = ans['steps'][-1] + typ, val = args[:2] + step[{'@':'spatial_offset', '~':'temporal_offset', ':':'text_offset'}[typ]] = val + if len(args) == 4: + typ, val = args[2:] + step[{'@':'spatial_offset', '~':'temporal_offset'}[typ]] = val + return ans + def a(before=None, after=None, **params): + ans = o(':', 3) + step = ans['steps'][-1] + ta = {} + if before is not None: + ta['before'] = before + if after is not None: + ta['after'] = after + if params: + ta['params'] = {unicode(k):(v,) if isinstance(v, unicode) else v for k, v in params.iteritems()} + if ta: + step['text_assertion'] = ta + return ans + + for raw, path, leftover in [ + # Test parsing of steps + ('/2', s(2), ''), + ('/2/3/4', s(2, 3, 4), ''), + ('/1/2[some^,^^id]/3', s(1, (2, 'some,^id'), 3), ''), + ('/1/2!/3/4', r(1, 2, '!', 3, 4), ''), + ('/1/2[id]!/3/4', r(1, (2, 'id'), '!', 3, 4), ''), + ('/1!/2[id]/3/4', r(1, '!', (2, 'id'), 3, 4), ''), + + # Test parsing of offsets + ('/1~0', o('~', 0), ''), + ('/1~7', o('~', 7), ''), + ('/1~43.1', o('~', 43.1), ''), + ('/1~0.01', o('~', 0.01), ''), + ('/1~1.301', o('~', 1.301), ''), + ('/1@23:34.1', o('@', (23, 34.1)), ''), + ('/1~3@3.1:2.3', o('~', 3.0, '@', (3.1, 2.3)), ''), + ('/1:0', o(':', 0), ''), + ('/1:3', o(':', 3), ''), + + # Test parsing of text assertions + ('/1:3[aa^,b]', a('aa,b'), ''), + ('/1:3[aa^,b,c1]', a('aa,b', 'c1'), ''), + ('/1:3[,aa^,b]', a(after='aa,b'), ''), + ('/1:3[;s=a]', a(s='a'), ''), + ('/1:3[a;s=a]', a('a', s='a'), ''), + ('/1:3[a;s=a^,b,c^;d;x=y]', a('a', s=('a,b', 'c;d'), x='y'), ''), + + ]: + self.assertEqual(p.parse_path(raw), (path, leftover)) + +if __name__ == '__main__': + suite = unittest.TestLoader().loadTestsFromTestCase(Tests) + unittest.TextTestRunner(verbosity=2).run(suite)