diff --git a/src/calibre/ebooks/oeb/polish/parsing.py b/src/calibre/ebooks/oeb/polish/parsing.py index 01f893378c..7cd9035775 100644 --- a/src/calibre/ebooks/oeb/polish/parsing.py +++ b/src/calibre/ebooks/oeb/polish/parsing.py @@ -8,10 +8,11 @@ __copyright__ = '2013, Kovid Goyal ' import copy, re, warnings from functools import partial +from bisect import bisect from lxml.etree import ElementBase, XMLParser, ElementDefaultClassLookup, CommentBase -from html5lib.constants import namespaces, tableInsertModeElements +from html5lib.constants import namespaces, tableInsertModeElements, EOF from html5lib.treebuilders._base import TreeBuilder as BaseTreeBuilder from html5lib.ihatexml import InfosetFilter, DataLossWarning from html5lib.html5parser import HTMLParser @@ -400,11 +401,76 @@ class NoNamespaceTreeBuilder(TreeBuilder): except ValueError: html.set(to_xml_name(k), v) -def parse(raw, decoder=None, log=None, discard_namespaces=False): +_regex_cache = {} + +class FastStream(object): + + __slots__ = ('raw', 'pos', 'errors', 'new_lines', 'track_position', 'charEncoding') + + def __init__(self, raw, track_position=False): + self.raw = raw + self.pos = 0 + self.errors = [] + self.charEncoding = ("utf-8", "certain") + self.track_position = track_position + if track_position: + self.new_lines = tuple(m.start() for m in re.finditer(r'\n', raw)) + + def reset(self): + self.pos = 0 + + def char(self): + try: + ans = self.raw[self.pos] + except IndexError: + return EOF + self.pos += 1 + return ans + + def unget(self, char): + if char is not None: + self.pos = max(0, self.pos - 1) + + def charsUntil(self, characters, opposite=False): + # Use a cache of regexps to find the required characters + try: + chars = _regex_cache[(characters, opposite)] + except KeyError: + regex = "".join(["\\x%02x" % ord(c) for c in characters]) + if not opposite: + regex = "^%s" % regex + chars = _regex_cache[(characters, opposite)] = re.compile("[%s]+" % regex) + + # Find the longest matching prefix + m = chars.match(self.raw, self.pos) + if m is None: + return '' + self.pos = m.end() + return m.group() + + def position(self): + if not self.track_position: + return (-1, -1) + lnum = bisect(self.new_lines, self.pos) + if lnum == 0: + return (1, self.pos) + return (lnum, self.pos - self.new_lines[lnum - 1]) + +if len("\U0010FFFF") == 1: # UCS4 build + replace_chars = re.compile("[\uD800-\uDFFF]") +else: + replace_chars = re.compile("([\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?", b"<"]) -invalid_unicode_re = re.compile("[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]") +invalid_unicode_re = re.compile("[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]") # noqa non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, @@ -118,6 +118,10 @@ class BufferedStream(object): def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True): + if (hasattr(source, 'unget') and hasattr(source, 'charsUntil') and + hasattr(source, 'position') and hasattr(source, 'char') and + hasattr(source, 'reset') and hasattr(source, 'errors')): + return source if hasattr(source, "read"): isUnicode = isinstance(source.read(0), text_type) else: