mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Speed up parsing some more by using a faster stream class
This commit is contained in:
parent
d93467b3a3
commit
0d1c917281
@ -8,10 +8,11 @@ __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
import copy, re, warnings
|
||||
from functools import partial
|
||||
from bisect import bisect
|
||||
|
||||
from lxml.etree import ElementBase, XMLParser, ElementDefaultClassLookup, CommentBase
|
||||
|
||||
from html5lib.constants import namespaces, tableInsertModeElements
|
||||
from html5lib.constants import namespaces, tableInsertModeElements, EOF
|
||||
from html5lib.treebuilders._base import TreeBuilder as BaseTreeBuilder
|
||||
from html5lib.ihatexml import InfosetFilter, DataLossWarning
|
||||
from html5lib.html5parser import HTMLParser
|
||||
@ -400,11 +401,76 @@ class NoNamespaceTreeBuilder(TreeBuilder):
|
||||
except ValueError:
|
||||
html.set(to_xml_name(k), v)
|
||||
|
||||
def parse(raw, decoder=None, log=None, discard_namespaces=False):
|
||||
_regex_cache = {}
|
||||
|
||||
class FastStream(object):
|
||||
|
||||
__slots__ = ('raw', 'pos', 'errors', 'new_lines', 'track_position', 'charEncoding')
|
||||
|
||||
def __init__(self, raw, track_position=False):
|
||||
self.raw = raw
|
||||
self.pos = 0
|
||||
self.errors = []
|
||||
self.charEncoding = ("utf-8", "certain")
|
||||
self.track_position = track_position
|
||||
if track_position:
|
||||
self.new_lines = tuple(m.start() for m in re.finditer(r'\n', raw))
|
||||
|
||||
def reset(self):
|
||||
self.pos = 0
|
||||
|
||||
def char(self):
|
||||
try:
|
||||
ans = self.raw[self.pos]
|
||||
except IndexError:
|
||||
return EOF
|
||||
self.pos += 1
|
||||
return ans
|
||||
|
||||
def unget(self, char):
|
||||
if char is not None:
|
||||
self.pos = max(0, self.pos - 1)
|
||||
|
||||
def charsUntil(self, characters, opposite=False):
|
||||
# Use a cache of regexps to find the required characters
|
||||
try:
|
||||
chars = _regex_cache[(characters, opposite)]
|
||||
except KeyError:
|
||||
regex = "".join(["\\x%02x" % ord(c) for c in characters])
|
||||
if not opposite:
|
||||
regex = "^%s" % regex
|
||||
chars = _regex_cache[(characters, opposite)] = re.compile("[%s]+" % regex)
|
||||
|
||||
# Find the longest matching prefix
|
||||
m = chars.match(self.raw, self.pos)
|
||||
if m is None:
|
||||
return ''
|
||||
self.pos = m.end()
|
||||
return m.group()
|
||||
|
||||
def position(self):
|
||||
if not self.track_position:
|
||||
return (-1, -1)
|
||||
lnum = bisect(self.new_lines, self.pos)
|
||||
if lnum == 0:
|
||||
return (1, self.pos)
|
||||
return (lnum, self.pos - self.new_lines[lnum - 1])
|
||||
|
||||
if len("\U0010FFFF") == 1: # UCS4 build
|
||||
replace_chars = re.compile("[\uD800-\uDFFF]")
|
||||
else:
|
||||
replace_chars = re.compile("([\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?<![\uD800-\uDBFF])[\uDC00-\uDFFF])")
|
||||
|
||||
def parse(raw, decoder=None, log=None, discard_namespaces=False, line_numbers=True):
|
||||
if isinstance(raw, bytes):
|
||||
raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
|
||||
raw = fix_self_closing_cdata_tags(raw) # TODO: Handle this in the parser
|
||||
raw = xml_replace_entities(raw)
|
||||
raw = raw.replace('\r\n', '\n').replace('\r', '\n')
|
||||
raw = replace_chars.sub('', raw)
|
||||
|
||||
stream_class = partial(FastStream, track_position=line_numbers)
|
||||
stream = stream_class(raw)
|
||||
builder = NoNamespaceTreeBuilder if discard_namespaces else TreeBuilder
|
||||
while True:
|
||||
try:
|
||||
@ -412,11 +478,12 @@ def parse(raw, decoder=None, log=None, discard_namespaces=False):
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter('ignore', category=DataLossWarning)
|
||||
try:
|
||||
parser.parse(raw, parseMeta=False, useChardet=False)
|
||||
parser.parse(stream, parseMeta=False, useChardet=False)
|
||||
finally:
|
||||
parser.tree.proxy_cache = None
|
||||
except NamespacedHTMLPresent as err:
|
||||
raw = re.sub(r'<\s*/{0,1}(%s:)' % err.prefix, lambda m: m.group().replace(m.group(1), ''), raw, flags=re.I)
|
||||
stream = stream_class(raw)
|
||||
continue
|
||||
break
|
||||
root = parser.tree.getDocument()
|
||||
|
@ -27,7 +27,7 @@ asciiLettersBytes = frozenset([item.encode("ascii") for item in asciiLetters])
|
||||
asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase])
|
||||
spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])
|
||||
|
||||
invalid_unicode_re = re.compile("[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]")
|
||||
invalid_unicode_re = re.compile("[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]") # noqa
|
||||
|
||||
non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
|
||||
0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
|
||||
@ -118,6 +118,10 @@ class BufferedStream(object):
|
||||
|
||||
|
||||
def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True):
|
||||
if (hasattr(source, 'unget') and hasattr(source, 'charsUntil') and
|
||||
hasattr(source, 'position') and hasattr(source, 'char') and
|
||||
hasattr(source, 'reset') and hasattr(source, 'errors')):
|
||||
return source
|
||||
if hasattr(source, "read"):
|
||||
isUnicode = isinstance(source.read(0), text_type)
|
||||
else:
|
||||
|
Loading…
x
Reference in New Issue
Block a user