Speed up parsing some more by using a faster stream class

This commit is contained in:
Kovid Goyal 2013-10-26 11:01:30 +05:30
parent d93467b3a3
commit 0d1c917281
2 changed files with 75 additions and 4 deletions

View File

@ -8,10 +8,11 @@ __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import copy, re, warnings import copy, re, warnings
from functools import partial from functools import partial
from bisect import bisect
from lxml.etree import ElementBase, XMLParser, ElementDefaultClassLookup, CommentBase from lxml.etree import ElementBase, XMLParser, ElementDefaultClassLookup, CommentBase
from html5lib.constants import namespaces, tableInsertModeElements from html5lib.constants import namespaces, tableInsertModeElements, EOF
from html5lib.treebuilders._base import TreeBuilder as BaseTreeBuilder from html5lib.treebuilders._base import TreeBuilder as BaseTreeBuilder
from html5lib.ihatexml import InfosetFilter, DataLossWarning from html5lib.ihatexml import InfosetFilter, DataLossWarning
from html5lib.html5parser import HTMLParser from html5lib.html5parser import HTMLParser
@ -400,11 +401,76 @@ class NoNamespaceTreeBuilder(TreeBuilder):
except ValueError: except ValueError:
html.set(to_xml_name(k), v) html.set(to_xml_name(k), v)
def parse(raw, decoder=None, log=None, discard_namespaces=False): _regex_cache = {}
class FastStream(object):
__slots__ = ('raw', 'pos', 'errors', 'new_lines', 'track_position', 'charEncoding')
def __init__(self, raw, track_position=False):
self.raw = raw
self.pos = 0
self.errors = []
self.charEncoding = ("utf-8", "certain")
self.track_position = track_position
if track_position:
self.new_lines = tuple(m.start() for m in re.finditer(r'\n', raw))
def reset(self):
self.pos = 0
def char(self):
try:
ans = self.raw[self.pos]
except IndexError:
return EOF
self.pos += 1
return ans
def unget(self, char):
if char is not None:
self.pos = max(0, self.pos - 1)
def charsUntil(self, characters, opposite=False):
# Use a cache of regexps to find the required characters
try:
chars = _regex_cache[(characters, opposite)]
except KeyError:
regex = "".join(["\\x%02x" % ord(c) for c in characters])
if not opposite:
regex = "^%s" % regex
chars = _regex_cache[(characters, opposite)] = re.compile("[%s]+" % regex)
# Find the longest matching prefix
m = chars.match(self.raw, self.pos)
if m is None:
return ''
self.pos = m.end()
return m.group()
def position(self):
if not self.track_position:
return (-1, -1)
lnum = bisect(self.new_lines, self.pos)
if lnum == 0:
return (1, self.pos)
return (lnum, self.pos - self.new_lines[lnum - 1])
if len("\U0010FFFF") == 1: # UCS4 build
replace_chars = re.compile("[\uD800-\uDFFF]")
else:
replace_chars = re.compile("([\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?<![\uD800-\uDBFF])[\uDC00-\uDFFF])")
def parse(raw, decoder=None, log=None, discard_namespaces=False, line_numbers=True):
if isinstance(raw, bytes): if isinstance(raw, bytes):
raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw) raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
raw = fix_self_closing_cdata_tags(raw) # TODO: Handle this in the parser raw = fix_self_closing_cdata_tags(raw) # TODO: Handle this in the parser
raw = xml_replace_entities(raw) raw = xml_replace_entities(raw)
raw = raw.replace('\r\n', '\n').replace('\r', '\n')
raw = replace_chars.sub('', raw)
stream_class = partial(FastStream, track_position=line_numbers)
stream = stream_class(raw)
builder = NoNamespaceTreeBuilder if discard_namespaces else TreeBuilder builder = NoNamespaceTreeBuilder if discard_namespaces else TreeBuilder
while True: while True:
try: try:
@ -412,11 +478,12 @@ def parse(raw, decoder=None, log=None, discard_namespaces=False):
with warnings.catch_warnings(): with warnings.catch_warnings():
warnings.simplefilter('ignore', category=DataLossWarning) warnings.simplefilter('ignore', category=DataLossWarning)
try: try:
parser.parse(raw, parseMeta=False, useChardet=False) parser.parse(stream, parseMeta=False, useChardet=False)
finally: finally:
parser.tree.proxy_cache = None parser.tree.proxy_cache = None
except NamespacedHTMLPresent as err: except NamespacedHTMLPresent as err:
raw = re.sub(r'<\s*/{0,1}(%s:)' % err.prefix, lambda m: m.group().replace(m.group(1), ''), raw, flags=re.I) raw = re.sub(r'<\s*/{0,1}(%s:)' % err.prefix, lambda m: m.group().replace(m.group(1), ''), raw, flags=re.I)
stream = stream_class(raw)
continue continue
break break
root = parser.tree.getDocument() root = parser.tree.getDocument()

View File

@ -27,7 +27,7 @@ asciiLettersBytes = frozenset([item.encode("ascii") for item in asciiLetters])
asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase]) asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase])
spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"]) spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])
invalid_unicode_re = re.compile("[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]") invalid_unicode_re = re.compile("[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]") # noqa
non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
@ -118,6 +118,10 @@ class BufferedStream(object):
def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True): def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True):
if (hasattr(source, 'unget') and hasattr(source, 'charsUntil') and
hasattr(source, 'position') and hasattr(source, 'char') and
hasattr(source, 'reset') and hasattr(source, 'errors')):
return source
if hasattr(source, "read"): if hasattr(source, "read"):
isUnicode = isinstance(source.read(0), text_type) isUnicode = isinstance(source.read(0), text_type)
else: else: