2015-11-12 14:56:33 +05:30

469 lines
23 KiB
Plaintext

# vim:fileencoding=utf-8
# License: BSD
# Copyright: 2015, Kovid Goyal <kovid at kovidgoyal.net>
# Copyright: 2013, Alexander Tsepkov
# globals: _$rapyd$_iterator_symbol, _$rapyd$_list_decorate
# basic implementation of Python's 're' library
# Alias DB from http://www.unicode.org/Public/8.0.0/ucd/NameAliases.txt {{{
_ALIAS_MAP = {"null":0,"nul":0,"start of heading":1,"soh":1,"start of text":2,"stx":2,"end of text":3,"etx":3,"end of transmission":4,"eot":4,"enquiry":5,"enq":5,"acknowledge":6,"ack":6,"alert":7,"bel":7,"backspace":8,"bs":8,"character tabulation":9,"horizontal tabulation":9,"ht":9,"tab":9,"line feed":10,"new line":10,"end of line":10,"lf":10,"nl":10,"eol":10,"line tabulation":11,"vertical tabulation":11,"vt":11,"form feed":12,"ff":12,"carriage return":13,"cr":13,"shift out":14,"locking-shift one":14,"so":14,"shift in":15,"locking-shift zero":15,"si":15,"data link escape":16,"dle":16,"device control one":17,"dc1":17,"device control two":18,"dc2":18,"device control three":19,"dc3":19,"device control four":20,"dc4":20,"negative acknowledge":21,"nak":21,"synchronous idle":22,"syn":22,"end of transmission block":23,"etb":23,"cancel":24,"can":24,"end of medium":25,"eom":25,"substitute":26,"sub":26,"escape":27,"esc":27,"information separator four":28,"file separator":28,"fs":28,"information separator three":29,"group separator":29,"gs":29,"information separator two":30,"record separator":30,"rs":30,"information separator one":31,"unit separator":31,"us":31,"sp":32,"delete":127,"del":127,"padding character":128,"pad":128,"high octet preset":129,"hop":129,"break permitted here":130,"bph":130,"no break here":131,"nbh":131,"index":132,"ind":132,"next line":133,"nel":133,"start of selected area":134,"ssa":134,"end of selected area":135,"esa":135,"character tabulation set":136,"horizontal tabulation set":136,"hts":136,"character tabulation with justification":137,"horizontal tabulation with justification":137,"htj":137,"line tabulation set":138,"vertical tabulation set":138,"vts":138,"partial line forward":139,"partial line down":139,"pld":139,"partial line backward":140,"partial line up":140,"plu":140,"reverse line feed":141,"reverse index":141,"ri":141,"single shift two":142,"single-shift-2":142,"ss2":142,"single shift three":143,"single-shift-3":143,"ss3":143,"device control string":144,"dcs":144,"private use one":145,"private use-1":145,"pu1":145,"private use two":146,"private use-2":146,"pu2":146,"set transmit state":147,"sts":147,"cancel character":148,"cch":148,"message waiting":149,"mw":149,"start of guarded area":150,"start of protected area":150,"spa":150,"end of guarded area":151,"end of protected area":151,"epa":151,"start of string":152,"sos":152,"single graphic character introducer":153,"sgc":153,"single character introducer":154,"sci":154,"control sequence introducer":155,"csi":155,"string terminator":156,"st":156,"operating system command":157,"osc":157,"privacy message":158,"pm":158,"application program command":159,"apc":159,"nbsp":160,"shy":173,"latin capital letter gha":418,"latin small letter gha":419,"cgj":847,"alm":1564,"syriac sublinear colon skewed left":1801,"kannada letter llla":3294,"lao letter fo fon":3741,"lao letter fo fay":3743,"lao letter ro":3747,"lao letter lo":3749,"tibetan mark bka- shog gi mgo rgyan":4048,"fvs1":6155,"fvs2":6156,"fvs3":6157,"mvs":6158,"zwsp":8203,"zwnj":8204,"zwj":8205,"lrm":8206,"rlm":8207,"lre":8234,"rle":8235,"pdf":8236,"lro":8237,"rlo":8238,"nnbsp":8239,"mmsp":8287,"wj":8288,"lri":8294,"rli":8295,"fsi":8296,"pdi":8297,"weierstrass elliptic function":8472,"micr on us symbol":9288,"micr dash symbol":9289,"leftwards triangle-headed arrow with double vertical stroke":11130,"rightwards triangle-headed arrow with double vertical stroke":11132,"yi syllable iteration mark":40981,"presentation form for vertical right white lenticular bracket":65048,"vs1":65024,"vs2":65025,"vs3":65026,"vs4":65027,"vs5":65028,"vs6":65029,"vs7":65030,"vs8":65031,"vs9":65032,"vs10":65033,"vs11":65034,"vs12":65035,"vs13":65036,"vs14":65037,"vs15":65038,"vs16":65039,"byte order mark":65279,"bom":65279,"zwnbsp":65279,"cuneiform sign nu11 tenu":74452,"cuneiform sign nu11 over nu11 bur over bur":74453,"byzantine musical symbol fthora skliron chroma vasis":118981,"vs17":917760,"vs18":917761,"vs19":917762,"vs20":917763,"vs21":917764,"vs22":917765,"vs23":917766,"vs24":917767,"vs25":917768,"vs26":917769,"vs27":917770,"vs28":917771,"vs29":917772,"vs30":917773,"vs31":917774,"vs32":917775,"vs33":917776,"vs34":917777,"vs35":917778,"vs36":917779,"vs37":917780,"vs38":917781,"vs39":917782,"vs40":917783,"vs41":917784,"vs42":917785,"vs43":917786,"vs44":917787,"vs45":917788,"vs46":917789,"vs47":917790,"vs48":917791,"vs49":917792,"vs50":917793,"vs51":917794,"vs52":917795,"vs53":917796,"vs54":917797,"vs55":917798,"vs56":917799,"vs57":917800,"vs58":917801,"vs59":917802,"vs60":917803,"vs61":917804,"vs62":917805,"vs63":917806,"vs64":917807,"vs65":917808,"vs66":917809,"vs67":917810,"vs68":917811,"vs69":917812,"vs70":917813,"vs71":917814,"vs72":917815,"vs73":917816,"vs74":917817,"vs75":917818,"vs76":917819,"vs77":917820,"vs78":917821,"vs79":917822,"vs80":917823,"vs81":917824,"vs82":917825,"vs83":917826,"vs84":917827,"vs85":917828,"vs86":917829,"vs87":917830,"vs88":917831,"vs89":917832,"vs90":917833,"vs91":917834,"vs92":917835,"vs93":917836,"vs94":917837,"vs95":917838,"vs96":917839,"vs97":917840,"vs98":917841,"vs99":917842,"vs100":917843,"vs101":917844,"vs102":917845,"vs103":917846,"vs104":917847,"vs105":917848,"vs106":917849,"vs107":917850,"vs108":917851,"vs109":917852,"vs110":917853,"vs111":917854,"vs112":917855,"vs113":917856,"vs114":917857,"vs115":917858,"vs116":917859,"vs117":917860,"vs118":917861,"vs119":917862,"vs120":917863,"vs121":917864,"vs122":917865,"vs123":917866,"vs124":917867,"vs125":917868,"vs126":917869,"vs127":917870,"vs128":917871,"vs129":917872,"vs130":917873,"vs131":917874,"vs132":917875,"vs133":917876,"vs134":917877,"vs135":917878,"vs136":917879,"vs137":917880,"vs138":917881,"vs139":917882,"vs140":917883,"vs141":917884,"vs142":917885,"vs143":917886,"vs144":917887,"vs145":917888,"vs146":917889,"vs147":917890,"vs148":917891,"vs149":917892,"vs150":917893,"vs151":917894,"vs152":917895,"vs153":917896,"vs154":917897,"vs155":917898,"vs156":917899,"vs157":917900,"vs158":917901,"vs159":917902,"vs160":917903,"vs161":917904,"vs162":917905,"vs163":917906,"vs164":917907,"vs165":917908,"vs166":917909,"vs167":917910,"vs168":917911,"vs169":917912,"vs170":917913,"vs171":917914,"vs172":917915,"vs173":917916,"vs174":917917,"vs175":917918,"vs176":917919,"vs177":917920,"vs178":917921,"vs179":917922,"vs180":917923,"vs181":917924,"vs182":917925,"vs183":917926,"vs184":917927,"vs185":917928,"vs186":917929,"vs187":917930,"vs188":917931,"vs189":917932,"vs190":917933,"vs191":917934,"vs192":917935,"vs193":917936,"vs194":917937,"vs195":917938,"vs196":917939,"vs197":917940,"vs198":917941,"vs199":917942,"vs200":917943,"vs201":917944,"vs202":917945,"vs203":917946,"vs204":917947,"vs205":917948,"vs206":917949,"vs207":917950,"vs208":917951,"vs209":917952,"vs210":917953,"vs211":917954,"vs212":917955,"vs213":917956,"vs214":917957,"vs215":917958,"vs216":917959,"vs217":917960,"vs218":917961,"vs219":917962,"vs220":917963,"vs221":917964,"vs222":917965,"vs223":917966,"vs224":917967,"vs225":917968,"vs226":917969,"vs227":917970,"vs228":917971,"vs229":917972,"vs230":917973,"vs231":917974,"vs232":917975,"vs233":917976,"vs234":917977,"vs235":917978,"vs236":917979,"vs237":917980,"vs238":917981,"vs239":917982,"vs240":917983,"vs241":917984,"vs242":917985,"vs243":917986,"vs244":917987,"vs245":917988,"vs246":917989,"vs247":917990,"vs248":917991,"vs249":917992,"vs250":917993,"vs251":917994,"vs252":917995,"vs253":917996,"vs254":917997,"vs255":917998,"vs256":917999}
# }}}
_ASCII_CONTROL_CHARS = {'a':7, 'b':8, 'f': 12, 'n': 10, 'r': 13, 't': 9, 'v': 11}
_HEX_PAT = /^[a-fA-F0-9]/
_NUM_PAT = /^[0-9]/
_GROUP_PAT = /<([^>]+)>/
_NAME_PAT = /^[a-zA-Z ]/
I = IGNORECASE = 2
L = LOCALE = 4
M = MULTILINE = 8
D = DOTALL = 16
U = UNICODE = 32
X = VERBOSE = 64
DEBUG = 128
A = ASCII = 256
supports_unicode = RegExp.prototype.unicode is not undefined
_RE_ESCAPE = /[-\/\\^$*+?.()|[\]{}]/g
_re_cache_map = {}
_re_cache_items = v'[]'
error = SyntaxError # This is the error JS throws for invalid regexps
def _expand(groups, repl, group_name_map):
i = 0
def next():
nonlocal i
return v'repl[i++]'
def peek():
return repl[i]
def read_digits(count, pat, base, maxval, prefix):
ans = prefix or ''
greedy = count == Number.MAX_VALUE
while count > 0:
count -= 1
if not pat.test(peek()):
if greedy:
break
return ans
ans += next()
nval = parseInt(ans, base)
if nval > maxval:
return ans
return nval
def read_escape_sequence():
nonlocal i
q = next()
if not q or q == '\\':
return '\\'
if '"\''.indexOf(q) != -1:
return q
if _ASCII_CONTROL_CHARS.hasOwnProperty(q):
return String.fromCharCode(_ASCII_CONTROL_CHARS[q])
if '0' <= q <= '9':
ans = read_digits(Number.MAX_VALUE, _NUM_PAT, 10, Number.MAX_VALUE, q)
if type(ans) == 'number':
return groups[ans] or ''
return '\\' + ans
if q == 'g':
m = _GROUP_PAT.exec(repl[i:])
if m is not None:
i += m[0].length
gn = m[1]
if isNaN(parseInt(gn, 10)):
if not Object.prototype.hasOwnProperty.call(group_name_map, gn):
return ''
gn = group_name_map[gn][-1]
return groups[gn] or ''
if q == 'x':
code = read_digits(2, _HEX_PAT, 16, 0x10FFFF)
if type(code) == 'number':
return String.fromCharCode(code)
return '\\x' + code
if q == 'u':
code = read_digits(4, _HEX_PAT, 16, 0x10FFFF)
if type(code) == 'number':
return String.fromCharCode(code)
return '\\u' + code
if q == 'U':
code = read_digits(8, _HEX_PAT, 16, 0x10FFFF)
if type(code) == 'number':
if code <= 0xFFFF:
return String.fromCharCode(code)
code -= 0x10000
return String.fromCharCode(0xD800+(code>>10), 0xDC00+(code&0x3FF))
return '\\U' + code
if q == 'N' and peek() == '{':
next()
name = ''
while _NAME_PAT.test(peek()):
name += next()
if peek() != '}':
return '\\N{' + name
next()
key = (name or '').toLowerCase()
if not name or not Object.prototype.hasOwnProperty.call(_ALIAS_MAP, key):
return '\\N{' + name + '}'
code = _ALIAS_MAP[key]
if code <= 0xFFFF:
return String.fromCharCode(code)
code -= 0x10000
return String.fromCharCode(0xD800+(code>>10), 0xDC00+(code&0x3FF))
return '\\' + q
ans = ch = ''
while (ch = next()):
if ch == '\\':
ans += read_escape_sequence()
else:
ans += ch
return ans
def transform_regex(source, flags):
pos = 0
previous_backslash = in_class = False
ans = ''
group_map = {}
flags = flags or 0
group_count = 0
while pos < source.length:
ch = v'source[pos++]'
if previous_backslash:
ans += '\\' + ch
previous_backslash = False
continue
if in_class:
if ch == ']':
in_class = False
ans += ch
continue
if ch == '\\':
previous_backslash = True
continue
if ch == '[':
in_class = True
if source[pos] == ']': # in python the empty set is not allowed, instead []] is the same as [\]]
pos += 1
ch = r'[\]'
elif ch == '(':
if source[pos] == '?':
extension = source[pos + 1]
if extension == '#':
close = source.indexOf(')', pos + 1)
if close == -1:
raise ValueError('Expecting a closing )')
pos = close + 1
continue
if 'aiLmsux'.indexOf(extension) != -1:
flag_map = {'a':ASCII, 'i':IGNORECASE, 'L':LOCALE, 'm':MULTILINE, 's':DOTALL, 'u':UNICODE, 'x':VERBOSE}
close = source.indexOf(')', pos + 1)
if close == -1:
raise SyntaxError('Expecting a closing )')
flgs = source[pos+1:close]
for v'var i = 0; i < flgs.length; i++':
q = flgs[i] # noqa:undef
if not flag_map.hasOwnProperty(q):
raise SyntaxError('Invalid flag: ' + q)
flags |= flag_map[q]
pos = close + 1
continue
if extension == '<':
raise SyntaxError('Look behind assertions are not supported in JavaScript')
if extension == '(':
raise SyntaxError('Group existence assertions are not supported in JavaScript')
if extension == 'P':
pos += 2
q = source[pos]
if q == '<':
close = source.indexOf('>', pos)
if close == -1:
raise SyntaxError('Named group not closed, expecting >')
name = source[pos+1:close]
if not Object.prototype.hasOwnProperty.call(group_map, name):
group_map[name] = v'[]'
group_map[name].push(v'++group_count')
pos = close + 1
elif q == '=':
close = source.indexOf(')', pos)
if close == -1:
raise SyntaxError('Named group back-reference not closed, expecting a )')
name = source[pos+1:close]
if not isNaN(parseInt(name, 10)):
ans += '\\' + name
else:
if not Object.prototype.hasOwnProperty.call(group_map, name):
raise SyntaxError('Invalid back-reference. The named group: ' + name + ' has not yet been defined.')
ans += '\\' + group_map[name][-1]
pos = close + 1
continue
else:
raise SyntaxError('Expecting < or = after (?P')
else:
group_count += 1
elif ch == '.' and (flags & DOTALL):
ans += r'[\s\S]' # JavaScript has no DOTALL
continue
ans += ch
return ans, flags, group_map
class MatchObject:
def __init__(self, regex, match, pos, endpos):
self.re = regex
self.string = match.input
self._start_pos = match.index
self._groups = match
self.pos, self.endpos = pos, endpos
def _compute_extents(self):
# compute start/end for each group
match = self._groups
self._start = v'Array(match.length)'
self._end = v'Array(match.length)'
self._start[0] = self._start_pos
self._end[0] = self._start_pos + match[0].length
offset = self._start_pos
extent = match[0]
loc = 0
for v'var i = 1; i < match.length; i++':
g = match[i]
loc = extent.indexOf(g, loc)
if loc == -1:
self._start[i] = self._start[i-1]
self._end[i] = self._end[i-1]
else:
self._start[i] = offset + loc
loc += g.length
self._end[i] = offset + loc # noqa:undef
def groups(self, defval=None):
ans = v'[]'
for v'var i = 1; i < self._groups.length; i++':
val = self._groups[i] # noqa:undef
if val is undefined:
val = defval
ans.push(val)
return ans
def _group_number(self, g):
if type(g) == 'number':
return g
if Object.prototype.hasOwnProperty.call(self.re.group_name_map, g):
return self.re.group_name_map[g][-1]
return g
def _group_val(self, q, defval):
val = undefined
if type(q) == 'number' and -1 < q < self._groups.length:
val = self._groups[q]
else:
if Object.prototype.hasOwnProperty.call(self.re.group_name_map, q):
val = self._groups[self.re.group_name_map[q][-1]]
if val is undefined:
val = defval
return val
def group(self):
if arguments.length == 0:
return self._groups[0]
ans = v'[]'
for v'var i = 0; i < arguments.length; i++':
q = arguments[i] # noqa:undef
ans.push(self._group_val(q, None))
return ans[0] if ans.length == 1 else ans
def start(self, g):
if self._start is undefined:
self._compute_extents()
val = self._start[self._group_number(g or 0)]
if val is undefined:
val = -1
return val
def end(self, g):
if self._end is undefined:
self._compute_extents()
val = self._end[self._group_number(g or 0)]
if val is undefined:
val = -1
return val
def span(self, g):
return [self.start(g), self.end(g)]
def expand(self, repl):
return _expand(repl, this._groups, this.re.group_name_map)
def groupdict(self, defval=None):
gnm = self.re.group_name_map
names = Object.keys(gnm)
ans = {}
for v"var i = 0; i < names.length; i++":
name = names[i] # noqa:undef
if Object.prototype.hasOwnProperty.call(gnm, name):
val = self._groups[gnm[name][-1]]
if val is undefined:
val = defval
ans[name] = val
return ans
def captures(self, group_name):
ans = []
if not Object.prototype.hasOwnProperty.call(self.re.group_name_map, group_name):
return ans
groups = self.re.group_name_map[group_name]
for v'var i = 0; i < groups.length; i++':
val = self._groups[groups[i]] # noqa:undef
if val is not undefined:
ans.push(val)
return ans
def capturesdict(self):
gnm = self.re.group_name_map
names = Object.keys(gnm)
ans = {}
for v'var i = 0; i < names.length; i++':
name = names[i] # noqa:undef
ans[name] = self.captures(name)
return ans
class RegexObject:
def __init__(self, pattern, flags):
self.pattern = pattern.source if isinstance(pattern, RegExp) else pattern
self.js_pattern, self.flags, self.group_name_map = transform_regex(self.pattern, flags)
modifiers = ''
if self.flags & IGNORECASE: modifiers += 'i'
if self.flags & MULTILINE: modifiers += 'm'
if not (self.flags & ASCII) and supports_unicode:
modifiers += 'u'
self._modifiers = modifiers + 'g'
self._pattern = RegExp(self.js_pattern, self._modifiers)
def _do_search(self, pat, string, pos, endpos):
pat.lastIndex = 0
if endpos is not None:
string = string[:endpos]
while True:
n = pat.exec(string)
if n is None:
return None
if n.index >= pos:
return MatchObject(self, n, pos, endpos)
def search(self, string, pos=0, endpos=None):
return self._do_search(self._pattern, string, pos, endpos)
def match(self, string, pos=0, endpos=None):
return self._do_search(RegExp('^' + self.js_pattern, self._modifiers), string, pos, endpos)
def split(self, string, maxsplit=0):
self._pattern.lastIndex = 0
return string.split(self._pattern, maxsplit or undefined)
def findall(self, string):
self._pattern.lastIndex = 0
return _$rapyd$_list_decorate(string.match(self._pattern) or v'[]')
def finditer(self, string):
pat = RegExp(this._pattern.source, this._modifiers) # We have to do this since lastIndex is mutable
return {
'_string':string,
'_r': pat,
'_self': self,
_$rapyd$_iterator_symbol: def (): return this;,
'next': def ():
m = this._r.exec(this._string)
if m is None:
return {'done':True}
return {'done':False, 'value':MatchObject(this._self, m, 0, None)}
}
def subn(self, repl, string, count=0):
expand = _expand
if type(repl) == 'function':
expand = def(m, repl, gnm): return '' + repl(MatchObject(self, m, 0, None))
this._pattern.lastIndex = 0
num = 0
matches = v'[]'
while count < 1 or num < count:
m = this._pattern.exec(string)
if m is None:
break
matches.push(m)
num += 1
for v'var i = matches.length - 1; i > -1; i--':
m = matches[i] # noqa:undef
start = m.index
end = start + m[0].length
string = string[:start] + expand(m, repl, self.group_name_map) + string[end:]
return string, matches.length
def sub(self, repl, string, count=0):
return self.subn(repl, string, count)[0]
def _get_from_cache(pattern, flags):
if isinstance(pattern, RegExp):
pattern = pattern.source
key = JSON.stringify(v'[pattern, flags]')
if Object.prototype.hasOwnProperty.call(_re_cache_map, key):
return _re_cache_map[key]
if _re_cache_items.length >= 100:
v'delete _re_cache_map[_re_cache_items.shift()]'
ans = RegexObject(pattern, flags)
_re_cache_map[key] = ans
_re_cache_items.push(key)
return ans
def compile(pattern, flags=0):
return _get_from_cache(pattern, flags)
def search(pattern, string, flags=0):
return _get_from_cache(pattern, flags).search(string)
def match(pattern, string, flags=0):
return _get_from_cache(pattern, flags).match(string)
def split(pattern, string, maxsplit=0, flags=0):
return _get_from_cache(pattern, flags).split(string)
def findall(pattern, string, flags=0):
return _get_from_cache(pattern, flags).findall(string)
def finditer(pattern, string, flags=0):
return _get_from_cache(pattern, flags).finditer(string)
def sub(pattern, repl, string, count=0, flags=0):
return _get_from_cache(pattern, flags).sub(repl, string, count)
def subn(pattern, repl, string, count=0, flags=0):
return _get_from_cache(pattern, flags).subn(repl, string, count)
def escape(string):
return string.replace(_RE_ESCAPE, '\\$&')
def purge():
nonlocal _re_cache_map, _re_cache_items
_re_cache_map = {}
_re_cache_items = v'[]'