2016-04-08 09:54:27 +05:30

469 lines
23 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# vim:fileencoding=utf-8
# License: BSD
# Copyright: 2015, Kovid Goyal <kovid at kovidgoyal.net>
# Copyright: 2013, Alexander Tsepkov
# globals: ρσ_iterator_symbol, ρσ_list_decorate
# basic implementation of Python's 're' library
# Alias DB from http://www.unicode.org/Public/8.0.0/ucd/NameAliases.txt {{{
_ALIAS_MAP = {"null":0,"nul":0,"start of heading":1,"soh":1,"start of text":2,"stx":2,"end of text":3,"etx":3,"end of transmission":4,"eot":4,"enquiry":5,"enq":5,"acknowledge":6,"ack":6,"alert":7,"bel":7,"backspace":8,"bs":8,"character tabulation":9,"horizontal tabulation":9,"ht":9,"tab":9,"line feed":10,"new line":10,"end of line":10,"lf":10,"nl":10,"eol":10,"line tabulation":11,"vertical tabulation":11,"vt":11,"form feed":12,"ff":12,"carriage return":13,"cr":13,"shift out":14,"locking-shift one":14,"so":14,"shift in":15,"locking-shift zero":15,"si":15,"data link escape":16,"dle":16,"device control one":17,"dc1":17,"device control two":18,"dc2":18,"device control three":19,"dc3":19,"device control four":20,"dc4":20,"negative acknowledge":21,"nak":21,"synchronous idle":22,"syn":22,"end of transmission block":23,"etb":23,"cancel":24,"can":24,"end of medium":25,"eom":25,"substitute":26,"sub":26,"escape":27,"esc":27,"information separator four":28,"file separator":28,"fs":28,"information separator three":29,"group separator":29,"gs":29,"information separator two":30,"record separator":30,"rs":30,"information separator one":31,"unit separator":31,"us":31,"sp":32,"delete":127,"del":127,"padding character":128,"pad":128,"high octet preset":129,"hop":129,"break permitted here":130,"bph":130,"no break here":131,"nbh":131,"index":132,"ind":132,"next line":133,"nel":133,"start of selected area":134,"ssa":134,"end of selected area":135,"esa":135,"character tabulation set":136,"horizontal tabulation set":136,"hts":136,"character tabulation with justification":137,"horizontal tabulation with justification":137,"htj":137,"line tabulation set":138,"vertical tabulation set":138,"vts":138,"partial line forward":139,"partial line down":139,"pld":139,"partial line backward":140,"partial line up":140,"plu":140,"reverse line feed":141,"reverse index":141,"ri":141,"single shift two":142,"single-shift-2":142,"ss2":142,"single shift three":143,"single-shift-3":143,"ss3":143,"device control string":144,"dcs":144,"private use one":145,"private use-1":145,"pu1":145,"private use two":146,"private use-2":146,"pu2":146,"set transmit state":147,"sts":147,"cancel character":148,"cch":148,"message waiting":149,"mw":149,"start of guarded area":150,"start of protected area":150,"spa":150,"end of guarded area":151,"end of protected area":151,"epa":151,"start of string":152,"sos":152,"single graphic character introducer":153,"sgc":153,"single character introducer":154,"sci":154,"control sequence introducer":155,"csi":155,"string terminator":156,"st":156,"operating system command":157,"osc":157,"privacy message":158,"pm":158,"application program command":159,"apc":159,"nbsp":160,"shy":173,"latin capital letter gha":418,"latin small letter gha":419,"cgj":847,"alm":1564,"syriac sublinear colon skewed left":1801,"kannada letter llla":3294,"lao letter fo fon":3741,"lao letter fo fay":3743,"lao letter ro":3747,"lao letter lo":3749,"tibetan mark bka- shog gi mgo rgyan":4048,"fvs1":6155,"fvs2":6156,"fvs3":6157,"mvs":6158,"zwsp":8203,"zwnj":8204,"zwj":8205,"lrm":8206,"rlm":8207,"lre":8234,"rle":8235,"pdf":8236,"lro":8237,"rlo":8238,"nnbsp":8239,"mmsp":8287,"wj":8288,"lri":8294,"rli":8295,"fsi":8296,"pdi":8297,"weierstrass elliptic function":8472,"micr on us symbol":9288,"micr dash symbol":9289,"leftwards triangle-headed arrow with double vertical stroke":11130,"rightwards triangle-headed arrow with double vertical stroke":11132,"yi syllable iteration mark":40981,"presentation form for vertical right white lenticular bracket":65048,"vs1":65024,"vs2":65025,"vs3":65026,"vs4":65027,"vs5":65028,"vs6":65029,"vs7":65030,"vs8":65031,"vs9":65032,"vs10":65033,"vs11":65034,"vs12":65035,"vs13":65036,"vs14":65037,"vs15":65038,"vs16":65039,"byte order mark":65279,"bom":65279,"zwnbsp":65279,"cuneiform sign nu11 tenu":74452,"cuneiform sign nu11 over nu11 bur over bur":74453,"byzantine musical symbol fthora skliron chroma vasis":118981,"vs17":917760,"vs18":917761,"vs19":917762,"vs20":917763,"vs21":917764,"vs22":917765,"vs23":917766,"vs24":917767,"vs25":917768,"vs26":917769,"vs27":917770,"vs28":917771,"vs29":917772,"vs30":917773,"vs31":917774,"vs32":917775,"vs33":917776,"vs34":917777,"vs35":917778,"vs36":917779,"vs37":917780,"vs38":917781,"vs39":917782,"vs40":917783,"vs41":917784,"vs42":917785,"vs43":917786,"vs44":917787,"vs45":917788,"vs46":917789,"vs47":917790,"vs48":917791,"vs49":917792,"vs50":917793,"vs51":917794,"vs52":917795,"vs53":917796,"vs54":917797,"vs55":917798,"vs56":917799,"vs57":917800,"vs58":917801,"vs59":917802,"vs60":917803,"vs61":917804,"vs62":917805,"vs63":917806,"vs64":917807,"vs65":917808,"vs66":917809,"vs67":917810,"vs68":917811,"vs69":917812,"vs70":917813,"vs71":917814,"vs72":917815,"vs73":917816,"vs74":917817,"vs75":917818,"vs76":917819,"vs77":917820,"vs78":917821,"vs79":917822,"vs80":917823,"vs81":917824,"vs82":917825,"vs83":917826,"vs84":917827,"vs85":917828,"vs86":917829,"vs87":917830,"vs88":917831,"vs89":917832,"vs90":917833,"vs91":917834,"vs92":917835,"vs93":917836,"vs94":917837,"vs95":917838,"vs96":917839,"vs97":917840,"vs98":917841,"vs99":917842,"vs100":917843,"vs101":917844,"vs102":917845,"vs103":917846,"vs104":917847,"vs105":917848,"vs106":917849,"vs107":917850,"vs108":917851,"vs109":917852,"vs110":917853,"vs111":917854,"vs112":917855,"vs113":917856,"vs114":917857,"vs115":917858,"vs116":917859,"vs117":917860,"vs118":917861,"vs119":917862,"vs120":917863,"vs121":917864,"vs122":917865,"vs123":917866,"vs124":917867,"vs125":917868,"vs126":917869,"vs127":917870,"vs128":917871,"vs129":917872,"vs130":917873,"vs131":917874,"vs132":917875,"vs133":917876,"vs134":917877,"vs135":917878,"vs136":917879,"vs137":917880,"vs138":917881,"vs139":917882,"vs140":917883,"vs141":917884,"vs142":917885,"vs143":917886,"vs144":917887,"vs145":917888,"vs146":917889,"vs147":917890,"vs148":917891,"vs149":917892,"vs150":917893,"vs151":917894,"vs152":917895,"vs153":917896,"vs154":917897,"vs155":917898,"vs156":917899,"vs157":917900,"vs158":917901,"vs159":917902,"vs160":917903,"vs161":917904,"vs162":917905,"vs163":917906,"vs164":917907,"vs165":917908,"vs166":917909,"vs167":917910,"vs168":917911,"vs169":917912,"vs170":917913,"vs171":917914,"vs172":917915,"vs173":917916,"vs174":917917,"vs175":917918,"vs176":917919,"vs177":917920,"vs178":917921,"vs179":917922,"vs180":917923,"vs181":917924,"vs182":917925,"vs183":917926,"vs184":917927,"vs185":917928,"vs186":917929,"vs187":917930,"vs188":917931,"vs189":917932,"vs190":917933,"vs191":917934,"vs192":917935,"vs193":917936,"vs194":917937,"vs195":917938,"vs196":917939,"vs197":917940,"vs198":917941,"vs199":917942,"vs200":917943,"vs201":917944,"vs202":917945,"vs203":917946,"vs204":917947,"vs205":917948,"vs206":917949,"vs207":917950,"vs208":917951,"vs209":917952,"vs210":917953,"vs211":917954,"vs212":917955,"vs213":917956,"vs214":917957,"vs215":917958,"vs216":917959,"vs217":917960,"vs218":917961,"vs219":917962,"vs220":917963,"vs221":917964,"vs222":917965,"vs223":917966,"vs224":917967,"vs225":917968,"vs226":917969,"vs227":917970,"vs228":917971,"vs229":917972,"vs230":917973,"vs231":917974,"vs232":917975,"vs233":917976,"vs234":917977,"vs235":917978,"vs236":917979,"vs237":917980,"vs238":917981,"vs239":917982,"vs240":917983,"vs241":917984,"vs242":917985,"vs243":917986,"vs244":917987,"vs245":917988,"vs246":917989,"vs247":917990,"vs248":917991,"vs249":917992,"vs250":917993,"vs251":917994,"vs252":917995,"vs253":917996,"vs254":917997,"vs255":917998,"vs256":917999}
# }}}
_ASCII_CONTROL_CHARS = {'a':7, 'b':8, 'f': 12, 'n': 10, 'r': 13, 't': 9, 'v': 11}
_HEX_PAT = /^[a-fA-F0-9]/
_NUM_PAT = /^[0-9]/
_GROUP_PAT = /<([^>]+)>/
_NAME_PAT = /^[a-zA-Z ]/
I = IGNORECASE = 2
L = LOCALE = 4
M = MULTILINE = 8
D = DOTALL = 16
U = UNICODE = 32
X = VERBOSE = 64
DEBUG = 128
A = ASCII = 256
supports_unicode = RegExp.prototype.unicode is not undefined
_RE_ESCAPE = /[-\/\\^$*+?.()|[\]{}]/g
_re_cache_map = {}
_re_cache_items = v'[]'
error = SyntaxError # This is the error JS throws for invalid regexps
has_prop = Object.prototype.hasOwnProperty.call.bind(Object.prototype.hasOwnProperty)
def _expand(groups, repl, group_name_map):
i = 0
def next():
nonlocal i
return v'repl[i++]'
def peek():
return repl[i]
def read_digits(count, pat, base, maxval, prefix):
ans = prefix or ''
greedy = count is Number.MAX_VALUE
while count > 0:
count -= 1
if not pat.test(peek()):
if greedy:
break
return ans
ans += next()
nval = parseInt(ans, base)
if nval > maxval:
return ans
return nval
def read_escape_sequence():
nonlocal i
q = next()
if not q or q is '\\':
return '\\'
if '"\''.indexOf(q) is not -1:
return q
if _ASCII_CONTROL_CHARS[q]:
return String.fromCharCode(_ASCII_CONTROL_CHARS[q])
if '0' <= q <= '9':
ans = read_digits(Number.MAX_VALUE, _NUM_PAT, 10, Number.MAX_VALUE, q)
if type(ans) is 'number':
return groups[ans] or ''
return '\\' + ans
if q is 'g':
m = _GROUP_PAT.exec(repl[i:])
if m is not None:
i += m[0].length
gn = m[1]
if isNaN(parseInt(gn, 10)):
if not has_prop(group_name_map, gn):
return ''
gn = group_name_map[gn][-1]
return groups[gn] or ''
if q is 'x':
code = read_digits(2, _HEX_PAT, 16, 0x10FFFF)
if type(code) is 'number':
return String.fromCharCode(code)
return '\\x' + code
if q is 'u':
code = read_digits(4, _HEX_PAT, 16, 0x10FFFF)
if type(code) is 'number':
return String.fromCharCode(code)
return '\\u' + code
if q is 'U':
code = read_digits(8, _HEX_PAT, 16, 0x10FFFF)
if type(code) is 'number':
if code <= 0xFFFF:
return String.fromCharCode(code)
code -= 0x10000
return String.fromCharCode(0xD800+(code>>10), 0xDC00+(code&0x3FF))
return '\\U' + code
if q is 'N' and peek() is '{':
next()
name = ''
while _NAME_PAT.test(peek()):
name += next()
if peek() is not '}':
return '\\N{' + name
next()
key = (name or '').toLowerCase()
if not name or not has_prop(_ALIAS_MAP, key):
return '\\N{' + name + '}'
code = _ALIAS_MAP[key]
if code <= 0xFFFF:
return String.fromCharCode(code)
code -= 0x10000
return String.fromCharCode(0xD800+(code>>10), 0xDC00+(code&0x3FF))
return '\\' + q
ans = ch = ''
while (ch = next()):
if ch is '\\':
ans += read_escape_sequence()
else:
ans += ch
return ans
def transform_regex(source, flags):
pos = 0
previous_backslash = in_class = False
ans = ''
group_map = {}
flags = flags or 0
group_count = 0
while pos < source.length:
ch = v'source[pos++]'
if previous_backslash:
ans += '\\' + ch
previous_backslash = False
continue
if in_class:
if ch is ']':
in_class = False
ans += ch
continue
if ch is '\\':
previous_backslash = True
continue
if ch is '[':
in_class = True
if source[pos] is ']': # in python the empty set is not allowed, instead []] is the same as [\]]
pos += 1
ch = r'[\]'
elif ch is '(':
if source[pos] is '?':
extension = source[pos + 1]
if extension is '#':
close = source.indexOf(')', pos + 1)
if close is -1:
raise ValueError('Expecting a closing )')
pos = close + 1
continue
if 'aiLmsux'.indexOf(extension) is not -1:
flag_map = {'a':ASCII, 'i':IGNORECASE, 'L':LOCALE, 'm':MULTILINE, 's':DOTALL, 'u':UNICODE, 'x':VERBOSE}
close = source.indexOf(')', pos + 1)
if close is -1:
raise SyntaxError('Expecting a closing )')
flgs = source[pos+1:close]
for v'var i = 0; i < flgs.length; i++':
q = flgs[i] # noqa:undef
if not has_prop(flag_map, q):
raise SyntaxError('Invalid flag: ' + q)
flags |= flag_map[q]
pos = close + 1
continue
if extension is '<':
raise SyntaxError('Look behind assertions are not supported in JavaScript')
if extension is '(':
raise SyntaxError('Group existence assertions are not supported in JavaScript')
if extension is 'P':
pos += 2
q = source[pos]
if q is '<':
close = source.indexOf('>', pos)
if close is -1:
raise SyntaxError('Named group not closed, expecting >')
name = source[pos+1:close]
if not has_prop(group_map, name):
group_map[name] = v'[]'
group_map[name].push(v'++group_count')
pos = close + 1
elif q is '=':
close = source.indexOf(')', pos)
if close is -1:
raise SyntaxError('Named group back-reference not closed, expecting a )')
name = source[pos+1:close]
if not isNaN(parseInt(name, 10)):
ans += '\\' + name
else:
if not has_prop(group_map, name):
raise SyntaxError('Invalid back-reference. The named group: ' + name + ' has not yet been defined.')
ans += '\\' + group_map[name][-1]
pos = close + 1
continue
else:
raise SyntaxError('Expecting < or = after (?P')
else:
group_count += 1
elif ch is '.' and (flags & DOTALL):
ans += r'[\s\S]' # JavaScript has no DOTALL
continue
ans += ch
return ans, flags, group_map
class MatchObject:
def __init__(self, regex, match, pos, endpos):
self.re = regex
self.string = match.input
self._start_pos = match.index
self._groups = match
self.pos, self.endpos = pos, endpos
def _compute_extents(self):
# compute start/end for each group
match = self._groups
self._start = v'Array(match.length)'
self._end = v'Array(match.length)'
self._start[0] = self._start_pos
self._end[0] = self._start_pos + match[0].length
offset = self._start_pos
extent = match[0]
loc = 0
for v'var i = 1; i < match.length; i++':
g = match[i]
loc = extent.indexOf(g, loc)
if loc is -1:
self._start[i] = self._start[i-1]
self._end[i] = self._end[i-1]
else:
self._start[i] = offset + loc
loc += g.length
self._end[i] = offset + loc # noqa:undef
def groups(self, defval=None):
ans = v'[]'
for v'var i = 1; i < self._groups.length; i++':
val = self._groups[i] # noqa:undef
if val is undefined:
val = defval
ans.push(val)
return ans
def _group_number(self, g):
if type(g) is 'number':
return g
if has_prop(self.re.group_name_map, g):
return self.re.group_name_map[g][-1]
return g
def _group_val(self, q, defval):
val = undefined
if type(q) is 'number' and -1 < q < self._groups.length:
val = self._groups[q]
else:
if has_prop(self.re.group_name_map, q):
val = self._groups[self.re.group_name_map[q][-1]]
if val is undefined:
val = defval
return val
def group(self):
if arguments.length is 0:
return self._groups[0]
ans = v'[]'
for v'var i = 0; i < arguments.length; i++':
q = arguments[i] # noqa:undef
ans.push(self._group_val(q, None))
return ans[0] if ans.length is 1 else ans
def start(self, g):
if self._start is undefined:
self._compute_extents()
val = self._start[self._group_number(g or 0)]
if val is undefined:
val = -1
return val
def end(self, g):
if self._end is undefined:
self._compute_extents()
val = self._end[self._group_number(g or 0)]
if val is undefined:
val = -1
return val
def span(self, g):
return [self.start(g), self.end(g)]
def expand(self, repl):
return _expand(repl, this._groups, this.re.group_name_map)
def groupdict(self, defval=None):
gnm = self.re.group_name_map
names = Object.keys(gnm)
ans = {}
for v"var i = 0; i < names.length; i++":
name = names[i] # noqa:undef
if has_prop(gnm, name):
val = self._groups[gnm[name][-1]]
if val is undefined:
val = defval
ans[name] = val
return ans
def captures(self, group_name):
ans = []
if not has_prop(self.re.group_name_map, group_name):
return ans
groups = self.re.group_name_map[group_name]
for v'var i = 0; i < groups.length; i++':
val = self._groups[groups[i]] # noqa:undef
if val is not undefined:
ans.push(val)
return ans
def capturesdict(self):
gnm = self.re.group_name_map
names = Object.keys(gnm)
ans = {}
for v'var i = 0; i < names.length; i++':
name = names[i] # noqa:undef
ans[name] = self.captures(name)
return ans
class RegexObject:
def __init__(self, pattern, flags):
self.pattern = pattern.source if isinstance(pattern, RegExp) else pattern
self.js_pattern, self.flags, self.group_name_map = transform_regex(self.pattern, flags)
modifiers = ''
if self.flags & IGNORECASE: modifiers += 'i'
if self.flags & MULTILINE: modifiers += 'm'
if not (self.flags & ASCII) and supports_unicode:
modifiers += 'u'
self._modifiers = modifiers + 'g'
self._pattern = RegExp(self.js_pattern, self._modifiers)
def _do_search(self, pat, string, pos, endpos):
pat.lastIndex = 0
if endpos is not None:
string = string[:endpos]
while True:
n = pat.exec(string)
if n is None:
return None
if n.index >= pos:
return MatchObject(self, n, pos, endpos)
def search(self, string, pos=0, endpos=None):
return self._do_search(self._pattern, string, pos, endpos)
def match(self, string, pos=0, endpos=None):
return self._do_search(RegExp('^' + self.js_pattern, self._modifiers), string, pos, endpos)
def split(self, string, maxsplit=0):
self._pattern.lastIndex = 0
return string.split(self._pattern, maxsplit or undefined)
def findall(self, string):
self._pattern.lastIndex = 0
return ρσ_list_decorate(string.match(self._pattern) or v'[]')
def finditer(self, string):
# We have to copy pat since lastIndex is mutable
pat = RegExp(this._pattern.source, this._modifiers) # noqa: unused-local
ans = v"{'_string':string, '_r':pat, '_self':self}"
ans[ρσ_iterator_symbol] = def():
return this
ans['next'] = def():
m = this._r.exec(this._string)
if m is None:
return v"{'done':true}"
return v"{'done':false, 'value':new MatchObject(this._self, m, 0, null)}"
return ans
def subn(self, repl, string, count=0):
expand = _expand
if type(repl) is 'function':
expand = def(m, repl, gnm): return '' + repl(MatchObject(self, m, 0, None))
this._pattern.lastIndex = 0
num = 0
matches = v'[]'
while count < 1 or num < count:
m = this._pattern.exec(string)
if m is None:
break
matches.push(m)
num += 1
for v'var i = matches.length - 1; i > -1; i--':
m = matches[i] # noqa:undef
start = m.index
end = start + m[0].length
string = string[:start] + expand(m, repl, self.group_name_map) + string[end:]
return string, matches.length
def sub(self, repl, string, count=0):
return self.subn(repl, string, count)[0]
def _get_from_cache(pattern, flags):
if isinstance(pattern, RegExp):
pattern = pattern.source
key = JSON.stringify(v'[pattern, flags]')
if has_prop(_re_cache_map, key):
return _re_cache_map[key]
if _re_cache_items.length >= 100:
v'delete _re_cache_map[_re_cache_items.shift()]'
ans = RegexObject(pattern, flags)
_re_cache_map[key] = ans
_re_cache_items.push(key)
return ans
def compile(pattern, flags=0):
return _get_from_cache(pattern, flags)
def search(pattern, string, flags=0):
return _get_from_cache(pattern, flags).search(string)
def match(pattern, string, flags=0):
return _get_from_cache(pattern, flags).match(string)
def split(pattern, string, maxsplit=0, flags=0):
return _get_from_cache(pattern, flags).split(string)
def findall(pattern, string, flags=0):
return _get_from_cache(pattern, flags).findall(string)
def finditer(pattern, string, flags=0):
return _get_from_cache(pattern, flags).finditer(string)
def sub(pattern, repl, string, count=0, flags=0):
return _get_from_cache(pattern, flags).sub(repl, string, count)
def subn(pattern, repl, string, count=0, flags=0):
return _get_from_cache(pattern, flags).subn(repl, string, count)
def escape(string):
return string.replace(_RE_ESCAPE, '\\$&')
def purge():
nonlocal _re_cache_map, _re_cache_items
_re_cache_map = {}
_re_cache_items = v'[]'