# vim:fileencoding=utf-8 # License: BSD # Copyright: 2015, Kovid Goyal # Copyright: 2013, Alexander Tsepkov # globals: ρσ_iterator_symbol, ρσ_list_decorate # basic implementation of Python's 're' library # Alias DB from http://www.unicode.org/Public/8.0.0/ucd/NameAliases.txt {{{ _ALIAS_MAP = {"null":0,"nul":0,"start of heading":1,"soh":1,"start of text":2,"stx":2,"end of text":3,"etx":3,"end of transmission":4,"eot":4,"enquiry":5,"enq":5,"acknowledge":6,"ack":6,"alert":7,"bel":7,"backspace":8,"bs":8,"character tabulation":9,"horizontal tabulation":9,"ht":9,"tab":9,"line feed":10,"new line":10,"end of line":10,"lf":10,"nl":10,"eol":10,"line tabulation":11,"vertical tabulation":11,"vt":11,"form feed":12,"ff":12,"carriage return":13,"cr":13,"shift out":14,"locking-shift one":14,"so":14,"shift in":15,"locking-shift zero":15,"si":15,"data link escape":16,"dle":16,"device control one":17,"dc1":17,"device control two":18,"dc2":18,"device control three":19,"dc3":19,"device control four":20,"dc4":20,"negative acknowledge":21,"nak":21,"synchronous idle":22,"syn":22,"end of transmission block":23,"etb":23,"cancel":24,"can":24,"end of medium":25,"eom":25,"substitute":26,"sub":26,"escape":27,"esc":27,"information separator four":28,"file separator":28,"fs":28,"information separator three":29,"group separator":29,"gs":29,"information separator two":30,"record separator":30,"rs":30,"information separator one":31,"unit separator":31,"us":31,"sp":32,"delete":127,"del":127,"padding character":128,"pad":128,"high octet preset":129,"hop":129,"break permitted here":130,"bph":130,"no break here":131,"nbh":131,"index":132,"ind":132,"next line":133,"nel":133,"start of selected area":134,"ssa":134,"end of selected area":135,"esa":135,"character tabulation set":136,"horizontal tabulation set":136,"hts":136,"character tabulation with justification":137,"horizontal tabulation with justification":137,"htj":137,"line tabulation set":138,"vertical tabulation set":138,"vts":138,"partial line forward":139,"partial line down":139,"pld":139,"partial line backward":140,"partial line up":140,"plu":140,"reverse line feed":141,"reverse index":141,"ri":141,"single shift two":142,"single-shift-2":142,"ss2":142,"single shift three":143,"single-shift-3":143,"ss3":143,"device control string":144,"dcs":144,"private use one":145,"private use-1":145,"pu1":145,"private use two":146,"private use-2":146,"pu2":146,"set transmit state":147,"sts":147,"cancel character":148,"cch":148,"message waiting":149,"mw":149,"start of guarded area":150,"start of protected area":150,"spa":150,"end of guarded area":151,"end of protected area":151,"epa":151,"start of string":152,"sos":152,"single graphic character introducer":153,"sgc":153,"single character introducer":154,"sci":154,"control sequence introducer":155,"csi":155,"string terminator":156,"st":156,"operating system command":157,"osc":157,"privacy message":158,"pm":158,"application program command":159,"apc":159,"nbsp":160,"shy":173,"latin capital letter gha":418,"latin small letter gha":419,"cgj":847,"alm":1564,"syriac sublinear colon skewed left":1801,"kannada letter llla":3294,"lao letter fo fon":3741,"lao letter fo fay":3743,"lao letter ro":3747,"lao letter lo":3749,"tibetan mark bka- shog gi mgo rgyan":4048,"fvs1":6155,"fvs2":6156,"fvs3":6157,"mvs":6158,"zwsp":8203,"zwnj":8204,"zwj":8205,"lrm":8206,"rlm":8207,"lre":8234,"rle":8235,"pdf":8236,"lro":8237,"rlo":8238,"nnbsp":8239,"mmsp":8287,"wj":8288,"lri":8294,"rli":8295,"fsi":8296,"pdi":8297,"weierstrass elliptic function":8472,"micr on us symbol":9288,"micr dash symbol":9289,"leftwards triangle-headed arrow with double vertical stroke":11130,"rightwards triangle-headed arrow with double vertical stroke":11132,"yi syllable iteration mark":40981,"presentation form for vertical right white lenticular bracket":65048,"vs1":65024,"vs2":65025,"vs3":65026,"vs4":65027,"vs5":65028,"vs6":65029,"vs7":65030,"vs8":65031,"vs9":65032,"vs10":65033,"vs11":65034,"vs12":65035,"vs13":65036,"vs14":65037,"vs15":65038,"vs16":65039,"byte order mark":65279,"bom":65279,"zwnbsp":65279,"cuneiform sign nu11 tenu":74452,"cuneiform sign nu11 over nu11 bur over bur":74453,"byzantine musical symbol fthora skliron chroma vasis":118981,"vs17":917760,"vs18":917761,"vs19":917762,"vs20":917763,"vs21":917764,"vs22":917765,"vs23":917766,"vs24":917767,"vs25":917768,"vs26":917769,"vs27":917770,"vs28":917771,"vs29":917772,"vs30":917773,"vs31":917774,"vs32":917775,"vs33":917776,"vs34":917777,"vs35":917778,"vs36":917779,"vs37":917780,"vs38":917781,"vs39":917782,"vs40":917783,"vs41":917784,"vs42":917785,"vs43":917786,"vs44":917787,"vs45":917788,"vs46":917789,"vs47":917790,"vs48":917791,"vs49":917792,"vs50":917793,"vs51":917794,"vs52":917795,"vs53":917796,"vs54":917797,"vs55":917798,"vs56":917799,"vs57":917800,"vs58":917801,"vs59":917802,"vs60":917803,"vs61":917804,"vs62":917805,"vs63":917806,"vs64":917807,"vs65":917808,"vs66":917809,"vs67":917810,"vs68":917811,"vs69":917812,"vs70":917813,"vs71":917814,"vs72":917815,"vs73":917816,"vs74":917817,"vs75":917818,"vs76":917819,"vs77":917820,"vs78":917821,"vs79":917822,"vs80":917823,"vs81":917824,"vs82":917825,"vs83":917826,"vs84":917827,"vs85":917828,"vs86":917829,"vs87":917830,"vs88":917831,"vs89":917832,"vs90":917833,"vs91":917834,"vs92":917835,"vs93":917836,"vs94":917837,"vs95":917838,"vs96":917839,"vs97":917840,"vs98":917841,"vs99":917842,"vs100":917843,"vs101":917844,"vs102":917845,"vs103":917846,"vs104":917847,"vs105":917848,"vs106":917849,"vs107":917850,"vs108":917851,"vs109":917852,"vs110":917853,"vs111":917854,"vs112":917855,"vs113":917856,"vs114":917857,"vs115":917858,"vs116":917859,"vs117":917860,"vs118":917861,"vs119":917862,"vs120":917863,"vs121":917864,"vs122":917865,"vs123":917866,"vs124":917867,"vs125":917868,"vs126":917869,"vs127":917870,"vs128":917871,"vs129":917872,"vs130":917873,"vs131":917874,"vs132":917875,"vs133":917876,"vs134":917877,"vs135":917878,"vs136":917879,"vs137":917880,"vs138":917881,"vs139":917882,"vs140":917883,"vs141":917884,"vs142":917885,"vs143":917886,"vs144":917887,"vs145":917888,"vs146":917889,"vs147":917890,"vs148":917891,"vs149":917892,"vs150":917893,"vs151":917894,"vs152":917895,"vs153":917896,"vs154":917897,"vs155":917898,"vs156":917899,"vs157":917900,"vs158":917901,"vs159":917902,"vs160":917903,"vs161":917904,"vs162":917905,"vs163":917906,"vs164":917907,"vs165":917908,"vs166":917909,"vs167":917910,"vs168":917911,"vs169":917912,"vs170":917913,"vs171":917914,"vs172":917915,"vs173":917916,"vs174":917917,"vs175":917918,"vs176":917919,"vs177":917920,"vs178":917921,"vs179":917922,"vs180":917923,"vs181":917924,"vs182":917925,"vs183":917926,"vs184":917927,"vs185":917928,"vs186":917929,"vs187":917930,"vs188":917931,"vs189":917932,"vs190":917933,"vs191":917934,"vs192":917935,"vs193":917936,"vs194":917937,"vs195":917938,"vs196":917939,"vs197":917940,"vs198":917941,"vs199":917942,"vs200":917943,"vs201":917944,"vs202":917945,"vs203":917946,"vs204":917947,"vs205":917948,"vs206":917949,"vs207":917950,"vs208":917951,"vs209":917952,"vs210":917953,"vs211":917954,"vs212":917955,"vs213":917956,"vs214":917957,"vs215":917958,"vs216":917959,"vs217":917960,"vs218":917961,"vs219":917962,"vs220":917963,"vs221":917964,"vs222":917965,"vs223":917966,"vs224":917967,"vs225":917968,"vs226":917969,"vs227":917970,"vs228":917971,"vs229":917972,"vs230":917973,"vs231":917974,"vs232":917975,"vs233":917976,"vs234":917977,"vs235":917978,"vs236":917979,"vs237":917980,"vs238":917981,"vs239":917982,"vs240":917983,"vs241":917984,"vs242":917985,"vs243":917986,"vs244":917987,"vs245":917988,"vs246":917989,"vs247":917990,"vs248":917991,"vs249":917992,"vs250":917993,"vs251":917994,"vs252":917995,"vs253":917996,"vs254":917997,"vs255":917998,"vs256":917999} # }}} _ASCII_CONTROL_CHARS = {'a':7, 'b':8, 'f': 12, 'n': 10, 'r': 13, 't': 9, 'v': 11} _HEX_PAT = /^[a-fA-F0-9]/ _NUM_PAT = /^[0-9]/ _GROUP_PAT = /<([^>]+)>/ _NAME_PAT = /^[a-zA-Z ]/ I = IGNORECASE = 2 L = LOCALE = 4 M = MULTILINE = 8 D = DOTALL = 16 U = UNICODE = 32 X = VERBOSE = 64 DEBUG = 128 A = ASCII = 256 supports_unicode = RegExp.prototype.unicode is not undefined _RE_ESCAPE = /[-\/\\^$*+?.()|[\]{}]/g _re_cache_map = {} _re_cache_items = v'[]' error = SyntaxError # This is the error JS throws for invalid regexps def _expand(groups, repl, group_name_map): i = 0 def next(): nonlocal i return v'repl[i++]' def peek(): return repl[i] def read_digits(count, pat, base, maxval, prefix): ans = prefix or '' greedy = count is Number.MAX_VALUE while count > 0: count -= 1 if not pat.test(peek()): if greedy: break return ans ans += next() nval = parseInt(ans, base) if nval > maxval: return ans return nval def read_escape_sequence(): nonlocal i q = next() if not q or q is '\\': return '\\' if '"\''.indexOf(q) is not -1: return q if _ASCII_CONTROL_CHARS.hasOwnProperty(q): return String.fromCharCode(_ASCII_CONTROL_CHARS[q]) if '0' <= q <= '9': ans = read_digits(Number.MAX_VALUE, _NUM_PAT, 10, Number.MAX_VALUE, q) if type(ans) is 'number': return groups[ans] or '' return '\\' + ans if q is 'g': m = _GROUP_PAT.exec(repl[i:]) if m is not None: i += m[0].length gn = m[1] if isNaN(parseInt(gn, 10)): if not Object.prototype.hasOwnProperty.call(group_name_map, gn): return '' gn = group_name_map[gn][-1] return groups[gn] or '' if q is 'x': code = read_digits(2, _HEX_PAT, 16, 0x10FFFF) if type(code) is 'number': return String.fromCharCode(code) return '\\x' + code if q is 'u': code = read_digits(4, _HEX_PAT, 16, 0x10FFFF) if type(code) is 'number': return String.fromCharCode(code) return '\\u' + code if q is 'U': code = read_digits(8, _HEX_PAT, 16, 0x10FFFF) if type(code) is 'number': if code <= 0xFFFF: return String.fromCharCode(code) code -= 0x10000 return String.fromCharCode(0xD800+(code>>10), 0xDC00+(code&0x3FF)) return '\\U' + code if q is 'N' and peek() is '{': next() name = '' while _NAME_PAT.test(peek()): name += next() if peek() is not '}': return '\\N{' + name next() key = (name or '').toLowerCase() if not name or not Object.prototype.hasOwnProperty.call(_ALIAS_MAP, key): return '\\N{' + name + '}' code = _ALIAS_MAP[key] if code <= 0xFFFF: return String.fromCharCode(code) code -= 0x10000 return String.fromCharCode(0xD800+(code>>10), 0xDC00+(code&0x3FF)) return '\\' + q ans = ch = '' while (ch = next()): if ch is '\\': ans += read_escape_sequence() else: ans += ch return ans def transform_regex(source, flags): pos = 0 previous_backslash = in_class = False ans = '' group_map = {} flags = flags or 0 group_count = 0 while pos < source.length: ch = v'source[pos++]' if previous_backslash: ans += '\\' + ch previous_backslash = False continue if in_class: if ch is ']': in_class = False ans += ch continue if ch is '\\': previous_backslash = True continue if ch is '[': in_class = True if source[pos] is ']': # in python the empty set is not allowed, instead []] is the same as [\]] pos += 1 ch = r'[\]' elif ch is '(': if source[pos] is '?': extension = source[pos + 1] if extension is '#': close = source.indexOf(')', pos + 1) if close is -1: raise ValueError('Expecting a closing )') pos = close + 1 continue if 'aiLmsux'.indexOf(extension) is not -1: flag_map = {'a':ASCII, 'i':IGNORECASE, 'L':LOCALE, 'm':MULTILINE, 's':DOTALL, 'u':UNICODE, 'x':VERBOSE} close = source.indexOf(')', pos + 1) if close is -1: raise SyntaxError('Expecting a closing )') flgs = source[pos+1:close] for v'var i = 0; i < flgs.length; i++': q = flgs[i] # noqa:undef if not flag_map.hasOwnProperty(q): raise SyntaxError('Invalid flag: ' + q) flags |= flag_map[q] pos = close + 1 continue if extension is '<': raise SyntaxError('Look behind assertions are not supported in JavaScript') if extension is '(': raise SyntaxError('Group existence assertions are not supported in JavaScript') if extension is 'P': pos += 2 q = source[pos] if q is '<': close = source.indexOf('>', pos) if close is -1: raise SyntaxError('Named group not closed, expecting >') name = source[pos+1:close] if not Object.prototype.hasOwnProperty.call(group_map, name): group_map[name] = v'[]' group_map[name].push(v'++group_count') pos = close + 1 elif q is '=': close = source.indexOf(')', pos) if close is -1: raise SyntaxError('Named group back-reference not closed, expecting a )') name = source[pos+1:close] if not isNaN(parseInt(name, 10)): ans += '\\' + name else: if not Object.prototype.hasOwnProperty.call(group_map, name): raise SyntaxError('Invalid back-reference. The named group: ' + name + ' has not yet been defined.') ans += '\\' + group_map[name][-1] pos = close + 1 continue else: raise SyntaxError('Expecting < or = after (?P') else: group_count += 1 elif ch is '.' and (flags & DOTALL): ans += r'[\s\S]' # JavaScript has no DOTALL continue ans += ch return ans, flags, group_map class MatchObject: def __init__(self, regex, match, pos, endpos): self.re = regex self.string = match.input self._start_pos = match.index self._groups = match self.pos, self.endpos = pos, endpos def _compute_extents(self): # compute start/end for each group match = self._groups self._start = v'Array(match.length)' self._end = v'Array(match.length)' self._start[0] = self._start_pos self._end[0] = self._start_pos + match[0].length offset = self._start_pos extent = match[0] loc = 0 for v'var i = 1; i < match.length; i++': g = match[i] loc = extent.indexOf(g, loc) if loc is -1: self._start[i] = self._start[i-1] self._end[i] = self._end[i-1] else: self._start[i] = offset + loc loc += g.length self._end[i] = offset + loc # noqa:undef def groups(self, defval=None): ans = v'[]' for v'var i = 1; i < self._groups.length; i++': val = self._groups[i] # noqa:undef if val is undefined: val = defval ans.push(val) return ans def _group_number(self, g): if type(g) is 'number': return g if Object.prototype.hasOwnProperty.call(self.re.group_name_map, g): return self.re.group_name_map[g][-1] return g def _group_val(self, q, defval): val = undefined if type(q) is 'number' and -1 < q < self._groups.length: val = self._groups[q] else: if Object.prototype.hasOwnProperty.call(self.re.group_name_map, q): val = self._groups[self.re.group_name_map[q][-1]] if val is undefined: val = defval return val def group(self): if arguments.length is 0: return self._groups[0] ans = v'[]' for v'var i = 0; i < arguments.length; i++': q = arguments[i] # noqa:undef ans.push(self._group_val(q, None)) return ans[0] if ans.length is 1 else ans def start(self, g): if self._start is undefined: self._compute_extents() val = self._start[self._group_number(g or 0)] if val is undefined: val = -1 return val def end(self, g): if self._end is undefined: self._compute_extents() val = self._end[self._group_number(g or 0)] if val is undefined: val = -1 return val def span(self, g): return [self.start(g), self.end(g)] def expand(self, repl): return _expand(repl, this._groups, this.re.group_name_map) def groupdict(self, defval=None): gnm = self.re.group_name_map names = Object.keys(gnm) ans = {} for v"var i = 0; i < names.length; i++": name = names[i] # noqa:undef if Object.prototype.hasOwnProperty.call(gnm, name): val = self._groups[gnm[name][-1]] if val is undefined: val = defval ans[name] = val return ans def captures(self, group_name): ans = [] if not Object.prototype.hasOwnProperty.call(self.re.group_name_map, group_name): return ans groups = self.re.group_name_map[group_name] for v'var i = 0; i < groups.length; i++': val = self._groups[groups[i]] # noqa:undef if val is not undefined: ans.push(val) return ans def capturesdict(self): gnm = self.re.group_name_map names = Object.keys(gnm) ans = {} for v'var i = 0; i < names.length; i++': name = names[i] # noqa:undef ans[name] = self.captures(name) return ans class RegexObject: def __init__(self, pattern, flags): self.pattern = pattern.source if isinstance(pattern, RegExp) else pattern self.js_pattern, self.flags, self.group_name_map = transform_regex(self.pattern, flags) modifiers = '' if self.flags & IGNORECASE: modifiers += 'i' if self.flags & MULTILINE: modifiers += 'm' if not (self.flags & ASCII) and supports_unicode: modifiers += 'u' self._modifiers = modifiers + 'g' self._pattern = RegExp(self.js_pattern, self._modifiers) def _do_search(self, pat, string, pos, endpos): pat.lastIndex = 0 if endpos is not None: string = string[:endpos] while True: n = pat.exec(string) if n is None: return None if n.index >= pos: return MatchObject(self, n, pos, endpos) def search(self, string, pos=0, endpos=None): return self._do_search(self._pattern, string, pos, endpos) def match(self, string, pos=0, endpos=None): return self._do_search(RegExp('^' + self.js_pattern, self._modifiers), string, pos, endpos) def split(self, string, maxsplit=0): self._pattern.lastIndex = 0 return string.split(self._pattern, maxsplit or undefined) def findall(self, string): self._pattern.lastIndex = 0 return ρσ_list_decorate(string.match(self._pattern) or v'[]') def finditer(self, string): pat = RegExp(this._pattern.source, this._modifiers) # We have to do this since lastIndex is mutable return { '_string':string, '_r': pat, '_self': self, ρσ_iterator_symbol: def (): return this;, 'next': def (): m = this._r.exec(this._string) if m is None: return {'done':True} return {'done':False, 'value':MatchObject(this._self, m, 0, None)} } def subn(self, repl, string, count=0): expand = _expand if type(repl) is 'function': expand = def(m, repl, gnm): return '' + repl(MatchObject(self, m, 0, None)) this._pattern.lastIndex = 0 num = 0 matches = v'[]' while count < 1 or num < count: m = this._pattern.exec(string) if m is None: break matches.push(m) num += 1 for v'var i = matches.length - 1; i > -1; i--': m = matches[i] # noqa:undef start = m.index end = start + m[0].length string = string[:start] + expand(m, repl, self.group_name_map) + string[end:] return string, matches.length def sub(self, repl, string, count=0): return self.subn(repl, string, count)[0] def _get_from_cache(pattern, flags): if isinstance(pattern, RegExp): pattern = pattern.source key = JSON.stringify(v'[pattern, flags]') if Object.prototype.hasOwnProperty.call(_re_cache_map, key): return _re_cache_map[key] if _re_cache_items.length >= 100: v'delete _re_cache_map[_re_cache_items.shift()]' ans = RegexObject(pattern, flags) _re_cache_map[key] = ans _re_cache_items.push(key) return ans def compile(pattern, flags=0): return _get_from_cache(pattern, flags) def search(pattern, string, flags=0): return _get_from_cache(pattern, flags).search(string) def match(pattern, string, flags=0): return _get_from_cache(pattern, flags).match(string) def split(pattern, string, maxsplit=0, flags=0): return _get_from_cache(pattern, flags).split(string) def findall(pattern, string, flags=0): return _get_from_cache(pattern, flags).findall(string) def finditer(pattern, string, flags=0): return _get_from_cache(pattern, flags).finditer(string) def sub(pattern, repl, string, count=0, flags=0): return _get_from_cache(pattern, flags).sub(repl, string, count) def subn(pattern, repl, string, count=0, flags=0): return _get_from_cache(pattern, flags).subn(repl, string, count) def escape(string): return string.replace(_RE_ESCAPE, '\\$&') def purge(): nonlocal _re_cache_map, _re_cache_items _re_cache_map = {} _re_cache_items = v'[]'