mirror of
				https://github.com/searxng/searxng.git
				synced 2025-11-04 03:27:06 -05:00 
			
		
		
		
	
		
			
				
	
	
		
			146 lines
		
	
	
		
			4.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			146 lines
		
	
	
		
			4.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
'''
 | 
						|
searx is free software: you can redistribute it and/or modify
 | 
						|
it under the terms of the GNU Affero General Public License as published by
 | 
						|
the Free Software Foundation, either version 3 of the License, or
 | 
						|
(at your option) any later version.
 | 
						|
 | 
						|
searx is distributed in the hope that it will be useful,
 | 
						|
but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
						|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
						|
GNU Affero General Public License for more details.
 | 
						|
 | 
						|
You should have received a copy of the GNU Affero General Public License
 | 
						|
along with searx. If not, see < http://www.gnu.org/licenses/ >.
 | 
						|
 | 
						|
(C) 2013- by Adam Tauber, <asciimoo@gmail.com>
 | 
						|
'''
 | 
						|
 | 
						|
import re
 | 
						|
from lxml import etree
 | 
						|
from os import listdir
 | 
						|
from os.path import isfile, isdir, join
 | 
						|
 | 
						|
 | 
						|
# https://gitweb.torproject.org/\
 | 
						|
# pde/https-everywhere.git/tree/4.0:/src/chrome/content/rules
 | 
						|
 | 
						|
# HTTPS rewrite rules
 | 
						|
https_rules = []
 | 
						|
 | 
						|
 | 
						|
# load single ruleset from a xml file
 | 
						|
def load_single_https_ruleset(filepath):
 | 
						|
    ruleset = ()
 | 
						|
 | 
						|
    # init parser
 | 
						|
    parser = etree.XMLParser()
 | 
						|
 | 
						|
    # load and parse xml-file
 | 
						|
    try:
 | 
						|
        tree = etree.parse(filepath, parser)
 | 
						|
    except:
 | 
						|
        # TODO, error message
 | 
						|
        return ()
 | 
						|
 | 
						|
    # get root node
 | 
						|
    root = tree.getroot()
 | 
						|
 | 
						|
    # check if root is a node with the name ruleset
 | 
						|
    # TODO improve parsing
 | 
						|
    if root.tag != 'ruleset':
 | 
						|
        return ()
 | 
						|
 | 
						|
    # check if rule is deactivated by default
 | 
						|
    if root.attrib.get('default_off'):
 | 
						|
        return ()
 | 
						|
 | 
						|
    # check if rule does only work for specific platforms
 | 
						|
    if root.attrib.get('platform'):
 | 
						|
        return ()
 | 
						|
 | 
						|
    hosts = []
 | 
						|
    rules = []
 | 
						|
    exclusions = []
 | 
						|
 | 
						|
    # parse childs from ruleset
 | 
						|
    for ruleset in root:
 | 
						|
        # this child define a target
 | 
						|
        if ruleset.tag == 'target':
 | 
						|
            # check if required tags available
 | 
						|
            if not ruleset.attrib.get('host'):
 | 
						|
                continue
 | 
						|
 | 
						|
            # convert host-rule to valid regex
 | 
						|
            host = ruleset.attrib.get('host')\
 | 
						|
                .replace('.', '\.').replace('*', '.*')
 | 
						|
 | 
						|
            # append to host list
 | 
						|
            hosts.append(host)
 | 
						|
 | 
						|
        # this child define a rule
 | 
						|
        elif ruleset.tag == 'rule':
 | 
						|
            # check if required tags available
 | 
						|
            if not ruleset.attrib.get('from')\
 | 
						|
               or not ruleset.attrib.get('to'):
 | 
						|
                continue
 | 
						|
 | 
						|
            # TODO hack, which convert a javascript regex group
 | 
						|
            # into a valid python regex group
 | 
						|
            rule_from = ruleset.attrib.get('from').replace('$', '\\')
 | 
						|
            rule_to = ruleset.attrib.get('to').replace('$', '\\')
 | 
						|
 | 
						|
            # TODO, not working yet because of the hack above,
 | 
						|
            # currently doing that in webapp.py
 | 
						|
            # rule_from_rgx = re.compile(rule_from, re.I)
 | 
						|
 | 
						|
            # append rule
 | 
						|
            rules.append((rule_from, rule_to))
 | 
						|
 | 
						|
        # this child define an exclusion
 | 
						|
        elif ruleset.tag == 'exclusion':
 | 
						|
            # check if required tags available
 | 
						|
            if not ruleset.attrib.get('pattern'):
 | 
						|
                continue
 | 
						|
 | 
						|
            exclusion_rgx = re.compile(ruleset.attrib.get('pattern'))
 | 
						|
 | 
						|
            # append exclusion
 | 
						|
            exclusions.append(exclusion_rgx)
 | 
						|
 | 
						|
    # convert list of possible hosts to a simple regex
 | 
						|
    # TODO compress regex to improve performance
 | 
						|
    try:
 | 
						|
        target_hosts = re.compile('^(' + '|'.join(hosts) + ')', re.I | re.U)
 | 
						|
    except:
 | 
						|
        return ()
 | 
						|
 | 
						|
    # return ruleset
 | 
						|
    return (target_hosts, rules, exclusions)
 | 
						|
 | 
						|
 | 
						|
# load all https rewrite rules
 | 
						|
def load_https_rules(rules_path):
 | 
						|
    # check if directory exists
 | 
						|
    if not isdir(rules_path):
 | 
						|
        print("[E] directory not found: '" + rules_path + "'")
 | 
						|
        return
 | 
						|
 | 
						|
    # search all xml files which are stored in the https rule directory
 | 
						|
    xml_files = [join(rules_path, f)
 | 
						|
                 for f in listdir(rules_path)
 | 
						|
                 if isfile(join(rules_path, f)) and f[-4:] == '.xml']
 | 
						|
 | 
						|
    # load xml-files
 | 
						|
    for ruleset_file in xml_files:
 | 
						|
        # calculate rewrite-rules
 | 
						|
        ruleset = load_single_https_ruleset(ruleset_file)
 | 
						|
 | 
						|
        # skip if no ruleset returned
 | 
						|
        if not ruleset:
 | 
						|
            continue
 | 
						|
 | 
						|
        # append ruleset
 | 
						|
        https_rules.append(ruleset)
 | 
						|
 | 
						|
    print(' * {n} https-rules loaded'.format(n=len(https_rules)))
 |