mirror of
				https://github.com/searxng/searxng.git
				synced 2025-11-04 03:27:06 -05:00 
			
		
		
		
	fix Wikipedia's paragraph extraction
This commit is contained in:
		
							parent
							
								
									6d18769ccf
								
							
						
					
					
						commit
						77b9faa8df
					
				@ -49,29 +49,6 @@ def request(query, params):
 | 
				
			|||||||
    return params
 | 
					    return params
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# get first meaningful paragraph
 | 
					 | 
				
			||||||
# this should filter out disambiguation pages and notes above first paragraph
 | 
					 | 
				
			||||||
# "magic numbers" were obtained by fine tuning
 | 
					 | 
				
			||||||
def extract_first_paragraph(content, title, image):
 | 
					 | 
				
			||||||
    first_paragraph = None
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    failed_attempts = 0
 | 
					 | 
				
			||||||
    for paragraph in content.split('\n'):
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        starts_with_title = paragraph.lower().find(title.lower(), 0, len(title) + 35)
 | 
					 | 
				
			||||||
        length = len(paragraph)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        if length >= 200 or (starts_with_title >= 0 and (image or length >= 150)):
 | 
					 | 
				
			||||||
            first_paragraph = paragraph
 | 
					 | 
				
			||||||
            break
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        failed_attempts += 1
 | 
					 | 
				
			||||||
        if failed_attempts > 3:
 | 
					 | 
				
			||||||
            return None
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    return first_paragraph
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
# get response from search-request
 | 
					# get response from search-request
 | 
				
			||||||
def response(resp):
 | 
					def response(resp):
 | 
				
			||||||
    results = []
 | 
					    results = []
 | 
				
			||||||
@ -97,10 +74,7 @@ def response(resp):
 | 
				
			|||||||
    if image:
 | 
					    if image:
 | 
				
			||||||
        image = image.get('source')
 | 
					        image = image.get('source')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    extract = page.get('extract')
 | 
					    summary = page.get('extract', '').split('\n')[0].replace('()', '')
 | 
				
			||||||
 | 
					 | 
				
			||||||
    summary = extract_first_paragraph(extract, title, image)
 | 
					 | 
				
			||||||
    summary = summary.replace('() ', '')
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # link to wikipedia article
 | 
					    # link to wikipedia article
 | 
				
			||||||
    wikipedia_link = base_url.format(language=url_lang(resp.search_params['language'])) \
 | 
					    wikipedia_link = base_url.format(language=url_lang(resp.search_params['language'])) \
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user