mirror of
https://github.com/benbusby/whoogle-search.git
synced 2026-05-30 19:35:21 -04:00
Enhance search function to extract title and content separately, maintaining backward compatibility in JSON response.
This commit is contained in:
+1
-1
@@ -100,4 +100,4 @@ EXPOSE $EXPOSE_PORT
|
||||
HEALTHCHECK --interval=30s --timeout=5s \
|
||||
CMD curl -f http://localhost:${EXPOSE_PORT}/healthz || exit 1
|
||||
|
||||
CMD misc/tor/start-tor.sh & ./run
|
||||
CMD ["/bin/sh", "-c", "misc/tor/start-tor.sh & ./run"]
|
||||
|
||||
+48
-2
@@ -432,9 +432,49 @@ def search():
|
||||
text = div.get_text(separator=' ', strip=True)
|
||||
if not text:
|
||||
continue
|
||||
|
||||
# Extract title and content separately
|
||||
# Title is typically in an h3 tag or the main link text
|
||||
title = ''
|
||||
h3_tag = div.find('h3')
|
||||
if h3_tag:
|
||||
title = h3_tag.get_text(strip=True)
|
||||
elif link:
|
||||
title = link.get_text(strip=True)
|
||||
|
||||
# Content is the remaining text after removing the title
|
||||
# Look for description/snippet elements
|
||||
content = ''
|
||||
# Common classes for snippets/descriptions in Google results
|
||||
snippet_selectors = [
|
||||
{'class_': ['VwiC3b']}, # Standard snippet
|
||||
{'class_': ['s']}, # Alternative snippet class
|
||||
{'class_': ['st']}, # Another snippet class
|
||||
]
|
||||
|
||||
for selector in snippet_selectors:
|
||||
snippet_elem = div.find('div', selector) or div.find('span', selector)
|
||||
if snippet_elem:
|
||||
content = snippet_elem.get_text(separator=' ', strip=True)
|
||||
break
|
||||
|
||||
# If no specific content found, use text minus title as fallback
|
||||
if not content and title:
|
||||
# Try to extract content by removing title from full text
|
||||
if text.startswith(title):
|
||||
content = text[len(title):].strip()
|
||||
else:
|
||||
content = text
|
||||
elif not content:
|
||||
content = text
|
||||
|
||||
seen.add(href)
|
||||
results.append({'href': href, 'text': text})
|
||||
results.append({
|
||||
'href': href,
|
||||
'text': text,
|
||||
'title': title,
|
||||
'content': content
|
||||
})
|
||||
else:
|
||||
# Fallback: extract links directly if no result containers found
|
||||
for a in json_soup.find_all('a', href=True):
|
||||
@@ -447,7 +487,13 @@ def search():
|
||||
if not text:
|
||||
continue
|
||||
seen.add(href)
|
||||
results.append({'href': href, 'text': text})
|
||||
# In fallback mode, the link text serves as both title and text
|
||||
results.append({
|
||||
'href': href,
|
||||
'text': text,
|
||||
'title': text,
|
||||
'content': ''
|
||||
})
|
||||
|
||||
return jsonify({
|
||||
'query': urlparse.unquote(query),
|
||||
|
||||
@@ -41,6 +41,12 @@ def test_search_json_accept(client, stubbed_search_response):
|
||||
assert 'https://example.org/other' in hrefs
|
||||
# Relative href should be excluded
|
||||
assert not any(href.endswith('/relative') for href in hrefs)
|
||||
# Verify new fields are present while maintaining backward compatibility
|
||||
for result in data['results']:
|
||||
assert 'href' in result
|
||||
assert 'text' in result # Original field maintained
|
||||
assert 'title' in result # New field
|
||||
assert 'content' in result # New field
|
||||
|
||||
|
||||
def test_search_json_format_param(client, stubbed_search_response):
|
||||
|
||||
Reference in New Issue
Block a user