Enhance search function to extract title and content separately, maintaining backward compatibility in JSON response.

This commit is contained in:
Don-Swanson
2025-10-01 19:18:22 -05:00
parent e0a4a5f2cb
commit bb3347f7ff
3 changed files with 55 additions and 3 deletions
+1 -1
View File
@@ -100,4 +100,4 @@ EXPOSE $EXPOSE_PORT
HEALTHCHECK --interval=30s --timeout=5s \
CMD curl -f http://localhost:${EXPOSE_PORT}/healthz || exit 1
CMD misc/tor/start-tor.sh & ./run
CMD ["/bin/sh", "-c", "misc/tor/start-tor.sh & ./run"]
+48 -2
View File
@@ -432,9 +432,49 @@ def search():
text = div.get_text(separator=' ', strip=True)
if not text:
continue
# Extract title and content separately
# Title is typically in an h3 tag or the main link text
title = ''
h3_tag = div.find('h3')
if h3_tag:
title = h3_tag.get_text(strip=True)
elif link:
title = link.get_text(strip=True)
# Content is the remaining text after removing the title
# Look for description/snippet elements
content = ''
# Common classes for snippets/descriptions in Google results
snippet_selectors = [
{'class_': ['VwiC3b']}, # Standard snippet
{'class_': ['s']}, # Alternative snippet class
{'class_': ['st']}, # Another snippet class
]
for selector in snippet_selectors:
snippet_elem = div.find('div', selector) or div.find('span', selector)
if snippet_elem:
content = snippet_elem.get_text(separator=' ', strip=True)
break
# If no specific content found, use text minus title as fallback
if not content and title:
# Try to extract content by removing title from full text
if text.startswith(title):
content = text[len(title):].strip()
else:
content = text
elif not content:
content = text
seen.add(href)
results.append({'href': href, 'text': text})
results.append({
'href': href,
'text': text,
'title': title,
'content': content
})
else:
# Fallback: extract links directly if no result containers found
for a in json_soup.find_all('a', href=True):
@@ -447,7 +487,13 @@ def search():
if not text:
continue
seen.add(href)
results.append({'href': href, 'text': text})
# In fallback mode, the link text serves as both title and text
results.append({
'href': href,
'text': text,
'title': text,
'content': ''
})
return jsonify({
'query': urlparse.unquote(query),
+6
View File
@@ -41,6 +41,12 @@ def test_search_json_accept(client, stubbed_search_response):
assert 'https://example.org/other' in hrefs
# Relative href should be excluded
assert not any(href.endswith('/relative') for href in hrefs)
# Verify new fields are present while maintaining backward compatibility
for result in data['results']:
assert 'href' in result
assert 'text' in result # Original field maintained
assert 'title' in result # New field
assert 'content' in result # New field
def test_search_json_format_param(client, stubbed_search_response):