From bb3347f7ffa52e11cd199649be50a52fcdd8456c Mon Sep 17 00:00:00 2001 From: Don-Swanson <32144818+Don-Swanson@users.noreply.github.com> Date: Wed, 1 Oct 2025 19:18:22 -0500 Subject: [PATCH] Enhance search function to extract title and content separately, maintaining backward compatibility in JSON response. --- Dockerfile | 2 +- app/routes.py | 50 +++++++++++++++++++++++++++++++++++++++++++++-- test/test_json.py | 6 ++++++ 3 files changed, 55 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index 8184d83..5361a1a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -100,4 +100,4 @@ EXPOSE $EXPOSE_PORT HEALTHCHECK --interval=30s --timeout=5s \ CMD curl -f http://localhost:${EXPOSE_PORT}/healthz || exit 1 -CMD misc/tor/start-tor.sh & ./run +CMD ["/bin/sh", "-c", "misc/tor/start-tor.sh & ./run"] diff --git a/app/routes.py b/app/routes.py index b63806e..7118a25 100644 --- a/app/routes.py +++ b/app/routes.py @@ -432,9 +432,49 @@ def search(): text = div.get_text(separator=' ', strip=True) if not text: continue + + # Extract title and content separately + # Title is typically in an h3 tag or the main link text + title = '' + h3_tag = div.find('h3') + if h3_tag: + title = h3_tag.get_text(strip=True) + elif link: + title = link.get_text(strip=True) + + # Content is the remaining text after removing the title + # Look for description/snippet elements + content = '' + # Common classes for snippets/descriptions in Google results + snippet_selectors = [ + {'class_': ['VwiC3b']}, # Standard snippet + {'class_': ['s']}, # Alternative snippet class + {'class_': ['st']}, # Another snippet class + ] + + for selector in snippet_selectors: + snippet_elem = div.find('div', selector) or div.find('span', selector) + if snippet_elem: + content = snippet_elem.get_text(separator=' ', strip=True) + break + + # If no specific content found, use text minus title as fallback + if not content and title: + # Try to extract content by removing title from full text + if text.startswith(title): + content = text[len(title):].strip() + else: + content = text + elif not content: + content = text seen.add(href) - results.append({'href': href, 'text': text}) + results.append({ + 'href': href, + 'text': text, + 'title': title, + 'content': content + }) else: # Fallback: extract links directly if no result containers found for a in json_soup.find_all('a', href=True): @@ -447,7 +487,13 @@ def search(): if not text: continue seen.add(href) - results.append({'href': href, 'text': text}) + # In fallback mode, the link text serves as both title and text + results.append({ + 'href': href, + 'text': text, + 'title': text, + 'content': '' + }) return jsonify({ 'query': urlparse.unquote(query), diff --git a/test/test_json.py b/test/test_json.py index 6ad84b4..df4cb16 100644 --- a/test/test_json.py +++ b/test/test_json.py @@ -41,6 +41,12 @@ def test_search_json_accept(client, stubbed_search_response): assert 'https://example.org/other' in hrefs # Relative href should be excluded assert not any(href.endswith('/relative') for href in hrefs) + # Verify new fields are present while maintaining backward compatibility + for result in data['results']: + assert 'href' in result + assert 'text' in result # Original field maintained + assert 'title' in result # New field + assert 'content' in result # New field def test_search_json_format_param(client, stubbed_search_response):