From bb3347f7ffa52e11cd199649be50a52fcdd8456c Mon Sep 17 00:00:00 2001
From: Don-Swanson <32144818+Don-Swanson@users.noreply.github.com>
Date: Wed, 1 Oct 2025 19:18:22 -0500
Subject: [PATCH] Enhance search function to extract title and content
 separately, maintaining backward compatibility in JSON response.

---
 Dockerfile        |  2 +-
 app/routes.py     | 50 +++++++++++++++++++++++++++++++++++++++++++++--
 test/test_json.py |  6 ++++++
 3 files changed, 55 insertions(+), 3 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 8184d83..5361a1a 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -100,4 +100,4 @@ EXPOSE $EXPOSE_PORT
 HEALTHCHECK --interval=30s --timeout=5s \
   CMD curl -f http://localhost:${EXPOSE_PORT}/healthz || exit 1
 
-CMD misc/tor/start-tor.sh & ./run
+CMD ["/bin/sh", "-c", "misc/tor/start-tor.sh & ./run"]
diff --git a/app/routes.py b/app/routes.py
index b63806e..7118a25 100644
--- a/app/routes.py
+++ b/app/routes.py
@@ -432,9 +432,49 @@ def search():
                 text = div.get_text(separator=' ', strip=True)
                 if not text:
                     continue
+                
+                # Extract title and content separately
+                # Title is typically in an h3 tag or the main link text
+                title = ''
+                h3_tag = div.find('h3')
+                if h3_tag:
+                    title = h3_tag.get_text(strip=True)
+                elif link:
+                    title = link.get_text(strip=True)
+                
+                # Content is the remaining text after removing the title
+                # Look for description/snippet elements
+                content = ''
+                # Common classes for snippets/descriptions in Google results
+                snippet_selectors = [
+                    {'class_': ['VwiC3b']},  # Standard snippet
+                    {'class_': ['s']},        # Alternative snippet class
+                    {'class_': ['st']},       # Another snippet class
+                ]
+                
+                for selector in snippet_selectors:
+                    snippet_elem = div.find('div', selector) or div.find('span', selector)
+                    if snippet_elem:
+                        content = snippet_elem.get_text(separator=' ', strip=True)
+                        break
+                
+                # If no specific content found, use text minus title as fallback
+                if not content and title:
+                    # Try to extract content by removing title from full text
+                    if text.startswith(title):
+                        content = text[len(title):].strip()
+                    else:
+                        content = text
+                elif not content:
+                    content = text
                     
                 seen.add(href)
-                results.append({'href': href, 'text': text})
+                results.append({
+                    'href': href,
+                    'text': text,
+                    'title': title,
+                    'content': content
+                })
         else:
             # Fallback: extract links directly if no result containers found
             for a in json_soup.find_all('a', href=True):
@@ -447,7 +487,13 @@ def search():
                 if not text:
                     continue
                 seen.add(href)
-                results.append({'href': href, 'text': text})
+                # In fallback mode, the link text serves as both title and text
+                results.append({
+                    'href': href,
+                    'text': text,
+                    'title': text,
+                    'content': ''
+                })
 
         return jsonify({
             'query': urlparse.unquote(query),
diff --git a/test/test_json.py b/test/test_json.py
index 6ad84b4..df4cb16 100644
--- a/test/test_json.py
+++ b/test/test_json.py
@@ -41,6 +41,12 @@ def test_search_json_accept(client, stubbed_search_response):
     assert 'https://example.org/other' in hrefs
     # Relative href should be excluded
     assert not any(href.endswith('/relative') for href in hrefs)
+    # Verify new fields are present while maintaining backward compatibility
+    for result in data['results']:
+        assert 'href' in result
+        assert 'text' in result  # Original field maintained
+        assert 'title' in result  # New field
+        assert 'content' in result  # New field
 
 
 def test_search_json_format_param(client, stubbed_search_response):