From c46ec6f9374a25aa650d9fd3a3d65fabb9f7eccc Mon Sep 17 00:00:00 2001 From: Don-Swanson <32144818+Don-Swanson@users.noreply.github.com> Date: Thu, 2 Oct 2025 08:14:24 -0500 Subject: [PATCH] Add clean_text_spacing function to normalize text spacing in search results This new function addresses spacing issues in text extracted from HTML, ensuring proper formatting for titles and content in the search function. It replaces direct text extraction calls with clean_text_spacing to enhance the overall text quality and consistency. --- app/routes.py | 42 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 36 insertions(+), 6 deletions(-) diff --git a/app/routes.py b/app/routes.py index 7ac40a7..d68a90c 100644 --- a/app/routes.py +++ b/app/routes.py @@ -290,6 +290,36 @@ def autocomplete(): g.user_request.autocomplete(q) if (not g.user_config.tor and autocomplete_enabled) else [] ]) +def clean_text_spacing(text: str) -> str: + """Clean up text spacing issues from HTML extraction. + + Args: + text: Text extracted from HTML that may have spacing issues + + Returns: + Cleaned text with proper spacing + """ + if not text: + return text + + # Normalize multiple spaces to single space + text = re.sub(r'\s+', ' ', text) + + # Fix domain names: remove space before period followed by domain extension + # Examples: "weather .com" -> "weather.com", "example .org" -> "example.org" + text = re.sub(r'\s+\.([a-zA-Z]{2,})\b', r'.\1', text) + + # Fix www/http/https patterns + # Examples: "www .example" -> "www.example" + text = re.sub(r'\b(www|http|https)\s+\.', r'\1.', text) + + # Fix spaces before common punctuation + text = re.sub(r'\s+([,;:])', r'\1', text) + + # Strip leading/trailing whitespace + return text.strip() + + @app.route(f'/{Endpoint.search}', methods=['GET', 'POST']) @session_required @auth_required @@ -429,7 +459,7 @@ def search(): continue # Get all text from the result container, not just the link - text = div.get_text(separator=' ', strip=True) + text = clean_text_spacing(div.get_text(separator=' ', strip=True)) if not text: continue @@ -439,15 +469,15 @@ def search(): # First try h3 tag h3_tag = div.find('h3') if h3_tag: - title = h3_tag.get_text(strip=True) + title = clean_text_spacing(h3_tag.get_text(separator=' ', strip=True)) else: # Try CVA68e class (common title class in Google results) title_span = div.find('span', class_='CVA68e') if title_span: - title = title_span.get_text(strip=True) + title = clean_text_spacing(title_span.get_text(separator=' ', strip=True)) elif link: # Fallback to link text, but exclude URL breadcrumb - title = link.get_text(strip=True) + title = clean_text_spacing(link.get_text(separator=' ', strip=True)) # Content is the description/snippet text # Look for description/snippet elements @@ -464,7 +494,7 @@ def search(): snippet_elem = div.find('span', selector) or div.find('div', selector) if snippet_elem: # Get text but exclude any nested links (like "Related searches") - content = snippet_elem.get_text(separator=' ', strip=True) + content = clean_text_spacing(snippet_elem.get_text(separator=' ', strip=True)) # Only use if it's substantial content (not just the URL breadcrumb) if content and not content.startswith('www.') and '›' not in content: break @@ -496,7 +526,7 @@ def search(): continue if href in seen: continue - text = a.get_text(strip=True) + text = clean_text_spacing(a.get_text(separator=' ', strip=True)) if not text: continue seen.add(href)