From d5c66c2653182bdc6a593b3fe12d3316abd2ea82 Mon Sep 17 00:00:00 2001 From: claybdavis Date: Thu, 14 May 2026 00:16:13 -0500 Subject: [PATCH] latimes: populate per-article description from og:description LAT's section index pages only attach a teaser to <10% of article tiles, so most TOC entries in the resulting EPUB show no description under the headline. This fix fetches each article page once during indexing and reads to populate art['description']. Cached per-URL across the section walks so duplicates are fetched once. Descriptions shorter than 20 chars are dropped (LAT occasionally publishes placeholder text). --- recipes/latimes.recipe | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/recipes/latimes.recipe b/recipes/latimes.recipe index 3dcc727d6e..08a570ab1d 100644 --- a/recipes/latimes.recipe +++ b/recipes/latimes.recipe @@ -116,5 +116,29 @@ class LATimes(BasicNewsRecipe): a.contents) == 1 and a.find(text=True, recursive=False)] articles = [ {'title': self.tag_to_string(a), 'url': absurl(a['href'])} for a in alinks] + # Populate `description` from each article page's + cache = getattr(self, '_og_desc_cache', None) + if cache is None: + cache = self._og_desc_cache = {} + for art in articles: + url = art.get('url') + if not url: + continue + if url not in cache: + cache[url] = self._fetch_og_description(url) + desc = cache[url] + if desc and len(desc) >= 20: + art['description'] = desc self.log('Found: ', len(articles), ' articles.\n') return articles + + def _fetch_og_description(self, url): + try: + soup = self.index_to_soup(url) + except Exception as e: + self.log.warn('LAT og:description fetch failed for', url, ':', e) + return None + meta = soup.find('meta', attrs={'property': 'og:description'}) + if meta and meta.get('content'): + return meta['content'].strip() or None + return None