latimes: populate per-article description from og:description

LAT's section index pages only attach a teaser to <10% of article tiles, so most TOC entries in the resulting EPUB show no description under the headline. This fix fetches each article page once during indexing and reads <meta property="og:description"> to populate art['description']. Cached per-URL across the section walks so duplicates are fetched once. Descriptions shorter than 20 chars are dropped (LAT occasionally publishes placeholder text).
2026-05-31 02:55:19 -04:00 · 2026-05-14 00:16:13 -05:00
parent 4c04c56033
commit d5c66c2653
1 changed files with 24 additions and 0 deletions
@@ -116,5 +116,29 @@ class LATimes(BasicNewsRecipe):
            a.contents) == 1 and a.find(text=True, recursive=False)]
        articles = [
            {'title': self.tag_to_string(a), 'url': absurl(a['href'])} for a in alinks]
+        # Populate `description` from each article page's
+        cache = getattr(self, '_og_desc_cache', None)
+        if cache is None:
+            cache = self._og_desc_cache = {}
+        for art in articles:
+            url = art.get('url')
+            if not url:
+                continue
+            if url not in cache:
+                cache[url] = self._fetch_og_description(url)
+            desc = cache[url]
+            if desc and len(desc) >= 20:
+                art['description'] = desc
        self.log('Found: ', len(articles), ' articles.\n')
        return articles
+
+    def _fetch_og_description(self, url):
+        try:
+            soup = self.index_to_soup(url)
+        except Exception as e:
+            self.log.warn('LAT og:description fetch failed for', url, ':', e)
+            return None
+        meta = soup.find('meta', attrs={'property': 'og:description'})
+        if meta and meta.get('content'):
+            return meta['content'].strip() or None
+        return None