latimes: populate per-article description from og:description

LAT's section index pages only attach a teaser to <10% of article tiles, so most TOC entries in the resulting EPUB show no description under the headline.

This fix fetches each article page once during indexing and reads <meta property="og:description"> to populate art['description'].
Cached per-URL across the section walks so duplicates are fetched once. Descriptions shorter than 20 chars are dropped (LAT occasionally publishes placeholder text).
This commit is contained in:
claybdavis
2026-05-14 00:16:13 -05:00
parent 4c04c56033
commit d5c66c2653
+24
View File
@@ -116,5 +116,29 @@ class LATimes(BasicNewsRecipe):
a.contents) == 1 and a.find(text=True, recursive=False)]
articles = [
{'title': self.tag_to_string(a), 'url': absurl(a['href'])} for a in alinks]
# Populate `description` from each article page's
cache = getattr(self, '_og_desc_cache', None)
if cache is None:
cache = self._og_desc_cache = {}
for art in articles:
url = art.get('url')
if not url:
continue
if url not in cache:
cache[url] = self._fetch_og_description(url)
desc = cache[url]
if desc and len(desc) >= 20:
art['description'] = desc
self.log('Found: ', len(articles), ' articles.\n')
return articles
def _fetch_og_description(self, url):
try:
soup = self.index_to_soup(url)
except Exception as e:
self.log.warn('LAT og:description fetch failed for', url, ':', e)
return None
meta = soup.find('meta', attrs={'property': 'og:description'})
if meta and meta.get('content'):
return meta['content'].strip() or None
return None