Extract article metadata from the article itself

This commit is contained in:
Henrik Holm 2025-05-16 18:33:56 +02:00
parent 84772e8b14
commit 8abd9b706e
No known key found for this signature in database

View File

@ -1,6 +1,7 @@
#!/usr/bin/env python #!/usr/bin/env python
# vim:fileencoding=utf-8 # vim:fileencoding=utf-8
from datetime import datetime, timezone import time
from datetime import datetime, timedelta
from mechanize import Request from mechanize import Request
@ -25,15 +26,21 @@ class Fokus(BasicNewsRecipe):
compress_news_images = True compress_news_images = True
needs_subscription = 'optional' needs_subscription = 'optional'
oldest_article = 7 # days oldest_article = 7 # days
max_articles_per_feed = 15
use_embedded_content = False use_embedded_content = False
remove_empty_feeds = True remove_empty_feeds = True
scale_news_images_to_device = True scale_news_images_to_device = True
scale_news_images = (800, 600) scale_news_images = (800, 600)
delay = 3 # Avoid throttling by the server.
# Center and reduce the size of images and image captions. # 1. Center and reduce the size of images and image captions.
# 2. Make the lead text italic.
# 3. Make the article metadata text gray and small.
extra_css = ''' extra_css = '''
img { display: block; margin: auto; width: 50%; height: auto } img { display: block; margin: auto; width: 50%; height: auto; }
div.calibre-nuked-tag-figure { font-size: small; text-align: center; } div.calibre-nuked-tag-figure { font-size: small; text-align: center; }
p.Single__lead, p.Longread__lead { font-style: italic; color:#202020; }
p.article-metadata { color: gray; font-size:small; }
''' '''
keep_only_tags = [ keep_only_tags = [
@ -41,8 +48,11 @@ class Fokus(BasicNewsRecipe):
dict(name='h1', class_='Longread__title'), # Title of "Longread" type articles. dict(name='h1', class_='Longread__title'), # Title of "Longread" type articles.
dict(name='p', class_='Single__lead'), # Lead text of "Single" type articles. dict(name='p', class_='Single__lead'), # Lead text of "Single" type articles.
dict(name='p', class_='Longread__lead'), # Lead text of "Longread" type articles. dict(name='p', class_='Longread__lead'), # Lead text of "Longread" type articles.
dict(name='p', class_='article-metadata'), # Dynamically created by the recipe.
dict(name='figure', class_='Single__thumbnail'), # Image of "Single" type articles. dict(name='figure', class_='Single__thumbnail'), # Image of "Single" type articles.
dict(name='figure', class_='Longread__thumbnail'), # Image of "Longread" type articles. dict(name='figure', class_='Longread__thumbnail'), # Image of "Longread" type articles.
dict(name='p', class_='Meta__author'), # Author of the article.
dict(name='time', class_='Meta__updated'), # Last updated date of the article.
dict(name='div', class_='sesamy-protected-content'), # Article body. dict(name='div', class_='sesamy-protected-content'), # Article body.
] ]
@ -277,3 +287,48 @@ class Fokus(BasicNewsRecipe):
self.log(f'A total of {num_articles} articles belonging to {len(section_to_articles)} sections were kept.') self.log(f'A total of {num_articles} articles belonging to {len(section_to_articles)} sections were kept.')
return feeds return feeds
def populate_article_metadata(self, article, soup, _):
# The article description/summary is found in the <p> tag of class 'Single__lead' or 'Longread__lead'.
lead_tag = soup.find('p', {'class': ['Single__lead', 'Longread__lead']})
article.summary = article.text_summary = lead_tag.get_text(strip=True)
# Extract the article timestamp from the `datetime` attribute of the first <time> tag of class 'Meta__updated'.
# The timestamp is on the ISO format. After the timestamp has been extracted, remove all such <time> tags from
# the soup (the article can contain several).
if time_tag := soup.find('time', {'class': 'Meta__updated'}):
dt = time_tag['datetime']
dt = datetime.fromisoformat(dt) + timedelta(seconds=time.timezone)
article.date = dt.strftime('%Y-%m-%d %H:%M')
for time_tag in soup.find_all('time', {'class': 'Meta__updated'}):
time_tag.decompose()
# Extract the author name from the first <p> tag of class 'Meta__author'. After the author name has been
# extracted, remove all such <p> tags from the soup (the article can contain several).
if author_tag := soup.find('p', {'class': 'Meta__author'}):
# If the tag contains an <a> child tag, extract the author name from it.
if a_tag := author_tag.find('a'):
author_info = a_tag.get_text(strip=True)
# To ensure a clean output, remove the <a> child tag.
a_tag.decompose()
else:
# If the tag does not contain an <a> child tag, extract the author name from the text of the <p> tag.
author_info = author_tag.get_text(strip=True)
# Remove the 'Text: ' prefix from the author name (if any).
if author_info.startswith('Text: '):
author_info = author_info[6:]
for author_tag in soup.find_all('p', {'class': 'Meta__author'}):
author_tag.decompose()
else:
# If the author name is empty, set it to 'Fokus'.
if not author_info:
author_info = 'Fokus'
# Concatenate the author name and the article date.
article_metadata = f"{author_info} | {article.date}"
# Finally, add a new <p> tag with the article metadata to the soup. Place it directly after the lead text.
new_tag = soup.new_tag('p')
new_tag['class'] = 'article-metadata'
new_tag.string = article_metadata
lead_tag.insert_after(new_tag)