Extract article metadata from the article itself

2025-11-17 12:03:02 -05:00 · 2025-05-16 18:33:56 +02:00 · 2025-05-16 18:33:56 +02:00 · 8abd9b706e
commit 8abd9b706e
parent 84772e8b14
1 changed files with 58 additions and 3 deletions
--- a/recipes/fokus.recipe
+++ b/recipes/fokus.recipe
@ -1,6 +1,7 @@
 #!/usr/bin/env python
 # vim:fileencoding=utf-8
-from datetime import datetime, timezone
+import time
+from datetime import datetime, timedelta

 from mechanize import Request

@ -25,15 +26,21 @@ class Fokus(BasicNewsRecipe):
    compress_news_images = True
    needs_subscription = 'optional'
    oldest_article = 7  # days
+    max_articles_per_feed = 15
    use_embedded_content = False
    remove_empty_feeds = True
    scale_news_images_to_device = True
    scale_news_images = (800, 600)
+    delay = 3  # Avoid throttling by the server.

-    # Center and reduce the size of images and image captions.
+    # 1. Center and reduce the size of images and image captions.
+    # 2. Make the lead text italic.
+    # 3. Make the article metadata text gray and small.
    extra_css = '''
-        img { display: block; margin: auto; width: 50%; height: auto }
+        img { display: block; margin: auto; width: 50%; height: auto; }
        div.calibre-nuked-tag-figure { font-size: small; text-align: center; }
+        p.Single__lead, p.Longread__lead { font-style: italic; color:#202020; }
+        p.article-metadata { color: gray; font-size:small; }
    '''

    keep_only_tags = [
@ -41,8 +48,11 @@ class Fokus(BasicNewsRecipe):
        dict(name='h1', class_='Longread__title'),              # Title of "Longread" type articles.
        dict(name='p', class_='Single__lead'),                  # Lead text of "Single" type articles.
        dict(name='p', class_='Longread__lead'),                # Lead text of "Longread" type articles.
+        dict(name='p', class_='article-metadata'),              # Dynamically created by the recipe.
        dict(name='figure', class_='Single__thumbnail'),        # Image of "Single" type articles.
        dict(name='figure', class_='Longread__thumbnail'),      # Image of "Longread" type articles.
+        dict(name='p', class_='Meta__author'),                  # Author of the article.
+        dict(name='time', class_='Meta__updated'),              # Last updated date of the article.
        dict(name='div', class_='sesamy-protected-content'),    # Article body.
    ]

@ -277,3 +287,48 @@ class Fokus(BasicNewsRecipe):
        self.log(f'A total of {num_articles} articles belonging to {len(section_to_articles)} sections were kept.')

        return feeds
+
+    def populate_article_metadata(self, article, soup, _):
+        # The article description/summary is found in the <p> tag of class 'Single__lead' or 'Longread__lead'.
+        lead_tag = soup.find('p', {'class': ['Single__lead', 'Longread__lead']})
+        article.summary = article.text_summary = lead_tag.get_text(strip=True)
+
+        # Extract the article timestamp from the `datetime` attribute of the first <time> tag of class 'Meta__updated'.
+        # The timestamp is on the ISO format. After the timestamp has been extracted, remove all such <time> tags from
+        # the soup (the article can contain several).
+        if time_tag := soup.find('time', {'class': 'Meta__updated'}):
+            dt = time_tag['datetime']
+            dt = datetime.fromisoformat(dt) + timedelta(seconds=time.timezone)
+            article.date = dt.strftime('%Y-%m-%d %H:%M')
+            for time_tag in soup.find_all('time', {'class': 'Meta__updated'}):
+                time_tag.decompose()
+
+        # Extract the author name from the first <p> tag of class 'Meta__author'. After the author name has been
+        # extracted, remove all such <p> tags from the soup (the article can contain several).
+        if author_tag := soup.find('p', {'class': 'Meta__author'}):
+            # If the tag contains an <a> child tag, extract the author name from it.
+            if a_tag := author_tag.find('a'):
+                author_info = a_tag.get_text(strip=True)
+                # To ensure a clean output, remove the <a> child tag.
+                a_tag.decompose()
+            else:
+                # If the tag does not contain an <a> child tag, extract the author name from the text of the <p> tag.
+                author_info = author_tag.get_text(strip=True)
+                # Remove the 'Text: ' prefix from the author name (if any).
+                if author_info.startswith('Text: '):
+                    author_info = author_info[6:]
+            for author_tag in soup.find_all('p', {'class': 'Meta__author'}):
+                author_tag.decompose()
+        else:
+            # If the author name is empty, set it to 'Fokus'.
+            if not author_info:
+                author_info = 'Fokus'
+
+        # Concatenate the author name and the article date.
+        article_metadata = f"{author_info} | {article.date}"
+
+        # Finally, add a new <p> tag with the article metadata to the soup. Place it directly after the lead text.
+        new_tag = soup.new_tag('p')
+        new_tag['class'] = 'article-metadata'
+        new_tag.string = article_metadata
+        lead_tag.insert_after(new_tag)