Update strange_horizons.recipe

2026-01-05 19:50:21 -05:00 · 2025-02-14 09:43:00 +05:30 · 2025-02-14 09:43:00 +05:30 · 539e87ec28
commit 539e87ec28
parent 417c05ea41
1 changed files with 34 additions and 46 deletions
--- a/recipes/strange_horizons.recipe
+++ b/recipes/strange_horizons.recipe
@ -1,12 +1,15 @@
-import re
-from collections import defaultdict
+#!/usr/bin/env python

 from calibre.web.feeds.news import BasicNewsRecipe, classes


 class StrangeHorizons(BasicNewsRecipe):
    title = 'Strange Horizons'
-    description = 'A magazine of speculative fiction and related nonfiction. Best downloaded on weekends'
+    description = (
+        'Strange Horizons is a weekly magazine of and about speculative fiction. '
+        'We publish fiction, poetry, reviews, essays, interviews, roundtable '
+        'discussions, and art.'
+    )
    __author__ = 'unkn0wn'
    no_stylesheets = True
    use_embedded_content = False
@ -14,55 +17,40 @@ class StrangeHorizons(BasicNewsRecipe):
    language = 'en'
    remove_attributes = ['style', 'height', 'width']
    masthead_url = 'http://strangehorizons.com/wordpress/wp-content/themes/strangehorizons/images/sh-logo.jpg'
+    ignore_duplicate_articles = {'url'}
+    resolve_internal_links = True
+    oldest_article = 7

-    extra_css = '''
+    extra_css = """
        .author-biographies, .content-warning-container-ltr, .category {font-size:small; font-style:italic; font-color:#404040;}
        .byline {font-size:small; font-color:#202020;}
-        .title {font-size:large; text-align:center;}
-    '''
+        img {display:block; margin:0 auto;}
+    """
+
+    recipe_specific_options = {
+        'days': {
+            'short': 'Oldest article to download from this news source. In days ',
+            'long': 'For example, 0.5, gives you articles from the past 12 hours',
+            'default': str(oldest_article),
+        }
+    }
+
+    def __init__(self, *args, **kwargs):
+        BasicNewsRecipe.__init__(self, *args, **kwargs)
+        d = self.recipe_specific_options.get('days')
+        if d and isinstance(d, str):
+            self.oldest_article = float(d)

    ignore_duplicate_articles = {'url'}

-    keep_only_tags = [
-        classes('post-container')
-    ]
+    keep_only_tags = remove_tags_after = [dict(name='div', attrs={'class': 'post'})]

-    remove_tags = [
-        dict(name='button'),
-        classes('font-size sharedaddy comments-form-row')
-    ]
+    remove_tags = [dict(name='button'), classes('font-size sharedaddy comments-form-row')]

-    def parse_index(self):
-        main = self.index_to_soup('http://strangehorizons.com/issue/')
-        issue = main.find(attrs={'class':lambda x: x and 'current-issue-widget' in x.split()})
-        current = issue.find('a', href=lambda x: x and x.startswith('http://strangehorizons.com/issue/'))
-        date = issue.find(**classes('date'))
-        self.timefmt = ' [' + self.tag_to_string(date) + ']'
-        self.log('Downloading Issue:', self.timefmt, current['href'])
-        soup = self.index_to_soup(current['href'])
+    def preprocess_html(self, soup):
+        h1 = soup.find(attrs={'class': 'title'})
+        if h1 and h1.find('a'):
+            h1.a.name = 'h1'
+        return soup

-        feeds_dict = defaultdict(list)
-
-        for art in soup.findAll('div', attrs={'class':'article'}):
-            for ti in art.findAll(**classes('title')):
-                if a := ti.find('a', href=True):
-                    url = a['href']
-                    title = self.tag_to_string(ti).strip()
-
-                sec = 'Articles'
-                if cat := art.find(**classes('category')):
-                    sec = self.tag_to_string(cat).strip()
-
-                desc = ''
-                if exp := ti.find_next_sibling(**classes('excerpt')):
-                    desc = self.tag_to_string(exp) + desc
-                    desc = re.sub(r'\d{5} ', '', desc)
-                if auth := ti.find_next_sibling(**classes('author')):
-                    desc = self.tag_to_string(auth) + ' | ' + desc
-
-                if not title or not url:
-                    continue
-
-            self.log(sec, '\n\t', title, '\n\t', desc, '\n\t\t', url)
-            feeds_dict[sec].append({'title': title, 'url': url, 'description': desc})
-        return list(feeds_dict.items())
+    feeds = [('Articles', 'http://strangehorizons.com/wordpress/feed/')]