Update Strange Horizons

2025-08-30 23:00:21 -04:00 · 2023-03-08 20:55:49 +05:30 · 2023-03-08 20:55:49 +05:30 · 6bf2d26905
commit 6bf2d26905
parent 6420d449ba
1 changed files with 53 additions and 147 deletions
--- a/recipes/strange_horizons.recipe
+++ b/recipes/strange_horizons.recipe
@ -1,160 +1,66 @@
-#!/usr/bin/env python
-
-from collections import OrderedDict
-
-from calibre.web.feeds.news import BasicNewsRecipe
-
+from collections import defaultdict
+from calibre.web.feeds.news import BasicNewsRecipe, classes
+import re

 class StrangeHorizons(BasicNewsRecipe):
-    # Recipe metadata
-    title = "Strange Horizons"
-    description = "A magazine of speculative fiction and related nonfiction. Best downloaded on weekends"
-    publication_type = "magazine"
-    language = "en"
-    __author__ = "Peter Fidelman, based on work by Jim DeVona"
-    __version__ = "2.0"
+    title = 'Strange Horizons'
+    description = 'A magazine of speculative fiction and related nonfiction. Best downloaded on weekends'
+    __author__ = 'unkn0wn'
+    no_stylesheets = True
+    use_embedded_content = False
+    encoding = 'utf-8'
+    language = 'en'
+    remove_attributes = ['style', 'height', 'width']
+    masthead_url = 'http://strangehorizons.com/wordpress/wp-content/themes/strangehorizons/images/sh-logo.jpg'

-    # Cruft filters to apply to each article found by parse_index
-    keep_only_tags = [dict(name="div", attrs={"class": "post"})]
-    remove_tags_after = [dict(name="br", attrs={"class": "clear_both"})]
-    remove_tags = [
-        dict(name="div", attrs={"class": "single-title-header row"}),
-        dict(name="div", attrs={"class": "podcast-title"}),
+    extra_css = '''
+        .author-biographies, .content-warning-container-ltr, .category {font-size:small; font-style:italic; font-color:#404040;}
+        .byline {font-size:small; font-color:#202020;}
+        .title {font-size:large; text-align:center;}
+    '''
+
+    ignore_duplicate_articles = {'url'}
+
+    keep_only_tags = [
+        classes('post-container')
    ]

-    # Styles to apply to each article
-    no_stylesheets = True
-    extra_css = """
-    div.image-left { margin: 0.5em auto 1em auto; }
-    div.image-right { margin: 0.5em auto 1em auto; }
-    div.illustration { margin: 0.5em auto 1em auto; text-align: center; }
-    p.image-caption { margin-top: 0.25em; margin-bottom: 1em; font-size: 75%; text-align: center; }
-    h1 { font-size: 160%; }
-    h2 { font-size: 110%; }
-    h3 { font-size: 85%; }
-    h4 { font-size: 80%; }
-    p { font-size: 90%; margin: 1em 1em 1em 15px; }
-    p.author-bio { font-size: 75%; font-style: italic; margin: 1em 1em 1em 15px; }
-    p.author-bio i, p.author-bio cite, p.author-bio .foreign { font-style: normal; }
-    p.author-copyright { font-size: 75%; text-align: center; margin: 3em 1em 1em 15px; }
-    p.content-date { font-weight: bold; }
-    p.dedication { font-style: italic; }
-    div.stanza { margin-bottom: 1em; }
-    div.stanza p { margin: 0px 1em 0px 15px; font-size: 90%; }
-    p.verse-line { margin-bottom: 0px; margin-top: 0px; }
-    p.verse-line-indent-1 { margin-bottom: 0px; margin-top: 0px; text-indent: 2em; }
-    p.verse-line-indent-2 { margin-bottom: 0px; margin-top: 0px; text-indent: 4em; }
-    p.verse-stanza-break { margin-bottom: 0px; margin-top: 0px; }
-    .foreign { font-style: italic; }
-    .thought { font-style: italic; }
-    .thought cite { font-style: normal; }
-    .thought em { font-style: normal; }
-    blockquote { font-size: 90%; font-style: italic; }
-    blockquote cite { font-style: normal; }
-    blockquote em { font-style: normal; }
-    blockquote .foreign { font-style: normal; }
-    blockquote .thought { font-style: normal; }
-    .speaker { font-weight: bold; }
-    pre { margin-left: 15px; }
-    div.screenplay { font-family: monospace; }
-    blockquote.screenplay-dialogue { font-style: normal; font-size: 100%; }
-    .screenplay p.dialogue-first { margin-top: 0; }
-    .screenplay p.speaker { margin-bottom: 0; text-align: center; font-weight: normal; }
-    blockquote.typed-letter { font-style: normal; font-size: 100%; font-family: monospace; }
-    .no-italics { font-style: normal; }
-    """
-
-    def get_date(self):
-        frontSoup = self.index_to_soup("http://strangehorizons.com")
-        dateDiv = frontSoup.find(
-            "div", attrs={"class": "current-issue-widget issue-medium issue"}
-        )
-        url = dateDiv.a["href"]
-        date = url.split('/')[-2]
-        return date
+    remove_tags = [
+        dict(name = 'button'),
+        classes('font-size sharedaddy comments-form-row')
+    ]

    def parse_index(self):
-        # Change this to control what issue to grab.  Must be of the format
-        # D-month-YYYY; for example, "4-july-2005".  Alternately, use
-        # self.get_date() to retrieve the latest issue.
+        main = self.index_to_soup('http://strangehorizons.com/issue/')
+        issue = main.find(attrs={'class':lambda x: x and 'current-issue-widget' in x.split()})
+        current = issue.find('a', href=lambda x: x and x.startswith('http://strangehorizons.com/issue/'))
+        date = issue.find(**classes('date'))
+        self.timefmt = ' [' + self.tag_to_string(date) + ']'
+        self.log('Downloading Issue:', self.timefmt, current['href'])
+        soup = self.index_to_soup(current['href'])

-        dateStr = self.get_date()
+        feeds_dict = defaultdict(list)

-        issueUrl = "http://strangehorizons.com/issue/%s/" % dateStr
-        soup = self.index_to_soup(issueUrl)
+        for art in soup.findAll('div', attrs={'class':'article'}):
+            for ti in art.findAll(**classes('title')):
+                if a := ti.find('a', href=True):
+                    url = a['href']
+                    title = self.tag_to_string(ti).strip()

-        sections = OrderedDict()
+                sec = 'Articles'
+                if cat := art.find(**classes('category')):
+                    sec = self.tag_to_string(cat).strip()

-        #
-        # Each div with class="article" is an article.
-        #
-        articles = soup.findAll(attrs={"class": "article"})
+                desc = ''
+                if exp := ti.find_next_sibling(**classes('excerpt')):
+                    desc = self.tag_to_string(exp) + desc
+                    desc = re.sub(r"\d{5} ", "", desc)
+                if auth := ti.find_next_sibling(**classes('author')):
+                    desc = self.tag_to_string(auth) + ' | ' + desc

-        for article in articles:
-            #
-            # What kind of article is this?
-            #
-            categoryDiv = article.find("div", {"class": "category"})
-            categoryStr = self.tag_to_string(categoryDiv.a)
+                if not title or not url:
+                    continue

-            #
-            # Ignore podcasts, as they cannot be converted to text.
-            #
-            if categoryStr == "Podcasts":
-                continue
-
-            #
-            # Reviews must be special-cased, as several reviews
-            # may be packed into the same div.
-            #
-            if categoryStr == "Reviews":
-                reviews = article.findAll(attrs={"class": "review"})
-                for review in reviews:
-                    titleDiv = review.find("div", {"class": "title"})
-                    url = titleDiv.a["href"]
-                    titleStr = self.tag_to_string(titleDiv.a).strip()
-
-                    authorDiv = review.find("div", {"class": "author"})
-                    authorStr = self.tag_to_string(authorDiv.a).strip()
-
-                    if categoryStr not in sections:
-                        sections[categoryStr] = []
-                    sections[categoryStr].append({
-                        "title": titleStr,
-                        "author": authorStr,
-                        "url": url,
-                        "description": "",
-                        "date": dateStr,
-                    })
-
-            #
-            # Assume anything else is an ordinary article.  Ought
-            # to work for "Fiction", "Poetry", "Articles", etc.
-            #
-            else:
-                titleDiv = article.find("div", {"class": "title"})
-                url = titleDiv.a["href"]
-                titleStr = self.tag_to_string(titleDiv.a).strip()
-
-                authorDiv = article.find("div", {"class": "author"})
-                authorStr = self.tag_to_string(authorDiv.a).strip()
-
-                # The excerpt consistently starts with a
-                # comment containing one number.  This comment
-                # is not removed by tag_to_string so we must
-                # remove it ourself.  We do this by removing
-                # the first word of the excerpt.
-                excerptDiv = article.find("div", {"class": "excerpt"})
-                excerptStr = self.tag_to_string(excerptDiv).strip()
-                excerptStr = " ".join(excerptStr.split(" ")[1:])
-
-                if categoryStr not in sections:
-                    sections[categoryStr] = []
-                sections[categoryStr].append({
-                    "title": titleStr,
-                    "author": authorStr,
-                    "url": url,
-                    "description": excerptStr,
-                    "date": dateStr,
-                })
-        return sections.items()
+            self.log(sec, '\n\t', title, '\n\t', desc, '\n\t\t', url)
+            feeds_dict[sec].append({"title": title, "url": url, "description": desc})
+        return [(section, articles) for section, articles in feeds_dict.items()]