calibre/recipes/strange_horizons.recipe
Kovid Goyal 29cd8d64ea
Change shebangs to python from python2
Also remove a few other miscellaneous references to python2
2020-08-22 18:47:51 +05:30

161 lines
6.4 KiB
Python

#!/usr/bin/env python
from collections import OrderedDict
from calibre.web.feeds.news import BasicNewsRecipe
class StrangeHorizons(BasicNewsRecipe):
# Recipe metadata
title = "Strange Horizons"
description = "A magazine of speculative fiction and related nonfiction. Best downloaded on weekends"
publication_type = "magazine"
language = "en"
__author__ = "Peter Fidelman, based on work by Jim DeVona"
__version__ = "2.0"
# Cruft filters to apply to each article found by parse_index
keep_only_tags = [dict(name="div", attrs={"class": "post"})]
remove_tags_after = [dict(name="br", attrs={"class": "clear_both"})]
remove_tags = [
dict(name="div", attrs={"class": "single-title-header row"}),
dict(name="div", attrs={"class": "podcast-title"}),
]
# Styles to apply to each article
no_stylesheets = True
extra_css = """
div.image-left { margin: 0.5em auto 1em auto; }
div.image-right { margin: 0.5em auto 1em auto; }
div.illustration { margin: 0.5em auto 1em auto; text-align: center; }
p.image-caption { margin-top: 0.25em; margin-bottom: 1em; font-size: 75%; text-align: center; }
h1 { font-size: 160%; }
h2 { font-size: 110%; }
h3 { font-size: 85%; }
h4 { font-size: 80%; }
p { font-size: 90%; margin: 1em 1em 1em 15px; }
p.author-bio { font-size: 75%; font-style: italic; margin: 1em 1em 1em 15px; }
p.author-bio i, p.author-bio cite, p.author-bio .foreign { font-style: normal; }
p.author-copyright { font-size: 75%; text-align: center; margin: 3em 1em 1em 15px; }
p.content-date { font-weight: bold; }
p.dedication { font-style: italic; }
div.stanza { margin-bottom: 1em; }
div.stanza p { margin: 0px 1em 0px 15px; font-size: 90%; }
p.verse-line { margin-bottom: 0px; margin-top: 0px; }
p.verse-line-indent-1 { margin-bottom: 0px; margin-top: 0px; text-indent: 2em; }
p.verse-line-indent-2 { margin-bottom: 0px; margin-top: 0px; text-indent: 4em; }
p.verse-stanza-break { margin-bottom: 0px; margin-top: 0px; }
.foreign { font-style: italic; }
.thought { font-style: italic; }
.thought cite { font-style: normal; }
.thought em { font-style: normal; }
blockquote { font-size: 90%; font-style: italic; }
blockquote cite { font-style: normal; }
blockquote em { font-style: normal; }
blockquote .foreign { font-style: normal; }
blockquote .thought { font-style: normal; }
.speaker { font-weight: bold; }
pre { margin-left: 15px; }
div.screenplay { font-family: monospace; }
blockquote.screenplay-dialogue { font-style: normal; font-size: 100%; }
.screenplay p.dialogue-first { margin-top: 0; }
.screenplay p.speaker { margin-bottom: 0; text-align: center; font-weight: normal; }
blockquote.typed-letter { font-style: normal; font-size: 100%; font-family: monospace; }
.no-italics { font-style: normal; }
"""
def get_date(self):
frontSoup = self.index_to_soup("http://strangehorizons.com")
dateDiv = frontSoup.find(
"div", attrs={"class": "current-issue-widget issue-medium issue"}
)
url = dateDiv.a["href"]
date = url.split('/')[-2]
return date
def parse_index(self):
# Change this to control what issue to grab. Must be of the format
# D-month-YYYY; for example, "4-july-2005". Alternately, use
# self.get_date() to retrieve the latest issue.
dateStr = self.get_date()
issueUrl = "http://strangehorizons.com/issue/%s/" % dateStr
soup = self.index_to_soup(issueUrl)
sections = OrderedDict()
#
# Each div with class="article" is an article.
#
articles = soup.findAll(attrs={"class": "article"})
for article in articles:
#
# What kind of article is this?
#
categoryDiv = article.find("div", {"class": "category"})
categoryStr = self.tag_to_string(categoryDiv.a)
#
# Ignore podcasts, as they cannot be converted to text.
#
if categoryStr == "Podcasts":
continue
#
# Reviews must be special-cased, as several reviews
# may be packed into the same div.
#
if categoryStr == "Reviews":
reviews = article.findAll(attrs={"class": "review"})
for review in reviews:
titleDiv = review.find("div", {"class": "title"})
url = titleDiv.a["href"]
titleStr = self.tag_to_string(titleDiv.a).strip()
authorDiv = review.find("div", {"class": "author"})
authorStr = self.tag_to_string(authorDiv.a).strip()
if categoryStr not in sections:
sections[categoryStr] = []
sections[categoryStr].append({
"title": titleStr,
"author": authorStr,
"url": url,
"description": "",
"date": dateStr,
})
#
# Assume anything else is an ordinary article. Ought
# to work for "Fiction", "Poetry", "Articles", etc.
#
else:
titleDiv = article.find("div", {"class": "title"})
url = titleDiv.a["href"]
titleStr = self.tag_to_string(titleDiv.a).strip()
authorDiv = article.find("div", {"class": "author"})
authorStr = self.tag_to_string(authorDiv.a).strip()
# The excerpt consistently starts with a
# comment containing one number. This comment
# is not removed by tag_to_string so we must
# remove it ourself. We do this by removing
# the first word of the excerpt.
excerptDiv = article.find("div", {"class": "excerpt"})
excerptStr = self.tag_to_string(excerptDiv).strip()
excerptStr = " ".join(excerptStr.split(" ")[1:])
if categoryStr not in sections:
sections[categoryStr] = []
sections[categoryStr].append({
"title": titleStr,
"author": authorStr,
"url": url,
"description": excerptStr,
"date": dateStr,
})
return sections.items()