Added recipe for Strange Horizons

This commit is contained in:
Peter Fidelman 2017-02-05 23:45:31 -08:00
parent f030b414ea
commit d42a4cadad

View File

@ -1,4 +1,4 @@
#!/usr/bin/env python2 #!/usr/bin/env python
import urlparse import urlparse
from collections import OrderedDict from collections import OrderedDict
@ -7,138 +7,128 @@ from calibre.web.feeds.news import BasicNewsRecipe
class StrangeHorizons(BasicNewsRecipe): class StrangeHorizons(BasicNewsRecipe):
# Recipe metadata
title = "Strange Horizons"
description = "A magazine of speculative fiction and related nonfiction. Best downloaded on weekends"
publication_type = "magazine"
language = "en"
__author__ = "Peter Fidelman, based on work by Jim DeVona"
__version__ = "2.0"
# Recipe metadata # Cruft filters to apply to each article found by parse_index
# Any issue archive page is an acceptable index as well. keep_only_tags = [dict(name="div", attrs={"class":"post"})]
# However, reviews will not be included in older issues. remove_tags_after = [dict(name="br", attrs={"class": "clear_both"})]
# (Using the reviews archive instead of the recent reviews page would fix.) remove_tags = [
INDEX = 'http://www.strangehorizons.com/' dict(name="div", attrs={"class": "single-title-header row"}),
title = 'Strange Horizons' dict(name="div", attrs={"class": "podcast-title"}),
description = 'A magazine of speculative fiction and related nonfiction. Best downloaded on weekends' ]
masthead_url = 'http://strangehorizons.com/images/sh_head.gif'
publication_type = 'magazine'
language = 'en'
__author__ = 'Jim DeVona'
__version__ = '1.0'
# Cruft filters # Styles to apply to each article
keep_only_tags = [dict(name='div', id='content')] no_stylesheets = True
remove_tags = [dict(name='p', attrs={ extra_css = """div.image-left { margin: 0.5em auto 1em auto; } div.image-right { margin: 0.5em auto 1em auto; } div.illustration { margin: 0.5em auto 1em auto; text-align: center; } p.image-caption { margin-top: 0.25em; margin-bottom: 1em; font-size: 75%; text-align: center; } h1 { font-size: 160%; } h2 { font-size: 110%; } h3 { font-size: 85%; } h4 { font-size: 80%; } p { font-size: 90%; margin: 1em 1em 1em 15px; } p.author-bio { font-size: 75%; font-style: italic; margin: 1em 1em 1em 15px; } p.author-bio i, p.author-bio cite, p.author-bio .foreign { font-style: normal; } p.author-copyright { font-size: 75%; text-align: center; margin: 3em 1em 1em 15px; } p.content-date { font-weight: bold; } p.dedication { font-style: italic; } div.stanza { margin-bottom: 1em; } div.stanza p { margin: 0px 1em 0px 15px; font-size: 90%; } p.verse-line { margin-bottom: 0px; margin-top: 0px; } p.verse-line-indent-1 { margin-bottom: 0px; margin-top: 0px; text-indent: 2em; } p.verse-line-indent-2 { margin-bottom: 0px; margin-top: 0px; text-indent: 4em; } p.verse-stanza-break { margin-bottom: 0px; margin-top: 0px; } .foreign { font-style: italic; } .thought { font-style: italic; } .thought cite { font-style: normal; } .thought em { font-style: normal; } blockquote { font-size: 90%; font-style: italic; } blockquote cite { font-style: normal; } blockquote em { font-style: normal; } blockquote .foreign { font-style: normal; } blockquote .thought { font-style: normal; } .speaker { font-weight: bold; } pre { margin-left: 15px; } div.screenplay { font-family: monospace; } blockquote.screenplay-dialogue { font-style: normal; font-size: 100%; } .screenplay p.dialogue-first { margin-top: 0; } .screenplay p.speaker { margin-bottom: 0; text-align: center; font-weight: normal; } blockquote.typed-letter { font-style: normal; font-size: 100%; font-family: monospace; } .no-italics { font-style: normal; }"""
'class': 'forum-links'}), dict(name='p', attrs={'class': 'top-link'})]
remove_tags_after = [dict(name='p', attrs={'class': 'author-bio'})]
# Styles def get_date(self):
no_stylesheets = True frontSoup = self.index_to_soup("http://strangehorizons.com")
extra_css = '''div.image-left { margin: 0.5em auto 1em auto; } div.image-right { margin: 0.5em auto 1em auto; } div.illustration { margin: 0.5em auto 1em auto; text-align: center; } p.image-caption { margin-top: 0.25em; margin-bottom: 1em; font-size: 75%; text-align: center; } h1 { font-size: 160%; } h2 { font-size: 110%; } h3 { font-size: 85%; } h4 { font-size: 80%; } p { font-size: 90%; margin: 1em 1em 1em 15px; } p.author-bio { font-size: 75%; font-style: italic; margin: 1em 1em 1em 15px; } p.author-bio i, p.author-bio cite, p.author-bio .foreign { font-style: normal; } p.author-copyright { font-size: 75%; text-align: center; margin: 3em 1em 1em 15px; } p.content-date { font-weight: bold; } p.dedication { font-style: italic; } div.stanza { margin-bottom: 1em; } div.stanza p { margin: 0px 1em 0px 15px; font-size: 90%; } p.verse-line { margin-bottom: 0px; margin-top: 0px; } p.verse-line-indent-1 { margin-bottom: 0px; margin-top: 0px; text-indent: 2em; } p.verse-line-indent-2 { margin-bottom: 0px; margin-top: 0px; text-indent: 4em; } p.verse-stanza-break { margin-bottom: 0px; margin-top: 0px; } .foreign { font-style: italic; } .thought { font-style: italic; } .thought cite { font-style: normal; } .thought em { font-style: normal; } blockquote { font-size: 90%; font-style: italic; } blockquote cite { font-style: normal; } blockquote em { font-style: normal; } blockquote .foreign { font-style: normal; } blockquote .thought { font-style: normal; } .speaker { font-weight: bold; } pre { margin-left: 15px; } div.screenplay { font-family: monospace; } blockquote.screenplay-dialogue { font-style: normal; font-size: 100%; } .screenplay p.dialogue-first { margin-top: 0; } .screenplay p.speaker { margin-bottom: 0; text-align: center; font-weight: normal; } blockquote.typed-letter { font-style: normal; font-size: 100%; font-family: monospace; } .no-italics { font-style: normal; }''' # noqa dateDiv = frontSoup.find("div",
attrs={"class": "current-issue-widget issue-medium issue"})
url = dateDiv.a["href"]
date = url.split('/')[-2]
return date
def parse_index(self):
sections = OrderedDict() def parse_index(self):
strange_soup = self.index_to_soup(self.INDEX) # Change this to control what issue to grab. Must be of the format
# D-month-YYYY; for example, "4-july-2005". Alternately, use
# self.get_date() to retrieve the latest issue.
# Find the heading that marks the start of this issue. dateStr = self.get_date();
issue_heading = strange_soup.find('h2') #dateStr = "4-july-2005"
issue_date = self.tag_to_string(issue_heading)
self.title = self.title + " - " + issue_date
# Examine subsequent headings for information about this issue. issueUrl = "http://strangehorizons.com/issue/%s/" % dateStr
heading_tag = issue_heading.findNextSibling(['h2', 'h3']) soup = self.index_to_soup(issueUrl)
while heading_tag is not None:
# An h2 indicates the start of the next issue. sections = OrderedDict()
if heading_tag.name == 'h2':
break
# The heading begins with a word indicating the article category. #
section = self.tag_to_string(heading_tag).split(':', 1)[0].title() # Each div with class="article" is an article.
#
articles = soup.findAll(attrs={"class": "article"})
# Reviews aren't linked from the index, so we need to look them up for article in articles:
# separately. Currently using Recent Reviews page. The reviews #
# archive page lists all reviews, but is >500k. # What kind of article is this?
if section == 'Review': #
categoryDiv = article.find("div", {"class": "category"})
categoryStr = self.tag_to_string(categoryDiv.a)
# Get the list of recent reviews. #
review_soup = self.index_to_soup( # Ignore podcasts, as they cannot be converted to text.
'http://www.strangehorizons.com/reviews/') #
review_titles = review_soup.findAll( if categoryStr == "Podcasts":
'p', attrs={'class': 'contents-title'}) continue
# Get the list of reviews included in this issue. (Kludgey.) #
reviews_summary = heading_tag.findNextSibling( # Reviews must be special-cased, as several reviews
'p', attrs={'class': 'contents-pullquote'}) # may be packed into the same div.
for br in reviews_summary.findAll('br'): #
br.replaceWith('----') if categoryStr == "Reviews":
review_summary_text = self.tag_to_string(reviews_summary) reviews = article.findAll(attrs={"class": "review"})
review_lines = review_summary_text.split(' ----') for review in reviews:
titleDiv = review.find("div", {"class": "title"})
url = titleDiv.a["href"]
titleStr = self.tag_to_string(titleDiv.a).strip()
# Look for each of the needed reviews (there are 3, right?)... authorDiv = review.find("div", {"class": "author"})
for review_info in review_lines[0:3]: authorStr = self.tag_to_string(authorDiv.a).strip()
# Get the review's release day (unused), title, and author. #print titleStr
day, tna = review_info.split(': ', 1) #print url
article_title, article_author = tna.split(', reviewed by ') #print authorStr
# ... in the list of recent reviews. if categoryStr not in sections:
for review_title_tag in review_titles: sections[categoryStr] = []
review_title = self.tag_to_string(review_title_tag) sections[categoryStr].append({
if review_title != article_title: "title": titleStr,
continue "author": authorStr,
"url": url,
"description": "",
"date": dateStr,
})
# Extract review information from heading and #
# surrounding text. # Assume anything else is an ordinary article. Ought
article_summary = self.tag_to_string(review_title_tag.findNextSibling( # to work for "Fiction", "Poetry", "Articles", etc.
'p', attrs={'class': 'contents-pullquote'})) #
review_date = self.tag_to_string( else:
review_title_tag.findNextSibling('p', attrs={'class': 'contents-date'})) titleDiv = article.find("div", {"class": "title"})
article_url = review_title_tag.find('a')['href'] url = titleDiv.a["href"]
titleStr = self.tag_to_string(titleDiv.a).strip()
# Add this review to the Review section. authorDiv = article.find("div", {"class": "author"})
if section not in sections: authorStr = self.tag_to_string(authorDiv.a).strip()
sections[section] = []
sections[section].append({
'title': article_title,
'author': article_author,
'url': article_url,
'description': article_summary,
'date': review_date})
break # The excerpt consistently starts with a
# comment containing one number. This comment
# is not removed by tag_to_string so we must
# remove it ourself. We do this by removing
# the first word of the excerpt.
excerptDiv = article.find("div", {"class": "excerpt"})
excerptStr = self.tag_to_string(excerptDiv).strip()
excerptStr = " ".join(excerptStr.split(" ")[1:])
else: #print titleStr
# Try #print url
# http://www.strangehorizons.com/reviews/archives.shtml #print authorStr
self.log( #print excerptStr
"Review not found in Recent Reviews:", article_title) #print dateStr
else: if categoryStr not in sections:
sections[categoryStr] = []
# Extract article information from the heading and surrounding sections[categoryStr].append({
# text. "title": titleStr,
link = heading_tag.find('a') "author": authorStr,
article_title = self.tag_to_string(link) "url": url,
article_url = urlparse.urljoin(self.INDEX, link['href']) "description": excerptStr,
article_author = link.nextSibling.replace(', by ', '') "date": dateStr,
article_summary = self.tag_to_string(heading_tag.findNextSibling( })
'p', attrs={'class': 'contents-pullquote'})) return sections.items()
# Add article to the appropriate collection of sections.
if section not in sections:
sections[section] = []
sections[section].append({
'title': article_title,
'author': article_author,
'url': article_url,
'description': article_summary,
'date': issue_date})
heading_tag = heading_tag.findNextSibling(['h2', 'h3'])
# Manually insert standard info about the magazine.
sections['About'] = [{
'title': 'Strange Horizons',
'author': 'Niall Harrison, Editor-in-Chief',
'url': 'http://www.strangehorizons.com/AboutUs.shtml',
'description': 'Strange Horizons is a magazine of and about speculative fiction and related nonfiction. Speculative fiction includes science fiction, fantasy, horror, slipstream, and all other flavors of fantastika. Work published in Strange Horizons has been shortlisted for or won Hugo, Nebula, Rhysling, Theodore Sturgeon, James Tiptree Jr., and World Fantasy Awards.', # noqa
'date': ''}]
return sections.items()