diff --git a/recipes/strange_horizons.recipe b/recipes/strange_horizons.recipe index 200b2e61d3..a39183644a 100644 --- a/recipes/strange_horizons.recipe +++ b/recipes/strange_horizons.recipe @@ -1,160 +1,66 @@ -#!/usr/bin/env python - -from collections import OrderedDict - -from calibre.web.feeds.news import BasicNewsRecipe - +from collections import defaultdict +from calibre.web.feeds.news import BasicNewsRecipe, classes +import re class StrangeHorizons(BasicNewsRecipe): - # Recipe metadata - title = "Strange Horizons" - description = "A magazine of speculative fiction and related nonfiction. Best downloaded on weekends" - publication_type = "magazine" - language = "en" - __author__ = "Peter Fidelman, based on work by Jim DeVona" - __version__ = "2.0" + title = 'Strange Horizons' + description = 'A magazine of speculative fiction and related nonfiction. Best downloaded on weekends' + __author__ = 'unkn0wn' + no_stylesheets = True + use_embedded_content = False + encoding = 'utf-8' + language = 'en' + remove_attributes = ['style', 'height', 'width'] + masthead_url = 'http://strangehorizons.com/wordpress/wp-content/themes/strangehorizons/images/sh-logo.jpg' - # Cruft filters to apply to each article found by parse_index - keep_only_tags = [dict(name="div", attrs={"class": "post"})] - remove_tags_after = [dict(name="br", attrs={"class": "clear_both"})] - remove_tags = [ - dict(name="div", attrs={"class": "single-title-header row"}), - dict(name="div", attrs={"class": "podcast-title"}), + extra_css = ''' + .author-biographies, .content-warning-container-ltr, .category {font-size:small; font-style:italic; font-color:#404040;} + .byline {font-size:small; font-color:#202020;} + .title {font-size:large; text-align:center;} + ''' + + ignore_duplicate_articles = {'url'} + + keep_only_tags = [ + classes('post-container') ] - # Styles to apply to each article - no_stylesheets = True - extra_css = """ - div.image-left { margin: 0.5em auto 1em auto; } - div.image-right { margin: 0.5em auto 1em auto; } - div.illustration { margin: 0.5em auto 1em auto; text-align: center; } - p.image-caption { margin-top: 0.25em; margin-bottom: 1em; font-size: 75%; text-align: center; } - h1 { font-size: 160%; } - h2 { font-size: 110%; } - h3 { font-size: 85%; } - h4 { font-size: 80%; } - p { font-size: 90%; margin: 1em 1em 1em 15px; } - p.author-bio { font-size: 75%; font-style: italic; margin: 1em 1em 1em 15px; } - p.author-bio i, p.author-bio cite, p.author-bio .foreign { font-style: normal; } - p.author-copyright { font-size: 75%; text-align: center; margin: 3em 1em 1em 15px; } - p.content-date { font-weight: bold; } - p.dedication { font-style: italic; } - div.stanza { margin-bottom: 1em; } - div.stanza p { margin: 0px 1em 0px 15px; font-size: 90%; } - p.verse-line { margin-bottom: 0px; margin-top: 0px; } - p.verse-line-indent-1 { margin-bottom: 0px; margin-top: 0px; text-indent: 2em; } - p.verse-line-indent-2 { margin-bottom: 0px; margin-top: 0px; text-indent: 4em; } - p.verse-stanza-break { margin-bottom: 0px; margin-top: 0px; } - .foreign { font-style: italic; } - .thought { font-style: italic; } - .thought cite { font-style: normal; } - .thought em { font-style: normal; } - blockquote { font-size: 90%; font-style: italic; } - blockquote cite { font-style: normal; } - blockquote em { font-style: normal; } - blockquote .foreign { font-style: normal; } - blockquote .thought { font-style: normal; } - .speaker { font-weight: bold; } - pre { margin-left: 15px; } - div.screenplay { font-family: monospace; } - blockquote.screenplay-dialogue { font-style: normal; font-size: 100%; } - .screenplay p.dialogue-first { margin-top: 0; } - .screenplay p.speaker { margin-bottom: 0; text-align: center; font-weight: normal; } - blockquote.typed-letter { font-style: normal; font-size: 100%; font-family: monospace; } - .no-italics { font-style: normal; } - """ - - def get_date(self): - frontSoup = self.index_to_soup("http://strangehorizons.com") - dateDiv = frontSoup.find( - "div", attrs={"class": "current-issue-widget issue-medium issue"} - ) - url = dateDiv.a["href"] - date = url.split('/')[-2] - return date + remove_tags = [ + dict(name = 'button'), + classes('font-size sharedaddy comments-form-row') + ] def parse_index(self): - # Change this to control what issue to grab. Must be of the format - # D-month-YYYY; for example, "4-july-2005". Alternately, use - # self.get_date() to retrieve the latest issue. + main = self.index_to_soup('http://strangehorizons.com/issue/') + issue = main.find(attrs={'class':lambda x: x and 'current-issue-widget' in x.split()}) + current = issue.find('a', href=lambda x: x and x.startswith('http://strangehorizons.com/issue/')) + date = issue.find(**classes('date')) + self.timefmt = ' [' + self.tag_to_string(date) + ']' + self.log('Downloading Issue:', self.timefmt, current['href']) + soup = self.index_to_soup(current['href']) - dateStr = self.get_date() + feeds_dict = defaultdict(list) - issueUrl = "http://strangehorizons.com/issue/%s/" % dateStr - soup = self.index_to_soup(issueUrl) + for art in soup.findAll('div', attrs={'class':'article'}): + for ti in art.findAll(**classes('title')): + if a := ti.find('a', href=True): + url = a['href'] + title = self.tag_to_string(ti).strip() - sections = OrderedDict() + sec = 'Articles' + if cat := art.find(**classes('category')): + sec = self.tag_to_string(cat).strip() - # - # Each div with class="article" is an article. - # - articles = soup.findAll(attrs={"class": "article"}) + desc = '' + if exp := ti.find_next_sibling(**classes('excerpt')): + desc = self.tag_to_string(exp) + desc + desc = re.sub(r"\d{5} ", "", desc) + if auth := ti.find_next_sibling(**classes('author')): + desc = self.tag_to_string(auth) + ' | ' + desc - for article in articles: - # - # What kind of article is this? - # - categoryDiv = article.find("div", {"class": "category"}) - categoryStr = self.tag_to_string(categoryDiv.a) + if not title or not url: + continue - # - # Ignore podcasts, as they cannot be converted to text. - # - if categoryStr == "Podcasts": - continue - - # - # Reviews must be special-cased, as several reviews - # may be packed into the same div. - # - if categoryStr == "Reviews": - reviews = article.findAll(attrs={"class": "review"}) - for review in reviews: - titleDiv = review.find("div", {"class": "title"}) - url = titleDiv.a["href"] - titleStr = self.tag_to_string(titleDiv.a).strip() - - authorDiv = review.find("div", {"class": "author"}) - authorStr = self.tag_to_string(authorDiv.a).strip() - - if categoryStr not in sections: - sections[categoryStr] = [] - sections[categoryStr].append({ - "title": titleStr, - "author": authorStr, - "url": url, - "description": "", - "date": dateStr, - }) - - # - # Assume anything else is an ordinary article. Ought - # to work for "Fiction", "Poetry", "Articles", etc. - # - else: - titleDiv = article.find("div", {"class": "title"}) - url = titleDiv.a["href"] - titleStr = self.tag_to_string(titleDiv.a).strip() - - authorDiv = article.find("div", {"class": "author"}) - authorStr = self.tag_to_string(authorDiv.a).strip() - - # The excerpt consistently starts with a - # comment containing one number. This comment - # is not removed by tag_to_string so we must - # remove it ourself. We do this by removing - # the first word of the excerpt. - excerptDiv = article.find("div", {"class": "excerpt"}) - excerptStr = self.tag_to_string(excerptDiv).strip() - excerptStr = " ".join(excerptStr.split(" ")[1:]) - - if categoryStr not in sections: - sections[categoryStr] = [] - sections[categoryStr].append({ - "title": titleStr, - "author": authorStr, - "url": url, - "description": excerptStr, - "date": dateStr, - }) - return sections.items() + self.log(sec, '\n\t', title, '\n\t', desc, '\n\t\t', url) + feeds_dict[sec].append({"title": title, "url": url, "description": desc}) + return [(section, articles) for section, articles in feeds_dict.items()]