from collections import defaultdict from calibre.web.feeds.news import BasicNewsRecipe, classes import re class StrangeHorizons(BasicNewsRecipe): title = 'Strange Horizons' description = 'A magazine of speculative fiction and related nonfiction. Best downloaded on weekends' __author__ = 'unkn0wn' no_stylesheets = True use_embedded_content = False encoding = 'utf-8' language = 'en' remove_attributes = ['style', 'height', 'width'] masthead_url = 'http://strangehorizons.com/wordpress/wp-content/themes/strangehorizons/images/sh-logo.jpg' extra_css = ''' .author-biographies, .content-warning-container-ltr, .category {font-size:small; font-style:italic; font-color:#404040;} .byline {font-size:small; font-color:#202020;} .title {font-size:large; text-align:center;} ''' ignore_duplicate_articles = {'url'} keep_only_tags = [ classes('post-container') ] remove_tags = [ dict(name = 'button'), classes('font-size sharedaddy comments-form-row') ] def parse_index(self): main = self.index_to_soup('http://strangehorizons.com/issue/') issue = main.find(attrs={'class':lambda x: x and 'current-issue-widget' in x.split()}) current = issue.find('a', href=lambda x: x and x.startswith('http://strangehorizons.com/issue/')) date = issue.find(**classes('date')) self.timefmt = ' [' + self.tag_to_string(date) + ']' self.log('Downloading Issue:', self.timefmt, current['href']) soup = self.index_to_soup(current['href']) feeds_dict = defaultdict(list) for art in soup.findAll('div', attrs={'class':'article'}): for ti in art.findAll(**classes('title')): if a := ti.find('a', href=True): url = a['href'] title = self.tag_to_string(ti).strip() sec = 'Articles' if cat := art.find(**classes('category')): sec = self.tag_to_string(cat).strip() desc = '' if exp := ti.find_next_sibling(**classes('excerpt')): desc = self.tag_to_string(exp) + desc desc = re.sub(r"\d{5} ", "", desc) if auth := ti.find_next_sibling(**classes('author')): desc = self.tag_to_string(auth) + ' | ' + desc if not title or not url: continue self.log(sec, '\n\t', title, '\n\t', desc, '\n\t\t', url) feeds_dict[sec].append({"title": title, "url": url, "description": desc}) return [(section, articles) for section, articles in feeds_dict.items()]