#!/usr/bin/env python from __future__ import unicode_literals __license__ = 'GPL v3' __copyright__ = '2015 Michael Marotta ' # Written April 2015 # Last edited 4/18/15 ''' technologyreview.com ''' from calibre.web.feeds.news import BasicNewsRecipe import re def absurl(x): if x.startswith('//'): x = 'http:' + x elif not x.startswith('http'): x = "http://www.technologyreview.com" + x return x def classes(classes): q = frozenset(classes.split(' ')) return dict(attrs={'class': lambda x: x and frozenset(x.split()).intersection(q)}) class MitTechnologyReview(BasicNewsRecipe): title = 'MIT Technology Review Magazine' __author__ = 'Michael Marotta' description = ('Bi-monthly magazine version of MIT Technology Review.' ' This is different than the recipe named simply "Technology Review"' ' which downloads the rss feed with daily articles from the website.') INDEX = 'http://www.technologyreview.com/magazine/' language = 'en' encoding = 'utf-8' simultaneous_downloads = 20 tags = 'news, technology, science' no_stylesheets = True """ regex for class names """ articleHeaderRegex= '^.*contentHeader__wrapper.*$' editorLetterHeaderRegex = "^.*contentHeader--vertical__wrapper.*$" articleContentRegex = "^.*contentbody__wrapper.*$" imagePlaceHolderRegex = "^.*image__placeholder.*$" advertisementRegex = "^.*sliderAd__wrapper.*$" keep_only_tags = [ dict(name='header', attrs={'class': re.compile(editorLetterHeaderRegex, re.IGNORECASE)}), dict(name='header', attrs={'class': re.compile(articleHeaderRegex, re.IGNORECASE)}), dict(name='div', attrs={'class': re.compile(articleContentRegex, re.IGNORECASE)}) ] remove_tags = [ dict(name="aside"), dict(name="svg"), dict(name="blockquote"), dict(name="img", attrs={'class': re.compile(imagePlaceHolderRegex, re.IGNORECASE)}), dict(name="div", attrs={'class': re.compile(advertisementRegex, re.IGNORECASE)}), ] def parse_index(self): soup = self.index_to_soup(self.INDEX) # find cover self.cover_url = absurl(soup.find( "div", attrs={"class":lambda name: name.startswith("magazineHero__image") if name else False}).find( "img", src=True )['src']) # parse articles current_articles = [] classNamePrefixes = ["magazineHero__letter--", "teaserItem__title", "teaserItem--aside__title"] for div in soup.findAll(attrs={'class': lambda x: any(x.startswith(prefix) for prefix in classNamePrefixes) if x else False}): a = div.find('a', href=True) title = self.tag_to_string(a).strip() href = absurl(a['href']) if href and title: current_articles.append({'title': title, 'url': href}) self.log(title, '[%s]' % href) return [('Articles', current_articles)] def preprocess_html(self, soup): for img in soup.findAll(srcset=True): img['src'] = absurl(img['srcset'].split()[0]) del img['srcset'] return soup