diff --git a/recipes/spectator_magazine.recipe b/recipes/spectator_magazine.recipe index 042cd4fef9..798c8f15ed 100644 --- a/recipes/spectator_magazine.recipe +++ b/recipes/spectator_magazine.recipe @@ -1,70 +1,105 @@ -#!/usr/bin/env python -# vim:fileencoding=utf-8 -# License: GPLv3 Copyright: 2015, Kovid Goyal - -from __future__ import absolute_import, division, print_function, unicode_literals - -import time -from calibre import random_user_agent from calibre.web.feeds.news import BasicNewsRecipe, prefixed_classes -def absolutize(url): +def absurl(url): if url.startswith('/'): - url = 'https://spectator.co.uk' + url + url = 'https://www.spectator.co.uk' + url return url -class Spectator(BasicNewsRecipe): - +class spectator(BasicNewsRecipe): title = 'Spectator Magazine' - __author__ = 'Kovid Goyal' - description = 'Magazine' + __author__ = 'unkn0wn' + description = 'The Spectator was established in 1828, and is the best-written and most influential weekly in the English language.' language = 'en' no_stylesheets = True + remove_attributes = ['height', 'width', 'style'] + ignore_duplicate_articles = {'url'} + masthead_url = 'https://upload.wikimedia.org/wikipedia/commons/thumb/c/c7/The_Spectator_logo.svg/320px-The_Spectator_logo.svg.png' + encoding = 'utf-8' + remove_empty_feeds = True + resolve_internal_links = True + + extra_css = ''' + [class^="ContentPageFooterAuthor_author"] {font-size:small;} + #fig-c {text-align:center; font-size:small;} + blockquote, em {color:#404040;} + ''' keep_only_tags = [ - prefixed_classes('ContentPageHeader_main__ ContentPageHeader_metadata__ ContentPageHero_container__ ContentPageBody_body__container__'), - dict(name='noscript'), + prefixed_classes( + 'ContentPageHeader_main ContentPageHero_container ContentPageBody_body__container__' + ' ContentPageFooterAuthor_author__'), + ] + + remove_tags = [ + dict(name=('aside', 'iframe')), + prefixed_classes('ContentPageBody_measure__ ContentPageAuthor_author__pic') ] - remove_attributes = ['style'] + + def preprocess_html(self, soup): + h2 = soup.find('h2') + if h2: + h2.name = 'h4' + for fc in soup.findAll('figcaption'): + fc['id'] = 'fig-c' + for fig in soup.findAll('figure'): + for nos in fig.findAll('noscript'): + nos.name = 'span' + return soup def parse_index(self): - soup = self.index_to_soup('https://www.spectator.co.uk/magazine/latest') - raw = str(soup) - # open('/t/raw.html', 'w').write(raw) - section, articles = 'Featured', [] + soup = self.index_to_soup('https://www.spectator.co.uk/magazine') + self.cover_url = soup.find(**prefixed_classes( + 'MagazinePage_spectator-magazine__image-and-subsections__')).img['src'] + issue = self.tag_to_string(soup.find(**prefixed_classes( + 'MagazinePage_spectator-magazine-issue__title__'))).strip() + self.timefmt = ' (' + issue + ') [' + self.tag_to_string(soup.find(**prefixed_classes( + 'MagazinePage_spectator-magazine-issue__date__'))).strip() + ']' + self.log('Downloading Issue: ', self.timefmt) + nav_div = soup.find('ul', **prefixed_classes('Tabs_spectator-table-of-contents__')) + section_list = [] + + for x in nav_div.findAll(['a']): + section_list.append(( + self.tag_to_string(x).strip(), absurl(x['href']))) feeds = [] - for art in soup.findAll(**prefixed_classes( - 'MagazineContent_spectator-magazine__section-title__ MagazineContent_spectator-magazine-content__article-card__')): - cls = art['class'] - if not isinstance(cls, str): - cls = ' '.join(cls) - if 'section-title' in cls: - if articles: - feeds.append((section, articles)) - section = self.tag_to_string(art).strip() - articles = [] - self.log(section) - continue - a = art.find('a', href=True) - url = absolutize(a['href']) - title = self.tag_to_string(a).strip() - hd = art.find(**prefixed_classes('ArticleCard_spectator-article-card__headline__')) - if hd: - title = self.tag_to_string(hd).strip() - desc = '' - dd = art.find(**prefixed_classes('ArticleCard_spectator-article-card__media-teaser__')) - if dd: - desc = self.tag_to_string(dd).strip() - self.log('\t', title, url) - if desc: - self.log('\t\t', desc) - articles.append({'title': title, 'url': url, 'description': desc}) - if not feeds and '