Fixed broken recipe wired.recipe

This commit is contained in:
Allan Simonsen 2016-10-09 13:28:24 +02:00
parent 7914ada946
commit afeeb2d26f

View File

@ -12,12 +12,14 @@ import urllib2
class WiredDailyNews(BasicNewsRecipe): class WiredDailyNews(BasicNewsRecipe):
title = 'Wired Magazine, Monthly Edition' title = 'Wired Magazine, Monthly Edition'
__author__ = 'Darko Miletic, update by Zach Lapidus, Michael Marotta' __author__ = 'Darko Miletic, update by Zach Lapidus, Michael Marotta'
description = ('Wired is a full-color monthly American magazine, published in both print ' description = ('Wired is a full-color monthly American magazine, '
'and online editions, that reports on how emerging technologies affect culture,' 'published in both print and online editions, that '
'the economy and politics. Monthly edition, best run at the start of every month.') 'reports on how emerging technologies affect culture, '
'the economy and politics. '
'Monthly edition, best run at the start of every month.')
publisher = 'Conde Nast' publisher = 'Conde Nast'
category = 'news, IT, computers, technology' category = 'news, IT, computers, technology'
oldest_article = 2 oldest_article = 45
max_articles_per_feed = 200 max_articles_per_feed = 200
no_stylesheets = True no_stylesheets = True
encoding = 'utf-8' encoding = 'utf-8'
@ -37,42 +39,57 @@ class WiredDailyNews(BasicNewsRecipe):
remove_tags = [ remove_tags = [
dict(name=['meta', 'link']), dict(name=['meta', 'link']),
dict(name='div', attrs={'class': 'podcast_storyboard'}), dict(name='div', attrs={'class': 'podcast_storyboard'}),
dict(id=['sharing', 'social', 'article-tags', 'sidebar']), dict(name='figure', attrs={'data-js': 'slide'}),
dict(name='div', attrs={'class': 'no-underline fix-height'}),
dict(name='div',
attrs={'class': 'no-underline marg-t-med big-marg-b-med fix-height'}),
dict(id=['sharing', 'social', 'article-tags', 'sidebar', 'related']),
] ]
keep_only_tags = [ keep_only_tags = [
dict(attrs={'data-js': ['post', 'postHeader']}), dict(attrs={'data-js': ['post', 'postHeader']}),
dict(attrs={'class': 'exchange fsb-content relative'}), dict(attrs={'class': 'exchange fsb-content relative'}),
] ]
def get_magazine_year_month(self, seperator):
monthurl = str('{:02d}'.format(date.today().month))
yearurl = str(date.today().year - 1992)
return yearurl + seperator + monthurl
def get_date_url(self): def get_date_url(self):
''' '''
get month and year, add year modifier, append to wired magazine url, get month and year, add year modifier, append to wired magazine url,
:return: url :return: url
''' '''
baseurl = 'http://www.wired.com/tag/magazine-' baseurl = 'https://www.wired.com/tag/magazine-'
magazine_year_month = self.get_magazine_year_month('-')
monthurl = str('{:02d}'.format(date.today().month)) monthurl = str('{:02d}'.format(date.today().month))
yearurl = str(date.today().year - 1992) yearurl = str(date.today().year - 1992)
dateurl = baseurl + yearurl + '-' + monthurl + '/page/' dateurl = baseurl + magazine_year_month + '/page/'
return dateurl return dateurl
def parse_wired_index_page(self, currenturl, seen): def parse_wired_index_page(self, currenturl, seen):
soup = self.index_to_soup(currenturl) soup = self.index_to_soup(currenturl)
for a in soup.find('main').findAll('a', href=True): for a in soup.find('main').findAll('a', href=True):
url = a['href'] url = a['href']
if url.startswith('http://www.wired.com/') and url.endswith('/'): if url.startswith('https://www.wired.com/') and url.endswith('/'):
title = self.tag_to_string(a.parent.find('h2')) title = self.tag_to_string(a.parent.find('h2'))
dateloc = a.parent.find('time') dateloc = a.parent.find('time')
date = self.tag_to_string(dateloc) date = self.tag_to_string(dateloc)
if title.lower() != 'read more' and title and url not in seen: if title.lower() != 'read more' and title and url not in seen:
seen.add(url) seen.add(url)
self.log('Found article:', title) self.log('Found article:', title)
yield {'title': title, 'date': date, 'url': url, 'description': ''} yield {
'title': title,
'date': date,
'url': url,
'description': ''
}
def parse_index(self): def parse_index(self):
''' '''
get the current month's url, index first page to soup, find number of pages, get the current month's url, index first page to soup,
just keep adding to page num until soup is not none instead of scraping page for find number of pages, just keep adding to page num until
:return: soup is not none instead of scraping page for :return:
''' '''
baseurl = self.get_date_url() baseurl = self.get_date_url()
pagenum = 1 pagenum = 1
@ -87,4 +104,6 @@ class WiredDailyNews(BasicNewsRecipe):
pagenum += 1 pagenum += 1
except urllib2.HTTPError: except urllib2.HTTPError:
morepages = False morepages = False
return [('Articles', articles)]
magazine_year_month = self.get_magazine_year_month('.')
return [('Magazine-' + magazine_year_month, articles)]