This commit is contained in:
Kovid Goyal 2016-10-09 18:20:02 +05:30
commit d22a378f78

View File

@ -12,12 +12,14 @@ import urllib2
class WiredDailyNews(BasicNewsRecipe): class WiredDailyNews(BasicNewsRecipe):
title = 'Wired Magazine, Monthly Edition' title = 'Wired Magazine, Monthly Edition'
__author__ = 'Darko Miletic, update by Zach Lapidus, Michael Marotta' __author__ = 'Darko Miletic, update by Zach Lapidus, Michael Marotta'
description = ('Wired is a full-color monthly American magazine, published in both print ' description = ('Wired is a full-color monthly American magazine, '
'and online editions, that reports on how emerging technologies affect culture,' 'published in both print and online editions, that '
'the economy and politics. Monthly edition, best run at the start of every month.') 'reports on how emerging technologies affect culture, '
'the economy and politics. '
'Monthly edition, best run at the start of every month.')
publisher = 'Conde Nast' publisher = 'Conde Nast'
category = 'news, IT, computers, technology' category = 'news, IT, computers, technology'
oldest_article = 2 oldest_article = 45
max_articles_per_feed = 200 max_articles_per_feed = 200
no_stylesheets = True no_stylesheets = True
encoding = 'utf-8' encoding = 'utf-8'
@ -26,7 +28,7 @@ class WiredDailyNews(BasicNewsRecipe):
ignore_duplicate_articles = {'url'} ignore_duplicate_articles = {'url'}
remove_empty_feeds = True remove_empty_feeds = True
publication_type = 'newsportal' publication_type = 'newsportal'
extra_css = """ extra_css = """
.entry-header{ .entry-header{
text-transform: uppercase; text-transform: uppercase;
vertical-align: baseline; vertical-align: baseline;
@ -37,42 +39,57 @@ class WiredDailyNews(BasicNewsRecipe):
remove_tags = [ remove_tags = [
dict(name=['meta', 'link']), dict(name=['meta', 'link']),
dict(name='div', attrs={'class': 'podcast_storyboard'}), dict(name='div', attrs={'class': 'podcast_storyboard'}),
dict(id=['sharing', 'social', 'article-tags', 'sidebar']), dict(name='figure', attrs={'data-js': 'slide'}),
dict(name='div', attrs={'class': 'no-underline fix-height'}),
dict(name='div',
attrs={'class': 'no-underline marg-t-med big-marg-b-med fix-height'}),
dict(id=['sharing', 'social', 'article-tags', 'sidebar', 'related']),
] ]
keep_only_tags = [ keep_only_tags = [
dict(attrs={'data-js': ['post', 'postHeader']}), dict(attrs={'data-js': ['post', 'postHeader']}),
dict(attrs={'class': 'exchange fsb-content relative'}), dict(attrs={'class': 'exchange fsb-content relative'}),
] ]
def get_magazine_year_month(self, seperator):
monthurl = str('{:02d}'.format(date.today().month))
yearurl = str(date.today().year - 1992)
return yearurl + seperator + monthurl
def get_date_url(self): def get_date_url(self):
''' '''
get month and year, add year modifier, append to wired magazine url, get month and year, add year modifier, append to wired magazine url,
:return: url :return: url
''' '''
baseurl = 'http://www.wired.com/tag/magazine-' baseurl = 'https://www.wired.com/tag/magazine-'
magazine_year_month = self.get_magazine_year_month('-')
monthurl = str('{:02d}'.format(date.today().month)) monthurl = str('{:02d}'.format(date.today().month))
yearurl = str(date.today().year - 1992) yearurl = str(date.today().year - 1992)
dateurl = baseurl + yearurl + '-' + monthurl + '/page/' dateurl = baseurl + magazine_year_month + '/page/'
return dateurl return dateurl
def parse_wired_index_page(self, currenturl, seen): def parse_wired_index_page(self, currenturl, seen):
soup = self.index_to_soup(currenturl) soup = self.index_to_soup(currenturl)
for a in soup.find('main').findAll('a', href=True): for a in soup.find('main').findAll('a', href=True):
url = a['href'] url = a['href']
if url.startswith('http://www.wired.com/') and url.endswith('/'): if url.startswith('https://www.wired.com/') and url.endswith('/'):
title = self.tag_to_string(a.parent.find('h2')) title = self.tag_to_string(a.parent.find('h2'))
dateloc = a.parent.find('time') dateloc = a.parent.find('time')
date = self.tag_to_string(dateloc) date = self.tag_to_string(dateloc)
if title.lower() != 'read more' and title and url not in seen: if title.lower() != 'read more' and title and url not in seen:
seen.add(url) seen.add(url)
self.log('Found article:', title) self.log('Found article:', title)
yield {'title': title, 'date': date, 'url': url, 'description': ''} yield {
'title': title,
'date': date,
'url': url,
'description': ''
}
def parse_index(self): def parse_index(self):
''' '''
get the current month's url, index first page to soup, find number of pages, get the current month's url, index first page to soup,
just keep adding to page num until soup is not none instead of scraping page for find number of pages, just keep adding to page num until
:return: soup is not none instead of scraping page for :return:
''' '''
baseurl = self.get_date_url() baseurl = self.get_date_url()
pagenum = 1 pagenum = 1
@ -87,4 +104,6 @@ class WiredDailyNews(BasicNewsRecipe):
pagenum += 1 pagenum += 1
except urllib2.HTTPError: except urllib2.HTTPError:
morepages = False morepages = False
return [('Articles', articles)]
magazine_year_month = self.get_magazine_year_month('.')
return [('Magazine-' + magazine_year_month, articles)]