Update Wired Magazine

Merge branch 'master' of https://github.com/truth1ness/calibre
This commit is contained in:
Kovid Goyal 2015-05-02 09:31:18 +05:30
commit 5e04c1f9f8

View File

@ -5,13 +5,15 @@ www.wired.com
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from datetime import date
import urllib2
class WiredDailyNews(BasicNewsRecipe): class WiredDailyNews(BasicNewsRecipe):
title = 'Wired Magazine, Monthly Edition' title = 'Wired Magazine, Monthly Edition'
__author__ = 'Darko Miletic, update by Zach Lapidus' __author__ = 'Darko Miletic, update by Zach Lapidus, Michael Marotta'
description = ('Wired is a full-color monthly American magazine, published in both print ' description = ('Wired is a full-color monthly American magazine, published in both print '
'and online editions, that reports on how emerging technologies affect culture,' 'and online editions, that reports on how emerging technologies affect culture,'
'the economy and politics.') 'the economy and politics. Monthly edition, best run at the start of every month.')
publisher = 'Conde Nast' publisher = 'Conde Nast'
category = 'news, IT, computers, technology' category = 'news, IT, computers, technology'
oldest_article = 2 oldest_article = 2
@ -40,8 +42,19 @@ class WiredDailyNews(BasicNewsRecipe):
dict(attrs={'data-js':['post', 'postHeader']}), dict(attrs={'data-js':['post', 'postHeader']}),
] ]
def parse_wired_index_page(self, num, seen): def get_date_url(self):
soup = self.index_to_soup('http://www.wired.com/category/magazine/page/%d' % num) '''
get month and year, add year modifier, append to wired magazine url,
:return: url
'''
baseurl = 'http://www.wired.com/tag/magazine-'
monthurl = str('{:02d}'.format(date.today().month))
yearurl = str(date.today().year - 1992)
dateurl = baseurl + yearurl + '-' + monthurl + '/page/'
return dateurl
def parse_wired_index_page(self, currenturl, seen):
soup = self.index_to_soup(currenturl)
for a in soup.find('main').findAll('a', href=True): for a in soup.find('main').findAll('a', href=True):
url = a['href'] url = a['href']
if url.startswith('http://www.wired.com/') and url.endswith('/'): if url.startswith('http://www.wired.com/') and url.endswith('/'):
@ -50,12 +63,26 @@ class WiredDailyNews(BasicNewsRecipe):
date = self.tag_to_string(dateloc) date = self.tag_to_string(dateloc)
if title.lower() != 'read more' and title and url not in seen: if title.lower() != 'read more' and title and url not in seen:
seen.add(url) seen.add(url)
self.log('Found article:', title, 'in page:', num) self.log('Found article:', title)
yield {'title':title, 'date':date, 'url':url, 'description':''} yield {'title':title, 'date':date, 'url':url, 'description':''}
def parse_index(self): def parse_index(self):
'''
get the current month's url, index first page to soup, find number of pages,
just keep adding to page num until soup is not none instead of scraping page for
:return:
'''
baseurl = self.get_date_url()
pagenum = 1
articles = [] articles = []
seen = set() seen = set()
for num in (1, 2): morepages = True
articles.extend(self.parse_wired_index_page(num, seen)) while morepages:
try:
urllib2.urlopen(baseurl + str(pagenum))
currenturl = baseurl + str(pagenum)
articles.extend(self.parse_wired_index_page(currenturl, seen))
pagenum += 1
except urllib2.HTTPError:
morepages = False
return [('Articles', articles)] return [('Articles', articles)]