Added logic to find current month's magazine url and crawl through pages until end of month's articles are reached instead of taking 2 pages off the all main magazine feed

This commit is contained in:
truth1ness 2015-05-01 17:34:02 -04:00
parent baa5bf6a5b
commit c833ccc79e

View File

@ -5,13 +5,15 @@ www.wired.com
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from datetime import date
import urllib2
class WiredDailyNews(BasicNewsRecipe): class WiredDailyNews(BasicNewsRecipe):
title = 'Wired Magazine, Monthly Edition' title = 'Wired Magazine, Monthly Edition'
__author__ = 'Darko Miletic, update by Zach Lapidus' __author__ = 'Darko Miletic, update by Zach Lapidus, Michael Marotta'
description = ('Wired is a full-color monthly American magazine, published in both print ' description = ('Wired is a full-color monthly American magazine, published in both print '
'and online editions, that reports on how emerging technologies affect culture,' 'and online editions, that reports on how emerging technologies affect culture,'
'the economy and politics.') 'the economy and politics. Monthly version, run at start of month')
publisher = 'Conde Nast' publisher = 'Conde Nast'
category = 'news, IT, computers, technology' category = 'news, IT, computers, technology'
oldest_article = 2 oldest_article = 2
@ -40,8 +42,20 @@ class WiredDailyNews(BasicNewsRecipe):
dict(attrs={'data-js':['post', 'postHeader']}), dict(attrs={'data-js':['post', 'postHeader']}),
] ]
def parse_wired_index_page(self, num, seen): def get_date_url(self):
soup = self.index_to_soup('http://www.wired.com/category/magazine/page/%d' % num) '''
get month and year, add year modifier, append to wired magazine url,
:return: url
'''
baseurl = 'http://www.wired.com/tag/magazine-'
monthurl = str('{:02d}'.format(date.today().month))
yearurl = str(date.today().year - 1992)
dateurl = baseurl + yearurl + '-' + monthurl + '/page/'
return dateurl
def parse_wired_index_page(self, currenturl, seen):
soup = self.index_to_soup(currenturl)
for a in soup.find('main').findAll('a', href=True): for a in soup.find('main').findAll('a', href=True):
url = a['href'] url = a['href']
if url.startswith('http://www.wired.com/') and url.endswith('/'): if url.startswith('http://www.wired.com/') and url.endswith('/'):
@ -50,12 +64,26 @@ class WiredDailyNews(BasicNewsRecipe):
date = self.tag_to_string(dateloc) date = self.tag_to_string(dateloc)
if title.lower() != 'read more' and title and url not in seen: if title.lower() != 'read more' and title and url not in seen:
seen.add(url) seen.add(url)
self.log('Found article:', title, 'in page:', num) self.log('Found article:', title)
yield {'title':title, 'date':date, 'url':url, 'description':''} yield {'title':title, 'date':date, 'url':url, 'description':''}
def parse_index(self): def parse_index(self):
'''
get the current month's url, index first page to soup, find number of pages,
just keep adding to page num until soup is not none instead of scraping page for
:return:
'''
baseurl = self.get_date_url()
pagenum = 1
articles = [] articles = []
seen = set() seen = set()
for num in (1, 2): morepages = True
articles.extend(self.parse_wired_index_page(num, seen)) while morepages:
try:
url = urllib2.urlopen(baseurl + str(pagenum))
currenturl = baseurl + str(pagenum)
articles.extend(self.parse_wired_index_page(currenturl, seen))
pagenum += 1
except urllib2.HTTPError, e:
morepages = False
return [('Articles', articles)] return [('Articles', articles)]