diff --git a/recipes/wired.recipe b/recipes/wired.recipe index 6f34e49b5d..31f3b21f6b 100644 --- a/recipes/wired.recipe +++ b/recipes/wired.recipe @@ -5,13 +5,15 @@ www.wired.com ''' from calibre.web.feeds.news import BasicNewsRecipe +from datetime import date +import urllib2 class WiredDailyNews(BasicNewsRecipe): title = 'Wired Magazine, Monthly Edition' - __author__ = 'Darko Miletic, update by Zach Lapidus' + __author__ = 'Darko Miletic, update by Zach Lapidus, Michael Marotta' description = ('Wired is a full-color monthly American magazine, published in both print ' 'and online editions, that reports on how emerging technologies affect culture,' - 'the economy and politics.') + 'the economy and politics. Monthly version, run at start of month') publisher = 'Conde Nast' category = 'news, IT, computers, technology' oldest_article = 2 @@ -40,8 +42,20 @@ class WiredDailyNews(BasicNewsRecipe): dict(attrs={'data-js':['post', 'postHeader']}), ] - def parse_wired_index_page(self, num, seen): - soup = self.index_to_soup('http://www.wired.com/category/magazine/page/%d' % num) + def get_date_url(self): + ''' + get month and year, add year modifier, append to wired magazine url, + :return: url + ''' + baseurl = 'http://www.wired.com/tag/magazine-' + monthurl = str('{:02d}'.format(date.today().month)) + yearurl = str(date.today().year - 1992) + dateurl = baseurl + yearurl + '-' + monthurl + '/page/' + return dateurl + + + def parse_wired_index_page(self, currenturl, seen): + soup = self.index_to_soup(currenturl) for a in soup.find('main').findAll('a', href=True): url = a['href'] if url.startswith('http://www.wired.com/') and url.endswith('/'): @@ -50,12 +64,26 @@ class WiredDailyNews(BasicNewsRecipe): date = self.tag_to_string(dateloc) if title.lower() != 'read more' and title and url not in seen: seen.add(url) - self.log('Found article:', title, 'in page:', num) + self.log('Found article:', title) yield {'title':title, 'date':date, 'url':url, 'description':''} def parse_index(self): + ''' + get the current month's url, index first page to soup, find number of pages, + just keep adding to page num until soup is not none instead of scraping page for + :return: + ''' + baseurl = self.get_date_url() + pagenum = 1 articles = [] seen = set() - for num in (1, 2): - articles.extend(self.parse_wired_index_page(num, seen)) - return [('Articles', articles)] + morepages = True + while morepages: + try: + url = urllib2.urlopen(baseurl + str(pagenum)) + currenturl = baseurl + str(pagenum) + articles.extend(self.parse_wired_index_page(currenturl, seen)) + pagenum += 1 + except urllib2.HTTPError, e: + morepages = False + return [('Articles', articles)] \ No newline at end of file