From 8e85a16e4cfbd5850583ca4f9958d2d2edf788c9 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 18 Apr 2015 08:12:57 +0530 Subject: [PATCH] Update Wired --- recipes/wired.recipe | 132 ++++++++++++++----------------------------- 1 file changed, 43 insertions(+), 89 deletions(-) diff --git a/recipes/wired.recipe b/recipes/wired.recipe index aacbd903d9..6f34e49b5d 100644 --- a/recipes/wired.recipe +++ b/recipes/wired.recipe @@ -1,107 +1,61 @@ - __license__ = 'GPL v3' -__copyright__ = '2010-2013, Darko Miletic ' +__copyright__ = '2014, Darko Miletic ' ''' www.wired.com ''' -import re -from calibre import strftime from calibre.web.feeds.news import BasicNewsRecipe -class Wired(BasicNewsRecipe): - title = 'Wired Magazine' - __author__ = 'Darko Miletic' - description = 'Gaming news' - publisher = 'Conde Nast Digital' - category = 'news, games, IT, gadgets' - oldest_article = 32 - max_articles_per_feed = 100 +class WiredDailyNews(BasicNewsRecipe): + title = 'Wired Magazine, Monthly Edition' + __author__ = 'Darko Miletic, update by Zach Lapidus' + description = ('Wired is a full-color monthly American magazine, published in both print ' + 'and online editions, that reports on how emerging technologies affect culture,' + 'the economy and politics.') + publisher = 'Conde Nast' + category = 'news, IT, computers, technology' + oldest_article = 2 + max_articles_per_feed = 200 no_stylesheets = True encoding = 'utf-8' use_embedded_content = False - masthead_url = 'http://www.wired.com/images/home/wired_logo.gif' language = 'en' - publication_type = 'magazine' - extra_css = """ - h1, .entry-header{font-family: brandon-grotesque,anchor-web,Helvetica,Arial,sans-serif} - .entry-header{display: block;} - .entry-header ul{ list-style-type:disc;} - .author, .entryDate, .entryTime, .entryEdit, .entryCategories{display: inline} - .entry-header li{text-transform: uppercase;} - div#container{font-family: 'Exchange SSm 4r', Georgia, serif} + ignore_duplicate_articles = {'url'} + remove_empty_feeds = True + publication_type = 'newsportal' + extra_css = """ + .entry-header{ + text-transform: uppercase; + vertical-align: baseline; + display: inline; + } """ - index = 'http://www.wired.com/magazine/' - preprocess_regexps = [(re.compile(r'', re.DOTALL|re.IGNORECASE),lambda match: '')] - conversion_options = { - 'comment' : description - , 'tags' : category - , 'publisher' : publisher - , 'language' : language - } - - keep_only_tags = [dict(name='div', attrs={'class':'post'})] - remove_tags_after = dict(name='div', attrs={'id':'container'}) remove_tags = [ - dict(name=['object','embed','iframe','link','meta','base']) - ,dict(name='div', attrs={'class':['social-top','podcast_storyboard','tweetmeme_button']}) - ,dict(attrs={'id':'ff_bottom_nav'}) - ,dict(name='a',attrs={'href':'http://www.wired.com/app'}) - ,dict(name='div', attrs={'id':'mag-bug'}) + dict(name=['meta','link']), + dict(name='div', attrs={'class':'podcast_storyboard'}), + dict(id=['sharing', 'social', 'article-tags', 'sidebar']), ] - remove_attributes = ['height','width','lang','border','clear'] + keep_only_tags=[ + dict(attrs={'data-js':['post', 'postHeader']}), + ] + def parse_wired_index_page(self, num, seen): + soup = self.index_to_soup('http://www.wired.com/category/magazine/page/%d' % num) + for a in soup.find('main').findAll('a', href=True): + url = a['href'] + if url.startswith('http://www.wired.com/') and url.endswith('/'): + title = self.tag_to_string(a.find('h2')) + dateloc = a.find('time') + date = self.tag_to_string(dateloc) + if title.lower() != 'read more' and title and url not in seen: + seen.add(url) + self.log('Found article:', title, 'in page:', num) + yield {'title':title, 'date':date, 'url':url, 'description':''} def parse_index(self): - totalfeeds = [] - soup = self.index_to_soup(self.index) - majorf = soup.find('div',attrs={'class':'entry'}) - if majorf: - articles = [] - checker = [] - for a in majorf.findAll('a', href=True): - if a['href'].startswith('http://www.wired.com/') and a['href'].endswith('/'): - title = self.tag_to_string(a) - url = a['href'] - if title.lower() != 'read more' and url not in checker: - checker.append(url) - articles.append({ - 'title' :title - ,'date' :strftime(self.timefmt) - ,'url' :a['href'] - ,'description':'' - }) - totalfeeds.append(('Articles', articles)) - return totalfeeds - - def get_cover_url(self): - cover_url = None - soup = self.index_to_soup(self.index) - cover_item = soup.find('div',attrs={'class':'spread-image'}) - if cover_item: - cover_url = 'http://www.wired.com' + cover_item.a.img['src'] - return cover_url - - def print_version(self, url): - return url.rstrip('/') + '/all/1' - - def preprocess_html(self, soup): - for item in soup.findAll(style=True): - del item['style'] - for item in soup.findAll('a'): - if item.string is not None: - tstr = item.string - item.replaceWith(tstr) - else: - item.name='span' - for atrs in ['href','target','alt','title','name','id']: - if item.has_key(atrs): - del item[atrs] - for item in soup.findAll('img'): - if not item.has_key('alt'): - item['alt'] = 'image' - if item.has_key('data-lazy-src'): - item['src'] = item['data-lazy-src'] - del item['data-lazy-src'] - return soup + articles = [] + seen = set() + for num in (1, 2): + articles.extend(self.parse_wired_index_page(num, seen)) + return [('Articles', articles)]