__license__ = 'GPL v3' __copyright__ = '2012, mkydgr' ''' www.wired.com based on the (broken) built-in recipe by Darko Miletic ''' import re from calibre import strftime from calibre.web.feeds.news import BasicNewsRecipe class Wired(BasicNewsRecipe): title = 'Wired Magazine' __author__ = 'mkydgr' description = 'Technology News' publisher = 'Conde Nast Digital' category = '' oldest_article = 500 delay = 1 max_articles_per_feed = 100 no_stylesheets = True encoding = 'utf-8' use_embedded_content = False masthead_url = 'http://www.wired.com/images/home/wired_logo.gif' language = 'en' publication_type = 'magazine' extra_css = ' body{font-family: Arial,Verdana,sans-serif} .entryDescription li {display: inline; list-style-type: none} ' index = 'http://www.wired.com/magazine' departments = ['features','start','test','play','found', 'reviews'] preprocess_regexps = [(re.compile(r'', re.DOTALL|re.IGNORECASE),lambda match: '')] conversion_options = { 'comment' : description , 'tags' : category , 'publisher' : publisher , 'language' : language } keep_only_tags = [dict(name='div', attrs={'class':'post'})] remove_tags_after = dict(name='div', attrs={'class':'tweetmeme_button'}) remove_tags = [ dict(name=['object','embed','iframe','link']) ,dict(name='div', attrs={'class':['podcast_storyboard','tweetmeme_button']}) ,dict(attrs={'id':'ff_bottom_nav'}) ,dict(name='a',attrs={'href':'http://www.wired.com/app'}) ] remove_attributes = ['height','width'] def parse_index(self): totalfeeds = [] soup = self.index_to_soup(self.index) #department feeds depts = soup.find('div',attrs={'id':'department-posts'}) if depts: for ditem in self.departments: darticles = [] department = depts.find('h3',attrs={'id':'department-'+ditem}) if department: #print '\n###### Found department %s ########'%(ditem) el = department.next while el and (el.__class__.__name__ == 'NavigableString' or el.name != 'h3'): if el.__class__.__name__ != 'NavigableString': #print '\t ... element',el.name if el.name == 'ul': for artitem in el.findAll('li'): #print '\t\t ... article',repr(artitem) feed_link = artitem.find('a') #print '\t\t\t ... link',repr(feed_link) if feed_link and feed_link.has_key('href'): url = self.makeurl(feed_link['href']) title = self.tag_to_string(feed_link) date = strftime(self.timefmt) #print '\t\t ... found "%s" %s'%(title,url) darticles.append({ 'title' :title ,'date' :date ,'url' :url ,'description':'' }) el = None else: el = el.next totalfeeds.append((ditem.capitalize(), darticles)) return totalfeeds def get_cover_url(self): cover_url = None soup = self.index_to_soup(self.index) cover_item = soup.find('div',attrs={'class':'spread-image'}) if cover_item: cover_url = self.makeurl(cover_item.a.img['src']) return cover_url def print_version(self, url): return url.rstrip('/') + '/all/1' def preprocess_html(self, soup): for item in soup.findAll(style=True): del item['style'] return soup def makeurl(self, addr): if addr[:4] != 'http' : addr='http://www.wired.com' + addr while addr[-2:] == '//' : addr=addr[:-1] return addr