__license__ = 'GPL v3' __copyright__ = '2010, Franco Venturi ' ''' spectrum.ieee.org ''' from calibre.web.feeds.news import BasicNewsRecipe from string import capwords from urlparse import urljoin class IEEESpectrum(BasicNewsRecipe): title = 'IEEE Spectrum' __author__ = 'Franco Venturi' description = 'Electronics News from IEEE' publisher = 'IEEE' category = 'news, electronics, IT, computer science' oldest_article = 32 max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False language = 'en' index = 'http://spectrum.ieee.org/magazine/' masthead_url = 'http://spectrum.ieee.org/images/logo_hdr.png' remove_javascript = True remove_tags = [dict(name={'script':True, 'object':True})] remove_attributes = ['height','width','alt'] keep_only_tags = [dict(attrs={'class': {'artSctn':True, 'artTitle':True, 'dekTitle': True}}), dict(attrs={'id':'artBody'})] def parse_index(self): soup = self.index_to_soup(self.index) img = soup.find('img', image='cover.gif', src=True) if img is not None: self.cover_url = 'http://spectrum.ieee.org'+img['src'] content = soup.find(id='gnrlContent') title = content.find(attrs={'class':'style4'}).string.strip() date = ' '.join(title.split()[0:2]) self.timefmt = ' [' + date + ']' contents = [] for tag in content.findAll(attrs={'class': {'style2':True, 'lstngTitle':True, 'lstngBody': True}}): if tag['class'] == 'style2': contents.append((capwords(tag.renderContents().strip()), [])) elif tag['class'] == 'lstngTitle': url = urljoin(self.index, tag.findPrevious('a')['href']) + '/0' contents[-1][1].append({'title': tag.renderContents().strip(), 'url': url, 'date': date, 'description': '', 'content': '' }) elif tag['class'] == 'lstngBody': contents[-1][1][-1]['description'] = tag.renderContents().strip() return contents def preprocess_html(self, soup): for a in soup.findAll('a'): if not a['href'].lower().startswith('http'): a['href'] = urljoin(self.index, a['href']) return soup