__license__ = 'GPL v3' __copyright__ = '2009, Darko Miletic ' ''' Changelog: 2011-09-24 Changed cover (drMerry) 2011-10-13 Updated Cover (drMerry) 2014-03-28 Update by Armin Geller news.cnet.com further updated by Bonni Salles My updates use the current rss feeds for CNet and eliminates extraneous areas and sets the oldest_article to 1 as the feed generally has the present day and one day before. You may want to set it to download daily for this reason. ''' from calibre.web.feeds.news import BasicNewsRecipe def classes(classes): q = frozenset(classes.split(' ')) return dict(attrs={ 'class': lambda x: x and frozenset(x.split()).intersection(q)}) class CnetNews(BasicNewsRecipe): title = 'CNET News' __author__ = 'Kovid Goyal' description = 'Tech news and business reports by CNET News. Focused on information technology, core topics include computers, hardware, software, networking, and Internet media.' # noqa publisher = 'CNET' category = 'news, IT, USA' encoding = 'utf-8' language = 'en' oldest_article = 7 max_articles_per_feed = 100 compress_news_images = True ignore_duplicate_articles = {'title', 'url'} remove_empty_feeds = True use_embedded_content = False no_stylesheets = True recursions = 1 remove_tags = [ dict(name='div', attrs={'id': 'livefyreContainer'}), dict(section=['tag', 'shortcodeRelatedLinks']), dict(attrs={'class': ['ad-inline-product-carousel-top', 'author-social', 'row controls', 'launchGallery', 'topWrap']}), dict(name='a', attrs={'class': 'subHead', 'data-component': 'imageGalleryModal'}), dict(attrs={'data-component': 'sharebar'}), dict(name=['link', 'meta']), classes('playerControls video share-button'), ] keep_only_tags = [ dict(name='h1'), dict(section='author'), dict(id=["article-body", 'cnetReview']), dict(attrs={'class': 'deal-content'}), ] feeds = [ # (u'All of CNET', u'http://www.cnet.com/rss/all/'), (u'CNET News', u'http://www.cnet.com/rss/news/'), (u'CNET Reviews', u'http://www.cnet.com/rss/reviews/'), (u'CNET How To', u'http://www.cnet.com/rss/how-to/'), (u'CNET iPhone Update', u'http://www.cnet.com/rss/iphone-update/'), (u'CNET Crave', u'http://www.cnet.com/rss/crave/'), (u'CNET Car Tech', u'http://www.cnet.com/rss/car-tech/'), (u'CNET Android Update', u'http://www.cnet.com/rss/android-update/'), (u'CNET Gaming', u'http://www.cnet.com/rss/gaming/'), (u'CNET Cheapskate', u'http://www.cnet.com/rss/cheapskate/'), ] def get_article_url(self, article): ans = BasicNewsRecipe.get_article_url(self, article) if ans and ('cnet.com/videos/' in ans or 'cnet.com/pictures/' in ans): ans = None return ans def is_link_wanted(self, url, tag): return tag.findParent(attrs={'class': 'pageNav'}) is not None def preprocess_html(self, soup): for attr in 'data-original data-src'.split(): for img in soup.findAll('img', attrs={attr: True}): img['src'] = img[attr].strip() return soup def postprocess_html(self, soup, first_fetch): for div in soup.findAll(attrs={'class': 'pagination'}): div.extract() for div in soup.findAll(attrs={'data-update-area': 'pagination'}): div.extract() for h1 in soup.findAll(itemprop='headline'): h1.extract() if first_fetch: soup.find('body').insert(1, h1) for img in soup.findAll('img'): img['height'] = img['width'] = '' return soup