diff --git a/recipes/cnetnews.recipe b/recipes/cnetnews.recipe index d4439b0369..a3b68f973c 100644 --- a/recipes/cnetnews.recipe +++ b/recipes/cnetnews.recipe @@ -19,52 +19,71 @@ from calibre.web.feeds.news import BasicNewsRecipe class CnetNews(BasicNewsRecipe): title = 'CNET News' - __author__ = 'Darko Miletic updated by DrMerry and further updated by Bonni Salles' + __author__ = 'Kovid Goyal' description = 'Tech news and business reports by CNET News. Focused on information technology, core topics include computers, hardware, software, networking, and Internet media.' # noqa publisher = 'CNET' category = 'news, IT, USA' - encoding = 'utf-8' # AGe 2014-03-28 - language = 'en' # AGe 2014-03-28 + encoding = 'utf-8' + language = 'en' oldest_article = 7 max_articles_per_feed = 100 compress_news_images = True ignore_duplicate_articles = {'title', 'url'} remove_empty_feeds = True use_embedded_content = False + no_stylesheets = True + recursions = 1 - cover_url = 'http://reviews.cnet.com/i/ff/wp/logo_cnet.gif' + remove_tags = [ + dict(name='div', attrs={'id':'livefyreContainer'}), + dict(section=['tag', 'shortcodeRelatedLinks']), + dict(attrs={'class':['ad-inline-product-carousel-top', 'author-social', 'row controls', 'launchGallery', 'topWrap']}), + dict(name='a', attrs={'class':'subHead', 'data-component':'imageGalleryModal'}), + dict(attrs={'data-component':'sharebar'}), + dict(name=['link', 'meta']), + ] - conversion_options = { - 'comment' : description - , 'tags' : category - , 'publisher' : publisher - , 'language' : language - } - - remove_tags = [ # AGe 2014-03-28, new - dict(name='nav', attrs={'id':'primaryNav'}), - dict(name='nav', attrs={'id':'footerMap'}), - dict(name='section', attrs={'class':'dontMissRight'}), - dict(name='ul', attrs={'class':'sharebar inline-view'}), - dict(name='div', attrs={'id':'livefyreContainer'}), - dict(name='div', attrs={'class':'productList ReviewListing'}), - dict(name='div', attrs={'class':'col-4'}), - dict(name='div', attrs={'id':'legal'}), - ] - - remove_tags_before = dict(name='article', id="article-body") + keep_only_tags = [ + dict(itemprop='headline'), + dict(id=["article-body", 'cnetReview']), + dict(attrs={'class':'deal-content'}), + ] feeds = [ - # (u'All of CNET', u'http://www.cnet.com/rss/all/'), - (u'CNET News', u'http://www.cnet.com/rss/news/'), - (u'CNET Reviews', u'http://www.cnet.com/rss/reviews/'), - (u'CNET Video', u'http://www.cnet.com/rss/video/'), - (u'CNET How To', u'http://www.cnet.com/rss/how-to/'), - (u'CNET Deals', u'http://www.cnet.com/rss/deals/'), - (u'CNET iPhone Update', u'http://www.cnet.com/rss/iphone-update/'), - (u'CNET Crave', u'http://www.cnet.com/rss/crave/'), - (u'CNET Car Tech', u'http://www.cnet.com/rss/car-tech/'), - (u'CNET Android Update', u'http://www.cnet.com/rss/android-update/'), - (u'CNET Gaming', u'http://www.cnet.com/rss/gaming/'), - (u'CNET Cheapskate', u'http://www.cnet.com/rss/cheapskate/'), - ] + # (u'All of CNET', u'http://www.cnet.com/rss/all/'), + (u'CNET News', u'http://www.cnet.com/rss/news/'), + (u'CNET Reviews', u'http://www.cnet.com/rss/reviews/'), + (u'CNET How To', u'http://www.cnet.com/rss/how-to/'), + (u'CNET iPhone Update', u'http://www.cnet.com/rss/iphone-update/'), + (u'CNET Crave', u'http://www.cnet.com/rss/crave/'), + (u'CNET Car Tech', u'http://www.cnet.com/rss/car-tech/'), + (u'CNET Android Update', u'http://www.cnet.com/rss/android-update/'), + (u'CNET Gaming', u'http://www.cnet.com/rss/gaming/'), + (u'CNET Cheapskate', u'http://www.cnet.com/rss/cheapskate/'), + ] + + def get_article_url(self, article): + ans = BasicNewsRecipe.get_article_url(self, article) + if ans and ('cnet.com/videos/' in ans or 'cnet.com/pictures/' in ans): + ans = None + return ans + + def is_link_wanted(self, url, tag): + return tag.findParent(attrs={'class':'pageNav'}) is not None + + def preprocess_html(self, soup): + for attr in 'data-original data-src'.split(): + for img in soup.findAll('img', attrs={attr:True}): + img['src'] = img[attr].strip() + return soup + + def postprocess_html(self, soup, first_fetch): + for div in soup.findAll(attrs={'class':'pagination'}): + div.extract() + for div in soup.findAll(attrs={'data-update-area':'pagination'}): + div.extract() + for h1 in soup.findAll(itemprop='headline'): + h1.extract() + if first_fetch: + soup.find('body').insert(1, h1) + return soup