#!/usr/bin/env python ''' http://www.pcworld.com/ ''' from calibre.web.feeds.news import BasicNewsRecipe, classes class pcWorld(BasicNewsRecipe): __author__ = 'unkn0wn' description = 'PCWorld helps you navigate the PC ecosystem to find the products you want and the advice you need to get the job done.' title = 'PCWorld' publisher = 'IDG Communication' language = 'en' encoding = 'utf-8' ignore_duplicate_articles = {'url'} remove_javascript = True resolve_internal_links = True remove_empty_feeds = True remove_attributes = ['height', 'width'] extra_css = ''' .entry-meta, .imageCredit {font-size:small;} .entry-eyebrow, .article_author_box_bio {font-size:small; color:#404040;} .subheadline {font-style:italic; color:#202020;} ''' keep_only_tags = [ classes('entry-header post-thumbnail'), dict(name='div', attrs={'id':'link_wrapped_content'}), classes('article_author_box_bio') ] def parse_index(self): section_list = [ ('PC & Components', 'pc-components'), ('Laptops', 'laptops'), ('Mobile', 'mobile'), ('How-To', 'howto'), ('Gaming', 'gaming'), ('Windows', 'windows'), ('Best-Picks','best-picks'), ('Reviews', 'reviews'), ('Security', 'security'), ('Smart Tech', 'smart-tech'), ('Software', 'software'), ('WiFi & Networks', 'wifi-networks'), ('Deals', 'deals'), ('Business', 'business'), ('Entertainment', 'entertainment'), ] feeds = [] # For each section title, fetch the article urls for section in section_list: section_title = section[0] section_url = 'https://www.pcworld.com/' + section[1] self.log(section_title, section_url) soup = self.index_to_soup(section_url) articles = self.articles_from_soup(soup) if articles: feeds.append((section_title, articles)) return feeds def articles_from_soup(self, soup): ans = [] feed = soup.find('div', attrs={'class':lambda x: x and 'articleFeed-inner' in x.split()}) for item in feed.findAll('div', attrs={'class':'item-text-inner'}): a = item.find('h3').find('a', href=True) title = self.tag_to_string(a) url = a['href'] desc = '' if span := item.find(attrs={'class':'item-excerpt'}): desc = self.tag_to_string(span) if byline := item.find(attrs={'class':'item-byline'}): desc = self.tag_to_string(byline) + ' | ' + desc if eye := item.find(attrs={'class':lambda x: x and 'item-eyebrow' in x.split()}): desc = self.tag_to_string(eye) + ' | ' + desc if itdate := item.find(attrs={'class':'item-date'}): date = self.tag_to_string(itdate) check = 'hours', 'day', 'days' # skipping articles older than a week if not any(x in date for x in check): continue if not url or not title: continue self.log('\t', title, '\n\t', desc, '\n\t\t', url) ans.append({'title': title, 'url': url, 'description': desc}) return ans def get_cover_url(self): soup = self.index_to_soup( 'https://www.magzter.com/US/IDG-Consumer-and-SMB,-Inc./PCWorld/Computer-&-Mobile/' ) for citem in soup.findAll('meta', content=lambda s: s and s.endswith('view/3.jpg')): return citem['content']