from __future__ import with_statement __license__ = 'GPL 3' __copyright__ = '2009, Kovid Goyal ' __docformat__ = 'restructuredtext en' from calibre.web.feeds.news import BasicNewsRecipe def classes(classes): q = frozenset(classes.split(' ')) return dict(attrs={ 'class': lambda x: x and frozenset(x.split()).intersection(q)}) def absolutize(x): x = x.lstrip('/') if not x.startswith('https:'): x = 'https://www.chicagotribune.com/' + x return x class ChicagoTribune(BasicNewsRecipe): title = 'Chicago Tribune' __author__ = 'Kovid Goyal' description = 'Politics, local and business news from Chicago' language = 'en' use_embedded_content = False no_stylesheets = True remove_javascript = True compress_news_images = True compress_news_images_auto_size = 5 keep_only_tags = [ dict(name='h1'), classes('byline-container pb-f-utilities-lead-art pb-f-article-gallery'), dict(attrs={'data-type': 'text'}), ] remove_tags = [ classes('trb_ar_cont trb_ar_main_ad trb_em_r_cc'), ] def ct_articles(self, slug): url = absolutize(slug) soup = self.index_to_soup(url) for div in soup.findAll(**classes('pb-f-homepage-story pb-f-homepage-story-feed')): h = div.find(('h1', 'h2', 'h3', 'h4', 'h5', 'h6')) a = h.find('a', href=True) title = self.tag_to_string(a) url = absolutize(a['href']) self.log('\t', title, url) desc = '' p = div.find(**classes('preview-text')) if p: desc = self.tag_to_string(p) self.log('\t\t', desc) yield {'title': title, 'description': desc, 'url': url} def parse_index(self): feed = [] for slug, title in ( ('news/breaking', 'Breaking News'), ('sports', 'Sports'), ('business', 'Business'), ('entertainment', 'Entertainment'), ('dining', 'Chicago Dining'), ('columns', 'Tribune Voices'), ): self.log('Found section:', title) articles = list(self.ct_articles(slug)) if articles: feed.append((title, articles)) return feed def preprocess_html(self, soup): for img in soup.findAll('img', attrs={'data-baseurl': True}): img['src'] = img['data-baseurl'] return soup def skip_ad_pages(self, soup): text = soup.find(text='click here to continue to article') if text: a = text.parent url = a.get('href') if url: return self.index_to_soup(url, raw=True)