#!/usr/bin/env python # vim:fileencoding=utf-8 ''' https://apnews.com ''' from calibre.web.feeds.news import BasicNewsRecipe, classes class AssociatedPress(BasicNewsRecipe): title = 'Associated Press' description = ( 'Read the latest headlines, breaking news, and videos at APNews.com, the definitive ' 'source for independent journalism from every corner of the globe. Articles from Front Page.' ) __author__ = 'unkn0wn' language = 'en' encoding = 'utf-8' no_stylesheets = True remove_javascript = True ignore_duplicate_articles = {'url'} remove_empty_feeds = False remove_attributes = ['style', 'height', 'width'] simultaneous_downloads = 1 cover_url = 'https://upload.wikimedia.org/wikipedia/commons/thumb/0/0c/Associated_Press_logo_2012.svg/662px-Associated_Press_logo_2012.svg.png' keep_only_tags = [ classes('StoryPage-lede-content Page-lead Page-byline-info RichTextStoryBody'), ] remove_tags = [ classes('displayNone Advertisement HTMLModuleEnhancement AudioEnhancement'), dict( name=[ 'source', 'button', 'svg', 'template', 'video', 'astro-island', 'iframe', 'document', ] ), dict(attrs={'data-parsely-title': 'Related Stories'}), ] extra_css = ''' .Page-byline-info, .Page-breadcrumbs, .CarouselSlide-info, .Figure-caption { font-size:small; } img {display:block; margin:0 auto;} em { color: #202020; } ''' def parse_index(self): feeds = [] soup = self.index_to_soup('https://apnews.com') for a in soup.findAll( 'a', attrs={'href': lambda x: x and x.startswith('https://apnews.com/article/')}, ): url = a['href'] title = self.tag_to_string(a) self.log(title, '\n\t', url) feeds.append({'title': title, 'url': url}) return [('Articles', feeds)] def preprocess_html(self, soup): for v in soup.findAll('bsp-jw-player', attrs={'poster': True}): v.name = 'img' v.attrs = {'src': v.get('poster', '')} for st in soup.findAll(**classes('CarouselSlide-infoDescription Figure-caption')): if p := st.find('p'): p.name = 'span' for h in soup.findAll(['h2', 'h3']): h.name = 'h4' for img in soup.findAll('img', attrs={'srcset': True}): img['src'] = img['srcset'].split()[0] for img_ in soup.findAll( 'img', attrs={'data-flickity-lazyload-srcset': True, 'srcset': False} ): img_['src'] = img_['data-flickity-lazyload-srcset'].split()[0] return soup