calibre/recipes/ap.recipe

#!/usr/bin/env python
# vim:fileencoding=utf-8
'''
https://apnews.com
'''

from calibre.web.feeds.news import BasicNewsRecipe, classes


class AssociatedPress(BasicNewsRecipe):
    title = 'Associated Press'
    description = (
        'Read the latest headlines, breaking news, and videos at APNews.com, the definitive '
        'source for independent journalism from every corner of the globe. Articles from Front Page.'
    )
    __author__ = 'unkn0wn'
    language = 'en'
    encoding = 'utf-8'
    no_stylesheets = True
    remove_javascript = True
    ignore_duplicate_articles = {'url'}
    remove_empty_feeds = False
    remove_attributes = ['style', 'height', 'width']
    simultaneous_downloads = 1
    cover_url = 'https://upload.wikimedia.org/wikipedia/commons/thumb/0/0c/Associated_Press_logo_2012.svg/662px-Associated_Press_logo_2012.svg.png'

    keep_only_tags = [
        classes('StoryPage-lede-content Page-lead Page-byline-info RichTextStoryBody'),
    ]
    remove_tags = [
        classes('displayNone Advertisement HTMLModuleEnhancement AudioEnhancement'),
        dict(
            name=[
                'source',
                'button',
                'svg',
                'template',
                'video',
                'astro-island',
                'iframe',
                'document',
            ]
        ),
        dict(attrs={'data-parsely-title': 'Related Stories'}),
    ]

    extra_css = '''
        .Page-byline-info, .Page-breadcrumbs, .CarouselSlide-info, .Figure-caption { font-size:small; }
        img {display:block; margin:0 auto;}
        em { color: #202020; }
    '''

    def parse_index(self):
        feeds = []
        soup = self.index_to_soup('https://apnews.com')
        for a in soup.findAll(
            'a',
            attrs={'href': lambda x: x and x.startswith('https://apnews.com/article/')},
        ):
            url = a['href']
            title = self.tag_to_string(a)
            self.log(title, '\n\t', url)
            feeds.append({'title': title, 'url': url})
        return [('Articles', feeds)]

    def preprocess_html(self, soup):
        for v in soup.findAll('bsp-jw-player', attrs={'poster': True}):
            v.name = 'img'
            v.attrs = {'src': v.get('poster', '')}
        for st in soup.findAll(**classes('CarouselSlide-infoDescription Figure-caption')):
            if p := st.find('p'):
                p.name = 'span'
        for h in soup.findAll(['h2', 'h3']):
            h.name = 'h4'
        for img in soup.findAll('img', attrs={'srcset': True}):
            img['src'] = img['srcset'].split()[0]
        for img_ in soup.findAll(
            'img', attrs={'data-flickity-lazyload-srcset': True, 'srcset': False}
        ):
            img_['src'] = img_['data-flickity-lazyload-srcset'].split()[0]
        return soup