diff --git a/recipes/ap.recipe b/recipes/ap.recipe index 311b7b1494..b3f809fe33 100644 --- a/recipes/ap.recipe +++ b/recipes/ap.recipe @@ -1,82 +1,78 @@ #!/usr/bin/env python # vim:fileencoding=utf-8 -# License: GPLv3 Copyright: 2017, Kovid Goyal +""" +https://apnews.com +""" -import json - -from calibre.utils.date import parse_date, utcnow from calibre.web.feeds.news import BasicNewsRecipe, classes class AssociatedPress(BasicNewsRecipe): - - title = u'Associated Press' - description = 'Global news' - __author__ = 'Kovid Goyal' - use_embedded_content = False + title = 'Associated Press' + description = ( + 'Read the latest headlines, breaking news, and videos at APNews.com, the definitive ' + 'source for independent journalism from every corner of the globe. Articles from Front Page.' + ) + __author__ = 'unkn0wn' language = 'en' encoding = 'utf-8' no_stylesheets = True - ignore_duplicate_articles = {'title', 'url'} + remove_javascript = True + ignore_duplicate_articles = {'url'} remove_empty_feeds = False - oldest_article = 1.5 + remove_attributes = ['style', 'height', 'width'] + simultaneous_downloads = 1 + cover_url = 'https://upload.wikimedia.org/wikipedia/commons/thumb/0/0c/Associated_Press_logo_2012.svg/662px-Associated_Press_logo_2012.svg.png' keep_only_tags = [ - classes('Page-headline Page-lead Page-storyBody Page-authorinfo'), + classes('StoryPage-lede-content Page-lead Page-byline-info RichTextStoryBody'), ] remove_tags = [ - classes('Page-actions Enhancement'), - dict(name='source'), + classes('displayNone Advertisement HTMLModuleEnhancement videoSlide'), + dict( + name=[ + 'source', + 'button', + 'svg', + 'template', + 'bsp-jw-player', + 'astro-island', + 'iframe', + 'document', + ] + ), + dict(attrs={'data-parsely-title': 'Related Stories'}), ] - remove_attributes = ['srcset'] + extra_css = ''' - .Figure-caption { - font-style: italic; - font-size: smaller; - margin-left: 1rem; margin-right: 1rem; - } + .Page-byline-info, .Page-breadcrumbs, .CarouselSlide-info, .Figure-caption { font-size:small; } + img {display:block; margin:0 auto;} + em { color: #202020; } ''' def parse_index(self): feeds = [] - limit = self.test[0] if self.test else 100 - for front in ( - 'topnews sports politics entertainment usnews oddities' - ' Travel technology lifestyle business Health science intlnews'.split() + soup = self.index_to_soup('https://apnews.com') + for a in soup.findAll( + 'a', + attrs={'href': lambda x: x and x.startswith('https://apnews.com/article/')}, ): - name = { - 'topnews': 'Top News', - 'intlnews': 'International', - 'usnews': 'U.S. News' - }.get(front, front).capitalize() - feeds.append([name, self.parse_section(front)]) - if len(feeds) >= limit: - break - return feeds + url = a['href'] + title = self.tag_to_string(a) + self.log(title, '\n\t', url) + feeds.append({'title': title, 'url': url}) + return [('Articles', feeds)] - def parse_section(self, front): - url = 'https://afs-prod.appspot.com/api/v2/feed/tag?tags=apf-' + front - self.log('Processing section:', front, 'at', url) - data = self.index_to_soup(url, raw=True) - data = json.loads(data) - cards = data.get('cards', ()) - articles = [] - - for card in cards: - for article in card['contents']: - url = article['localLinkUrl'] - title = article.get('headline', article.get('flattenedFirstWords')) - if not title: - continue - title = title.split('\u2014')[-1] - updated = article.get('updated') - if updated: - updated = parse_date(updated, assume_utc=True) - delta = utcnow() - updated - if (delta.days*24*3600 + delta.seconds) > 24*3600*self.oldest_article: - self.log('Skipping', title, 'as it is too old') - continue - self.log('\tFound article:', title, 'at', url) - articles.append({'title': title, 'url': url}) - self.log('') - return articles + def preprocess_html(self, soup): + for st in soup.findAll(**classes('CarouselSlide-infoDescription')): + if p := st.find('p'): + p.name = 'span' + for h in soup.findAll(['h2', 'h3']): + h.name = 'h4' + for img in soup.findAll('img', attrs={'srcset': True}): + img['src'] = img['srcset'].split()[0] + for img_ in soup.findAll( + 'img', attrs={'data-flickity-lazyload-srcset': True, 'srcset': False} + ): + img_['src'] = img_['data-flickity-lazyload-srcset'].split()[0] + return soup diff --git a/recipes/horizons.recipe b/recipes/horizons.recipe index 6103be063d..23e4ccd391 100644 --- a/recipes/horizons.recipe +++ b/recipes/horizons.recipe @@ -23,7 +23,7 @@ class horizons(BasicNewsRecipe): remove_attributes = ['style', 'height', 'width'] masthead_url = 'https://www.cirsd.org/bundles/olpublic/images/horizons-logo.jpg' ignore_duplicate_articles = {'url'} - extra_css = 'em{color:#404040;}' + extra_css = 'em{color:#202020;}' simultaneous_downloads = 1 keep_only_tags = [dict(name='div', attrs={'class': 'article'})] @@ -40,7 +40,7 @@ class horizons(BasicNewsRecipe): } def preprocess_raw_html(self, raw, *a): - return raw.replace('

 

', '') + return raw.replace('

 

', '').replace('

 

', '') def get_browser(self): return BasicNewsRecipe.get_browser(self, verify_ssl_certificates=False)