From 824a228e99bad18f2cddad3574e960e60c9035ee Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 13 Jul 2023 20:32:40 +0530 Subject: [PATCH] Update Associated Press --- recipes/ap.recipe | 44 +++++++++++++++++--------------------------- 1 file changed, 17 insertions(+), 27 deletions(-) diff --git a/recipes/ap.recipe b/recipes/ap.recipe index 0eb30dc832..2bfa2d0015 100644 --- a/recipes/ap.recipe +++ b/recipes/ap.recipe @@ -2,35 +2,12 @@ # vim:fileencoding=utf-8 # License: GPLv3 Copyright: 2017, Kovid Goyal -from __future__ import absolute_import, division, print_function, unicode_literals import json -import re -from calibre.web.feeds.news import BasicNewsRecipe +from calibre.web.feeds.news import BasicNewsRecipe, classes from calibre.utils.date import utcnow, parse_date -def extract_article(raw): - ms = re.search(r"window\['titanium-state'\]", raw) - me = re.search(r"window\['titanium-cacheConfig'\]", raw) - raw = raw[ms.start():me.start()] - raw = raw[raw.find('{'):] - data = json.loads(raw) - data = tuple(data['content']['data'].values())[0] - story_html = '

' + data['headline'] + '

\n' - story_html += '

' + data['bylines'] + '

\n' - story_html += '

' + data['published'] + '

\n' - for m in data.get('media', ()): - sizes = m['imageRenderedSizes'] - if sizes: - sz = 800 if 800 in sizes else sizes[0] - url = m['gcsBaseUrl'] + '{}{}'.format(sz, m['imageFileExtension']) - story_html += '\n
\n' - story_html += '
' + m['caption'] + '
\n' - story_html += '\n
' + data['storyHTML'] + '
' - return '' + story_html + '' - - class AssociatedPress(BasicNewsRecipe): title = u'Associated Press' @@ -44,6 +21,22 @@ class AssociatedPress(BasicNewsRecipe): remove_empty_feeds = False oldest_article = 1.5 + keep_only_tags = [ + classes('Page-headline Page-lead Page-storyBody Page-authorinfo'), + ] + remove_tags = [ + classes('Page-actions Enhancement'), + dict(name='source'), + ] + remove_attributes = ['srcset'] + extra_css = ''' + .Figure-caption { + font-style: italic; + font-size: smaller; + margin-left: 1rem; margin-right: 1rem; + } + ''' + def parse_index(self): feeds = [] limit = self.test[0] if self.test else 100 @@ -87,6 +80,3 @@ class AssociatedPress(BasicNewsRecipe): articles.append({'title': title, 'url': url}) self.log('') return articles - - def preprocess_raw_html(self, raw_html, url): - return extract_article(raw_html)