From 824a228e99bad18f2cddad3574e960e60c9035ee Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Thu, 13 Jul 2023 20:32:40 +0530
Subject: [PATCH] Update Associated Press

---
 recipes/ap.recipe | 44 +++++++++++++++++---------------------------
 1 file changed, 17 insertions(+), 27 deletions(-)
diff --git a/recipes/ap.recipe b/recipes/ap.recipe
index 0eb30dc832..2bfa2d0015 100644
--- a/recipes/ap.recipe
+++ b/recipes/ap.recipe
@@ -2,35 +2,12 @@
 # vim:fileencoding=utf-8
 # License: GPLv3 Copyright: 2017, Kovid Goyal <kovid at kovidgoyal.net>
 
-from __future__ import absolute_import, division, print_function, unicode_literals
 import json
-import re
 
-from calibre.web.feeds.news import BasicNewsRecipe
+from calibre.web.feeds.news import BasicNewsRecipe, classes
 from calibre.utils.date import utcnow, parse_date
 
 
-def extract_article(raw):
-    ms = re.search(r"window\['titanium-state'\]", raw)
-    me = re.search(r"window\['titanium-cacheConfig'\]", raw)
-    raw = raw[ms.start():me.start()]
-    raw = raw[raw.find('{'):]
-    data = json.loads(raw)
-    data = tuple(data['content']['data'].values())[0]
-    story_html = '<h1>' + data['headline'] + '</h1>\n'
-    story_html += '<p>' + data['bylines'] + '</p>\n'
-    story_html += '<p>' + data['published'] + '</p>\n'
-    for m in data.get('media', ()):
-        sizes = m['imageRenderedSizes']
-        if sizes:
-            sz = 800 if 800 in sizes else sizes[0]
-            url = m['gcsBaseUrl'] + '{}{}'.format(sz, m['imageFileExtension'])
-            story_html += '\n<div><img src="' + url + '"/></div>\n'
-            story_html += '<div>' + m['caption'] + '</div>\n'
-    story_html += '\n<div>' + data['storyHTML'] + '</div>'
-    return '<html><body>' + story_html + '</body></html>'
-
-
 class AssociatedPress(BasicNewsRecipe):
 
     title = u'Associated Press'
@@ -44,6 +21,22 @@ class AssociatedPress(BasicNewsRecipe):
     remove_empty_feeds = False
     oldest_article = 1.5
 
+    keep_only_tags = [
+        classes('Page-headline Page-lead Page-storyBody Page-authorinfo'),
+    ]
+    remove_tags = [
+        classes('Page-actions Enhancement'),
+        dict(name='source'),
+    ]
+    remove_attributes = ['srcset']
+    extra_css = '''
+    .Figure-caption {
+    font-style: italic;
+    font-size: smaller;
+    margin-left: 1rem; margin-right: 1rem;
+    }
+    '''
+
     def parse_index(self):
         feeds = []
         limit = self.test[0] if self.test else 100
@@ -87,6 +80,3 @@ class AssociatedPress(BasicNewsRecipe):
                 articles.append({'title': title, 'url': url})
         self.log('')
         return articles
-
-    def preprocess_raw_html(self, raw_html, url):
-        return extract_article(raw_html)