Update Associated Press

This commit is contained in:
Kovid Goyal 2023-07-13 20:32:40 +05:30
parent f77c451716
commit 824a228e99
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -2,35 +2,12 @@
# vim:fileencoding=utf-8 # vim:fileencoding=utf-8
# License: GPLv3 Copyright: 2017, Kovid Goyal <kovid at kovidgoyal.net> # License: GPLv3 Copyright: 2017, Kovid Goyal <kovid at kovidgoyal.net>
from __future__ import absolute_import, division, print_function, unicode_literals
import json import json
import re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe, classes
from calibre.utils.date import utcnow, parse_date from calibre.utils.date import utcnow, parse_date
def extract_article(raw):
ms = re.search(r"window\['titanium-state'\]", raw)
me = re.search(r"window\['titanium-cacheConfig'\]", raw)
raw = raw[ms.start():me.start()]
raw = raw[raw.find('{'):]
data = json.loads(raw)
data = tuple(data['content']['data'].values())[0]
story_html = '<h1>' + data['headline'] + '</h1>\n'
story_html += '<p>' + data['bylines'] + '</p>\n'
story_html += '<p>' + data['published'] + '</p>\n'
for m in data.get('media', ()):
sizes = m['imageRenderedSizes']
if sizes:
sz = 800 if 800 in sizes else sizes[0]
url = m['gcsBaseUrl'] + '{}{}'.format(sz, m['imageFileExtension'])
story_html += '\n<div><img src="' + url + '"/></div>\n'
story_html += '<div>' + m['caption'] + '</div>\n'
story_html += '\n<div>' + data['storyHTML'] + '</div>'
return '<html><body>' + story_html + '</body></html>'
class AssociatedPress(BasicNewsRecipe): class AssociatedPress(BasicNewsRecipe):
title = u'Associated Press' title = u'Associated Press'
@ -44,6 +21,22 @@ class AssociatedPress(BasicNewsRecipe):
remove_empty_feeds = False remove_empty_feeds = False
oldest_article = 1.5 oldest_article = 1.5
keep_only_tags = [
classes('Page-headline Page-lead Page-storyBody Page-authorinfo'),
]
remove_tags = [
classes('Page-actions Enhancement'),
dict(name='source'),
]
remove_attributes = ['srcset']
extra_css = '''
.Figure-caption {
font-style: italic;
font-size: smaller;
margin-left: 1rem; margin-right: 1rem;
}
'''
def parse_index(self): def parse_index(self):
feeds = [] feeds = []
limit = self.test[0] if self.test else 100 limit = self.test[0] if self.test else 100
@ -87,6 +80,3 @@ class AssociatedPress(BasicNewsRecipe):
articles.append({'title': title, 'url': url}) articles.append({'title': title, 'url': url})
self.log('') self.log('')
return articles return articles
def preprocess_raw_html(self, raw_html, url):
return extract_article(raw_html)