Update Associated Press

This commit is contained in:
Kovid Goyal 2021-06-08 14:47:00 +05:30
parent 9040501684
commit cf2c096db0
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -3,45 +3,44 @@
# License: GPLv3 Copyright: 2017, Kovid Goyal <kovid at kovidgoyal.net> # License: GPLv3 Copyright: 2017, Kovid Goyal <kovid at kovidgoyal.net>
from __future__ import absolute_import, division, print_function, unicode_literals from __future__ import absolute_import, division, print_function, unicode_literals
import json import json
import re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
def classes(classes): def extract_article(raw):
q = frozenset(classes.split(' ')) ms = re.search(r"window\['titanium-state'\]", raw)
return dict( me = re.search(r"window\['titanium-cacheConfig'\]", raw)
attrs={'class': lambda x: x and frozenset(x.split()).intersection(q)} raw = raw[ms.start():me.start()]
) raw = raw[raw.find('{'):]
data = json.loads(raw)
data = tuple(data['content']['data'].values())[0]
def new_tag(soup, name, attrs=()): story_html = '<h1>' + data['headline'] + '</h1>\n'
impl = getattr(soup, 'new_tag', None) story_html += '<p>' + data['bylines'] + '</p>\n'
if impl is not None: story_html += '<p>' + data['published'] + '</p>\n'
return impl(name, attrs=dict(attrs)) for m in data.get('media', ()):
return Tag(soup, name, attrs=attrs or None) sizes = m['imageRenderedSizes']
if sizes:
sz = 800 if 800 in sizes else sizes[0]
url = m['gcsBaseUrl'] + '{}{}'.format(sz, m['imageFileExtension'])
story_html += '\n<div><img src="' + url + '"/></div>\n'
story_html += '<div>' + m['caption'] + '</div>\n'
story_html += '\n<div>' + data['storyHTML'] + '</div>'
return '<html><body>' + story_html + '</body></html>'
class AssociatedPress(BasicNewsRecipe): class AssociatedPress(BasicNewsRecipe):
title = u'Associated Press' title = u'Associated Press'
description = 'Global news' description = 'Global news'
__author__ = 'Krittika Goyal' __author__ = 'Kovid Goyal'
use_embedded_content = False use_embedded_content = False
language = 'en' language = 'en'
encoding = 'utf-8' encoding = 'utf-8'
no_stylesheets = True no_stylesheets = True
ignore_duplicate_articles = {'title', 'url'} ignore_duplicate_articles = {'title', 'url'}
remove_empty_feeds = False remove_empty_feeds = False
keep_only_tags = [
classes('CardHeadline LeadFeature Article'),
]
remove_tags = [
classes('ad-placeholder modalImageButton modalVideoButton'),
dict(name=['button', 'svg']),
]
def parse_index(self): def parse_index(self):
feeds = [] feeds = []
@ -80,10 +79,5 @@ class AssociatedPress(BasicNewsRecipe):
self.log('') self.log('')
return articles return articles
def preprocess_html(self, soup, *a): def preprocess_raw_html(self, raw_html, url):
for meta in soup.findAll('meta', attrs=dict(name="twitter:image:alt")): return extract_article(raw_html)
for div in soup.findAll(**classes('LeadFeature')):
img = new_tag(soup, 'img')
img['src'] = meta['content']
div.insert(0, img)
return soup