This commit is contained in:
Kovid Goyal 2025-05-13 09:49:44 +05:30
commit 3ed06bcbac
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 51 additions and 5 deletions

View File

@ -110,14 +110,15 @@ def parse_article_json(root, abort_article):
lines.extend(serialize_image(block))
elif bt == 'text':
lines.extend(serialize_text(block))
return '<html><body id="main-content">' + '\n'.join(lines) + '</body></html>'
return '<html><body id="main-content"><article>' + '\n'.join(lines) + '</article></body></html>'
def parse_raw_html(html, abort_article):
q = '>window.__INITIAL_DATA__="{'
idx = html.find(q)
if idx < 0:
raise ValueError('Failed to find JSON')
print('Failed to find JSON')
return html
data = html[idx + len(q) - 2:]
idx = data.find('}";</script>')
data = data[:idx+2]
@ -301,3 +302,25 @@ class BBCNews(BasicNewsRecipe):
def preprocess_raw_html(self, raw_html, url):
return parse_raw_html(raw_html, self.abort_article)
keep_only_tags = [
dict(name='article')
]
remove_tags = [
dict(name=['button', 'svg', 'iframe']),
dict(attrs={'data-component': ['ad-slot', 'tags', 'links-block']})
]
remove_attributes = ['style', 'height', 'width']
no_stylesheets = True
extra_css = '''
figure, [data-component="byline-block"], [data-component="caption-block"], [data-component="image-block"] { font-size:small; }
'''
def preprocess_html(self, soup):
for placeholder in soup.findAll('img', attrs={'src': lambda x: x and x.endswith('placeholder.png')}):
placeholder.decompose()
for img in soup.findAll('img'):
img.attrs = {'src': img.get('src', '')}
return soup

View File

@ -110,14 +110,15 @@ def parse_article_json(root, abort_article):
lines.extend(serialize_image(block))
elif bt == 'text':
lines.extend(serialize_text(block))
return '<html><body id="main-content">' + '\n'.join(lines) + '</body></html>'
return '<html><body id="main-content"><article>' + '\n'.join(lines) + '</article></body></html>'
def parse_raw_html(html, abort_article):
q = '>window.__INITIAL_DATA__="{'
idx = html.find(q)
if idx < 0:
raise ValueError('Failed to find JSON')
print('Failed to find JSON')
return html
data = html[idx + len(q) - 2:]
idx = data.find('}";</script>')
data = data[:idx+2]
@ -133,7 +134,7 @@ if __name__ == '__main__':
class BBC(BasicNewsRecipe):
title = 'BBC News (fast)'
__author__ = 'Kovid Goyal'
__author__ = 'Kovid Goyal, unkn0wn'
description = 'Visit BBC News for up-to-the-minute news, breaking news, video, audio and feature stories. BBC News provides trusted World and UK news as well as local and regional perspectives. Also entertainment, business, science, technology and health news.' # noqa: E501
oldest_article = 2
max_articles_per_feed = 100
@ -151,6 +152,28 @@ class BBC(BasicNewsRecipe):
ignore_duplicate_articles = {'title', 'url'}
resolve_internal_links = True
keep_only_tags = [
dict(name='article')
]
remove_tags = [
dict(name=['button', 'svg', 'iframe']),
dict(attrs={'data-component': ['ad-slot', 'tags', 'links-block']})
]
remove_attributes = ['style', 'height', 'width']
no_stylesheets = True
extra_css = '''
figure, [data-component="byline-block"], [data-component="caption-block"], [data-component="image-block"] { font-size:small; }
'''
def preprocess_html(self, soup):
for placeholder in soup.findAll('img', attrs={'src': lambda x: x and x.endswith('placeholder.png')}):
placeholder.decompose()
for img in soup.findAll('img'):
img.attrs = {'src': img.get('src', '')}
return soup
recipe_specific_options = {
'days': {
'short': 'Oldest article to download from this news source. In days ',