From 961d0a71b3b826f112924600ba6add631e17b48f Mon Sep 17 00:00:00 2001
From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com>
Date: Tue, 13 May 2025 09:44:41 +0530
Subject: [PATCH] Update BBC
---
recipes/bbc.recipe | 27 +++++++++++++++++++++++++--
recipes/bbc_fast.recipe | 29 ++++++++++++++++++++++++++---
2 files changed, 51 insertions(+), 5 deletions(-)
diff --git a/recipes/bbc.recipe b/recipes/bbc.recipe
index a872d50550..608eb7a809 100644
--- a/recipes/bbc.recipe
+++ b/recipes/bbc.recipe
@@ -110,14 +110,15 @@ def parse_article_json(root, abort_article):
lines.extend(serialize_image(block))
elif bt == 'text':
lines.extend(serialize_text(block))
- return '
' + '\n'.join(lines) + ''
+ return '' + '\n'.join(lines) + ''
def parse_raw_html(html, abort_article):
q = '>window.__INITIAL_DATA__="{'
idx = html.find(q)
if idx < 0:
- raise ValueError('Failed to find JSON')
+ print('Failed to find JSON')
+ return html
data = html[idx + len(q) - 2:]
idx = data.find('}";')
data = data[:idx+2]
@@ -301,3 +302,25 @@ class BBCNews(BasicNewsRecipe):
def preprocess_raw_html(self, raw_html, url):
return parse_raw_html(raw_html, self.abort_article)
+
+ keep_only_tags = [
+ dict(name='article')
+ ]
+
+ remove_tags = [
+ dict(name=['button', 'svg', 'iframe']),
+ dict(attrs={'data-component': ['ad-slot', 'tags', 'links-block']})
+ ]
+
+ remove_attributes = ['style', 'height', 'width']
+ no_stylesheets = True
+ extra_css = '''
+ figure, [data-component="byline-block"], [data-component="caption-block"], [data-component="image-block"] { font-size:small; }
+ '''
+
+ def preprocess_html(self, soup):
+ for placeholder in soup.findAll('img', attrs={'src': lambda x: x and x.endswith('placeholder.png')}):
+ placeholder.decompose()
+ for img in soup.findAll('img'):
+ img.attrs = {'src': img.get('src', '')}
+ return soup
diff --git a/recipes/bbc_fast.recipe b/recipes/bbc_fast.recipe
index a7fa84aaa5..c6c25a456f 100644
--- a/recipes/bbc_fast.recipe
+++ b/recipes/bbc_fast.recipe
@@ -110,14 +110,15 @@ def parse_article_json(root, abort_article):
lines.extend(serialize_image(block))
elif bt == 'text':
lines.extend(serialize_text(block))
- return '' + '\n'.join(lines) + ''
+ return '' + '\n'.join(lines) + ''
def parse_raw_html(html, abort_article):
q = '>window.__INITIAL_DATA__="{'
idx = html.find(q)
if idx < 0:
- raise ValueError('Failed to find JSON')
+ print('Failed to find JSON')
+ return html
data = html[idx + len(q) - 2:]
idx = data.find('}";')
data = data[:idx+2]
@@ -133,7 +134,7 @@ if __name__ == '__main__':
class BBC(BasicNewsRecipe):
title = 'BBC News (fast)'
- __author__ = 'Kovid Goyal'
+ __author__ = 'Kovid Goyal, unkn0wn'
description = 'Visit BBC News for up-to-the-minute news, breaking news, video, audio and feature stories. BBC News provides trusted World and UK news as well as local and regional perspectives. Also entertainment, business, science, technology and health news.' # noqa: E501
oldest_article = 2
max_articles_per_feed = 100
@@ -151,6 +152,28 @@ class BBC(BasicNewsRecipe):
ignore_duplicate_articles = {'title', 'url'}
resolve_internal_links = True
+ keep_only_tags = [
+ dict(name='article')
+ ]
+
+ remove_tags = [
+ dict(name=['button', 'svg', 'iframe']),
+ dict(attrs={'data-component': ['ad-slot', 'tags', 'links-block']})
+ ]
+
+ remove_attributes = ['style', 'height', 'width']
+ no_stylesheets = True
+ extra_css = '''
+ figure, [data-component="byline-block"], [data-component="caption-block"], [data-component="image-block"] { font-size:small; }
+ '''
+
+ def preprocess_html(self, soup):
+ for placeholder in soup.findAll('img', attrs={'src': lambda x: x and x.endswith('placeholder.png')}):
+ placeholder.decompose()
+ for img in soup.findAll('img'):
+ img.attrs = {'src': img.get('src', '')}
+ return soup
+
recipe_specific_options = {
'days': {
'short': 'Oldest article to download from this news source. In days ',