Update The BBC

For some reason it now serializes the article JSON as a dict serialized as string.
2025-08-11 09:13:57 -04:00 · 2021-12-19 08:30:13 +05:30 · 2021-12-19 08:30:13 +05:30 · 172ee5d531
commit 172ee5d531
parent 559bba5fa9
2 changed files with 50 additions and 27 deletions
--- a/recipes/bbc.recipe
+++ b/recipes/bbc.recipe
@ -111,6 +111,23 @@ def parse_article_json(root, abort_article):
        elif bt == 'text':
            lines.extend(serialize_text(block))
    return '<html><body id="main-content">' + '\n'.join(lines) + '</body></html>'
+
+
+def parse_raw_html(html, abort_article):
+    q = '>window.__INITIAL_DATA__="{'
+    idx = html.find(q)
+    if idx < 0:
+        raise ValueError('Failed to find JSON')
+    data = html[idx + len(q) - 2:]
+    idx = data.find('}";</script>')
+    data = data[:idx+2]
+    data = json.loads(data)
+    root = json.loads(data)
+    return parse_article_json(root, abort_article)
+
+
+if __name__ == '__main__':
+    print(parse_raw_html(open('/t/raw.html').read(), print))
 # }}}


@ -269,12 +286,4 @@ class BBCNews(BasicNewsRecipe):
    resolve_internal_links = True

    def preprocess_raw_html(self, raw_html, url):
-        q = '>window.__INITIAL_DATA__={'
-        idx = raw_html.find(q)
-        if idx < 0:
-            raise ValueError('Failed to find JSON')
-        data = raw_html[idx + len(q) - 1:]
-        idx = data.find('};</script>')
-        data = data[:idx+1]
-        root = json.loads(data)
-        return parse_article_json(root, self.abort_article)
+        return parse_raw_html(raw_html, self.abort_article)
--- a/recipes/bbc_fast.recipe
+++ b/recipes/bbc_fast.recipe
@ -12,18 +12,20 @@ from calibre.web.feeds.recipes import BasicNewsRecipe
 def serialize_image(block):
    yield '<div>'
    block = block['model']
-    media = block['media']
-    alt = prepare_string_for_xml(media.get('alt') or '', True)
+    img = block['image']
+    alt = prepare_string_for_xml(img.get('alt') or '', True)
    for q in ('originalSrc', 'src'):
-        if q in media:
-            src = prepare_string_for_xml(media[q])
+        if q in img:
+            src = prepare_string_for_xml(img[q])
            break
    else:
-        raise ValueError('No src found in media block: {}'.format(media))
+        raise ValueError('No src found in img block: {}'.format(img))
    yield '<img src="{}" alt="{}"/>'.format(src, alt)
    caption = block.get('caption')
-    if caption:
-        yield '<div>{}</div>'.format(prepare_string_for_xml(caption))
+    if caption and caption.get('type') == 'text':
+        yield '<div>'
+        yield from serialize_paragraph(caption)
+        yield '</div>'
    yield '</div>'


@ -102,13 +104,30 @@ def parse_article_json(root, abort_article):
        lines.append('<h1>{}</h1>'.format(prepare_string_for_xml(article['headline'])))
    if article.get('contributor'):
        lines.extend(serialize_contributor(article['contributor']))
-    for block in article['blocks']:
+    for block in article['content']['model']['blocks']:
        bt = block.get('type')
        if bt == 'image':
            lines.extend(serialize_image(block))
        elif bt == 'text':
            lines.extend(serialize_text(block))
    return '<html><body id="main-content">' + '\n'.join(lines) + '</body></html>'
+
+
+def parse_raw_html(html, abort_article):
+    q = '>window.__INITIAL_DATA__="{'
+    idx = html.find(q)
+    if idx < 0:
+        raise ValueError('Failed to find JSON')
+    data = html[idx + len(q) - 2:]
+    idx = data.find('}";</script>')
+    data = data[:idx+2]
+    data = json.loads(data)
+    root = json.loads(data)
+    return parse_article_json(root, abort_article)
+
+
+if __name__ == '__main__':
+    print(parse_raw_html(open('/t/raw.html').read(), print))
 # }}}


@ -124,10 +143,13 @@ class BBC(BasicNewsRecipe):
    publisher = 'BBC'
    category = 'news, UK, world'
    language = 'en_GB'
-    masthead_url = 'https://news.bbcimg.co.uk/img/1_0_1/cream/hi/news/news-blocks.gif'
    conversion_options = {
        'comments': description, 'tags': category, 'language': language, 'publisher': publisher,
    }
+    # Removes empty feeds - why keep them!?
+    remove_empty_feeds = True
+    ignore_duplicate_articles = {'title', 'url'}
+    resolve_internal_links = True

    feeds = [
        ('Top Stories', 'https://feeds.bbci.co.uk/news/rss.xml'),
@ -150,12 +172,4 @@ class BBC(BasicNewsRecipe):
    ]

    def preprocess_raw_html(self, raw_html, url):
-        q = '>window.__INITIAL_DATA__={'
-        idx = raw_html.find(q)
-        if idx < 0:
-            raise ValueError('Failed to find JSON')
-        data = raw_html[idx + len(q) - 1:]
-        idx = data.find('};</script>')
-        data = data[:idx+1]
-        root = json.loads(data)
-        return parse_article_json(root, self.abort_article)
+        return parse_raw_html(raw_html, self.abort_article)