Merge branch 'bbc-sport-headline-block-fix' of https://github.com/claybdavis/calibre

2026-05-31 02:55:19 -04:00 · 2026-05-19 07:26:53 +05:30
parent 19d2488ca5 507feb15fa
commit d8490c2208
2 changed files with 70 additions and 0 deletions
@@ -85,6 +85,20 @@ def serialize_contributor(contributor):
        yield '<div>' + prepare_string_for_xml(contributor['subtitle']) + '</div>'


+def extract_text_block_plaintext(text_block):
+    # Pull the plain text out of a nested {type: text, model: {blocks: [...]}}
+    # block. Used to recover a headline string from the 'headline' / 'topper'
+    # block types that BBC Sport articles now use in place of the top-level
+    # article['headline'] field.
+    chunks = []
+    for x in text_block.get('model', {}).get('blocks', []):
+        if x.get('type') == 'paragraph':
+            for f in x.get('model', {}).get('blocks', []):
+                if f.get('type') == 'fragment':
+                    chunks.append(f.get('model', {}).get('text', ''))
+    return ''.join(chunks)
+
+
 def parse_article_json(root, abort_article):
    data = root['data']
    has_media_experience = False
@@ -102,14 +116,35 @@ def parse_article_json(root, abort_article):
    lines = []
    if article.get('headline'):
        lines.append('<h1>{}</h1>'.format(prepare_string_for_xml(article['headline'])))
+    elif article.get('metadata', {}).get('seoHeadline'):
+        # BBC Sport articles no longer populate the top-level 'headline' field;
+        # the visible headline is on metadata.seoHeadline (and is plain text).
+        lines.append('<h1>{}</h1>'.format(prepare_string_for_xml(article['metadata']['seoHeadline'])))
    if article.get('contributor'):
        lines.extend(serialize_contributor(article['contributor']))
+    headline_emitted = bool(lines and lines[0].startswith('<h1>'))
    for block in article['content']['model']['blocks']:
        bt = block.get('type')
        if bt == 'image':
            lines.extend(serialize_image(block))
        elif bt == 'text':
            lines.extend(serialize_text(block))
+        elif bt == 'headline' and not headline_emitted:
+            # New BBC Sport block-type containing the headline as a nested text block.
+            inner = block.get('model', {}).get('blocks') or [{}]
+            text = extract_text_block_plaintext(inner[0])
+            if text:
+                lines.insert(0, '<h1>{}</h1>'.format(prepare_string_for_xml(text)))
+                headline_emitted = True
+        elif bt == 'topper' and not headline_emitted:
+            # 'high-impact' BBC Sport article variant: headline lives under
+            # block.model.heading.blocks[0] rather than block.model.blocks[0].
+            heading = block.get('model', {}).get('heading') or {}
+            inner = heading.get('blocks') or [{}]
+            text = extract_text_block_plaintext(inner[0])
+            if text:
+                lines.insert(0, '<h1>{}</h1>'.format(prepare_string_for_xml(text)))
+                headline_emitted = True
    return '<html><body id="main-content"><article>' + '\n'.join(lines) + '</article></body></html>'


@@ -85,6 +85,20 @@ def serialize_contributor(contributor):
        yield '<div>' + prepare_string_for_xml(contributor['subtitle']) + '</div>'


+def extract_text_block_plaintext(text_block):
+    # Pull the plain text out of a nested {type: text, model: {blocks: [...]}}
+    # block. Used to recover a headline string from the 'headline' / 'topper'
+    # block types that BBC Sport articles now use in place of the top-level
+    # article['headline'] field.
+    chunks = []
+    for x in text_block.get('model', {}).get('blocks', []):
+        if x.get('type') == 'paragraph':
+            for f in x.get('model', {}).get('blocks', []):
+                if f.get('type') == 'fragment':
+                    chunks.append(f.get('model', {}).get('text', ''))
+    return ''.join(chunks)
+
+
 def parse_article_json(root, abort_article):
    data = root['data']
    has_media_experience = False
@@ -102,14 +116,35 @@ def parse_article_json(root, abort_article):
    lines = []
    if article.get('headline'):
        lines.append('<h1>{}</h1>'.format(prepare_string_for_xml(article['headline'])))
+    elif article.get('metadata', {}).get('seoHeadline'):
+        # BBC Sport articles no longer populate the top-level 'headline' field;
+        # the visible headline is on metadata.seoHeadline (and is plain text).
+        lines.append('<h1>{}</h1>'.format(prepare_string_for_xml(article['metadata']['seoHeadline'])))
    if article.get('contributor'):
        lines.extend(serialize_contributor(article['contributor']))
+    headline_emitted = bool(lines and lines[0].startswith('<h1>'))
    for block in article['content']['model']['blocks']:
        bt = block.get('type')
        if bt == 'image':
            lines.extend(serialize_image(block))
        elif bt == 'text':
            lines.extend(serialize_text(block))
+        elif bt == 'headline' and not headline_emitted:
+            # New BBC Sport block-type containing the headline as a nested text block.
+            inner = block.get('model', {}).get('blocks') or [{}]
+            text = extract_text_block_plaintext(inner[0])
+            if text:
+                lines.insert(0, '<h1>{}</h1>'.format(prepare_string_for_xml(text)))
+                headline_emitted = True
+        elif bt == 'topper' and not headline_emitted:
+            # 'high-impact' BBC Sport article variant: headline lives under
+            # block.model.heading.blocks[0] rather than block.model.blocks[0].
+            heading = block.get('model', {}).get('heading') or {}
+            inner = heading.get('blocks') or [{}]
+            text = extract_text_block_plaintext(inner[0])
+            if text:
+                lines.insert(0, '<h1>{}</h1>'.format(prepare_string_for_xml(text)))
+                headline_emitted = True
    return '<html><body id="main-content"><article>' + '\n'.join(lines) + '</article></body></html>'