diff --git a/recipes/bbc.recipe b/recipes/bbc.recipe index 68fb74e1a2..abce22c448 100644 --- a/recipes/bbc.recipe +++ b/recipes/bbc.recipe @@ -111,6 +111,23 @@ def parse_article_json(root, abort_article): elif bt == 'text': lines.extend(serialize_text(block)) return '
' + '\n'.join(lines) + '' + + +def parse_raw_html(html, abort_article): + q = '>window.__INITIAL_DATA__="{' + idx = html.find(q) + if idx < 0: + raise ValueError('Failed to find JSON') + data = html[idx + len(q) - 2:] + idx = data.find('}";') + data = data[:idx+2] + data = json.loads(data) + root = json.loads(data) + return parse_article_json(root, abort_article) + + +if __name__ == '__main__': + print(parse_raw_html(open('/t/raw.html').read(), print)) # }}} @@ -269,12 +286,4 @@ class BBCNews(BasicNewsRecipe): resolve_internal_links = True def preprocess_raw_html(self, raw_html, url): - q = '>window.__INITIAL_DATA__={' - idx = raw_html.find(q) - if idx < 0: - raise ValueError('Failed to find JSON') - data = raw_html[idx + len(q) - 1:] - idx = data.find('};') - data = data[:idx+1] - root = json.loads(data) - return parse_article_json(root, self.abort_article) + return parse_raw_html(raw_html, self.abort_article) diff --git a/recipes/bbc_fast.recipe b/recipes/bbc_fast.recipe index d6b467f377..312aa2dd7d 100644 --- a/recipes/bbc_fast.recipe +++ b/recipes/bbc_fast.recipe @@ -12,18 +12,20 @@ from calibre.web.feeds.recipes import BasicNewsRecipe def serialize_image(block): yield '