From 2e6f0e7a59216cbca7d6b75ce8f51d9ec499d62e Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 26 Nov 2020 12:30:46 +0530 Subject: [PATCH] Update The BBC --- manual/news.rst | 4 +- recipes/bbc.recipe | 665 +++++++--------------------------------- recipes/bbc_fast.recipe | 191 ++++++++---- 3 files changed, 249 insertions(+), 611 deletions(-) diff --git a/manual/news.rst b/manual/news.rst index 67698d62f4..21e2eda5cd 100644 --- a/manual/news.rst +++ b/manual/news.rst @@ -142,9 +142,7 @@ to the recipe. Finally, lets replace some of the :term:`CSS` that we disabled ea extra_css = '.headline {font-size: x-large;} \n .fact { padding-top: 10pt }' -With these additions, our recipe has become "production quality", indeed it is very close to the actual recipe used by calibre for the *BBC*, shown below: - -.. literalinclude:: ../../../recipes/bbc.recipe +With these additions, our recipe has become "production quality". This :term:`recipe` explores only the tip of the iceberg when it comes to the power of calibre. To explore more of the abilities of calibre we'll examine a more complex real life example in the next section. diff --git a/recipes/bbc.recipe b/recipes/bbc.recipe index a643ad5fe1..c80677f55f 100644 --- a/recipes/bbc.recipe +++ b/recipes/bbc.recipe @@ -1,72 +1,115 @@ -## -# Title: BBC News, Sport, and Blog Calibre Recipe -# Contact: mattst - jmstanfield@gmail.com -## -# License: GNU General Public License v3 - https://www.gnu.org/copyleft/gpl.html -# Copyright: mattst - jmstanfield@gmail.com -## -# Written: November 2011 -# Last Edited: 2011-11-19 -## +#!/usr/bin/env python +# vim:fileencoding=utf-8 +# License: GPLv3 Copyright: 2020, Kovid Goyal -__license__ = 'GNU General Public License v3 - https://www.gnu.org/copyleft/gpl.html' -__copyright__ = 'mattst - jmstanfield@gmail.com' +import json - -''' -BBC News, Sport, and Blog Calibre Recipe -''' - -# Import the regular expressions module. -import re - -# Import the BasicNewsRecipe class which this class extends. +from calibre import prepare_string_for_xml from calibre.web.feeds.recipes import BasicNewsRecipe -def classes(classes): - q = frozenset(classes.split(' ')) - return dict(attrs={ - 'class': lambda x: x and frozenset(x.split()).intersection(q)}) +# Article JSON parser {{{ +def serialize_image(block): + yield '
' + block = block['model'] + media = block['media'] + alt = prepare_string_for_xml(media.get('alt') or '', True) + src = prepare_string_for_xml(media['src']) + yield '{}'.format(src, alt) + caption = block.get('caption') + if caption: + yield '
{}
'.format(prepare_string_for_xml(caption)) + yield '
' + + +def block_tag(name, generator): + yield '<' + name + '>' + yield from generator + yield '' + + +def serialize_paragraph(block): + block = block['model'] + for x in block['blocks']: + xt = x['type'] + if xt == 'fragment': + styles = [] + model = x['model'] + for attr in model['attributes']: + if attr == 'bold': + styles.append('font-weight: bold') + elif attr in ('italic', 'italics'): + styles.append('font-style: italic') + if styles: + prefix = ''.format('; '.join(styles)) + suffix = '' + else: + prefix = suffix = '' + yield prefix + prepare_string_for_xml(model['text']) + suffix + elif xt == 'urlLink': + model = x['model'] + yield '{}'.format(prepare_string_for_xml(model['locator'], True), prepare_string_for_xml(model['text'])) + + +def serialize_list(block): + for x in block['model']['blocks']: + if x['type'] == 'listItem': + yield from block_tag('li', serialize_paragraph(x)) + + +def serialize_text(block): + block = block['model'] + for x in block['blocks']: + xt = x['type'] + if xt == 'paragraph': + yield from block_tag('p', serialize_paragraph(x)) + elif xt == 'unorderedList': + yield from block_tag('ul', serialize_list(x)) + elif xt == 'orderedList': + yield from block_tag('ol', serialize_list(x)) + else: + raise KeyError('Unknown block type: ' + x['type']) + + +def serialize_contributor(contributor): + if 'title' in contributor: + yield '

' + prepare_string_for_xml(contributor['title']) + '

' + if 'subtitle' in contributor: + yield '
' + prepare_string_for_xml(contributor['subtitle']) + '
' + + +def parse_article_json(root, abort_article): + data = root['data'] + has_media_experience = False + for key in data: + if key.startswith('article?'): + article = data[key]['data'] + break + elif key.startswith('media-experience?'): + has_media_experience = True + else: + if has_media_experience: + abort_article('Skipping video article') + return + raise KeyError('No article found in data keys: {}'.format(data.keys())) + lines = [] + if article.get('headline'): + lines.append('

{}

'.format(prepare_string_for_xml(article['headline']))) + if article.get('contributor'): + lines.extend(serialize_contributor(article['contributor'])) + for block in article['blocks']: + bt = block.get('type') + if bt == 'image': + lines.extend(serialize_image(block)) + elif bt == 'text': + lines.extend(serialize_text(block)) + return '' + '\n'.join(lines) + '' +# }}} class BBCNews(BasicNewsRecipe): - # - # **** IMPORTANT USERS READ ME **** - # - # First select the feeds you want then scroll down below the feeds list - # and select the values you want for the other user preferences, like - # oldest_article and such like. - # - # - # Select the BBC rss feeds which you want in your ebook. - # Selected feed have NO '#' at their start, de-selected feeds begin with a '#'. - # - # Eg. ("News Home", "https://feeds.bbci.co.uk/... - include feed. - # Eg. #("News Home", "https://feeds.bbci.co.uk/... - do not include feed. - # - # There are 68 feeds below which constitute the bulk of the available rss - # feeds on the BBC web site. These include 5 blogs by editors and - # correspondents, 16 sports feeds, 15 'sub' regional feeds (Eg. North West - # Wales, Scotland Business), and 7 Welsh language feeds. - # - # Some of the feeds are low volume (Eg. blogs), or very low volume (Eg. Click) - # so if "oldest_article = 1.5" (only articles published in the last 36 hours) - # you may get some 'empty feeds' which will not then be included in the ebook. - # - # The 15 feeds currently selected below are simply my default ones. - # - # Note: With all 68 feeds selected, oldest_article set to 2, - # max_articles_per_feed set to 100, and simultaneous_downloads set to 10, - # the ebook creation took 29 minutes on my speedy 100 mbps net connection, - # fairly high-end desktop PC running Linux (Ubuntu Lucid-Lynx). - # More realistically with 15 feeds selected, oldest_article set to 1.5, - # max_articles_per_feed set to 100, and simultaneous_downloads set to 20, - # it took 6 minutes. If that's too slow increase 'simultaneous_downloads'. - # # Select / de-select the feeds you want in your ebook. - # feeds = [ ("News Home", "https://feeds.bbci.co.uk/news/rss.xml"), ("UK", "https://feeds.bbci.co.uk/news/uk/rss.xml"), @@ -175,11 +218,6 @@ class BBCNews(BasicNewsRecipe): # simultaneous_downloads = 20 - # Timeout for fetching files from the server in seconds. The default of - # 120 seconds, seems somewhat excessive. - # - timeout = 30 - # The format string for the date shown on the ebook's first page. # List of all values: https://docs.python.org/library/time.html # Default in news.py has a leading space so that's mirrored here. @@ -208,29 +246,14 @@ class BBCNews(BasicNewsRecipe): # # Author of this recipe. - __author__ = 'mattst' + __author__ = 'Kovid Goyal' # Specify English as the language of the RSS feeds (ISO-639 code). language = 'en_GB' - # Set tags. - tags = 'news, sport, blog' - # Set publisher and publication type. - publisher = 'BBC' publication_type = 'newspaper' - - # Disable stylesheets from site. - no_stylesheets = True - - # Specifies an override encoding for sites that have an incorrect charset - # specified. Default of 'None' says to auto-detect. Some other BBC recipes - # use 'utf8', which works fine (so use that if necessary) but auto-detecting - # with None is working fine, so stick with that for robustness. - encoding = None - - # Sets whether a feed has full articles embedded in it. The BBC feeds do - # not. + encoding = 'utf-8' use_embedded_content = False # Removes empty feeds - why keep them!? @@ -238,473 +261,13 @@ class BBCNews(BasicNewsRecipe): ignore_duplicate_articles = {'title', 'url'} resolve_internal_links = True - # Create a custom title which fits nicely in the Kindle title list. - # Requires "import time" above class declaration, and replacing - # title with custom_title in conversion_options (right column only). - # Example of string below: "BBC News - 14 Nov 2011" - # - # custom_title = "BBC News - " + time.strftime('%d %b %Y') - - # Conversion options for advanced users. Avoid setting 'linearize_tables' - # as that plays havoc with the 'old style' table based pages. - conversion_options = { - # 'title' : title, - # 'comments' : description, - # 'tags' : tags, - # 'language' : language, - # 'publisher' : publisher, - # 'authors' : publisher, - 'smarten_punctuation' : True - } - - # Specify extra CSS - overrides ALL other CSS (IE. Added last). - extra_css = 'body { font-family: verdana, helvetica, sans-serif; } \ - .introduction, .first { font-weight: bold; } \ - .cross-head { font-weight: bold; font-size: 125%; } \ - .cap, .caption { display: block; font-size: 80%; font-style: italic; } \ - .cap, .caption, .caption img, .caption span { display: block; text-align: center; margin: 5px auto; } \ - .byl, .byd, .byline img, .byline-name, .byline-title, .author-name, .author-position, \ - .correspondent-portrait img, .byline-lead-in, .name, .bbc-role { display: block; \ - text-align: center; font-size: 80%; font-style: italic; margin: 1px auto; } \ - .story-date, .published { font-size: 80%; } \ - table { width: 100%; } \ - td img { display: block; margin: 5px auto; } \ - ul { padding-top: 10px; } \ - ol { padding-top: 10px; } \ - li { padding-top: 5px; padding-bottom: 5px; } \ - h1 { text-align: center; font-size: 175%; font-weight: bold; } \ - h2 { text-align: center; font-size: 150%; font-weight: bold; } \ - h3 { text-align: center; font-size: 125%; font-weight: bold; } \ - h4, h5, h6 { text-align: center; font-size: 100%; font-weight: bold; }' - - # Remove various tag attributes to improve the look of the ebook pages. - remove_attributes = ['border', 'cellspacing', 'align', 'cellpadding', 'colspan', - 'valign', 'vspace', 'hspace', 'alt', 'width', 'height'] - - # Remove the (admittedly rarely used) line breaks, "
", which sometimes - # cause a section of the ebook to start in an unsightly fashion or, more - # frequently, a "
" will muck up the formatting of a correspondant's byline. - # "
" and "
" are far more frequently used on the table formatted - # style of pages, and really spoil the look of the ebook pages. - preprocess_regexps = [(re.compile(r'', re.IGNORECASE), lambda m: ''), - (re.compile(r'', re.IGNORECASE), lambda m: '')] - - # Create regular expressions for tag keeping and removal to make the matches more - # robust against minor changes and errors in the HTML, Eg. double spaces, leading - # and trailing spaces, missing hyphens, and such like. - # Python regular expression ('re' class) page: - # https://docs.python.org/library/re.html - - # *************************************** - # Regular expressions for keep_only_tags: - # *************************************** - - # The BBC News HTML pages use variants of 'storybody' to denote the section of a HTML - # page which contains the main text of the article. Match storybody variants: 'storybody', - # 'story-body', 'story body','storybody ', etc. - storybody_reg_exp = '^.*story[_ -]*body.*$' - - # The BBC sport and 'newsbeat' (features) HTML pages use 'blq_content' to hold the title - # and published date. This is one level above the usual news pages which have the title - # and date within 'story-body'. This is annoying since 'blq_content' must also be kept, - # resulting in a lot of extra things to be removed by remove_tags. - blq_content_reg_exp = '^.*blq[_ -]*content.*$' - - # The BBC has an alternative page design structure, which I suspect is an out-of-date - # design but which is still used in some articles, Eg. 'Click' (technology), 'FastTrack' - # (travel), and in some sport pages. These alternative pages are table based (which is - # why I think they are an out-of-date design) and account for -I'm guesstimaking- less - # than 1% of all articles. They use a table class 'storycontent' to hold the article - # and like blq_content (above) have required lots of extra removal by - # remove_tags. - story_content_reg_exp = '^.*story[_ -]*content.*$' - - # Keep the sections of the HTML which match the list below. The HTML page created by - # Calibre will fill with those sections which are matched. Note that the - # blq_content_reg_exp must be listed before storybody_reg_exp in keep_only_tags due to - # it being the parent of storybody_reg_exp, that is to say the div class/id 'story-body' - # will be inside div class/id 'blq_content' in the HTML (if 'blq_content' is there at - # all). If they are the other way around in keep_only_tags then blq_content_reg_exp - # will end up being discarded. - keep_only_tags = [dict(name='table', attrs={'class': re.compile(story_content_reg_exp, re.IGNORECASE)}), - dict(name='div', attrs={'class': re.compile( - blq_content_reg_exp, re.IGNORECASE)}), - dict(name='div', attrs={'id': re.compile( - blq_content_reg_exp, re.IGNORECASE)}), - dict(name='div', attrs={'class': re.compile( - storybody_reg_exp, re.IGNORECASE)}), - dict(name='div', attrs={'id': re.compile(storybody_reg_exp, re.IGNORECASE)})] - - # ************************************ - # Regular expressions for remove_tags: - # ************************************ - - # Regular expression to remove share-help and variant tags. The share-help class - # is used by the site for a variety of 'sharing' type links, Eg. Facebook, delicious, - # twitter, email. Removed to avoid page clutter. - share_help_reg_exp = '^.*share[_ -]*help.*$' - - # Regular expression to remove embedded-hyper and variant tags. This class is used to - # display links to other BBC News articles on the same/similar subject. - embedded_hyper_reg_exp = '^.*embed*ed[_ -]*hyper.*$' - - # Regular expression to remove hypertabs and variant tags. This class is used to - # display a tab bar at the top of an article which allows the user to switch to - # an article (viewed on the same page) providing further info., 'in depth' analysis, - # an editorial, a correspondant's blog entry, and such like. The ability to handle - # a tab bar of this nature is currently beyond the scope of this recipe and - # possibly of Calibre itself (not sure about that - TO DO - check!). - hypertabs_reg_exp = '^.*hyper[_ -]*tabs.*$' - - # Regular expression to remove story-feature and variant tags. Eg. 'story-feature', - # 'story-feature related narrow', 'story-feature wide', 'story-feature narrow'. - # This class is used to add additional info. boxes, or small lists, outside of - # the main story. TO DO: Work out a way to incorporate these neatly. - story_feature_reg_exp = '^.*story[_ -]*feature.*$' - - # Regular expression to remove video and variant tags, Eg. 'videoInStoryB', - # 'videoInStoryC'. This class is used to embed video. - video_reg_exp = '^.*video.*$' - - # Regular expression to remove audio and variant tags, Eg. 'audioInStoryD'. - # This class is used to embed audio. - audio_reg_exp = '^.*audio.*$' - - # Regular expression to remove pictureGallery and variant tags, Eg. 'pictureGallery'. - # This class is used to embed a photo slideshow. See also 'slideshow' - # below. - picture_gallery_reg_exp = '^.*picture.*$' - - # Regular expression to remove slideshow and variant tags, Eg. 'dslideshow-enclosure'. - # This class is used to embed a slideshow (not necessarily photo) but both - # 'slideshow' and 'pictureGallery' are used for slideshows. - slideshow_reg_exp = '^.*slide[_ -]*show.*$' - - # Regular expression to remove social-links and variant tags. This class is used to - # display links to a BBC bloggers main page, used in various columnist's blogs - # (Eg. Nick Robinson, Robert Preston). - social_links_reg_exp = '^.*social[_ -]*links.*$' - - # Regular expression to remove quote and (multi) variant tags, Eg. 'quote', - # 'endquote', 'quote-credit', 'quote-credit-title', etc. These are usually - # removed by 'story-feature' removal (as they are usually within them), but - # not always. The quotation removed is always (AFAICT) in the article text - # as well but a 2nd copy is placed in a quote tag to draw attention to it. - # The quote class tags may or may not appear in div's. - quote_reg_exp = '^.*quote.*$' - - # Regular expression to remove hidden and variant tags, Eg. 'hidden'. - # The purpose of these is unclear, they seem to be an internal link to a - # section within the article, but the text of the link (Eg. 'Continue reading - # the main story') never seems to be displayed anyway. Removed to avoid clutter. - # The hidden class tags may or may not appear in div's. - hidden_reg_exp = '^.*hidden.*$' - - # Regular expression to remove comment and variant tags, Eg. 'comment-introduction'. - # Used on the site to display text about registered users entering - # comments. - comment_reg_exp = '^.*comment.*$' - - # Regular expression to remove form and variant tags, Eg. 'comment-form'. - # Used on the site to allow registered BBC users to fill in forms, typically - # for entering comments about an article. - form_reg_exp = '^.*form.*$' - - # Extra things to remove due to the addition of 'blq_content' in - # keep_only_tags. - - #
Used on sports pages for 'email' and 'print'. - story_actions_reg_exp = '^.*story[_ -]*actions.*$' - - #
Used on sports pages instead of 'share-help' (for - # social networking links). - bookmark_list_reg_exp = '^.*bookmark[_ -]*list.*$' - - #
- # NOTE: Don't remove class="content-group" that is needed. - # Used on sports pages to link to 'similar stories'. - secondary_content_reg_exp = '^.*secondary[_ -]*content.*$' - - #