Update BBC News

Fixes #1698322 [BBC News Script Needs Update](https://bugs.launchpad.net/calibre/+bug/1698322)
2025-07-09 03:04:10 -04:00 · 2017-06-16 14:34:20 +05:30 · 2017-06-16 14:34:20 +05:30 · 4cebbfd766
commit 4cebbfd766
parent f5446f68a7
1 changed files with 110 additions and 104 deletions
--- a/recipes/bbc.recipe
+++ b/recipes/bbc.recipe
@ -24,7 +24,13 @@ import re
 from calibre.web.feeds.recipes import BasicNewsRecipe


-class BBCNewsSportBlog(BasicNewsRecipe):
+def classes(classes):
+    q = frozenset(classes.split(' '))
+    return dict(attrs={
+        'class': lambda x: x and frozenset(x.split()).intersection(q)})
+
+
+class BBCNews(BasicNewsRecipe):

    #
    #    **** IMPORTANT USERS READ ME ****
@ -42,7 +48,7 @@ class BBCNewsSportBlog(BasicNewsRecipe):
    #
    # There are 68 feeds below which constitute the bulk of the available rss
    # feeds on the BBC web site. These include 5 blogs by editors and
-    # correspondants, 16 sports feeds, 15 'sub' regional feeds (Eg. North West
+    # correspondents, 16 sports feeds, 15 'sub' regional feeds (Eg. North West
    # Wales, Scotland Business), and 7 Welsh language feeds.
    #
    # Some of the feeds are low volume (Eg. blogs), or very low volume (Eg. Click)
@ -89,13 +95,9 @@ class BBCNewsSportBlog(BasicNewsRecipe):
        ("Also in the News", "http://feeds.bbci.co.uk/news/also_in_the_news/rss.xml"),
        # ("Newsbeat", "http://www.bbc.co.uk/newsbeat/rss.xml"),
        # ("Click", "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/programmes/click_online/rss.xml"),
-        ("Blog: Nick Robinson (Political Editor)",
-         "http://feeds.bbci.co.uk/news/correspondents/nickrobinson/rss.sxml"),
        # ("Blog: Mark D'Arcy (Parliamentary Correspondent)", "http://feeds.bbci.co.uk/news/correspondents/markdarcy/rss.sxml"),
        # ("Blog: Robert Peston (Business Editor)", "http://feeds.bbci.co.uk/news/correspondents/robertpeston/rss.sxml"),
        # ("Blog: Stephanie Flanders (Economics Editor)", "http://feeds.bbci.co.uk/news/correspondents/stephanieflanders/rss.sxml"),
-        ("Blog: Rory Cellan-Jones (Technology correspondent)",
-         "http://feeds.bbci.co.uk/news/correspondents/rorycellanjones/rss.sxml"),
        ("Sport Front Page",
         "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/front_page/rss.xml"),
        # ("Football", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/football/rss.xml"),
@ -233,6 +235,8 @@ class BBCNewsSportBlog(BasicNewsRecipe):

    # Removes empty feeds - why keep them!?
    remove_empty_feeds = True
+    ignore_duplicate_articles = {'title', 'url'}
+    resolve_internal_links = True

    # Create a custom title which fits nicely in the Kindle title list.
    # Requires "import time" above class declaration, and replacing
@ -241,22 +245,17 @@ class BBCNewsSportBlog(BasicNewsRecipe):
    #
    # custom_title = "BBC News - " + time.strftime('%d %b %Y')

-    '''
-    # Conversion options for advanced users, but don't forget to comment out the
-    # current conversion_options below. Avoid setting 'linearize_tables' as that
-    # plays havoc with the 'old style' table based pages.
-    #
-    conversion_options = { 'title'       : title,
-                           'comments'    : description,
-                           'tags'        : tags,
-                           'language'    : language,
-                           'publisher'   : publisher,
-                           'authors'     : publisher,
+    # Conversion options for advanced users. Avoid setting 'linearize_tables'
+    # as that plays havoc with the 'old style' table based pages.
+    conversion_options = {
+        # 'title'       : title,
+        # 'comments'    : description,
+        # 'tags'        : tags,
+        # 'language'    : language,
+        # 'publisher'   : publisher,
+        # 'authors'     : publisher,
        'smarten_punctuation' : True
    }
-    '''
-
-    conversion_options = {'smarten_punctuation': True}

    # Specify extra CSS - overrides ALL other CSS (IE. Added last).
    extra_css = 'body { font-family: verdana, helvetica, sans-serif; } \
@ -513,7 +512,9 @@ class BBCNewsSportBlog(BasicNewsRecipe):
    # Remove 'storyextra' - links to relevant articles and external sites.
    storyextra_reg_exp = '^.*story[_ -]*extra.*$'

-    remove_tags = [dict(name='div',  attrs={'class': re.compile(story_feature_reg_exp, re.IGNORECASE)}),
+    remove_tags = [
+        classes('sharetools share-tools--no-event-tag'),
+        dict(name='div',  attrs={'class': re.compile(story_feature_reg_exp, re.IGNORECASE)}),
        dict(name='div',  attrs={'class': re.compile(
            share_help_reg_exp, re.IGNORECASE)}),
        dict(name='div',  attrs={'class': re.compile(
@ -625,6 +626,11 @@ class BBCNewsSportBlog(BasicNewsRecipe):

        return print_url

+    def canonicalize_internal_url(self, url, is_link=True):
+        if url.endswith('?print=true'):
+            url = url.rpartition('?')[0]
+        return BasicNewsRecipe.canonicalize_internal_url(self, url, is_link=is_link)
+
    # Remove articles in feeds based on a string in the article title or url.
    #
    # Code logic written by: Starson17 - posted in: "Recipes - Re-usable code"