Update BBC News

Fixes #1698322 [BBC News Script Needs Update](https://bugs.launchpad.net/calibre/+bug/1698322)
This commit is contained in:
Kovid Goyal 2017-06-16 14:34:20 +05:30
parent f5446f68a7
commit 4cebbfd766
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -24,7 +24,13 @@ import re
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
class BBCNewsSportBlog(BasicNewsRecipe): def classes(classes):
q = frozenset(classes.split(' '))
return dict(attrs={
'class': lambda x: x and frozenset(x.split()).intersection(q)})
class BBCNews(BasicNewsRecipe):
# #
# **** IMPORTANT USERS READ ME **** # **** IMPORTANT USERS READ ME ****
@ -42,7 +48,7 @@ class BBCNewsSportBlog(BasicNewsRecipe):
# #
# There are 68 feeds below which constitute the bulk of the available rss # There are 68 feeds below which constitute the bulk of the available rss
# feeds on the BBC web site. These include 5 blogs by editors and # feeds on the BBC web site. These include 5 blogs by editors and
# correspondants, 16 sports feeds, 15 'sub' regional feeds (Eg. North West # correspondents, 16 sports feeds, 15 'sub' regional feeds (Eg. North West
# Wales, Scotland Business), and 7 Welsh language feeds. # Wales, Scotland Business), and 7 Welsh language feeds.
# #
# Some of the feeds are low volume (Eg. blogs), or very low volume (Eg. Click) # Some of the feeds are low volume (Eg. blogs), or very low volume (Eg. Click)
@ -89,13 +95,9 @@ class BBCNewsSportBlog(BasicNewsRecipe):
("Also in the News", "http://feeds.bbci.co.uk/news/also_in_the_news/rss.xml"), ("Also in the News", "http://feeds.bbci.co.uk/news/also_in_the_news/rss.xml"),
# ("Newsbeat", "http://www.bbc.co.uk/newsbeat/rss.xml"), # ("Newsbeat", "http://www.bbc.co.uk/newsbeat/rss.xml"),
# ("Click", "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/programmes/click_online/rss.xml"), # ("Click", "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/programmes/click_online/rss.xml"),
("Blog: Nick Robinson (Political Editor)",
"http://feeds.bbci.co.uk/news/correspondents/nickrobinson/rss.sxml"),
# ("Blog: Mark D'Arcy (Parliamentary Correspondent)", "http://feeds.bbci.co.uk/news/correspondents/markdarcy/rss.sxml"), # ("Blog: Mark D'Arcy (Parliamentary Correspondent)", "http://feeds.bbci.co.uk/news/correspondents/markdarcy/rss.sxml"),
# ("Blog: Robert Peston (Business Editor)", "http://feeds.bbci.co.uk/news/correspondents/robertpeston/rss.sxml"), # ("Blog: Robert Peston (Business Editor)", "http://feeds.bbci.co.uk/news/correspondents/robertpeston/rss.sxml"),
# ("Blog: Stephanie Flanders (Economics Editor)", "http://feeds.bbci.co.uk/news/correspondents/stephanieflanders/rss.sxml"), # ("Blog: Stephanie Flanders (Economics Editor)", "http://feeds.bbci.co.uk/news/correspondents/stephanieflanders/rss.sxml"),
("Blog: Rory Cellan-Jones (Technology correspondent)",
"http://feeds.bbci.co.uk/news/correspondents/rorycellanjones/rss.sxml"),
("Sport Front Page", ("Sport Front Page",
"http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/front_page/rss.xml"), "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/front_page/rss.xml"),
# ("Football", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/football/rss.xml"), # ("Football", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/football/rss.xml"),
@ -233,6 +235,8 @@ class BBCNewsSportBlog(BasicNewsRecipe):
# Removes empty feeds - why keep them!? # Removes empty feeds - why keep them!?
remove_empty_feeds = True remove_empty_feeds = True
ignore_duplicate_articles = {'title', 'url'}
resolve_internal_links = True
# Create a custom title which fits nicely in the Kindle title list. # Create a custom title which fits nicely in the Kindle title list.
# Requires "import time" above class declaration, and replacing # Requires "import time" above class declaration, and replacing
@ -241,22 +245,17 @@ class BBCNewsSportBlog(BasicNewsRecipe):
# #
# custom_title = "BBC News - " + time.strftime('%d %b %Y') # custom_title = "BBC News - " + time.strftime('%d %b %Y')
''' # Conversion options for advanced users. Avoid setting 'linearize_tables'
# Conversion options for advanced users, but don't forget to comment out the # as that plays havoc with the 'old style' table based pages.
# current conversion_options below. Avoid setting 'linearize_tables' as that conversion_options = {
# plays havoc with the 'old style' table based pages. # 'title' : title,
# # 'comments' : description,
conversion_options = { 'title' : title, # 'tags' : tags,
'comments' : description, # 'language' : language,
'tags' : tags, # 'publisher' : publisher,
'language' : language, # 'authors' : publisher,
'publisher' : publisher, 'smarten_punctuation' : True
'authors' : publisher, }
'smarten_punctuation' : True
}
'''
conversion_options = {'smarten_punctuation': True}
# Specify extra CSS - overrides ALL other CSS (IE. Added last). # Specify extra CSS - overrides ALL other CSS (IE. Added last).
extra_css = 'body { font-family: verdana, helvetica, sans-serif; } \ extra_css = 'body { font-family: verdana, helvetica, sans-serif; } \
@ -513,88 +512,90 @@ class BBCNewsSportBlog(BasicNewsRecipe):
# Remove 'storyextra' - links to relevant articles and external sites. # Remove 'storyextra' - links to relevant articles and external sites.
storyextra_reg_exp = '^.*story[_ -]*extra.*$' storyextra_reg_exp = '^.*story[_ -]*extra.*$'
remove_tags = [dict(name='div', attrs={'class': re.compile(story_feature_reg_exp, re.IGNORECASE)}), remove_tags = [
dict(name='div', attrs={'class': re.compile( classes('sharetools share-tools--no-event-tag'),
share_help_reg_exp, re.IGNORECASE)}), dict(name='div', attrs={'class': re.compile(story_feature_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class': re.compile( dict(name='div', attrs={'class': re.compile(
embedded_hyper_reg_exp, re.IGNORECASE)}), share_help_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class': re.compile( dict(name='div', attrs={'class': re.compile(
hypertabs_reg_exp, re.IGNORECASE)}), embedded_hyper_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class': re.compile( dict(name='div', attrs={'class': re.compile(
video_reg_exp, re.IGNORECASE)}), hypertabs_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class': re.compile( dict(name='div', attrs={'class': re.compile(
audio_reg_exp, re.IGNORECASE)}), video_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class': re.compile( dict(name='div', attrs={'class': re.compile(
picture_gallery_reg_exp, re.IGNORECASE)}), audio_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class': re.compile( dict(name='div', attrs={'class': re.compile(
slideshow_reg_exp, re.IGNORECASE)}), picture_gallery_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class': re.compile( dict(name='div', attrs={'class': re.compile(
quote_reg_exp, re.IGNORECASE)}), slideshow_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class': re.compile( dict(name='div', attrs={'class': re.compile(
hidden_reg_exp, re.IGNORECASE)}), quote_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class': re.compile( dict(name='div', attrs={'class': re.compile(
comment_reg_exp, re.IGNORECASE)}), hidden_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class': re.compile( dict(name='div', attrs={'class': re.compile(
story_actions_reg_exp, re.IGNORECASE)}), comment_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class': re.compile( dict(name='div', attrs={'class': re.compile(
bookmark_list_reg_exp, re.IGNORECASE)}), story_actions_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'id': re.compile( dict(name='div', attrs={'class': re.compile(
secondary_content_reg_exp, re.IGNORECASE)}), bookmark_list_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'id': re.compile( dict(name='div', attrs={'id': re.compile(
featured_content_reg_exp, re.IGNORECASE)}), secondary_content_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'id': re.compile( dict(name='div', attrs={'id': re.compile(
navigation_reg_exp, re.IGNORECASE)}), featured_content_reg_exp, re.IGNORECASE)}),
dict(name='form', attrs={'id': re.compile( dict(name='div', attrs={'id': re.compile(
form_reg_exp, re.IGNORECASE)}), navigation_reg_exp, re.IGNORECASE)}),
dict(attrs={'class': re.compile( dict(name='form', attrs={'id': re.compile(
quote_reg_exp, re.IGNORECASE)}), form_reg_exp, re.IGNORECASE)}),
dict(attrs={'class': re.compile( dict(attrs={'class': re.compile(
hidden_reg_exp, re.IGNORECASE)}), quote_reg_exp, re.IGNORECASE)}),
dict(attrs={'class': re.compile( dict(attrs={'class': re.compile(
social_links_reg_exp, re.IGNORECASE)}), hidden_reg_exp, re.IGNORECASE)}),
dict(attrs={'class': re.compile( dict(attrs={'class': re.compile(
comment_reg_exp, re.IGNORECASE)}), social_links_reg_exp, re.IGNORECASE)}),
dict(attrs={'class': re.compile( dict(attrs={'class': re.compile(
skip_reg_exp, re.IGNORECASE)}), comment_reg_exp, re.IGNORECASE)}),
dict(name='map', attrs={'id': re.compile( dict(attrs={'class': re.compile(
map_reg_exp, re.IGNORECASE)}), skip_reg_exp, re.IGNORECASE)}),
dict(name='map', attrs={'name': re.compile( dict(name='map', attrs={'id': re.compile(
map_reg_exp, re.IGNORECASE)}), map_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'id': re.compile( dict(name='map', attrs={'name': re.compile(
social_bookmarks_reg_exp, re.IGNORECASE)}), map_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'id': re.compile( dict(name='div', attrs={'id': re.compile(
blq_mast_reg_exp, re.IGNORECASE)}), social_bookmarks_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class': re.compile( dict(name='div', attrs={'id': re.compile(
sharesb_reg_exp, re.IGNORECASE)}), blq_mast_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={ dict(name='div', attrs={'class': re.compile(
'class': re.compile(o_reg_exp, re.IGNORECASE)}), sharesb_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class': re.compile( dict(name='div', attrs={
promo_top_reg_exp, re.IGNORECASE)}), 'class': re.compile(o_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class': re.compile( dict(name='div', attrs={'class': re.compile(
promo_bottom_reg_exp, re.IGNORECASE)}), promo_top_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={ dict(name='div', attrs={'class': re.compile(
'class': re.compile(nlp_reg_exp, re.IGNORECASE)}), promo_bottom_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class': re.compile( dict(name='div', attrs={
mva_or_mvb_reg_exp, re.IGNORECASE)}), 'class': re.compile(nlp_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class': re.compile( dict(name='div', attrs={'class': re.compile(
mvtb_reg_exp, re.IGNORECASE)}), mva_or_mvb_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class': re.compile( dict(name='div', attrs={'class': re.compile(
blq_toplink_reg_exp, re.IGNORECASE)}), mvtb_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class': re.compile( dict(name='div', attrs={'class': re.compile(
prods_services_01_reg_exp, re.IGNORECASE)}), blq_toplink_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class': re.compile( dict(name='div', attrs={'class': re.compile(
prods_services_02_reg_exp, re.IGNORECASE)}), prods_services_01_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class': re.compile( dict(name='div', attrs={'class': re.compile(
blq_misc_01_reg_exp, re.IGNORECASE)}), prods_services_02_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class': re.compile( dict(name='div', attrs={'class': re.compile(
blq_misc_02_reg_exp, re.IGNORECASE)}), blq_misc_01_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class': re.compile( dict(name='div', attrs={'class': re.compile(
puffbox_reg_exp, re.IGNORECASE)}), blq_misc_02_reg_exp, re.IGNORECASE)}),
dict(attrs={'class': re.compile( dict(name='div', attrs={'class': re.compile(
sibtbg_reg_exp, re.IGNORECASE)}), puffbox_reg_exp, re.IGNORECASE)}),
dict(attrs={'class': re.compile( dict(attrs={'class': re.compile(
storyextra_reg_exp, re.IGNORECASE)}) sibtbg_reg_exp, re.IGNORECASE)}),
] dict(attrs={'class': re.compile(
storyextra_reg_exp, re.IGNORECASE)})
]
# Uses url to create and return the 'printer friendly' version of the url. # Uses url to create and return the 'printer friendly' version of the url.
# In other words the 'print this page' address of the page. # In other words the 'print this page' address of the page.
@ -625,6 +626,11 @@ class BBCNewsSportBlog(BasicNewsRecipe):
return print_url return print_url
def canonicalize_internal_url(self, url, is_link=True):
if url.endswith('?print=true'):
url = url.rpartition('?')[0]
return BasicNewsRecipe.canonicalize_internal_url(self, url, is_link=is_link)
# Remove articles in feeds based on a string in the article title or url. # Remove articles in feeds based on a string in the article title or url.
# #
# Code logic written by: Starson17 - posted in: "Recipes - Re-usable code" # Code logic written by: Starson17 - posted in: "Recipes - Re-usable code"