Update BBC News

Fixes #1698322 [BBC News Script Needs Update](https://bugs.launchpad.net/calibre/+bug/1698322)
This commit is contained in:
Kovid Goyal 2017-06-16 14:34:20 +05:30
parent f5446f68a7
commit 4cebbfd766
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -24,7 +24,13 @@ import re
from calibre.web.feeds.recipes import BasicNewsRecipe
class BBCNewsSportBlog(BasicNewsRecipe):
def classes(classes):
q = frozenset(classes.split(' '))
return dict(attrs={
'class': lambda x: x and frozenset(x.split()).intersection(q)})
class BBCNews(BasicNewsRecipe):
#
# **** IMPORTANT USERS READ ME ****
@ -42,7 +48,7 @@ class BBCNewsSportBlog(BasicNewsRecipe):
#
# There are 68 feeds below which constitute the bulk of the available rss
# feeds on the BBC web site. These include 5 blogs by editors and
# correspondants, 16 sports feeds, 15 'sub' regional feeds (Eg. North West
# correspondents, 16 sports feeds, 15 'sub' regional feeds (Eg. North West
# Wales, Scotland Business), and 7 Welsh language feeds.
#
# Some of the feeds are low volume (Eg. blogs), or very low volume (Eg. Click)
@ -89,13 +95,9 @@ class BBCNewsSportBlog(BasicNewsRecipe):
("Also in the News", "http://feeds.bbci.co.uk/news/also_in_the_news/rss.xml"),
# ("Newsbeat", "http://www.bbc.co.uk/newsbeat/rss.xml"),
# ("Click", "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/programmes/click_online/rss.xml"),
("Blog: Nick Robinson (Political Editor)",
"http://feeds.bbci.co.uk/news/correspondents/nickrobinson/rss.sxml"),
# ("Blog: Mark D'Arcy (Parliamentary Correspondent)", "http://feeds.bbci.co.uk/news/correspondents/markdarcy/rss.sxml"),
# ("Blog: Robert Peston (Business Editor)", "http://feeds.bbci.co.uk/news/correspondents/robertpeston/rss.sxml"),
# ("Blog: Stephanie Flanders (Economics Editor)", "http://feeds.bbci.co.uk/news/correspondents/stephanieflanders/rss.sxml"),
("Blog: Rory Cellan-Jones (Technology correspondent)",
"http://feeds.bbci.co.uk/news/correspondents/rorycellanjones/rss.sxml"),
("Sport Front Page",
"http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/front_page/rss.xml"),
# ("Football", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/football/rss.xml"),
@ -233,6 +235,8 @@ class BBCNewsSportBlog(BasicNewsRecipe):
# Removes empty feeds - why keep them!?
remove_empty_feeds = True
ignore_duplicate_articles = {'title', 'url'}
resolve_internal_links = True
# Create a custom title which fits nicely in the Kindle title list.
# Requires "import time" above class declaration, and replacing
@ -241,22 +245,17 @@ class BBCNewsSportBlog(BasicNewsRecipe):
#
# custom_title = "BBC News - " + time.strftime('%d %b %Y')
'''
# Conversion options for advanced users, but don't forget to comment out the
# current conversion_options below. Avoid setting 'linearize_tables' as that
# plays havoc with the 'old style' table based pages.
#
conversion_options = { 'title' : title,
'comments' : description,
'tags' : tags,
'language' : language,
'publisher' : publisher,
'authors' : publisher,
# Conversion options for advanced users. Avoid setting 'linearize_tables'
# as that plays havoc with the 'old style' table based pages.
conversion_options = {
# 'title' : title,
# 'comments' : description,
# 'tags' : tags,
# 'language' : language,
# 'publisher' : publisher,
# 'authors' : publisher,
'smarten_punctuation' : True
}
'''
conversion_options = {'smarten_punctuation': True}
# Specify extra CSS - overrides ALL other CSS (IE. Added last).
extra_css = 'body { font-family: verdana, helvetica, sans-serif; } \
@ -513,7 +512,9 @@ class BBCNewsSportBlog(BasicNewsRecipe):
# Remove 'storyextra' - links to relevant articles and external sites.
storyextra_reg_exp = '^.*story[_ -]*extra.*$'
remove_tags = [dict(name='div', attrs={'class': re.compile(story_feature_reg_exp, re.IGNORECASE)}),
remove_tags = [
classes('sharetools share-tools--no-event-tag'),
dict(name='div', attrs={'class': re.compile(story_feature_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class': re.compile(
share_help_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class': re.compile(
@ -625,6 +626,11 @@ class BBCNewsSportBlog(BasicNewsRecipe):
return print_url
def canonicalize_internal_url(self, url, is_link=True):
if url.endswith('?print=true'):
url = url.rpartition('?')[0]
return BasicNewsRecipe.canonicalize_internal_url(self, url, is_link=is_link)
# Remove articles in feeds based on a string in the article title or url.
#
# Code logic written by: Starson17 - posted in: "Recipes - Re-usable code"