mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Update BBC News
Fixes #1698322 [BBC News Script Needs Update](https://bugs.launchpad.net/calibre/+bug/1698322)
This commit is contained in:
parent
f5446f68a7
commit
4cebbfd766
@ -24,7 +24,13 @@ import re
|
|||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
|
||||||
|
|
||||||
class BBCNewsSportBlog(BasicNewsRecipe):
|
def classes(classes):
|
||||||
|
q = frozenset(classes.split(' '))
|
||||||
|
return dict(attrs={
|
||||||
|
'class': lambda x: x and frozenset(x.split()).intersection(q)})
|
||||||
|
|
||||||
|
|
||||||
|
class BBCNews(BasicNewsRecipe):
|
||||||
|
|
||||||
#
|
#
|
||||||
# **** IMPORTANT USERS READ ME ****
|
# **** IMPORTANT USERS READ ME ****
|
||||||
@ -42,7 +48,7 @@ class BBCNewsSportBlog(BasicNewsRecipe):
|
|||||||
#
|
#
|
||||||
# There are 68 feeds below which constitute the bulk of the available rss
|
# There are 68 feeds below which constitute the bulk of the available rss
|
||||||
# feeds on the BBC web site. These include 5 blogs by editors and
|
# feeds on the BBC web site. These include 5 blogs by editors and
|
||||||
# correspondants, 16 sports feeds, 15 'sub' regional feeds (Eg. North West
|
# correspondents, 16 sports feeds, 15 'sub' regional feeds (Eg. North West
|
||||||
# Wales, Scotland Business), and 7 Welsh language feeds.
|
# Wales, Scotland Business), and 7 Welsh language feeds.
|
||||||
#
|
#
|
||||||
# Some of the feeds are low volume (Eg. blogs), or very low volume (Eg. Click)
|
# Some of the feeds are low volume (Eg. blogs), or very low volume (Eg. Click)
|
||||||
@ -89,13 +95,9 @@ class BBCNewsSportBlog(BasicNewsRecipe):
|
|||||||
("Also in the News", "http://feeds.bbci.co.uk/news/also_in_the_news/rss.xml"),
|
("Also in the News", "http://feeds.bbci.co.uk/news/also_in_the_news/rss.xml"),
|
||||||
# ("Newsbeat", "http://www.bbc.co.uk/newsbeat/rss.xml"),
|
# ("Newsbeat", "http://www.bbc.co.uk/newsbeat/rss.xml"),
|
||||||
# ("Click", "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/programmes/click_online/rss.xml"),
|
# ("Click", "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/programmes/click_online/rss.xml"),
|
||||||
("Blog: Nick Robinson (Political Editor)",
|
|
||||||
"http://feeds.bbci.co.uk/news/correspondents/nickrobinson/rss.sxml"),
|
|
||||||
# ("Blog: Mark D'Arcy (Parliamentary Correspondent)", "http://feeds.bbci.co.uk/news/correspondents/markdarcy/rss.sxml"),
|
# ("Blog: Mark D'Arcy (Parliamentary Correspondent)", "http://feeds.bbci.co.uk/news/correspondents/markdarcy/rss.sxml"),
|
||||||
# ("Blog: Robert Peston (Business Editor)", "http://feeds.bbci.co.uk/news/correspondents/robertpeston/rss.sxml"),
|
# ("Blog: Robert Peston (Business Editor)", "http://feeds.bbci.co.uk/news/correspondents/robertpeston/rss.sxml"),
|
||||||
# ("Blog: Stephanie Flanders (Economics Editor)", "http://feeds.bbci.co.uk/news/correspondents/stephanieflanders/rss.sxml"),
|
# ("Blog: Stephanie Flanders (Economics Editor)", "http://feeds.bbci.co.uk/news/correspondents/stephanieflanders/rss.sxml"),
|
||||||
("Blog: Rory Cellan-Jones (Technology correspondent)",
|
|
||||||
"http://feeds.bbci.co.uk/news/correspondents/rorycellanjones/rss.sxml"),
|
|
||||||
("Sport Front Page",
|
("Sport Front Page",
|
||||||
"http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/front_page/rss.xml"),
|
"http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/front_page/rss.xml"),
|
||||||
# ("Football", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/football/rss.xml"),
|
# ("Football", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/football/rss.xml"),
|
||||||
@ -233,6 +235,8 @@ class BBCNewsSportBlog(BasicNewsRecipe):
|
|||||||
|
|
||||||
# Removes empty feeds - why keep them!?
|
# Removes empty feeds - why keep them!?
|
||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
|
resolve_internal_links = True
|
||||||
|
|
||||||
# Create a custom title which fits nicely in the Kindle title list.
|
# Create a custom title which fits nicely in the Kindle title list.
|
||||||
# Requires "import time" above class declaration, and replacing
|
# Requires "import time" above class declaration, and replacing
|
||||||
@ -241,22 +245,17 @@ class BBCNewsSportBlog(BasicNewsRecipe):
|
|||||||
#
|
#
|
||||||
# custom_title = "BBC News - " + time.strftime('%d %b %Y')
|
# custom_title = "BBC News - " + time.strftime('%d %b %Y')
|
||||||
|
|
||||||
'''
|
# Conversion options for advanced users. Avoid setting 'linearize_tables'
|
||||||
# Conversion options for advanced users, but don't forget to comment out the
|
# as that plays havoc with the 'old style' table based pages.
|
||||||
# current conversion_options below. Avoid setting 'linearize_tables' as that
|
conversion_options = {
|
||||||
# plays havoc with the 'old style' table based pages.
|
# 'title' : title,
|
||||||
#
|
# 'comments' : description,
|
||||||
conversion_options = { 'title' : title,
|
# 'tags' : tags,
|
||||||
'comments' : description,
|
# 'language' : language,
|
||||||
'tags' : tags,
|
# 'publisher' : publisher,
|
||||||
'language' : language,
|
# 'authors' : publisher,
|
||||||
'publisher' : publisher,
|
'smarten_punctuation' : True
|
||||||
'authors' : publisher,
|
}
|
||||||
'smarten_punctuation' : True
|
|
||||||
}
|
|
||||||
'''
|
|
||||||
|
|
||||||
conversion_options = {'smarten_punctuation': True}
|
|
||||||
|
|
||||||
# Specify extra CSS - overrides ALL other CSS (IE. Added last).
|
# Specify extra CSS - overrides ALL other CSS (IE. Added last).
|
||||||
extra_css = 'body { font-family: verdana, helvetica, sans-serif; } \
|
extra_css = 'body { font-family: verdana, helvetica, sans-serif; } \
|
||||||
@ -513,88 +512,90 @@ class BBCNewsSportBlog(BasicNewsRecipe):
|
|||||||
# Remove 'storyextra' - links to relevant articles and external sites.
|
# Remove 'storyextra' - links to relevant articles and external sites.
|
||||||
storyextra_reg_exp = '^.*story[_ -]*extra.*$'
|
storyextra_reg_exp = '^.*story[_ -]*extra.*$'
|
||||||
|
|
||||||
remove_tags = [dict(name='div', attrs={'class': re.compile(story_feature_reg_exp, re.IGNORECASE)}),
|
remove_tags = [
|
||||||
dict(name='div', attrs={'class': re.compile(
|
classes('sharetools share-tools--no-event-tag'),
|
||||||
share_help_reg_exp, re.IGNORECASE)}),
|
dict(name='div', attrs={'class': re.compile(story_feature_reg_exp, re.IGNORECASE)}),
|
||||||
dict(name='div', attrs={'class': re.compile(
|
dict(name='div', attrs={'class': re.compile(
|
||||||
embedded_hyper_reg_exp, re.IGNORECASE)}),
|
share_help_reg_exp, re.IGNORECASE)}),
|
||||||
dict(name='div', attrs={'class': re.compile(
|
dict(name='div', attrs={'class': re.compile(
|
||||||
hypertabs_reg_exp, re.IGNORECASE)}),
|
embedded_hyper_reg_exp, re.IGNORECASE)}),
|
||||||
dict(name='div', attrs={'class': re.compile(
|
dict(name='div', attrs={'class': re.compile(
|
||||||
video_reg_exp, re.IGNORECASE)}),
|
hypertabs_reg_exp, re.IGNORECASE)}),
|
||||||
dict(name='div', attrs={'class': re.compile(
|
dict(name='div', attrs={'class': re.compile(
|
||||||
audio_reg_exp, re.IGNORECASE)}),
|
video_reg_exp, re.IGNORECASE)}),
|
||||||
dict(name='div', attrs={'class': re.compile(
|
dict(name='div', attrs={'class': re.compile(
|
||||||
picture_gallery_reg_exp, re.IGNORECASE)}),
|
audio_reg_exp, re.IGNORECASE)}),
|
||||||
dict(name='div', attrs={'class': re.compile(
|
dict(name='div', attrs={'class': re.compile(
|
||||||
slideshow_reg_exp, re.IGNORECASE)}),
|
picture_gallery_reg_exp, re.IGNORECASE)}),
|
||||||
dict(name='div', attrs={'class': re.compile(
|
dict(name='div', attrs={'class': re.compile(
|
||||||
quote_reg_exp, re.IGNORECASE)}),
|
slideshow_reg_exp, re.IGNORECASE)}),
|
||||||
dict(name='div', attrs={'class': re.compile(
|
dict(name='div', attrs={'class': re.compile(
|
||||||
hidden_reg_exp, re.IGNORECASE)}),
|
quote_reg_exp, re.IGNORECASE)}),
|
||||||
dict(name='div', attrs={'class': re.compile(
|
dict(name='div', attrs={'class': re.compile(
|
||||||
comment_reg_exp, re.IGNORECASE)}),
|
hidden_reg_exp, re.IGNORECASE)}),
|
||||||
dict(name='div', attrs={'class': re.compile(
|
dict(name='div', attrs={'class': re.compile(
|
||||||
story_actions_reg_exp, re.IGNORECASE)}),
|
comment_reg_exp, re.IGNORECASE)}),
|
||||||
dict(name='div', attrs={'class': re.compile(
|
dict(name='div', attrs={'class': re.compile(
|
||||||
bookmark_list_reg_exp, re.IGNORECASE)}),
|
story_actions_reg_exp, re.IGNORECASE)}),
|
||||||
dict(name='div', attrs={'id': re.compile(
|
dict(name='div', attrs={'class': re.compile(
|
||||||
secondary_content_reg_exp, re.IGNORECASE)}),
|
bookmark_list_reg_exp, re.IGNORECASE)}),
|
||||||
dict(name='div', attrs={'id': re.compile(
|
dict(name='div', attrs={'id': re.compile(
|
||||||
featured_content_reg_exp, re.IGNORECASE)}),
|
secondary_content_reg_exp, re.IGNORECASE)}),
|
||||||
dict(name='div', attrs={'id': re.compile(
|
dict(name='div', attrs={'id': re.compile(
|
||||||
navigation_reg_exp, re.IGNORECASE)}),
|
featured_content_reg_exp, re.IGNORECASE)}),
|
||||||
dict(name='form', attrs={'id': re.compile(
|
dict(name='div', attrs={'id': re.compile(
|
||||||
form_reg_exp, re.IGNORECASE)}),
|
navigation_reg_exp, re.IGNORECASE)}),
|
||||||
dict(attrs={'class': re.compile(
|
dict(name='form', attrs={'id': re.compile(
|
||||||
quote_reg_exp, re.IGNORECASE)}),
|
form_reg_exp, re.IGNORECASE)}),
|
||||||
dict(attrs={'class': re.compile(
|
dict(attrs={'class': re.compile(
|
||||||
hidden_reg_exp, re.IGNORECASE)}),
|
quote_reg_exp, re.IGNORECASE)}),
|
||||||
dict(attrs={'class': re.compile(
|
dict(attrs={'class': re.compile(
|
||||||
social_links_reg_exp, re.IGNORECASE)}),
|
hidden_reg_exp, re.IGNORECASE)}),
|
||||||
dict(attrs={'class': re.compile(
|
dict(attrs={'class': re.compile(
|
||||||
comment_reg_exp, re.IGNORECASE)}),
|
social_links_reg_exp, re.IGNORECASE)}),
|
||||||
dict(attrs={'class': re.compile(
|
dict(attrs={'class': re.compile(
|
||||||
skip_reg_exp, re.IGNORECASE)}),
|
comment_reg_exp, re.IGNORECASE)}),
|
||||||
dict(name='map', attrs={'id': re.compile(
|
dict(attrs={'class': re.compile(
|
||||||
map_reg_exp, re.IGNORECASE)}),
|
skip_reg_exp, re.IGNORECASE)}),
|
||||||
dict(name='map', attrs={'name': re.compile(
|
dict(name='map', attrs={'id': re.compile(
|
||||||
map_reg_exp, re.IGNORECASE)}),
|
map_reg_exp, re.IGNORECASE)}),
|
||||||
dict(name='div', attrs={'id': re.compile(
|
dict(name='map', attrs={'name': re.compile(
|
||||||
social_bookmarks_reg_exp, re.IGNORECASE)}),
|
map_reg_exp, re.IGNORECASE)}),
|
||||||
dict(name='div', attrs={'id': re.compile(
|
dict(name='div', attrs={'id': re.compile(
|
||||||
blq_mast_reg_exp, re.IGNORECASE)}),
|
social_bookmarks_reg_exp, re.IGNORECASE)}),
|
||||||
dict(name='div', attrs={'class': re.compile(
|
dict(name='div', attrs={'id': re.compile(
|
||||||
sharesb_reg_exp, re.IGNORECASE)}),
|
blq_mast_reg_exp, re.IGNORECASE)}),
|
||||||
dict(name='div', attrs={
|
dict(name='div', attrs={'class': re.compile(
|
||||||
'class': re.compile(o_reg_exp, re.IGNORECASE)}),
|
sharesb_reg_exp, re.IGNORECASE)}),
|
||||||
dict(name='div', attrs={'class': re.compile(
|
dict(name='div', attrs={
|
||||||
promo_top_reg_exp, re.IGNORECASE)}),
|
'class': re.compile(o_reg_exp, re.IGNORECASE)}),
|
||||||
dict(name='div', attrs={'class': re.compile(
|
dict(name='div', attrs={'class': re.compile(
|
||||||
promo_bottom_reg_exp, re.IGNORECASE)}),
|
promo_top_reg_exp, re.IGNORECASE)}),
|
||||||
dict(name='div', attrs={
|
dict(name='div', attrs={'class': re.compile(
|
||||||
'class': re.compile(nlp_reg_exp, re.IGNORECASE)}),
|
promo_bottom_reg_exp, re.IGNORECASE)}),
|
||||||
dict(name='div', attrs={'class': re.compile(
|
dict(name='div', attrs={
|
||||||
mva_or_mvb_reg_exp, re.IGNORECASE)}),
|
'class': re.compile(nlp_reg_exp, re.IGNORECASE)}),
|
||||||
dict(name='div', attrs={'class': re.compile(
|
dict(name='div', attrs={'class': re.compile(
|
||||||
mvtb_reg_exp, re.IGNORECASE)}),
|
mva_or_mvb_reg_exp, re.IGNORECASE)}),
|
||||||
dict(name='div', attrs={'class': re.compile(
|
dict(name='div', attrs={'class': re.compile(
|
||||||
blq_toplink_reg_exp, re.IGNORECASE)}),
|
mvtb_reg_exp, re.IGNORECASE)}),
|
||||||
dict(name='div', attrs={'class': re.compile(
|
dict(name='div', attrs={'class': re.compile(
|
||||||
prods_services_01_reg_exp, re.IGNORECASE)}),
|
blq_toplink_reg_exp, re.IGNORECASE)}),
|
||||||
dict(name='div', attrs={'class': re.compile(
|
dict(name='div', attrs={'class': re.compile(
|
||||||
prods_services_02_reg_exp, re.IGNORECASE)}),
|
prods_services_01_reg_exp, re.IGNORECASE)}),
|
||||||
dict(name='div', attrs={'class': re.compile(
|
dict(name='div', attrs={'class': re.compile(
|
||||||
blq_misc_01_reg_exp, re.IGNORECASE)}),
|
prods_services_02_reg_exp, re.IGNORECASE)}),
|
||||||
dict(name='div', attrs={'class': re.compile(
|
dict(name='div', attrs={'class': re.compile(
|
||||||
blq_misc_02_reg_exp, re.IGNORECASE)}),
|
blq_misc_01_reg_exp, re.IGNORECASE)}),
|
||||||
dict(name='div', attrs={'class': re.compile(
|
dict(name='div', attrs={'class': re.compile(
|
||||||
puffbox_reg_exp, re.IGNORECASE)}),
|
blq_misc_02_reg_exp, re.IGNORECASE)}),
|
||||||
dict(attrs={'class': re.compile(
|
dict(name='div', attrs={'class': re.compile(
|
||||||
sibtbg_reg_exp, re.IGNORECASE)}),
|
puffbox_reg_exp, re.IGNORECASE)}),
|
||||||
dict(attrs={'class': re.compile(
|
dict(attrs={'class': re.compile(
|
||||||
storyextra_reg_exp, re.IGNORECASE)})
|
sibtbg_reg_exp, re.IGNORECASE)}),
|
||||||
]
|
dict(attrs={'class': re.compile(
|
||||||
|
storyextra_reg_exp, re.IGNORECASE)})
|
||||||
|
]
|
||||||
|
|
||||||
# Uses url to create and return the 'printer friendly' version of the url.
|
# Uses url to create and return the 'printer friendly' version of the url.
|
||||||
# In other words the 'print this page' address of the page.
|
# In other words the 'print this page' address of the page.
|
||||||
@ -625,6 +626,11 @@ class BBCNewsSportBlog(BasicNewsRecipe):
|
|||||||
|
|
||||||
return print_url
|
return print_url
|
||||||
|
|
||||||
|
def canonicalize_internal_url(self, url, is_link=True):
|
||||||
|
if url.endswith('?print=true'):
|
||||||
|
url = url.rpartition('?')[0]
|
||||||
|
return BasicNewsRecipe.canonicalize_internal_url(self, url, is_link=is_link)
|
||||||
|
|
||||||
# Remove articles in feeds based on a string in the article title or url.
|
# Remove articles in feeds based on a string in the article title or url.
|
||||||
#
|
#
|
||||||
# Code logic written by: Starson17 - posted in: "Recipes - Re-usable code"
|
# Code logic written by: Starson17 - posted in: "Recipes - Re-usable code"
|
||||||
|
Loading…
x
Reference in New Issue
Block a user