mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Fix #5619 (Problem with BBC news feeds)
This commit is contained in:
parent
25c4013b04
commit
8d8e40fed5
@ -1,38 +1,47 @@
|
|||||||
#!/usr/bin/env python
|
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||||
'''
|
'''
|
||||||
bbc.co.uk
|
news.bbc.co.uk
|
||||||
'''
|
'''
|
||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
import re
|
||||||
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
|
||||||
class BBC(BasicNewsRecipe):
|
class BBC(BasicNewsRecipe):
|
||||||
title = u'The BBC'
|
title = 'The BBC'
|
||||||
__author__ = 'Kovid Goyal ans Sujata Raman'
|
__author__ = 'Darko Miletic'
|
||||||
description = 'Global news and current affairs from the British Broadcasting Corporation'
|
description = 'Global news and current affairs from the British Broadcasting Corporation'
|
||||||
language = 'en'
|
oldest_article = 2
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
no_stylesheets = True
|
||||||
|
#delay = 1
|
||||||
|
use_embedded_content = False
|
||||||
|
encoding = 'utf8'
|
||||||
|
publisher = 'BBC'
|
||||||
|
category = 'news, UK, world'
|
||||||
|
language = 'en_GB'
|
||||||
|
publication_type = 'newsportal'
|
||||||
|
extra_css = ' body{ font-family: Verdana,Helvetica,Arial,sans-serif } .introduction{font-weight: bold} .story-feature{display: block; padding: 0; border: 1px solid; width: 40%; font-size: small} .story-feature h2{text-align: center; text-transform: uppercase} '
|
||||||
|
preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')]
|
||||||
|
|
||||||
no_stylesheets = True
|
conversion_options = {
|
||||||
remove_tags = [dict(name='div', attrs={'class':'footer'}),
|
'comments' : description
|
||||||
{'id' : ['popstory','blq-footer']},
|
,'tags' : category
|
||||||
{'class' : ['arrup','links','relatedbbcsites','arr','promobottombg','bbccom_visibility_hidden', 'sharesb', 'sib606', 'mvtb', 'storyextra', 'sidebar1', 'bbccom_text','promotopbg', 'gppromo','promotopbg','bbccom_display_none']},
|
,'language' : language
|
||||||
]
|
,'publisher' : publisher
|
||||||
|
,'linearize_tables': True
|
||||||
|
}
|
||||||
|
|
||||||
keep_only_tags = [dict(name='div', attrs={'class':'mainwrapper'})]
|
keep_only_tags = [
|
||||||
|
dict(attrs={'id' :['meta-information','story-body']})
|
||||||
extra_css = '''
|
,dict(attrs={'class':['mxb' ,'storybody' ]})
|
||||||
body{font-family:Arial,Helvetica,sans-serif; font-size:small; align:left}
|
]
|
||||||
h1{font-size:large;}
|
remove_tags = [
|
||||||
.sh{font-size:large; font-weight:bold}
|
dict(name=['object','link','table'])
|
||||||
.cap{font-size:xx-small; }
|
,dict(attrs={'class':['caption','caption full-width','story-actions','hidden','sharesb','audioInStoryC']})
|
||||||
.lu{font-size:xx-small; }
|
]
|
||||||
.ds{font-size:xx-small; }
|
remove_tags_after = dict(attrs={'class':'sharesb'})
|
||||||
.mvb{font-size:xx-small;}
|
remove_attributes = ['width','height']
|
||||||
.by1{font-size:x-small; color:#666666}
|
|
||||||
.byd{font-size:x-small;}
|
|
||||||
'''
|
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
('News Front Page', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/front_page/rss.xml'),
|
('News Front Page', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/front_page/rss.xml'),
|
||||||
@ -50,22 +59,3 @@ class BBC(BasicNewsRecipe):
|
|||||||
('Africa', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/africa/rss.xml'),
|
('Africa', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/africa/rss.xml'),
|
||||||
]
|
]
|
||||||
|
|
||||||
def postprocess_html(self, soup, first):
|
|
||||||
|
|
||||||
for tag in soup.findAll(name= 'img', alt=""):
|
|
||||||
tag.extract()
|
|
||||||
|
|
||||||
for item in soup.findAll(align = "right"):
|
|
||||||
del item['align']
|
|
||||||
|
|
||||||
for tag in soup.findAll(name=['table', 'tr', 'td']):
|
|
||||||
tag.name = 'div'
|
|
||||||
|
|
||||||
return soup
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# def print_version(self, url):
|
|
||||||
# return url.replace('http://', 'http://newsvote.bbc.co.uk/mpapps/pagetools/print/')
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -3,7 +3,7 @@ __copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
|
|||||||
'''
|
'''
|
||||||
news.bbc.co.uk
|
news.bbc.co.uk
|
||||||
'''
|
'''
|
||||||
|
import re
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
|
||||||
class BBC(BasicNewsRecipe):
|
class BBC(BasicNewsRecipe):
|
||||||
@ -18,22 +18,28 @@ class BBC(BasicNewsRecipe):
|
|||||||
encoding = 'utf8'
|
encoding = 'utf8'
|
||||||
publisher = 'BBC'
|
publisher = 'BBC'
|
||||||
category = 'news, UK, world'
|
category = 'news, UK, world'
|
||||||
language = 'en'
|
language = 'en_GB'
|
||||||
extra_css = ' body{ font-family: sans-serif; } .headline{font-size: xx-large; font-weight: bold} .ibox{display: block; margin: 20px 50px; padding: 10px; border: 1px solid } '
|
publication_type = 'newsportal'
|
||||||
|
extra_css = ' body{ font-family: Verdana,Helvetica,Arial,sans-serif } .introduction{font-weight: bold} .story-feature{display: block; padding: 0; border: 1px solid; width: 40%; font-size: small} .story-feature h2{text-align: center; text-transform: uppercase} '
|
||||||
|
preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')]
|
||||||
conversion_options = {
|
conversion_options = {
|
||||||
'comments' : description
|
'comments' : description
|
||||||
,'tags' : category
|
,'tags' : category
|
||||||
,'language' : language
|
,'language' : language
|
||||||
,'publisher' : publisher
|
,'publisher' : publisher
|
||||||
|
,'linearize_tables': True
|
||||||
}
|
}
|
||||||
|
|
||||||
remove_tags_before = dict(name='div',attrs={'class':'headline'})
|
keep_only_tags = [
|
||||||
remove_tags_after = dict(name='div', attrs={'class':'footer'})
|
dict(attrs={'id' :['meta-information','story-body']})
|
||||||
remove_tags = [
|
,dict(attrs={'class':['mxb' ,'storybody' ]})
|
||||||
dict(name=['object','link','script','iframe'])
|
|
||||||
,dict(name='div', attrs={'class':'footer'})
|
|
||||||
]
|
]
|
||||||
|
remove_tags = [
|
||||||
|
dict(name=['object','link','table','img'])
|
||||||
|
,dict(attrs={'class':['caption','caption full-width','story-actions','hidden','sharesb','audioInStoryC']})
|
||||||
|
]
|
||||||
|
remove_tags_after = dict(attrs={'class':'sharesb'})
|
||||||
|
remove_attributes = ['width','height']
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
('News Front Page', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/front_page/rss.xml'),
|
('News Front Page', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/front_page/rss.xml'),
|
||||||
@ -51,10 +57,3 @@ class BBC(BasicNewsRecipe):
|
|||||||
('Africa', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/africa/rss.xml'),
|
('Africa', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/africa/rss.xml'),
|
||||||
]
|
]
|
||||||
|
|
||||||
def print_version(self, url):
|
|
||||||
emp,sep,rstrip = url.partition('http://')
|
|
||||||
return 'http://newsvote.bbc.co.uk/mpapps/pagetools/print/' + rstrip
|
|
||||||
|
|
||||||
def get_article_url(self, article):
|
|
||||||
return article.get('guid', None)
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user