Update Telegraph UK

This commit is contained in:
Kovid Goyal 2016-05-11 16:57:47 +05:30
parent 77ef24afde
commit 7128a9327c

View File

@ -4,8 +4,13 @@ __copyright__ = '2008-2010, Darko Miletic <darko.miletic at gmail.com>'
telegraph.co.uk
'''
import json
from calibre.web.feeds.news import BasicNewsRecipe
def classes(classes):
q = frozenset(classes.split(' '))
return dict(attrs={'class':lambda x:x and frozenset(x.split()).intersection(q)})
class TelegraphUK(BasicNewsRecipe):
title = 'Telegraph.co.uk'
__author__ = 'Darko Miletic and Sujata Raman'
@ -13,41 +18,16 @@ class TelegraphUK(BasicNewsRecipe):
oldest_article = 2
category = 'news, politics, UK'
publisher = 'Telegraph Media Group ltd.'
compress_news_images = True
max_articles_per_feed = 100
no_stylesheets = True
language = 'en_GB'
encoding = 'utf-8'
ignore_duplicate_articles = {'title', 'url'}
remove_empty_feeds = True
use_embedded_content = False
extra_css = '''
h1{font-family :Arial,Helvetica,sans-serif; font-size:large; }
h2{font-family :Arial,Helvetica,sans-serif; font-size:x-small; color:#444444;}
.story{font-family :Arial,Helvetica,sans-serif; font-size: x-small;}
.byline{color:#666666; font-family :Arial,Helvetica,sans-serif; font-size: xx-small;}
a{color:#234B7B; }
.imageExtras{color:#666666; font-family :Arial,Helvetica,sans-serif; font-size: xx-small;}
'''
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
keep_only_tags = [
dict(name='div', attrs={'class':['storyHead','byline']})
,dict(name='div', attrs={'id':'mainBodyArea'})
]
remove_tags = [dict(name='div', attrs={
'class':['related_links_inline',"imgindex","next","prev","gutterUnder",'ssImgHide','imageExtras','ssImg hide','related_links_video']})
,dict(name='ul' , attrs={'class':['shareThis shareBottom']})
,dict(name='span', attrs={'class':['num','placeComment']})
]
feeds = [
(u'UK News' , u'http://www.telegraph.co.uk/news/uknews/rss')
(u'UK News' , u'http://www.telegraph.co.uk/news/uknews/rss')
,(u'World News' , u'http://www.telegraph.co.uk/news/worldnews/rss')
,(u'Politics' , u'http://www.telegraph.co.uk/news/newstopics/politics/rss')
,(u'Finance' , u'http://www.telegraph.co.uk/finance/rss')
@ -59,12 +39,17 @@ class TelegraphUK(BasicNewsRecipe):
,(u'Comment' , u'http://www.telegraph.co.uk/comment/rss')
,(u'Travel' , u'http://www.telegraph.co.uk/travel/rss')
,(u'How about that?', u'http://www.telegraph.co.uk/news/newstopics/howaboutthat/rss')
]
def populate_article_metadata(self, article, soup, first):
if first and hasattr(self, 'add_toc_thumbnail'):
picdiv = soup.find('img')
if picdiv is not None:
self.add_toc_thumbnail(article,picdiv['src'])
]
keep_only_tags = [
classes('lead-asset-image-container headline__heading footer-author'),
dict(itemprop='articleBody'),
]
remove_tags = [
dict(name=['link', 'meta', 'style']),
classes('videoPlayer'),
]
remove_attributes = 'width height'.split()
def get_article_url(self, article):
url = article.get('link', None)
@ -72,3 +57,11 @@ class TelegraphUK(BasicNewsRecipe):
url = None
return url
def preprocess_html(self, soup):
for img in soup.findAll('img', attrs={'data-frz-src-array':True}):
d = json.loads(img['data-frz-src-array'].replace("'", '"'))
for item in d:
if int(item.get('width', 0)) > 700:
img['src'] = item['src']
break
return soup