From 6fc515a66a6d62d88bfcb401277727e30fcb8ba8 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 19 Jan 2014 18:22:48 +0530 Subject: [PATCH] Update Daily Express --- recipes/daily_express.recipe | 88 ++++++++++++++++-------------------- 1 file changed, 39 insertions(+), 49 deletions(-) diff --git a/recipes/daily_express.recipe b/recipes/daily_express.recipe index f356771592..8a28de2cff 100644 --- a/recipes/daily_express.recipe +++ b/recipes/daily_express.recipe @@ -1,72 +1,70 @@ +# vim:fileencoding=UTF-8 +from __future__ import unicode_literals import re from calibre.web.feeds.news import BasicNewsRecipe class AdvancedUserRecipe1376229553(BasicNewsRecipe): - title = u'Daily Express' + title = 'Daily Express' __author__ = 'Dave Asbury' # 9-9-13 added article author and now use (re.compile(r'>[\w].+? News<' # 16-11-13 cover adjustment + # 19.1.14 changes due to website changes breaking recipe encoding = 'utf-8' remove_empty_feeds = True - #remove_javascript = True + remove_javascript = True no_stylesheets = True oldest_article = 1 - max_articles_per_feed = 10 + max_articles_per_feed = 2 #auto_cleanup = True compress_news_images = True compress_news_images_max_size = 30 ignore_duplicate_articles = {'title', 'url'} masthead_url = 'http://cdn.images.dailyexpress.co.uk/img/page/express_logo.png' - - preprocess_regexps = [ - - (re.compile(r'widget', re.IGNORECASE | re.DOTALL), lambda match: ''), - (re.compile(r'Related articles', re.IGNORECASE | re.DOTALL), lambda match: ''), - (re.compile(r'Add Your Comment<', re.IGNORECASE | re.DOTALL), lambda match: '<'), - (re.compile(r'>More [\w].+?<', re.IGNORECASE), lambda match: '><'), - (re.compile(r'>[\w].+? News<', re.IGNORECASE), lambda match: '><'), - #(re.compile(r'Health News<', re.IGNORECASE | re.DOTALL), lambda match: '<'), - #(re.compile(r'Car News<', re.IGNORECASE | re.DOTALL), lambda match: '<'), - #(re.compile(r'TV & Radio News<', re.IGNORECASE | re.DOTALL), lambda match: '<'), - #(re.compile(r'Food & Recipe News<', re.IGNORECASE | re.DOTALL), lambda match: '<'), - #(re.compile(r'More City & Business<', re.IGNORECASE | re.DOTALL), lambda match: '<'), - #(re.compile(r'Travel News<', re.IGNORECASE | re.DOTALL), lambda match: '<'), - #(re.compile(r'Garden News<', re.IGNORECASE | re.DOTALL), lambda match: '<'), - #(re.compile(r'Fashion & Beauty News<', re.IGNORECASE | re.DOTALL), lambda match: '<'), - #(re.compile(r'More Personal Finance<', re.IGNORECASE | re.DOTALL), lambda match: '<'), - #(re.compile(r'

More UK

', re.IGNORECASE | re.DOTALL), lambda match: ''), - - ] + #conversion_options = { 'linearize_tables' : True } remove_tags = [ dict(attrs={'class' : 'quote'}), - #dict(attrs={'class' : 'author'}), + dict(attrs={'class' : 'mainFooter cf'}), dict(name='footer'), dict(attrs={'id' : 'header_addons'}), - dict(attrs={'class' : 'hoverException'}), + dict(attrs={'class' : 'hoverException'}), dict(name='_li'),dict(name='li'), - dict(attrs={'class' : 'box related-articles clear'}), + dict(attrs={'class' : 'box related-articles clear'}), dict(attrs={'class' : 'news-list'}), dict(attrs={'class' : 'sponsored-section'}), dict(attrs={'class' : 'pull-quote on-right'}), dict(attrs={'class' : 'pull-quote on-left'}), ] - keep_only_tags = [ - dict(name='h1'), - dict(attrs={'class' : 'publish-info'}), - dict(name='h3', limit=2), - dict(attrs={'class' : 'clearfix hR new-style'}), - ] + remove_tags_after = [dict(attrs={'class' : 'clearfix hR new-style'})] + extra_css = ''' + h1{font-weight:bold;font-size:175%;} + h2{font-weight:normal;font-size:75%;} + #p{font-size:14px;} + #body{font-size:14px;} + .photo-caption {display: block;margin-left: auto;margin-right: auto;width:100%;font-size:40%;} + .publish-info {font-size:50%;} + .photo img {display: block;margin-left: auto;margin-right: auto;width:100%;} + ''' - feeds = [(u'UK News', u'http://www.express.co.uk/posts/rss/1/uk'), - (u'World News',u'http://www.express.co.uk/posts/rss/78/world'), - (u'Finance',u'http://www.express.co.uk/posts/rss/21/finance'), - (u'Sport',u'http://www.express.co.uk/posts/rss/65/sport'), - (u'Entertainment',u'http://www.express.co.uk/posts/rss/18/entertainment'), - (u'Lifestyle',u'http://www.express.co.uk/posts/rss/8/life&style'), - (u'Fun',u'http://www.express.co.uk/posts/rss/110/fun'), - ] + feeds = [ + ('UK News', 'http://www.express.co.uk/posts/rss/1/uk'), + ('World News','http://www.express.co.uk/posts/rss/78/world'), + ('Finance','http://www.express.co.uk/posts/rss/21/finance'), + ('Sport','http://www.express.co.uk/posts/rss/65/sport'), + ('Entertainment','http://www.express.co.uk/posts/rss/18/entertainment'), + ('Lifestyle','http://www.express.co.uk/posts/rss/8/life&style'), + ('Fun','http://www.express.co.uk/posts/rss/110/fun'), + ] + + def preprocess_raw_html(self, raw_html, url): + for pat, f in [ + (re.compile(r'', re.DOTALL), lambda m: ''), + (re.compile(r'',re.DOTALL), lambda m: ''), + (re.compile(r'',re.DOTALL), lambda m: ''), + ]: + raw_html = pat.sub(f, raw_html) + return raw_html def get_cover_url(self): soup = self.index_to_soup('http://www.express.co.uk/ourpaper/') @@ -87,12 +85,4 @@ class AdvancedUserRecipe1376229553(BasicNewsRecipe): return cover_url - extra_css = ''' - h1{font-weight:bold;font-size:175%;} - h2{font-weight:normal;font-size:75%;} - #p{font-size:14px;} - #body{font-size:14px;} - .photo-caption {display: block;margin-left: auto;margin-right: auto;width:100%;font-size:40%;} - .publish-info {font-size:50%;} - .photo img {display: block;margin-left: auto;margin-right: auto;width:100%;} - ''' +