diff --git a/recipes/daily_express.recipe b/recipes/daily_express.recipe index 208fbc7172..dc55d3f0b8 100644 --- a/recipes/daily_express.recipe +++ b/recipes/daily_express.recipe @@ -1,10 +1,9 @@ -from calibre.web.feeds.news import BasicNewsRecipe import re +from calibre.web.feeds.news import BasicNewsRecipe class AdvancedUserRecipe1376229553(BasicNewsRecipe): title = u'Daily Express' __author__ = 'Dave Asbury' - language = 'en_GB' - # 13-08-17 remove quoted text from article + # 9-9-13 added article author and now use (re.compile(r'>[\w].+? News<' encoding = 'utf-8' remove_empty_feeds = True #remove_javascript = True @@ -15,33 +14,49 @@ class AdvancedUserRecipe1376229553(BasicNewsRecipe): compress_news_images = True compress_news_images_max_size = 30 ignore_duplicate_articles = {'title', 'url'} + masthead_url = 'http://cdn.images.dailyexpress.co.uk/img/page/express_logo.png' + + preprocess_regexps = [ + + (re.compile(r'widget', re.IGNORECASE | re.DOTALL), lambda match: ''), + (re.compile(r'Related articles', re.IGNORECASE | re.DOTALL), lambda match: ''), + (re.compile(r'Add Your Comment<', re.IGNORECASE | re.DOTALL), lambda match: '<'), + (re.compile(r'>More [\w].+?<', re.IGNORECASE), lambda match: '><'), + (re.compile(r'>[\w].+? News<', re.IGNORECASE), lambda match: '><'), + #(re.compile(r'Health News<', re.IGNORECASE | re.DOTALL), lambda match: '<'), + #(re.compile(r'Car News<', re.IGNORECASE | re.DOTALL), lambda match: '<'), + #(re.compile(r'TV & Radio News<', re.IGNORECASE | re.DOTALL), lambda match: '<'), + #(re.compile(r'Food & Recipe News<', re.IGNORECASE | re.DOTALL), lambda match: '<'), + #(re.compile(r'More City & Business<', re.IGNORECASE | re.DOTALL), lambda match: '<'), + #(re.compile(r'Travel News<', re.IGNORECASE | re.DOTALL), lambda match: '<'), + #(re.compile(r'Garden News<', re.IGNORECASE | re.DOTALL), lambda match: '<'), + #(re.compile(r'Fashion & Beauty News<', re.IGNORECASE | re.DOTALL), lambda match: '<'), + #(re.compile(r'More Personal Finance<', re.IGNORECASE | re.DOTALL), lambda match: '<'), + #(re.compile(r'

More UK

', re.IGNORECASE | re.DOTALL), lambda match: ''), + + ] remove_tags = [ dict(attrs={'class' : 'quote'}), - dict(attrs={'class' : 'author'}), + #dict(attrs={'class' : 'author'}), dict(name='footer'), dict(attrs={'id' : 'header_addons'}), dict(attrs={'class' : 'hoverException'}), dict(name='_li'),dict(name='li'), dict(attrs={'class' : 'box related-articles clear'}), dict(attrs={'class' : 'news-list'}), + dict(attrs={'class' : 'sponsored-section'}), + dict(attrs={'class' : 'pull-quote on-right'}), + dict(attrs={'class' : 'pull-quote on-left'}), + ] keep_only_tags = [ dict(name='h1'), dict(attrs={'class' : 'publish-info'}), - # dict(name='h3'), - #dict(name='section',attrs={'class' : 'photo'}), - #dict(name='section',attrs={'class' : 'text-description'}), - - dict(attrs={'class' : 'clearfix hR new-style'}), + dict(name='h3', limit=2), + dict(attrs={'class' : 'clearfix hR new-style'}), ] - preprocess_regexps = [ - (re.compile(r'widget', re.IGNORECASE | re.DOTALL), lambda match: '')] - - preprocess_regexps = [ - (re.compile(r'

More UK

', re.IGNORECASE | re.DOTALL), lambda match: '')] - feeds = [(u'UK News', u'http://www.express.co.uk/posts/rss/1/uk'), (u'World News',u'http://www.express.co.uk/posts/rss/78/world'), (u'Finance',u'http://www.express.co.uk/posts/rss/21/finance'),