diff --git a/recipes/daily_express.recipe b/recipes/daily_express.recipe
index f356771592..8a28de2cff 100644
--- a/recipes/daily_express.recipe
+++ b/recipes/daily_express.recipe
@@ -1,72 +1,70 @@
+# vim:fileencoding=UTF-8
+from __future__ import unicode_literals
import re
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1376229553(BasicNewsRecipe):
- title = u'Daily Express'
+ title = 'Daily Express'
__author__ = 'Dave Asbury'
# 9-9-13 added article author and now use (re.compile(r'>[\w].+? News<'
# 16-11-13 cover adjustment
+ # 19.1.14 changes due to website changes breaking recipe
encoding = 'utf-8'
remove_empty_feeds = True
- #remove_javascript = True
+ remove_javascript = True
no_stylesheets = True
oldest_article = 1
- max_articles_per_feed = 10
+ max_articles_per_feed = 2
#auto_cleanup = True
compress_news_images = True
compress_news_images_max_size = 30
ignore_duplicate_articles = {'title', 'url'}
masthead_url = 'http://cdn.images.dailyexpress.co.uk/img/page/express_logo.png'
-
- preprocess_regexps = [
-
- (re.compile(r'widget', re.IGNORECASE | re.DOTALL), lambda match: ''),
- (re.compile(r'Related articles', re.IGNORECASE | re.DOTALL), lambda match: ''),
- (re.compile(r'Add Your Comment<', re.IGNORECASE | re.DOTALL), lambda match: '<'),
- (re.compile(r'>More [\w].+?<', re.IGNORECASE), lambda match: '><'),
- (re.compile(r'>[\w].+? News<', re.IGNORECASE), lambda match: '><'),
- #(re.compile(r'Health News<', re.IGNORECASE | re.DOTALL), lambda match: '<'),
- #(re.compile(r'Car News<', re.IGNORECASE | re.DOTALL), lambda match: '<'),
- #(re.compile(r'TV & Radio News<', re.IGNORECASE | re.DOTALL), lambda match: '<'),
- #(re.compile(r'Food & Recipe News<', re.IGNORECASE | re.DOTALL), lambda match: '<'),
- #(re.compile(r'More City & Business<', re.IGNORECASE | re.DOTALL), lambda match: '<'),
- #(re.compile(r'Travel News<', re.IGNORECASE | re.DOTALL), lambda match: '<'),
- #(re.compile(r'Garden News<', re.IGNORECASE | re.DOTALL), lambda match: '<'),
- #(re.compile(r'Fashion & Beauty News<', re.IGNORECASE | re.DOTALL), lambda match: '<'),
- #(re.compile(r'More Personal Finance<', re.IGNORECASE | re.DOTALL), lambda match: '<'),
- #(re.compile(r'
More UK
', re.IGNORECASE | re.DOTALL), lambda match: ''),
-
- ]
+ #conversion_options = { 'linearize_tables' : True }
remove_tags = [
dict(attrs={'class' : 'quote'}),
- #dict(attrs={'class' : 'author'}),
+ dict(attrs={'class' : 'mainFooter cf'}),
dict(name='footer'),
dict(attrs={'id' : 'header_addons'}),
- dict(attrs={'class' : 'hoverException'}),
+ dict(attrs={'class' : 'hoverException'}),
dict(name='_li'),dict(name='li'),
- dict(attrs={'class' : 'box related-articles clear'}),
+ dict(attrs={'class' : 'box related-articles clear'}),
dict(attrs={'class' : 'news-list'}),
dict(attrs={'class' : 'sponsored-section'}),
dict(attrs={'class' : 'pull-quote on-right'}),
dict(attrs={'class' : 'pull-quote on-left'}),
]
- keep_only_tags = [
- dict(name='h1'),
- dict(attrs={'class' : 'publish-info'}),
- dict(name='h3', limit=2),
- dict(attrs={'class' : 'clearfix hR new-style'}),
- ]
+ remove_tags_after = [dict(attrs={'class' : 'clearfix hR new-style'})]
+ extra_css = '''
+ h1{font-weight:bold;font-size:175%;}
+ h2{font-weight:normal;font-size:75%;}
+ #p{font-size:14px;}
+ #body{font-size:14px;}
+ .photo-caption {display: block;margin-left: auto;margin-right: auto;width:100%;font-size:40%;}
+ .publish-info {font-size:50%;}
+ .photo img {display: block;margin-left: auto;margin-right: auto;width:100%;}
+ '''
- feeds = [(u'UK News', u'http://www.express.co.uk/posts/rss/1/uk'),
- (u'World News',u'http://www.express.co.uk/posts/rss/78/world'),
- (u'Finance',u'http://www.express.co.uk/posts/rss/21/finance'),
- (u'Sport',u'http://www.express.co.uk/posts/rss/65/sport'),
- (u'Entertainment',u'http://www.express.co.uk/posts/rss/18/entertainment'),
- (u'Lifestyle',u'http://www.express.co.uk/posts/rss/8/life&style'),
- (u'Fun',u'http://www.express.co.uk/posts/rss/110/fun'),
- ]
+ feeds = [
+ ('UK News', 'http://www.express.co.uk/posts/rss/1/uk'),
+ ('World News','http://www.express.co.uk/posts/rss/78/world'),
+ ('Finance','http://www.express.co.uk/posts/rss/21/finance'),
+ ('Sport','http://www.express.co.uk/posts/rss/65/sport'),
+ ('Entertainment','http://www.express.co.uk/posts/rss/18/entertainment'),
+ ('Lifestyle','http://www.express.co.uk/posts/rss/8/life&style'),
+ ('Fun','http://www.express.co.uk/posts/rss/110/fun'),
+ ]
+
+ def preprocess_raw_html(self, raw_html, url):
+ for pat, f in [
+ (re.compile(r'', re.DOTALL), lambda m: ''),
+ (re.compile(r'',re.DOTALL), lambda m: ''),
+ (re.compile(r'',re.DOTALL), lambda m: ''),
+ ]:
+ raw_html = pat.sub(f, raw_html)
+ return raw_html
def get_cover_url(self):
soup = self.index_to_soup('http://www.express.co.uk/ourpaper/')
@@ -87,12 +85,4 @@ class AdvancedUserRecipe1376229553(BasicNewsRecipe):
return cover_url
- extra_css = '''
- h1{font-weight:bold;font-size:175%;}
- h2{font-weight:normal;font-size:75%;}
- #p{font-size:14px;}
- #body{font-size:14px;}
- .photo-caption {display: block;margin-left: auto;margin-right: auto;width:100%;font-size:40%;}
- .publish-info {font-size:50%;}
- .photo img {display: block;margin-left: auto;margin-right: auto;width:100%;}
- '''
+