diff --git a/resources/recipes/economist.recipe b/resources/recipes/economist.recipe index 4ae0bb8b05..a6d0e08eea 100644 --- a/resources/recipes/economist.recipe +++ b/resources/recipes/economist.recipe @@ -24,9 +24,10 @@ class Economist(BasicNewsRecipe): oldest_article = 7.0 cover_url = 'http://www.economist.com/images/covers/currentcoverus_large.jpg' remove_tags = [dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent']), - dict(attrs={'class':['dblClkTrk']})] - remove_tags_before = dict(name=lambda tag: tag.name=='title' and tag.parent.name=='body') + dict(attrs={'class':['dblClkTrk', 'ec-article-info']})] + keep_only_tags = [dict(id='ec-article-body')] needs_subscription = True + no_stylesheets = True preprocess_regexps = [(re.compile('.*', re.DOTALL), lambda x:'')] @@ -87,7 +88,7 @@ class Economist(BasicNewsRecipe): continue a = tag.find('a', href=True) if a is not None: - url=a['href'].replace('displaystory', 'PrinterFriendly').strip() + url=a['href'].split('?')[0]+'/print' if url.startswith('Printer'): url = '/'+url if url.startswith('/'): diff --git a/resources/recipes/economist_free.recipe b/resources/recipes/economist_free.recipe index cdcd457501..1a783521f6 100644 --- a/resources/recipes/economist_free.recipe +++ b/resources/recipes/economist_free.recipe @@ -17,8 +17,9 @@ class Economist(BasicNewsRecipe): oldest_article = 7.0 cover_url = 'http://www.economist.com/images/covers/currentcoverus_large.jpg' remove_tags = [dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent']), - dict(attrs={'class':['dblClkTrk']})] - remove_tags_before = dict(name=lambda tag: tag.name=='title' and tag.parent.name=='body') + dict(attrs={'class':['dblClkTrk', 'ec-article-info']})] + keep_only_tags = [dict(id='ec-article-body')] + no_stylesheets = True preprocess_regexps = [(re.compile('.*', re.DOTALL), lambda x:'')] @@ -88,19 +89,20 @@ class Economist(BasicNewsRecipe): br = browser() ret = br.open(url) raw = ret.read() - url = br.geturl().replace('displaystory', 'PrinterFriendly').strip() + url = br.geturl().split('?')[0]+'/print' root = html.fromstring(raw) - matches = root.xpath('//*[@class = "article-section"]') + matches = root.xpath('//*[@class = "ec-article-info"]') feedtitle = 'Miscellaneous' if matches: - feedtitle = string.capwords(html.tostring(matches[0], method='text', - encoding=unicode)) + feedtitle = string.capwords(html.tostring(matches[-1], method='text', + encoding=unicode).split('|')[-1].strip()) return (i, feedtitle, url, title, description, author, published) def eco_article_found(self, req, result): from calibre.web.feeds import Article i, feedtitle, link, title, description, author, published = result - self.log('Found print version for article:', title) + self.log('Found print version for article:', title, 'in', feedtitle, + 'at', link) a = Article(i, title, link, author, description, published, '')