diff --git a/recipes/atlantic.recipe b/recipes/atlantic.recipe index 54449b2412..83a006f8e0 100644 --- a/recipes/atlantic.recipe +++ b/recipes/atlantic.recipe @@ -5,9 +5,14 @@ __copyright__ = '2008, Kovid Goyal ' ''' theatlantic.com ''' -import re +import html5lib +from lxml import html from calibre.web.feeds.news import BasicNewsRecipe +def classes(classes): + q = frozenset(classes.split(' ')) + return dict(attrs={'class':lambda x:x and frozenset(x.split()).intersection(q)}) + class TheAtlantic(BasicNewsRecipe): title = 'The Atlantic' @@ -18,7 +23,7 @@ class TheAtlantic(BasicNewsRecipe): encoding = 'utf-8' keep_only_tags = [ - {'attrs':{'class':['article-header', 'article-body', 'article-magazine', 'metadata', 'article-cover-content']}}, + classes('article-header article-body article-magazine metadata article-cover-content lead-img'), ] remove_tags = [ {'name': ['meta', 'link', 'noscript']}, @@ -26,12 +31,18 @@ class TheAtlantic(BasicNewsRecipe): {'attrs':{'class':lambda x: x and 'article-tools' in x}}, {'src':lambda x:x and 'spotxchange.com' in x}, ] + remove_tags_after = classes('article-body') + no_stylesheets = True remove_attributes = ['style'] - preprocess_regexps = [ - (re.compile(r'', re.DOTALL), lambda m: ''), - (re.compile(r'^.*