diff --git a/recipes/american_thinker.recipe b/recipes/american_thinker.recipe index 6ecca8549c..663753264b 100644 --- a/recipes/american_thinker.recipe +++ b/recipes/american_thinker.recipe @@ -3,7 +3,15 @@ __copyright__ = '2010, Walt Anthony ' ''' www.americanthinker.com ''' +import html5lib from calibre.web.feeds.news import BasicNewsRecipe +from calibre.utils.cleantext import clean_xml_chars +from lxml import etree + +def CSSSelect(expr): + from cssselect import HTMLTranslator + from lxml.etree import XPath + return XPath(HTMLTranslator().css_to_xpath(expr)) class AmericanThinker(BasicNewsRecipe): title = u'American Thinker' @@ -18,7 +26,7 @@ class AmericanThinker(BasicNewsRecipe): ignore_duplicate_articles = {'title', 'url'} remove_javascript = True - no_stylesheets = True + remove_tags_before = dict(name='h1') conversion_options = { 'comment' : description @@ -27,7 +35,14 @@ class AmericanThinker(BasicNewsRecipe): , 'language' : language , 'linearize_tables' : True } - auto_claenup = True + + def preprocess_raw_html(self, raw, url): + root = html5lib.parse( + clean_xml_chars(raw), treebuilder='lxml', + namespaceHTMLElements=False) + for x in CSSSelect('.article_body.bottom')(root): + x.getparent().remove(x) + return etree.tostring(root, encoding=unicode) feeds = [(u'http://feeds.feedburner.com/americanthinker'), (u'http://feeds.feedburner.com/AmericanThinkerBlog')