From 72ac7359286c540f90003ad1d3ac3966b41c697b Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 24 Aug 2011 20:23:26 -0600 Subject: [PATCH] News download: Use the algorithms from Redability to automatically cleanup downloaded HTML. You can turn this on in your own recipes by adding auto_cleanup=True to the recipe. It is turned on by default for basic recipes created via the GUI. --- src/calibre/gui2/dialogs/user_profiles.py | 1 + src/calibre/web/feeds/news.py | 47 +++++++++++++++++++---- src/calibre/web/fetch/simple.py | 10 +++-- 3 files changed, 47 insertions(+), 11 deletions(-) diff --git a/src/calibre/gui2/dialogs/user_profiles.py b/src/calibre/gui2/dialogs/user_profiles.py index 92d20a6f03..b81b5271bc 100644 --- a/src/calibre/gui2/dialogs/user_profiles.py +++ b/src/calibre/gui2/dialogs/user_profiles.py @@ -219,6 +219,7 @@ class %(classname)s(%(base_class)s): title = %(title)s oldest_article = %(oldest_article)d max_articles_per_feed = %(max_articles)d + auto_cleanup = True feeds = %(feeds)s '''%dict(classname=classname, title=repr(title), diff --git a/src/calibre/web/feeds/news.py b/src/calibre/web/feeds/news.py index d7d9b0643a..06bde76c6a 100644 --- a/src/calibre/web/feeds/news.py +++ b/src/calibre/web/feeds/news.py @@ -138,6 +138,12 @@ class BasicNewsRecipe(Recipe): #: Reverse the order of articles in each feed reverse_article_order = False + #: Automatically extract all the text from downloaded article pages. Uses + #: the algorithms from the readability project. Setting this to True, means + #: that you do not have to worry about cleaning up the downloaded HTML + #: manually (though manual cleanup will always be superior). + auto_cleanup = False + #: Specify any extra :term:`CSS` that should be addded to downloaded :term:`HTML` files #: It will be inserted into `