From c98eb806f568b7a78758f728f66ab98990236c31 Mon Sep 17 00:00:00 2001 From: Sergiy Kibrik Date: Tue, 11 Nov 2014 10:10:24 +0200 Subject: [PATCH] recipes: lwn_weekly: improve table handling Site uses table layout a lot, both for page formatting and within article's text, yet we clean up all tags before & after article text, and remove what's left from tables in-between, also removing useful tables often embedded within articles. The better way seems to keep only parts we actually interested about: PageHeadline (article's title) and ArticleText and not linearize table within ArticleText tag, thus preserving useful tables. Signed-off-by: Sergiy Kibrik --- recipes/lwn_weekly.recipe | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/recipes/lwn_weekly.recipe b/recipes/lwn_weekly.recipe index 407a0fea94..e1f0ecdd8b 100644 --- a/recipes/lwn_weekly.recipe +++ b/recipes/lwn_weekly.recipe @@ -30,8 +30,7 @@ class WeeklyLWN(BasicNewsRecipe): # masthead_url = 'http://lwn.net/images/lcorner.png' publication_type = 'magazine' - remove_tags_before = dict(attrs={'class':'PageHeadline'}) - remove_tags_after = dict(attrs={'class':'ArticleText'}) + keep_only_tags = [dict(attrs={'class':['PageHeadline','ArticleText']})] remove_tags = [dict(name=['h2', 'form'])] preprocess_regexps = [ @@ -40,7 +39,6 @@ class WeeklyLWN(BasicNewsRecipe): ] conversion_options = { - 'linearize_tables' : True, 'no_inline_navbars': True, }