From 7445edba1b6976f10604a68806cdf0c431206487 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 28 Sep 2011 09:16:25 -0600 Subject: [PATCH] Fix Chicago Tribune multipage articles --- recipes/chicago_tribune.recipe | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/recipes/chicago_tribune.recipe b/recipes/chicago_tribune.recipe index 8fa006b5da..6842dcff2a 100644 --- a/recipes/chicago_tribune.recipe +++ b/recipes/chicago_tribune.recipe @@ -8,21 +8,25 @@ from calibre.web.feeds.news import BasicNewsRecipe class ChicagoTribune(BasicNewsRecipe): title = 'Chicago Tribune' - __author__ = 'Kovid Goyal and Sujata Raman' + __author__ = 'Kovid Goyal and Sujata Raman, a.peter' description = 'Politics, local and business news from Chicago' - language = 'en' + language = 'en' + version = 2 - use_embedded_content = False - no_stylesheets = True - remove_javascript = True + use_embedded_content = False + no_stylesheets = True + remove_javascript = True + recursions = 1 keep_only_tags = [dict(name='div', attrs={'class':["story","entry-asset asset hentry"]}), dict(name='div', attrs={'id':["pagebody","story","maincontentcontainer"]}), ] - remove_tags_after = [ {'class':['photo_article',]} ] + remove_tags_after = [{'class':['photo_article',]}] - remove_tags = [{'id':["moduleArticleTools","content-bottom","rail","articleRelates module","toolSet","relatedrailcontent","div-wrapper","beta","atp-comments","footer"]}, - {'class':["clearfix","relatedTitle","articleRelates module","asset-footer","tools","comments","featurePromo","featurePromo fp-topjobs brownBackground","clearfix fullSpan brownBackground","curvedContent"]}, + match_regexps = [r'page=[0-9]+'] + + remove_tags = [{'id':["moduleArticleTools","content-bottom","rail","articleRelates module","toolSet","relatedrailcontent","div-wrapper","beta","atp-comments","footer",'gallery-subcontent','subFooter']}, + {'class':["clearfix","relatedTitle","articleRelates module","asset-footer","tools","comments","featurePromo","featurePromo fp-topjobs brownBackground","clearfix fullSpan brownBackground","curvedContent",'nextgen-share-tools','outbrainTools', 'google-ad-story-bottom']}, dict(name='font',attrs={'id':["cr-other-headlines"]})] extra_css = ''' h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;} @@ -37,7 +41,7 @@ class ChicagoTribune(BasicNewsRecipe): .maincontentcontainer{font-family:Arial,Helvetica,sans-serif;font-size:small;} .story-body{font-family:Arial,Helvetica,sans-serif;font-size:small;} body{font-family:Helvetica,Arial,sans-serif;font-size:small;} - ''' + ''' feeds = [ ('Latest news', 'http://feeds.chicagotribune.com/chicagotribune/news/'), ('Local news', 'http://feeds.chicagotribune.com/chicagotribune/news/local/'), @@ -76,8 +80,12 @@ class ChicagoTribune(BasicNewsRecipe): print article.get('feedburner_origlink', article.get('guid', article.get('link'))) return article.get('feedburner_origlink', article.get('guid', article.get('link'))) - def postprocess_html(self, soup, first_fetch): + # Remove the navigation bar. It was kept until now to be able to follow + # the links to further pages. But now we don't need them anymore. + for nav in soup.findAll(attrs={'class':['toppaginate','article-nav clearfix']}): + nav.extract() + for t in soup.findAll(['table', 'tr', 'td']): t.name = 'div' @@ -88,4 +96,3 @@ class ChicagoTribune(BasicNewsRecipe): return soup -