From c6692c859ef8b6b8dc06421f22dd122fc200f52e Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 17 Jan 2010 19:40:31 -0700 Subject: [PATCH] Fix multipage articles in The National Post --- resources/recipes/national_post.recipe | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/resources/recipes/national_post.recipe b/resources/recipes/national_post.recipe index d9743d5980..4fe188934c 100644 --- a/resources/recipes/national_post.recipe +++ b/resources/recipes/national_post.recipe @@ -70,11 +70,28 @@ class NYTimes(BasicNewsRecipe): feeds.append((current_section, current_articles)) return feeds + def preprocess_html(self, soup): story = soup.find(name='div', attrs={'class':'triline'}) - #td = heading.findParent(name='td') - #td.extract() + page2_link = soup.find('p','pagenav') + if page2_link: + atag = page2_link.find('a',href=True) + if atag: + page2_url = atag['href'] + if page2_url.startswith('story'): + page2_url = 'http://www.nationalpost.com/todays-paper/'+page2_url + elif page2_url.startswith( '/todays-paper/story.html'): + page2_url = 'http://www.nationalpost.com/'+page2_url + page2_soup = self.index_to_soup(page2_url) + if page2_soup: + page2_content = page2_soup.find('div','story-content') + if page2_content: + full_story = BeautifulSoup('
') + full_story.insert(0,story) + full_story.insert(1,page2_content) + story = full_story soup = BeautifulSoup('t') body = soup.find(name='body') body.insert(0, story) return soup +