mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Fix multipage articles in The National Post
This commit is contained in:
parent
5c243cda3b
commit
c6692c859e
@ -70,11 +70,28 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
feeds.append((current_section, current_articles))
|
feeds.append((current_section, current_articles))
|
||||||
|
|
||||||
return feeds
|
return feeds
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
story = soup.find(name='div', attrs={'class':'triline'})
|
story = soup.find(name='div', attrs={'class':'triline'})
|
||||||
#td = heading.findParent(name='td')
|
page2_link = soup.find('p','pagenav')
|
||||||
#td.extract()
|
if page2_link:
|
||||||
|
atag = page2_link.find('a',href=True)
|
||||||
|
if atag:
|
||||||
|
page2_url = atag['href']
|
||||||
|
if page2_url.startswith('story'):
|
||||||
|
page2_url = 'http://www.nationalpost.com/todays-paper/'+page2_url
|
||||||
|
elif page2_url.startswith( '/todays-paper/story.html'):
|
||||||
|
page2_url = 'http://www.nationalpost.com/'+page2_url
|
||||||
|
page2_soup = self.index_to_soup(page2_url)
|
||||||
|
if page2_soup:
|
||||||
|
page2_content = page2_soup.find('div','story-content')
|
||||||
|
if page2_content:
|
||||||
|
full_story = BeautifulSoup('<div></div>')
|
||||||
|
full_story.insert(0,story)
|
||||||
|
full_story.insert(1,page2_content)
|
||||||
|
story = full_story
|
||||||
soup = BeautifulSoup('<html><head><title>t</title></head><body></body></html>')
|
soup = BeautifulSoup('<html><head><title>t</title></head><body></body></html>')
|
||||||
body = soup.find(name='body')
|
body = soup.find(name='body')
|
||||||
body.insert(0, story)
|
body.insert(0, story)
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user