Fix Economist not downloading all aticles from Technology Quarterly

This commit is contained in:
Kovid Goyal 2011-09-03 08:49:59 -06:00
parent 2425e2c09b
commit 384e24a06a
2 changed files with 34 additions and 52 deletions

View File

@ -77,32 +77,23 @@ class Economist(BasicNewsRecipe):
continue continue
self.log('Found section: %s'%section_title) self.log('Found section: %s'%section_title)
articles = [] articles = []
for h5 in section.findAll('h5'): subsection = ''
article_title = self.tag_to_string(h5).strip() for node in section.findAll(attrs={'class':'article'}):
if not article_title: subsec = node.findPreviousSibling('h5')
continue if subsec is not None:
data = h5.findNextSibling(attrs={'class':'article'}) subsection = self.tag_to_string(subsec)
if data is None: continue prefix = (subsection+': ') if subsection else ''
a = data.find('a', href=True) a = node.find('a', href=True)
if a is None: continue if a is not None:
url = a['href'] url = a['href']
if url.startswith('/'): url = 'http://www.economist.com'+url if url.startswith('/'): url = 'http://www.economist.com'+url
url += '/print' url += '/print'
article_title += ': %s'%self.tag_to_string(a).strip() title = self.tag_to_string(a)
articles.append({'title':article_title, 'url':url, if title:
'description':'', 'date':''}) title = prefix + title
if not articles: self.log('\tFound article:', title)
# We have last or first section articles.append({'title':title, 'url':url,
for art in section.findAll(attrs={'class':'article'}): 'description':'', 'date':''})
a = art.find('a', href=True)
if a is not None:
url = a['href']
if url.startswith('/'): url = 'http://www.economist.com'+url
url += '/print'
title = self.tag_to_string(a)
if title:
articles.append({'title':title, 'url':url,
'description':'', 'date':''})
if articles: if articles:
if section_title not in feeds: if section_title not in feeds:

View File

@ -69,32 +69,23 @@ class Economist(BasicNewsRecipe):
continue continue
self.log('Found section: %s'%section_title) self.log('Found section: %s'%section_title)
articles = [] articles = []
for h5 in section.findAll('h5'): subsection = ''
article_title = self.tag_to_string(h5).strip() for node in section.findAll(attrs={'class':'article'}):
if not article_title: subsec = node.findPreviousSibling('h5')
continue if subsec is not None:
data = h5.findNextSibling(attrs={'class':'article'}) subsection = self.tag_to_string(subsec)
if data is None: continue prefix = (subsection+': ') if subsection else ''
a = data.find('a', href=True) a = node.find('a', href=True)
if a is None: continue if a is not None:
url = a['href'] url = a['href']
if url.startswith('/'): url = 'http://www.economist.com'+url if url.startswith('/'): url = 'http://www.economist.com'+url
url += '/print' url += '/print'
article_title += ': %s'%self.tag_to_string(a).strip() title = self.tag_to_string(a)
articles.append({'title':article_title, 'url':url, if title:
'description':'', 'date':''}) title = prefix + title
if not articles: self.log('\tFound article:', title)
# We have last or first section articles.append({'title':title, 'url':url,
for art in section.findAll(attrs={'class':'article'}): 'description':'', 'date':''})
a = art.find('a', href=True)
if a is not None:
url = a['href']
if url.startswith('/'): url = 'http://www.economist.com'+url
url += '/print'
title = self.tag_to_string(a)
if title:
articles.append({'title':title, 'url':url,
'description':'', 'date':''})
if articles: if articles:
if section_title not in feeds: if section_title not in feeds: