Fix #1054098 (Financial times UK issues with duplicate articles - updated recipe)

This commit is contained in:
Kovid Goyal 2012-09-21 20:49:11 +05:30
parent 4ad6a16eaa
commit 5c425b2335

View File

@ -1,5 +1,5 @@
__license__ = 'GPL v3'
__copyright__ = '2010-2011, Darko Miletic <darko.miletic at gmail.com>'
__copyright__ = '2010-2012, Darko Miletic <darko.miletic at gmail.com>'
'''
www.ft.com/uk-edition
'''
@ -51,10 +51,15 @@ class FinancialTimes(BasicNewsRecipe):
return br
keep_only_tags = [
dict(name='div', attrs={'class':['fullstory fullstoryHeader', 'ft-story-header']})
,dict(name='div', attrs={'class':'standfirst'})
,dict(name='div', attrs={'id' :'storyContent'})
,dict(name='div', attrs={'class':['ft-story-body','index-detail']})
dict(name='div' , attrs={'class':['fullstory fullstoryHeader', 'ft-story-header']})
,dict(name='div' , attrs={'class':'standfirst'})
,dict(name='div' , attrs={'id' :'storyContent'})
,dict(name='div' , attrs={'class':['ft-story-body','index-detail']})
,dict(name='div' , attrs={'class':['ft-story-body','index-detail']})
,dict(name='h2' , attrs={'class':'entry-title'} )
,dict(name='span', attrs={'class':lambda x: x and 'posted-on' in x.split()} )
,dict(name='span', attrs={'class':'author_byline'} )
,dict(name='div' , attrs={'class':'entry-content'} )
]
remove_tags = [
dict(name='div', attrs={'id':'floating-con'})
@ -83,9 +88,8 @@ class FinancialTimes(BasicNewsRecipe):
if self.test and count > 2:
return articles
rawlink = item['href']
if rawlink.startswith('http://'):
url = rawlink
else:
if not rawlink.startswith('http://'):
url = self.PREFIX + rawlink
urlverified = self.browser.open_novisit(url).geturl() # resolve redirect.
title = self.tag_to_string(item)
@ -106,20 +110,20 @@ class FinancialTimes(BasicNewsRecipe):
wide = soup.find('div',attrs={'class':'wide'})
if not wide:
return feeds
strest = wide.findAll('h3', attrs={'class':'section'})
if not strest:
allsections = wide.findAll(attrs={'class':lambda x: x and 'footwell' in x.split()})
if not allsections:
return feeds
st = wide.findAll('h4',attrs={'class':'section-no-arrow'})
if st:
st.extend(strest)
count = 0
for item in st:
for item in allsections:
count = count + 1
if self.test and count > 2:
return feeds
ftitle = self.tag_to_string(item)
fitem = item.h3
if not fitem:
fitem = item.h4
ftitle = self.tag_to_string(fitem)
self.report_progress(0, _('Fetching feed')+' %s...'%(ftitle))
feedarts = self.get_artlinks(item.parent.ul)
feedarts = self.get_artlinks(item.ul)
feeds.append((ftitle,feedarts))
return feeds
@ -166,7 +170,8 @@ class FinancialTimes(BasicNewsRecipe):
except:
print "Retrying download..."
count += 1
self.temp_files.append(PersistentTemporaryFile('_fa.html'))
self.temp_files[-1].write(html)
self.temp_files[-1].close()
return self.temp_files[-1].name
tfile = PersistentTemporaryFile('_fa.html')
tfile.write(html)
tfile.close()
self.temp_files.append(tfile)
return tfile.name