diff --git a/recipes/financial_times_uk.recipe b/recipes/financial_times_uk.recipe index 16295905bc..4e5b522ae9 100644 --- a/recipes/financial_times_uk.recipe +++ b/recipes/financial_times_uk.recipe @@ -1,5 +1,5 @@ __license__ = 'GPL v3' -__copyright__ = '2010-2011, Darko Miletic ' +__copyright__ = '2010-2012, Darko Miletic ' ''' www.ft.com/uk-edition ''' @@ -51,10 +51,15 @@ class FinancialTimes(BasicNewsRecipe): return br keep_only_tags = [ - dict(name='div', attrs={'class':['fullstory fullstoryHeader', 'ft-story-header']}) - ,dict(name='div', attrs={'class':'standfirst'}) - ,dict(name='div', attrs={'id' :'storyContent'}) - ,dict(name='div', attrs={'class':['ft-story-body','index-detail']}) + dict(name='div' , attrs={'class':['fullstory fullstoryHeader', 'ft-story-header']}) + ,dict(name='div' , attrs={'class':'standfirst'}) + ,dict(name='div' , attrs={'id' :'storyContent'}) + ,dict(name='div' , attrs={'class':['ft-story-body','index-detail']}) + ,dict(name='div' , attrs={'class':['ft-story-body','index-detail']}) + ,dict(name='h2' , attrs={'class':'entry-title'} ) + ,dict(name='span', attrs={'class':lambda x: x and 'posted-on' in x.split()} ) + ,dict(name='span', attrs={'class':'author_byline'} ) + ,dict(name='div' , attrs={'class':'entry-content'} ) ] remove_tags = [ dict(name='div', attrs={'id':'floating-con'}) @@ -83,10 +88,9 @@ class FinancialTimes(BasicNewsRecipe): if self.test and count > 2: return articles rawlink = item['href'] - if rawlink.startswith('http://'): - url = rawlink - else: - url = self.PREFIX + rawlink + url = rawlink + if not rawlink.startswith('http://'): + url = self.PREFIX + rawlink urlverified = self.browser.open_novisit(url).geturl() # resolve redirect. title = self.tag_to_string(item) date = strftime(self.timefmt) @@ -106,20 +110,20 @@ class FinancialTimes(BasicNewsRecipe): wide = soup.find('div',attrs={'class':'wide'}) if not wide: return feeds - strest = wide.findAll('h3', attrs={'class':'section'}) - if not strest: + allsections = wide.findAll(attrs={'class':lambda x: x and 'footwell' in x.split()}) + if not allsections: return feeds - st = wide.findAll('h4',attrs={'class':'section-no-arrow'}) - if st: - st.extend(strest) count = 0 - for item in st: + for item in allsections: count = count + 1 if self.test and count > 2: return feeds - ftitle = self.tag_to_string(item) + fitem = item.h3 + if not fitem: + fitem = item.h4 + ftitle = self.tag_to_string(fitem) self.report_progress(0, _('Fetching feed')+' %s...'%(ftitle)) - feedarts = self.get_artlinks(item.parent.ul) + feedarts = self.get_artlinks(item.ul) feeds.append((ftitle,feedarts)) return feeds @@ -166,7 +170,8 @@ class FinancialTimes(BasicNewsRecipe): except: print "Retrying download..." count += 1 - self.temp_files.append(PersistentTemporaryFile('_fa.html')) - self.temp_files[-1].write(html) - self.temp_files[-1].close() - return self.temp_files[-1].name + tfile = PersistentTemporaryFile('_fa.html') + tfile.write(html) + tfile.close() + self.temp_files.append(tfile) + return tfile.name