This commit is contained in:
Kovid Goyal 2015-06-29 11:02:36 +05:30
parent 9f1ec2d86c
commit c6dbbf488f

View File

@ -5,7 +5,6 @@ www.ft.com/intl/uk-edition
''' '''
from calibre.ptempfile import PersistentTemporaryFile from calibre.ptempfile import PersistentTemporaryFile
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from collections import OrderedDict from collections import OrderedDict
@ -31,14 +30,6 @@ class FinancialTimes(BasicNewsRecipe):
INDEX = 'http://www.ft.com/intl/uk-edition' INDEX = 'http://www.ft.com/intl/uk-edition'
PREFIX = 'http://www.ft.com' PREFIX = 'http://www.ft.com'
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
, 'linearize_tables' : True
}
def get_browser(self): def get_browser(self):
br = BasicNewsRecipe.get_browser(self) br = BasicNewsRecipe.get_browser(self)
br.open(self.INDEX) br.open(self.INDEX)
@ -54,11 +45,11 @@ class FinancialTimes(BasicNewsRecipe):
dict(name='div' , attrs={'class':['master-row editorialSection']}) dict(name='div' , attrs={'class':['master-row editorialSection']})
] ]
remove_tags = [ remove_tags = [
dict(name='div', attrs={'id':'floating-con'}) dict(name='div', attrs={'id':'floating-con'}),
,dict(name=['meta','iframe','base','object','embed','link']) dict(name=['meta','iframe','base','object','embed','link']),
,dict(attrs={'class':['storyTools','story-package','screen-copy','story-package separator','expandable-image','promobox']}) dict(attrs={'class':['storyTools','story-package','screen-copy','story-package separator','expandable-image','promobox']}),
,dict(name='div', attrs={'class':lambda x: x and 'insideArticleRelatedTopics' in x.split()}) dict(name='div', attrs={'class':lambda x: x and 'insideArticleRelatedTopics' in x.split()})
] ]
remove_attributes = ['width','height','lang'] remove_attributes = ['width','height','lang']
extra_css = """ extra_css = """
@ -73,36 +64,11 @@ class FinancialTimes(BasicNewsRecipe):
.byline,.ft-story-body,.ft-story-header{font-family: Arial,Helvetica,sans-serif} .byline,.ft-story-body,.ft-story-header{font-family: Arial,Helvetica,sans-serif}
""" """
def get_artlinks(self, elem):
articles = []
count = 0
for item in elem.findAll('a',href=True):
count = count + 1
if self.test and count > 2:
return articles
rawlink = item['href']
url = rawlink
if not rawlink.startswith('http://'):
url = self.PREFIX + rawlink
try:
urlverified = self.browser.open_novisit(url).geturl() # resolve redirect.
except:
continue
title = self.tag_to_string(item)
date = strftime(self.timefmt)
articles.append({
'title' :title
,'date' :date
,'url' :urlverified
,'description':''
})
return articles
def parse_index(self): def parse_index(self):
feeds = OrderedDict() feeds = OrderedDict()
soup = self.index_to_soup(self.INDEX) soup = self.index_to_soup(self.INDEX)
#dates= self.tag_to_string(soup.find('div', attrs={'class':'btm-links'}).find('div')) # dates= self.tag_to_string(soup.find('div', attrs={'class':'btm-links'}).find('div'))
#self.timefmt = ' [%s]'%dates # self.timefmt = ' [%s]'%dates
section_title = 'Untitled' section_title = 'Untitled'
for column in soup.findAll('div', attrs={'class':'feedBoxes clearfix'}): for column in soup.findAll('div', attrs={'class':'feedBoxes clearfix'}):
@ -110,11 +76,13 @@ class FinancialTimes(BasicNewsRecipe):
sectiontitle=self.tag_to_string(section.find('h4')) sectiontitle=self.tag_to_string(section.find('h4'))
if '...' not in sectiontitle: if '...' not in sectiontitle:
section_title=sectiontitle section_title=sectiontitle
self.log('Found section:', sectiontitle)
for article in section.ul.findAll('li'): for article in section.ul.findAll('li'):
articles = [] articles = []
title=self.tag_to_string(article.a) title=self.tag_to_string(article.a)
url=article.a['href'] url=article.a['href']
articles.append({'title':title, 'url':url, 'description':'', 'date':''}) articles.append({'title':title, 'url':url, 'description':'', 'date':''})
self.log('\tFound article:', title)
if articles: if articles:
if section_title not in feeds: if section_title not in feeds:
@ -134,6 +102,9 @@ class FinancialTimes(BasicNewsRecipe):
it.attrs = [] it.attrs = []
for item in soup.findAll(style=True): for item in soup.findAll(style=True):
del item['style'] del item['style']
for img in soup.findAll('img', src=True):
if 'track/track.js' in img['src']:
img.extract()
for item in soup.findAll('a'): for item in soup.findAll('a'):
limg = item.find('img') limg = item.find('img')
if item.string is not None: if item.string is not None:
@ -146,9 +117,6 @@ class FinancialTimes(BasicNewsRecipe):
else: else:
str = self.tag_to_string(item) str = self.tag_to_string(item)
item.replaceWith(str) item.replaceWith(str)
for item in soup.findAll('img'):
if not item.has_key('alt'):
item['alt'] = 'image'
return soup return soup
def get_obfuscated_article(self, url): def get_obfuscated_article(self, url):