mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
...
This commit is contained in:
parent
9f1ec2d86c
commit
c6dbbf488f
@ -5,7 +5,6 @@ www.ft.com/intl/uk-edition
|
|||||||
'''
|
'''
|
||||||
|
|
||||||
from calibre.ptempfile import PersistentTemporaryFile
|
from calibre.ptempfile import PersistentTemporaryFile
|
||||||
from calibre import strftime
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
|
|
||||||
@ -31,14 +30,6 @@ class FinancialTimes(BasicNewsRecipe):
|
|||||||
INDEX = 'http://www.ft.com/intl/uk-edition'
|
INDEX = 'http://www.ft.com/intl/uk-edition'
|
||||||
PREFIX = 'http://www.ft.com'
|
PREFIX = 'http://www.ft.com'
|
||||||
|
|
||||||
conversion_options = {
|
|
||||||
'comment' : description
|
|
||||||
, 'tags' : category
|
|
||||||
, 'publisher' : publisher
|
|
||||||
, 'language' : language
|
|
||||||
, 'linearize_tables' : True
|
|
||||||
}
|
|
||||||
|
|
||||||
def get_browser(self):
|
def get_browser(self):
|
||||||
br = BasicNewsRecipe.get_browser(self)
|
br = BasicNewsRecipe.get_browser(self)
|
||||||
br.open(self.INDEX)
|
br.open(self.INDEX)
|
||||||
@ -54,11 +45,11 @@ class FinancialTimes(BasicNewsRecipe):
|
|||||||
dict(name='div' , attrs={'class':['master-row editorialSection']})
|
dict(name='div' , attrs={'class':['master-row editorialSection']})
|
||||||
]
|
]
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
dict(name='div', attrs={'id':'floating-con'})
|
dict(name='div', attrs={'id':'floating-con'}),
|
||||||
,dict(name=['meta','iframe','base','object','embed','link'])
|
dict(name=['meta','iframe','base','object','embed','link']),
|
||||||
,dict(attrs={'class':['storyTools','story-package','screen-copy','story-package separator','expandable-image','promobox']})
|
dict(attrs={'class':['storyTools','story-package','screen-copy','story-package separator','expandable-image','promobox']}),
|
||||||
,dict(name='div', attrs={'class':lambda x: x and 'insideArticleRelatedTopics' in x.split()})
|
dict(name='div', attrs={'class':lambda x: x and 'insideArticleRelatedTopics' in x.split()})
|
||||||
]
|
]
|
||||||
remove_attributes = ['width','height','lang']
|
remove_attributes = ['width','height','lang']
|
||||||
|
|
||||||
extra_css = """
|
extra_css = """
|
||||||
@ -73,36 +64,11 @@ class FinancialTimes(BasicNewsRecipe):
|
|||||||
.byline,.ft-story-body,.ft-story-header{font-family: Arial,Helvetica,sans-serif}
|
.byline,.ft-story-body,.ft-story-header{font-family: Arial,Helvetica,sans-serif}
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def get_artlinks(self, elem):
|
|
||||||
articles = []
|
|
||||||
count = 0
|
|
||||||
for item in elem.findAll('a',href=True):
|
|
||||||
count = count + 1
|
|
||||||
if self.test and count > 2:
|
|
||||||
return articles
|
|
||||||
rawlink = item['href']
|
|
||||||
url = rawlink
|
|
||||||
if not rawlink.startswith('http://'):
|
|
||||||
url = self.PREFIX + rawlink
|
|
||||||
try:
|
|
||||||
urlverified = self.browser.open_novisit(url).geturl() # resolve redirect.
|
|
||||||
except:
|
|
||||||
continue
|
|
||||||
title = self.tag_to_string(item)
|
|
||||||
date = strftime(self.timefmt)
|
|
||||||
articles.append({
|
|
||||||
'title' :title
|
|
||||||
,'date' :date
|
|
||||||
,'url' :urlverified
|
|
||||||
,'description':''
|
|
||||||
})
|
|
||||||
return articles
|
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
feeds = OrderedDict()
|
feeds = OrderedDict()
|
||||||
soup = self.index_to_soup(self.INDEX)
|
soup = self.index_to_soup(self.INDEX)
|
||||||
#dates= self.tag_to_string(soup.find('div', attrs={'class':'btm-links'}).find('div'))
|
# dates= self.tag_to_string(soup.find('div', attrs={'class':'btm-links'}).find('div'))
|
||||||
#self.timefmt = ' [%s]'%dates
|
# self.timefmt = ' [%s]'%dates
|
||||||
section_title = 'Untitled'
|
section_title = 'Untitled'
|
||||||
|
|
||||||
for column in soup.findAll('div', attrs={'class':'feedBoxes clearfix'}):
|
for column in soup.findAll('div', attrs={'class':'feedBoxes clearfix'}):
|
||||||
@ -110,11 +76,13 @@ class FinancialTimes(BasicNewsRecipe):
|
|||||||
sectiontitle=self.tag_to_string(section.find('h4'))
|
sectiontitle=self.tag_to_string(section.find('h4'))
|
||||||
if '...' not in sectiontitle:
|
if '...' not in sectiontitle:
|
||||||
section_title=sectiontitle
|
section_title=sectiontitle
|
||||||
|
self.log('Found section:', sectiontitle)
|
||||||
for article in section.ul.findAll('li'):
|
for article in section.ul.findAll('li'):
|
||||||
articles = []
|
articles = []
|
||||||
title=self.tag_to_string(article.a)
|
title=self.tag_to_string(article.a)
|
||||||
url=article.a['href']
|
url=article.a['href']
|
||||||
articles.append({'title':title, 'url':url, 'description':'', 'date':''})
|
articles.append({'title':title, 'url':url, 'description':'', 'date':''})
|
||||||
|
self.log('\tFound article:', title)
|
||||||
|
|
||||||
if articles:
|
if articles:
|
||||||
if section_title not in feeds:
|
if section_title not in feeds:
|
||||||
@ -134,6 +102,9 @@ class FinancialTimes(BasicNewsRecipe):
|
|||||||
it.attrs = []
|
it.attrs = []
|
||||||
for item in soup.findAll(style=True):
|
for item in soup.findAll(style=True):
|
||||||
del item['style']
|
del item['style']
|
||||||
|
for img in soup.findAll('img', src=True):
|
||||||
|
if 'track/track.js' in img['src']:
|
||||||
|
img.extract()
|
||||||
for item in soup.findAll('a'):
|
for item in soup.findAll('a'):
|
||||||
limg = item.find('img')
|
limg = item.find('img')
|
||||||
if item.string is not None:
|
if item.string is not None:
|
||||||
@ -146,9 +117,6 @@ class FinancialTimes(BasicNewsRecipe):
|
|||||||
else:
|
else:
|
||||||
str = self.tag_to_string(item)
|
str = self.tag_to_string(item)
|
||||||
item.replaceWith(str)
|
item.replaceWith(str)
|
||||||
for item in soup.findAll('img'):
|
|
||||||
if not item.has_key('alt'):
|
|
||||||
item['alt'] = 'image'
|
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
def get_obfuscated_article(self, url):
|
def get_obfuscated_article(self, url):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user