Update Financial Times

Fixes #1711934 [Financial times download uk/us is broken](https://bugs.launchpad.net/calibre/+bug/1711934)
This commit is contained in:
Kovid Goyal 2017-08-21 00:31:35 +05:30
parent 331190c369
commit e25426ad5a
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 36 additions and 62 deletions

View File

@ -5,11 +5,10 @@
__license__ = 'GPL v3'
__copyright__ = '2010-2017, Darko Miletic <darko.miletic at gmail.com>'
'''
www.ft.com/uk-edition
www.ft.com/todaysnewspaper/uk
'''
from calibre.web.feeds.news import BasicNewsRecipe
from collections import OrderedDict
from urllib import unquote
@ -34,22 +33,26 @@ class FinancialTimes(BasicNewsRecipe):
encoding = 'utf8'
publication_type = 'newspaper'
handle_gzip = True
compress_news_images = True
scale_news_images_to_device = True
ignore_duplicate_articles = {'url'}
LOGIN = 'https://accounts.ft.com/login?location=https%3A%2F%2Fwww.ft.com%2F'
LOGOUT = 'https://myaccount.ft.com/logout'
INDEX = 'http://www.ft.com/uk-edition'
PREFIX = 'http://www.ft.com'
INDEX = 'https://www.ft.com/todaysnewspaper/uk'
PREFIX = 'https://www.ft.com'
keep_only_tags = [
classes(
'article__header--wrapper article__time-byline article__body'
'n-content-image barrier-grid__heading article__time-byline topper__headline topper__standfirst')
classes('topper__headline topper__standfirst n-content-image--full article__time-byline article__body')
]
remove_tags = [
classes('n-content-related-box tour-tip')
classes('n-content-related-box tour-tip n-content-recommended n-content-video')
]
remove_attributes = ['width', 'height', 'lang', 'style']
extra_css = '''
body {font-family: Georgia,serif;}
img {display:block;}
'''
def get_browser(self):
br = BasicNewsRecipe.get_browser(self)
@ -81,33 +84,14 @@ class FinancialTimes(BasicNewsRecipe):
return cover
def parse_index(self):
feeds = OrderedDict()
articles = []
soup = self.index_to_soup(self.INDEX)
# dates= self.tag_to_string(soup.find('div', attrs={'class':'btm-links'}).find('div'))
# self.timefmt = ' [%s]'%dates
section_title = 'Untitled'
for column in soup.findAll('div', attrs={'class': 'feedBoxes clearfix'}):
for section in column.findAll('div', attrs={'class': 'feedBox'}):
sectiontitle = self.tag_to_string(section.find('h4'))
if '...' not in sectiontitle:
section_title = sectiontitle
self.log('Found section:', sectiontitle)
for article in section.ul.findAll('li'):
articles = []
title = self.tag_to_string(article.a)
url = article.a['href']
articles.append(
{'title': title, 'url': url, 'description': '', 'date': ''})
self.log('\tFound article:', title)
if articles:
if section_title not in feeds:
feeds[section_title] = []
feeds[section_title] += articles
ans = [(key, val) for key, val in feeds.iteritems()]
return ans
for article in soup.findAll('a', href=True, attrs={'data-trackable':'main-link'}):
url = self.PREFIX + article['href']
title = self.tag_to_string(article)
articles.append({'title': title, 'url': url, 'description': '', 'date': ''})
return [("Articles", articles)]
def preprocess_html(self, soup):
for img in soup.findAll('img', srcset=True):

View File

@ -5,11 +5,10 @@
__license__ = 'GPL v3'
__copyright__ = '2010-2017, Darko Miletic <darko.miletic at gmail.com>'
'''
www.ft.com/international-edition
www.ft.com/todaysnewspaper/international
'''
from calibre.web.feeds.news import BasicNewsRecipe
from collections import OrderedDict
from urllib import unquote
@ -34,20 +33,26 @@ class FinancialTimes(BasicNewsRecipe):
encoding = 'utf8'
publication_type = 'newspaper'
handle_gzip = True
compress_news_images = True
scale_news_images_to_device = True
ignore_duplicate_articles = {'url'}
LOGIN = 'https://accounts.ft.com/login?location=https%3A%2F%2Fwww.ft.com%2F'
LOGOUT = 'https://myaccount.ft.com/logout'
INDEX = 'http://www.ft.com/international-edition'
PREFIX = 'http://www.ft.com'
INDEX = 'https://www.ft.com/todaysnewspaper/international'
PREFIX = 'https://www.ft.com'
keep_only_tags = [
classes('article__header--wrapper article__time-byline article__body n-content-image barrier-grid__heading')
classes('topper__headline topper__standfirst n-content-image--full article__time-byline article__body')
]
remove_tags = [
classes('n-content-related-box tour-tip')
classes('n-content-related-box tour-tip n-content-recommended n-content-video')
]
remove_attributes = ['width', 'height', 'lang', 'style']
extra_css = '''
body {font-family: Georgia,serif;}
img {display:block;}
'''
def get_browser(self):
br = BasicNewsRecipe.get_browser(self)
@ -63,29 +68,14 @@ class FinancialTimes(BasicNewsRecipe):
return br
def parse_index(self):
feeds = OrderedDict()
articles = []
soup = self.index_to_soup(self.INDEX)
section_title = 'Untitled'
for column in soup.findAll('div', attrs={'class': 'feedBoxes clearfix'}):
for section in column.findAll('div', attrs={'class': 'feedBox'}):
sectiontitle = self.tag_to_string(section.find('h4'))
if '...' not in sectiontitle:
section_title = sectiontitle
for article in section.ul.findAll('li'):
articles = []
title = self.tag_to_string(article.a)
url = article.a['href']
articles.append(
{'title': title, 'url': url, 'description': '', 'date': ''})
if articles:
if section_title not in feeds:
feeds[section_title] = []
feeds[section_title] += articles
ans = [(key, val) for key, val in feeds.iteritems()]
return ans
for article in soup.findAll('a', href=True, attrs={'data-trackable':'main-link'}):
url = self.PREFIX + article['href']
title = self.tag_to_string(article)
articles.append({'title': title, 'url': url, 'description': '', 'date': ''})
return [("Articles", articles)]
def preprocess_html(self, soup):
for img in soup.findAll('img', srcset=True):