From e25426ad5a4f67cf180e1de91bac766274055b5a Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 21 Aug 2017 00:31:35 +0530 Subject: [PATCH] Update Financial Times Fixes #1711934 [Financial times download uk/us is broken](https://bugs.launchpad.net/calibre/+bug/1711934) --- recipes/financial_times_uk.recipe | 52 +++++++++++-------------------- recipes/financial_times_us.recipe | 46 +++++++++++---------------- 2 files changed, 36 insertions(+), 62 deletions(-) diff --git a/recipes/financial_times_uk.recipe b/recipes/financial_times_uk.recipe index 490624bfd2..b3cae18735 100644 --- a/recipes/financial_times_uk.recipe +++ b/recipes/financial_times_uk.recipe @@ -5,11 +5,10 @@ __license__ = 'GPL v3' __copyright__ = '2010-2017, Darko Miletic ' ''' -www.ft.com/uk-edition +www.ft.com/todaysnewspaper/uk ''' from calibre.web.feeds.news import BasicNewsRecipe -from collections import OrderedDict from urllib import unquote @@ -34,22 +33,26 @@ class FinancialTimes(BasicNewsRecipe): encoding = 'utf8' publication_type = 'newspaper' handle_gzip = True + compress_news_images = True + scale_news_images_to_device = True + ignore_duplicate_articles = {'url'} LOGIN = 'https://accounts.ft.com/login?location=https%3A%2F%2Fwww.ft.com%2F' LOGOUT = 'https://myaccount.ft.com/logout' - INDEX = 'http://www.ft.com/uk-edition' - PREFIX = 'http://www.ft.com' + INDEX = 'https://www.ft.com/todaysnewspaper/uk' + PREFIX = 'https://www.ft.com' keep_only_tags = [ - classes( - 'article__header--wrapper article__time-byline article__body' - 'n-content-image barrier-grid__heading article__time-byline topper__headline topper__standfirst') + classes('topper__headline topper__standfirst n-content-image--full article__time-byline article__body') ] remove_tags = [ - classes('n-content-related-box tour-tip') + classes('n-content-related-box tour-tip n-content-recommended n-content-video') ] - remove_attributes = ['width', 'height', 'lang', 'style'] + extra_css = ''' + body {font-family: Georgia,serif;} + img {display:block;} + ''' def get_browser(self): br = BasicNewsRecipe.get_browser(self) @@ -81,33 +84,14 @@ class FinancialTimes(BasicNewsRecipe): return cover def parse_index(self): - feeds = OrderedDict() + articles = [] soup = self.index_to_soup(self.INDEX) - # dates= self.tag_to_string(soup.find('div', attrs={'class':'btm-links'}).find('div')) - # self.timefmt = ' [%s]'%dates - section_title = 'Untitled' - for column in soup.findAll('div', attrs={'class': 'feedBoxes clearfix'}): - for section in column.findAll('div', attrs={'class': 'feedBox'}): - sectiontitle = self.tag_to_string(section.find('h4')) - if '...' not in sectiontitle: - section_title = sectiontitle - self.log('Found section:', sectiontitle) - for article in section.ul.findAll('li'): - articles = [] - title = self.tag_to_string(article.a) - url = article.a['href'] - articles.append( - {'title': title, 'url': url, 'description': '', 'date': ''}) - self.log('\tFound article:', title) - - if articles: - if section_title not in feeds: - feeds[section_title] = [] - feeds[section_title] += articles - - ans = [(key, val) for key, val in feeds.iteritems()] - return ans + for article in soup.findAll('a', href=True, attrs={'data-trackable':'main-link'}): + url = self.PREFIX + article['href'] + title = self.tag_to_string(article) + articles.append({'title': title, 'url': url, 'description': '', 'date': ''}) + return [("Articles", articles)] def preprocess_html(self, soup): for img in soup.findAll('img', srcset=True): diff --git a/recipes/financial_times_us.recipe b/recipes/financial_times_us.recipe index 6b6c8338a9..6642861e55 100644 --- a/recipes/financial_times_us.recipe +++ b/recipes/financial_times_us.recipe @@ -5,11 +5,10 @@ __license__ = 'GPL v3' __copyright__ = '2010-2017, Darko Miletic ' ''' -www.ft.com/international-edition +www.ft.com/todaysnewspaper/international ''' from calibre.web.feeds.news import BasicNewsRecipe -from collections import OrderedDict from urllib import unquote @@ -34,20 +33,26 @@ class FinancialTimes(BasicNewsRecipe): encoding = 'utf8' publication_type = 'newspaper' handle_gzip = True + compress_news_images = True + scale_news_images_to_device = True + ignore_duplicate_articles = {'url'} LOGIN = 'https://accounts.ft.com/login?location=https%3A%2F%2Fwww.ft.com%2F' LOGOUT = 'https://myaccount.ft.com/logout' - INDEX = 'http://www.ft.com/international-edition' - PREFIX = 'http://www.ft.com' + INDEX = 'https://www.ft.com/todaysnewspaper/international' + PREFIX = 'https://www.ft.com' keep_only_tags = [ - classes('article__header--wrapper article__time-byline article__body n-content-image barrier-grid__heading') + classes('topper__headline topper__standfirst n-content-image--full article__time-byline article__body') ] remove_tags = [ - classes('n-content-related-box tour-tip') + classes('n-content-related-box tour-tip n-content-recommended n-content-video') ] - remove_attributes = ['width', 'height', 'lang', 'style'] + extra_css = ''' + body {font-family: Georgia,serif;} + img {display:block;} + ''' def get_browser(self): br = BasicNewsRecipe.get_browser(self) @@ -63,29 +68,14 @@ class FinancialTimes(BasicNewsRecipe): return br def parse_index(self): - feeds = OrderedDict() + articles = [] soup = self.index_to_soup(self.INDEX) - section_title = 'Untitled' - for column in soup.findAll('div', attrs={'class': 'feedBoxes clearfix'}): - for section in column.findAll('div', attrs={'class': 'feedBox'}): - sectiontitle = self.tag_to_string(section.find('h4')) - if '...' not in sectiontitle: - section_title = sectiontitle - for article in section.ul.findAll('li'): - articles = [] - title = self.tag_to_string(article.a) - url = article.a['href'] - articles.append( - {'title': title, 'url': url, 'description': '', 'date': ''}) - - if articles: - if section_title not in feeds: - feeds[section_title] = [] - feeds[section_title] += articles - - ans = [(key, val) for key, val in feeds.iteritems()] - return ans + for article in soup.findAll('a', href=True, attrs={'data-trackable':'main-link'}): + url = self.PREFIX + article['href'] + title = self.tag_to_string(article) + articles.append({'title': title, 'url': url, 'description': '', 'date': ''}) + return [("Articles", articles)] def preprocess_html(self, soup): for img in soup.findAll('img', srcset=True):