Update Financial Times

Fixes #1629015 [Private bug](https://bugs.launchpad.net/calibre/+bug/1629015)
This commit is contained in:
Kovid Goyal 2016-10-30 00:20:29 +05:30
parent fa7d6f3bdd
commit 29de031ff1
3 changed files with 65 additions and 227 deletions

View File

@ -5,6 +5,13 @@ www.ft.com
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from urllib import unquote
def classes(classes):
q = frozenset(classes.split(' '))
return dict(attrs={
'class': lambda x: x and frozenset(x.split()).intersection(q)})
class FinancialTimes_rss(BasicNewsRecipe): class FinancialTimes_rss(BasicNewsRecipe):
@ -20,15 +27,13 @@ class FinancialTimes_rss(BasicNewsRecipe):
use_embedded_content = False use_embedded_content = False
needs_subscription = True needs_subscription = True
encoding = 'utf8' encoding = 'utf8'
ignore_duplicate_articles = {'title'}
remove_empty_feeds = True
publication_type = 'newspaper' publication_type = 'newspaper'
masthead_url = 'http://im.media.ft.com/m/img/masthead_main.jpg' masthead_url = 'http://im.media.ft.com/m/img/masthead_main.jpg'
LOGIN = 'https://accounts.ft.com/login' LOGIN = 'https://accounts.ft.com/login'
INDEX = 'http://www.ft.com' INDEX = 'http://www.ft.com'
conversion_options = {
'comment': description, 'tags': category, 'publisher': publisher, 'language': language, 'linearize_tables': True
}
def get_browser(self): def get_browser(self):
br = BasicNewsRecipe.get_browser(self) br = BasicNewsRecipe.get_browser(self)
br.open(self.INDEX) br.open(self.INDEX)
@ -40,57 +45,26 @@ class FinancialTimes_rss(BasicNewsRecipe):
br.submit() br.submit()
return br return br
keep_only_tags = [dict(name='div', attrs={'class': [ keep_only_tags = [
'fullstory fullstoryHeader', 'fullstory fullstoryBody', 'ft-story-header', 'ft-story-body', 'index-detail']})] classes('article__header--wrapper article__time-byline article__body n-content-image')
remove_tags = [
dict(name='div', attrs={'id': 'floating-con'}), dict(name=['meta', 'iframe', 'base', 'object', 'embed', 'link']), dict(
attrs={'class': ['storyTools', 'story-package', 'screen-copy', 'story-package separator', 'expandable-image']})
] ]
remove_attributes = ['width', 'height', 'lang']
extra_css = """ remove_tags = [
body{font-family: Georgia,Times,"Times New Roman",serif} classes('n-content-related-box tour-tip')
h2{font-size:large} ]
.ft-story-header{font-size: x-small}
.container{font-size:x-small;} remove_attributes = ['width', 'height', 'lang', 'style']
h3{font-size:x-small;color:#003399;}
.copyright{font-size: x-small}
img{margin-top: 0.8em; display: block}
.lastUpdated{font-family: Arial,Helvetica,sans-serif; font-size: x-small}
.byline,.ft-story-body,.ft-story-header{font-family: Arial,Helvetica,sans-serif}
"""
feeds = [ feeds = [
(u'UK', u'http://www.ft.com/rss/home/uk'),
(u'UK', u'http://www.ft.com/rss/home/uk'), (u'US', u'http://www.ft.com/rss/home/us'),
(u'US', u'http://www.ft.com/rss/home/us'), (u'Asia', u'http://www.ft.com/rss/home/asia'),
(u'Asia', u'http://www.ft.com/rss/home/asia'), (u'Middle East', u'http://www.ft.com/rss/home/middleeast')
(u'Middle East', u'http://www.ft.com/rss/home/middleeast')
] ]
def preprocess_html(self, soup): def preprocess_html(self, soup):
items = ['promo-box', 'promo-title', for img in soup.findAll('img', srcset=True):
'promo-headline', 'promo-image', src = img['srcset'].split(',')[0].strip()
'promo-intro', 'promo-link', 'subhead'] src = unquote(src.rpartition('/')[2].partition('?')[0])
for item in items: img['src'] = src
for it in soup.findAll(item):
it.name = 'div'
it.attrs = []
for item in soup.findAll(style=True):
del item['style']
for item in soup.findAll('a'):
limg = item.find('img')
if item.string is not None:
str = item.string
item.replaceWith(str)
else:
if limg:
item.name = 'div'
item.attrs = []
else:
str = self.tag_to_string(item)
item.replaceWith(str)
for item in soup.findAll('img'):
if not item.has_key('alt'): # noqa
item['alt'] = 'image'
return soup return soup

View File

@ -4,9 +4,15 @@ __copyright__ = '2010-2015, Darko Miletic <darko.miletic at gmail.com>'
www.ft.com/uk-edition www.ft.com/uk-edition
''' '''
from calibre.ptempfile import PersistentTemporaryFile
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from collections import OrderedDict from collections import OrderedDict
from urllib import unquote
def classes(classes):
q = frozenset(classes.split(' '))
return dict(attrs={
'class': lambda x: x and frozenset(x.split()).intersection(q)})
class FinancialTimes(BasicNewsRecipe): class FinancialTimes(BasicNewsRecipe):
@ -23,13 +29,20 @@ class FinancialTimes(BasicNewsRecipe):
needs_subscription = True needs_subscription = True
encoding = 'utf8' encoding = 'utf8'
publication_type = 'newspaper' publication_type = 'newspaper'
articles_are_obfuscated = True
temp_files = []
masthead_url = 'http://im.media.ft.com/m/img/masthead_main.jpg'
LOGIN = 'https://accounts.ft.com/login?location=http%3A%2F%2Fwww.ft.com%2Fhome%2Fuk' LOGIN = 'https://accounts.ft.com/login?location=http%3A%2F%2Fwww.ft.com%2Fhome%2Fuk'
INDEX = 'http://www.ft.com/uk-edition' INDEX = 'http://www.ft.com/uk-edition'
PREFIX = 'http://www.ft.com' PREFIX = 'http://www.ft.com'
keep_only_tags = [
classes('article__header--wrapper article__time-byline article__body n-content-image')
]
remove_tags = [
classes('n-content-related-box tour-tip')
]
remove_attributes = ['width', 'height', 'lang', 'style']
def get_browser(self): def get_browser(self):
br = BasicNewsRecipe.get_browser(self) br = BasicNewsRecipe.get_browser(self)
br.open(self.INDEX) br.open(self.INDEX)
@ -41,42 +54,6 @@ class FinancialTimes(BasicNewsRecipe):
br.submit() br.submit()
return br return br
keep_only_tags = [
dict(name='p', attrs={
'class': lambda x: x and 'lastUpdated' in x.split()}),
dict(name='div', attrs={
'class': lambda x: x and 'syndicationHeadline' in x.split()}),
dict(name='p', attrs={'class': lambda x: x and 'byline' in x.split()}),
dict(name='div', attrs={'class': [
'fullstory fullstoryBody', 'fullstory fullstoryBody specialArticle', 'ft-story-header', 'ft-story-body', 'index-detail']})
]
remove_tags = [
dict(name='style', attrs={'id': 'antiClickjack'}),
dict(name='div', attrs={'id': 'floating-con'}),
dict(name=['meta', 'iframe', 'base', 'object', 'embed', 'link']),
dict(attrs={'class': ['storyTools', 'story-package', 'screen-copy',
'story-package separator', 'expandable-image', 'promobox']}),
dict(name='div', attrs={
'class': lambda x: x and 'insideArticleRelatedTopics' in x.split()}),
dict(name='div', attrs={
'class': lambda x: x and 'ft-new-story-tools-box' in x.split()}),
dict(name='div', attrs={
'class': ['railMiniVideo', 'ftbf-syndicationIndicator']})
]
remove_attributes = ['width', 'height', 'lang']
extra_css = """
body{font-family: Georgia,Times,"Times New Roman",serif}
h2{font-size:large}
.ft-story-header{font-size: x-small}
.container{font-size:x-small;}
h3{font-size:x-small;color:#003399;}
.copyright{font-size: x-small}
img{margin-top: 0.8em; display: block}
.lastUpdated{font-family: Arial,Helvetica,sans-serif; font-size: x-small}
.byline,.ft-story-body,.ft-story-header{font-family: Arial,Helvetica,sans-serif}
"""
def parse_index(self): def parse_index(self):
feeds = OrderedDict() feeds = OrderedDict()
soup = self.index_to_soup(self.INDEX) soup = self.index_to_soup(self.INDEX)
@ -107,44 +84,8 @@ class FinancialTimes(BasicNewsRecipe):
return ans return ans
def preprocess_html(self, soup): def preprocess_html(self, soup):
items = ['promo-box', 'promo-title', for img in soup.findAll('img', srcset=True):
'promo-headline', 'promo-image', src = img['srcset'].split(',')[0].strip()
'promo-intro', 'promo-link', 'subhead'] src = unquote(src.rpartition('/')[2].partition('?')[0])
for item in items: img['src'] = src
for it in soup.findAll(item):
it.name = 'div'
it.attrs = []
for item in soup.findAll(style=True):
del item['style']
for img in soup.findAll('img', src=True):
if 'track/track.js' in img['src']:
img.extract()
for item in soup.findAll('a'):
limg = item.find('img')
if item.string is not None:
str = item.string
item.replaceWith(str)
else:
if limg:
item.name = 'div'
item.attrs = []
else:
str = self.tag_to_string(item)
item.replaceWith(str)
return soup return soup
def get_obfuscated_article(self, url):
count = 0
while (count < 10):
try:
response = self.browser.open(url)
html = response.read()
count = 10
except:
print "Retrying download..."
count += 1
tfile = PersistentTemporaryFile('_fa.html')
tfile.write(html)
tfile.close()
self.temp_files.append(tfile)
return tfile.name

View File

@ -4,10 +4,15 @@ __copyright__ = '2010-2015, Darko Miletic <darko.miletic at gmail.com>'
www.ft.com/international-edition www.ft.com/international-edition
''' '''
from calibre.ptempfile import PersistentTemporaryFile
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from collections import OrderedDict from collections import OrderedDict
from urllib import unquote
def classes(classes):
q = frozenset(classes.split(' '))
return dict(attrs={
'class': lambda x: x and frozenset(x.split()).intersection(q)})
class FinancialTimes(BasicNewsRecipe): class FinancialTimes(BasicNewsRecipe):
@ -24,16 +29,19 @@ class FinancialTimes(BasicNewsRecipe):
needs_subscription = True needs_subscription = True
encoding = 'utf8' encoding = 'utf8'
publication_type = 'newspaper' publication_type = 'newspaper'
articles_are_obfuscated = True
temp_files = []
masthead_url = 'http://im.media.ft.com/m/img/masthead_main.jpg'
LOGIN = 'https://accounts.ft.com/login?location=http%3A%2F%2Fwww.ft.com%2Fhome%2Fuk' LOGIN = 'https://accounts.ft.com/login?location=http%3A%2F%2Fwww.ft.com%2Fhome%2Fuk'
INDEX = 'http://www.ft.com/international-edition' INDEX = 'http://www.ft.com/international-edition'
PREFIX = 'http://www.ft.com' PREFIX = 'http://www.ft.com'
conversion_options = { keep_only_tags = [
'comment': description, 'tags': category, 'publisher': publisher, 'language': language, 'linearize_tables': True classes('article__header--wrapper article__time-byline article__body n-content-image')
} ]
remove_tags = [
classes('n-content-related-box tour-tip')
]
remove_attributes = ['width', 'height', 'lang', 'style']
def get_browser(self): def get_browser(self):
br = BasicNewsRecipe.get_browser(self) br = BasicNewsRecipe.get_browser(self)
@ -46,55 +54,6 @@ class FinancialTimes(BasicNewsRecipe):
br.submit() br.submit()
return br return br
keep_only_tags = [
dict(name='p', attrs={
'class': lambda x: x and 'lastUpdated' in x.split()}),
dict(name='div', attrs={
'class': lambda x: x and 'syndicationHeadline' in x.split()}),
dict(name='p', attrs={'class': lambda x: x and 'byline' in x.split()}),
dict(name='div', attrs={'class': [
'fullstory fullstoryBody', 'ft-story-header', 'ft-story-body', 'index-detail']})
]
remove_tags = [
dict(name='style', attrs={'id': 'antiClickjack'}), dict(name='div', attrs={'id': 'floating-con'}), dict(name=['meta', 'iframe', 'base', 'object', 'embed', 'link']), dict(attrs={'class': ['storyTools', 'story-package', 'screen-copy', 'story-package separator', 'expandable-image', 'promobox']}), dict(name='div', attrs={'class': lambda x: x and 'insideArticleRelatedTopics' in x.split()}), dict(name='div', attrs={'class': lambda x: x and 'ft-new-story-tools-box' in x.split()}), dict(name='div', attrs={'class': ['railMiniVideo', 'ftbf-syndicationIndicator']}) # noqa
]
remove_attributes = ['width', 'height', 'lang']
extra_css = """
body{font-family: Georgia,Times,"Times New Roman",serif}
h2{font-size:large}
.ft-story-header{font-size: x-small}
.container{font-size:x-small;}
h3{font-size:x-small;color:#003399;}
.copyright{font-size: x-small}
img{margin-top: 0.8em; display: block}
.lastUpdated{font-family: Arial,Helvetica,sans-serif; font-size: x-small}
.byline,.ft-story-body,.ft-story-header{font-family: Arial,Helvetica,sans-serif}
"""
def get_artlinks(self, elem):
articles = []
count = 0
for item in elem.findAll('a', href=True):
count = count + 1
if self.test and count > 2:
return articles
rawlink = item['href']
url = rawlink
if not rawlink.startswith('http://'):
url = self.PREFIX + rawlink
try:
# resolve redirect.
urlverified = self.browser.open_novisit(url).geturl()
except:
continue
title = self.tag_to_string(item)
date = strftime(self.timefmt)
articles.append({
'title': title, 'date': date, 'url': urlverified, 'description': ''
})
return articles
def parse_index(self): def parse_index(self):
feeds = OrderedDict() feeds = OrderedDict()
soup = self.index_to_soup(self.INDEX) soup = self.index_to_soup(self.INDEX)
@ -121,44 +80,8 @@ class FinancialTimes(BasicNewsRecipe):
return ans return ans
def preprocess_html(self, soup): def preprocess_html(self, soup):
items = ['promo-box', 'promo-title', for img in soup.findAll('img', srcset=True):
'promo-headline', 'promo-image', src = img['srcset'].split(',')[0].strip()
'promo-intro', 'promo-link', 'subhead'] src = unquote(src.rpartition('/')[2].partition('?')[0])
for item in items: img['src'] = src
for it in soup.findAll(item):
it.name = 'div'
it.attrs = []
for item in soup.findAll(style=True):
del item['style']
for item in soup.findAll('a'):
limg = item.find('img')
if item.string is not None:
str = item.string
item.replaceWith(str)
else:
if limg:
item.name = 'div'
item.attrs = []
else:
str = self.tag_to_string(item)
item.replaceWith(str)
for item in soup.findAll('img'):
if not item.has_key('alt'): # noqa
item['alt'] = 'image'
return soup return soup
def get_obfuscated_article(self, url):
count = 0
while (count < 10):
try:
response = self.browser.open(url)
html = response.read()
count = 10
except:
print "Retrying download..."
count += 1
tfile = PersistentTemporaryFile('_fa.html')
tfile.write(html)
tfile.close()
self.temp_files.append(tfile)
return tfile.name