diff --git a/recipes/financial_times.recipe b/recipes/financial_times.recipe index f707ab8f31..bcd327f5f6 100644 --- a/recipes/financial_times.recipe +++ b/recipes/financial_times.recipe @@ -5,6 +5,13 @@ www.ft.com ''' from calibre.web.feeds.news import BasicNewsRecipe +from urllib import unquote + + +def classes(classes): + q = frozenset(classes.split(' ')) + return dict(attrs={ + 'class': lambda x: x and frozenset(x.split()).intersection(q)}) class FinancialTimes_rss(BasicNewsRecipe): @@ -20,15 +27,13 @@ class FinancialTimes_rss(BasicNewsRecipe): use_embedded_content = False needs_subscription = True encoding = 'utf8' + ignore_duplicate_articles = {'title'} + remove_empty_feeds = True publication_type = 'newspaper' masthead_url = 'http://im.media.ft.com/m/img/masthead_main.jpg' LOGIN = 'https://accounts.ft.com/login' INDEX = 'http://www.ft.com' - conversion_options = { - 'comment': description, 'tags': category, 'publisher': publisher, 'language': language, 'linearize_tables': True - } - def get_browser(self): br = BasicNewsRecipe.get_browser(self) br.open(self.INDEX) @@ -40,57 +45,26 @@ class FinancialTimes_rss(BasicNewsRecipe): br.submit() return br - keep_only_tags = [dict(name='div', attrs={'class': [ - 'fullstory fullstoryHeader', 'fullstory fullstoryBody', 'ft-story-header', 'ft-story-body', 'index-detail']})] - remove_tags = [ - dict(name='div', attrs={'id': 'floating-con'}), dict(name=['meta', 'iframe', 'base', 'object', 'embed', 'link']), dict( - attrs={'class': ['storyTools', 'story-package', 'screen-copy', 'story-package separator', 'expandable-image']}) + keep_only_tags = [ + classes('article__header--wrapper article__time-byline article__body n-content-image') ] - remove_attributes = ['width', 'height', 'lang'] - extra_css = """ - body{font-family: Georgia,Times,"Times New Roman",serif} - h2{font-size:large} - .ft-story-header{font-size: x-small} - .container{font-size:x-small;} - h3{font-size:x-small;color:#003399;} - .copyright{font-size: x-small} - img{margin-top: 0.8em; display: block} - .lastUpdated{font-family: Arial,Helvetica,sans-serif; font-size: x-small} - .byline,.ft-story-body,.ft-story-header{font-family: Arial,Helvetica,sans-serif} - """ + remove_tags = [ + classes('n-content-related-box tour-tip') + ] + + remove_attributes = ['width', 'height', 'lang', 'style'] feeds = [ - - (u'UK', u'http://www.ft.com/rss/home/uk'), - (u'US', u'http://www.ft.com/rss/home/us'), - (u'Asia', u'http://www.ft.com/rss/home/asia'), - (u'Middle East', u'http://www.ft.com/rss/home/middleeast') + (u'UK', u'http://www.ft.com/rss/home/uk'), + (u'US', u'http://www.ft.com/rss/home/us'), + (u'Asia', u'http://www.ft.com/rss/home/asia'), + (u'Middle East', u'http://www.ft.com/rss/home/middleeast') ] def preprocess_html(self, soup): - items = ['promo-box', 'promo-title', - 'promo-headline', 'promo-image', - 'promo-intro', 'promo-link', 'subhead'] - for item in items: - for it in soup.findAll(item): - it.name = 'div' - it.attrs = [] - for item in soup.findAll(style=True): - del item['style'] - for item in soup.findAll('a'): - limg = item.find('img') - if item.string is not None: - str = item.string - item.replaceWith(str) - else: - if limg: - item.name = 'div' - item.attrs = [] - else: - str = self.tag_to_string(item) - item.replaceWith(str) - for item in soup.findAll('img'): - if not item.has_key('alt'): # noqa - item['alt'] = 'image' + for img in soup.findAll('img', srcset=True): + src = img['srcset'].split(',')[0].strip() + src = unquote(src.rpartition('/')[2].partition('?')[0]) + img['src'] = src return soup diff --git a/recipes/financial_times_uk.recipe b/recipes/financial_times_uk.recipe index af413a0d38..e665ba6651 100644 --- a/recipes/financial_times_uk.recipe +++ b/recipes/financial_times_uk.recipe @@ -4,9 +4,15 @@ __copyright__ = '2010-2015, Darko Miletic ' www.ft.com/uk-edition ''' -from calibre.ptempfile import PersistentTemporaryFile from calibre.web.feeds.news import BasicNewsRecipe from collections import OrderedDict +from urllib import unquote + + +def classes(classes): + q = frozenset(classes.split(' ')) + return dict(attrs={ + 'class': lambda x: x and frozenset(x.split()).intersection(q)}) class FinancialTimes(BasicNewsRecipe): @@ -23,13 +29,20 @@ class FinancialTimes(BasicNewsRecipe): needs_subscription = True encoding = 'utf8' publication_type = 'newspaper' - articles_are_obfuscated = True - temp_files = [] - masthead_url = 'http://im.media.ft.com/m/img/masthead_main.jpg' LOGIN = 'https://accounts.ft.com/login?location=http%3A%2F%2Fwww.ft.com%2Fhome%2Fuk' INDEX = 'http://www.ft.com/uk-edition' PREFIX = 'http://www.ft.com' + keep_only_tags = [ + classes('article__header--wrapper article__time-byline article__body n-content-image') + ] + + remove_tags = [ + classes('n-content-related-box tour-tip') + ] + + remove_attributes = ['width', 'height', 'lang', 'style'] + def get_browser(self): br = BasicNewsRecipe.get_browser(self) br.open(self.INDEX) @@ -41,42 +54,6 @@ class FinancialTimes(BasicNewsRecipe): br.submit() return br - keep_only_tags = [ - dict(name='p', attrs={ - 'class': lambda x: x and 'lastUpdated' in x.split()}), - dict(name='div', attrs={ - 'class': lambda x: x and 'syndicationHeadline' in x.split()}), - dict(name='p', attrs={'class': lambda x: x and 'byline' in x.split()}), - dict(name='div', attrs={'class': [ - 'fullstory fullstoryBody', 'fullstory fullstoryBody specialArticle', 'ft-story-header', 'ft-story-body', 'index-detail']}) - ] - remove_tags = [ - dict(name='style', attrs={'id': 'antiClickjack'}), - dict(name='div', attrs={'id': 'floating-con'}), - dict(name=['meta', 'iframe', 'base', 'object', 'embed', 'link']), - dict(attrs={'class': ['storyTools', 'story-package', 'screen-copy', - 'story-package separator', 'expandable-image', 'promobox']}), - dict(name='div', attrs={ - 'class': lambda x: x and 'insideArticleRelatedTopics' in x.split()}), - dict(name='div', attrs={ - 'class': lambda x: x and 'ft-new-story-tools-box' in x.split()}), - dict(name='div', attrs={ - 'class': ['railMiniVideo', 'ftbf-syndicationIndicator']}) - ] - remove_attributes = ['width', 'height', 'lang'] - - extra_css = """ - body{font-family: Georgia,Times,"Times New Roman",serif} - h2{font-size:large} - .ft-story-header{font-size: x-small} - .container{font-size:x-small;} - h3{font-size:x-small;color:#003399;} - .copyright{font-size: x-small} - img{margin-top: 0.8em; display: block} - .lastUpdated{font-family: Arial,Helvetica,sans-serif; font-size: x-small} - .byline,.ft-story-body,.ft-story-header{font-family: Arial,Helvetica,sans-serif} - """ - def parse_index(self): feeds = OrderedDict() soup = self.index_to_soup(self.INDEX) @@ -107,44 +84,8 @@ class FinancialTimes(BasicNewsRecipe): return ans def preprocess_html(self, soup): - items = ['promo-box', 'promo-title', - 'promo-headline', 'promo-image', - 'promo-intro', 'promo-link', 'subhead'] - for item in items: - for it in soup.findAll(item): - it.name = 'div' - it.attrs = [] - for item in soup.findAll(style=True): - del item['style'] - for img in soup.findAll('img', src=True): - if 'track/track.js' in img['src']: - img.extract() - for item in soup.findAll('a'): - limg = item.find('img') - if item.string is not None: - str = item.string - item.replaceWith(str) - else: - if limg: - item.name = 'div' - item.attrs = [] - else: - str = self.tag_to_string(item) - item.replaceWith(str) + for img in soup.findAll('img', srcset=True): + src = img['srcset'].split(',')[0].strip() + src = unquote(src.rpartition('/')[2].partition('?')[0]) + img['src'] = src return soup - - def get_obfuscated_article(self, url): - count = 0 - while (count < 10): - try: - response = self.browser.open(url) - html = response.read() - count = 10 - except: - print "Retrying download..." - count += 1 - tfile = PersistentTemporaryFile('_fa.html') - tfile.write(html) - tfile.close() - self.temp_files.append(tfile) - return tfile.name diff --git a/recipes/financial_times_us.recipe b/recipes/financial_times_us.recipe index fc5eb6601e..14fa674f0b 100644 --- a/recipes/financial_times_us.recipe +++ b/recipes/financial_times_us.recipe @@ -4,10 +4,15 @@ __copyright__ = '2010-2015, Darko Miletic ' www.ft.com/international-edition ''' -from calibre.ptempfile import PersistentTemporaryFile -from calibre import strftime from calibre.web.feeds.news import BasicNewsRecipe from collections import OrderedDict +from urllib import unquote + + +def classes(classes): + q = frozenset(classes.split(' ')) + return dict(attrs={ + 'class': lambda x: x and frozenset(x.split()).intersection(q)}) class FinancialTimes(BasicNewsRecipe): @@ -24,16 +29,19 @@ class FinancialTimes(BasicNewsRecipe): needs_subscription = True encoding = 'utf8' publication_type = 'newspaper' - articles_are_obfuscated = True - temp_files = [] - masthead_url = 'http://im.media.ft.com/m/img/masthead_main.jpg' LOGIN = 'https://accounts.ft.com/login?location=http%3A%2F%2Fwww.ft.com%2Fhome%2Fuk' INDEX = 'http://www.ft.com/international-edition' PREFIX = 'http://www.ft.com' - conversion_options = { - 'comment': description, 'tags': category, 'publisher': publisher, 'language': language, 'linearize_tables': True - } + keep_only_tags = [ + classes('article__header--wrapper article__time-byline article__body n-content-image') + ] + + remove_tags = [ + classes('n-content-related-box tour-tip') + ] + + remove_attributes = ['width', 'height', 'lang', 'style'] def get_browser(self): br = BasicNewsRecipe.get_browser(self) @@ -46,55 +54,6 @@ class FinancialTimes(BasicNewsRecipe): br.submit() return br - keep_only_tags = [ - dict(name='p', attrs={ - 'class': lambda x: x and 'lastUpdated' in x.split()}), - dict(name='div', attrs={ - 'class': lambda x: x and 'syndicationHeadline' in x.split()}), - dict(name='p', attrs={'class': lambda x: x and 'byline' in x.split()}), - dict(name='div', attrs={'class': [ - 'fullstory fullstoryBody', 'ft-story-header', 'ft-story-body', 'index-detail']}) - ] - remove_tags = [ - dict(name='style', attrs={'id': 'antiClickjack'}), dict(name='div', attrs={'id': 'floating-con'}), dict(name=['meta', 'iframe', 'base', 'object', 'embed', 'link']), dict(attrs={'class': ['storyTools', 'story-package', 'screen-copy', 'story-package separator', 'expandable-image', 'promobox']}), dict(name='div', attrs={'class': lambda x: x and 'insideArticleRelatedTopics' in x.split()}), dict(name='div', attrs={'class': lambda x: x and 'ft-new-story-tools-box' in x.split()}), dict(name='div', attrs={'class': ['railMiniVideo', 'ftbf-syndicationIndicator']}) # noqa - ] - remove_attributes = ['width', 'height', 'lang'] - - extra_css = """ - body{font-family: Georgia,Times,"Times New Roman",serif} - h2{font-size:large} - .ft-story-header{font-size: x-small} - .container{font-size:x-small;} - h3{font-size:x-small;color:#003399;} - .copyright{font-size: x-small} - img{margin-top: 0.8em; display: block} - .lastUpdated{font-family: Arial,Helvetica,sans-serif; font-size: x-small} - .byline,.ft-story-body,.ft-story-header{font-family: Arial,Helvetica,sans-serif} - """ - - def get_artlinks(self, elem): - articles = [] - count = 0 - for item in elem.findAll('a', href=True): - count = count + 1 - if self.test and count > 2: - return articles - rawlink = item['href'] - url = rawlink - if not rawlink.startswith('http://'): - url = self.PREFIX + rawlink - try: - # resolve redirect. - urlverified = self.browser.open_novisit(url).geturl() - except: - continue - title = self.tag_to_string(item) - date = strftime(self.timefmt) - articles.append({ - 'title': title, 'date': date, 'url': urlverified, 'description': '' - }) - return articles - def parse_index(self): feeds = OrderedDict() soup = self.index_to_soup(self.INDEX) @@ -121,44 +80,8 @@ class FinancialTimes(BasicNewsRecipe): return ans def preprocess_html(self, soup): - items = ['promo-box', 'promo-title', - 'promo-headline', 'promo-image', - 'promo-intro', 'promo-link', 'subhead'] - for item in items: - for it in soup.findAll(item): - it.name = 'div' - it.attrs = [] - for item in soup.findAll(style=True): - del item['style'] - for item in soup.findAll('a'): - limg = item.find('img') - if item.string is not None: - str = item.string - item.replaceWith(str) - else: - if limg: - item.name = 'div' - item.attrs = [] - else: - str = self.tag_to_string(item) - item.replaceWith(str) - for item in soup.findAll('img'): - if not item.has_key('alt'): # noqa - item['alt'] = 'image' + for img in soup.findAll('img', srcset=True): + src = img['srcset'].split(',')[0].strip() + src = unquote(src.rpartition('/')[2].partition('?')[0]) + img['src'] = src return soup - - def get_obfuscated_article(self, url): - count = 0 - while (count < 10): - try: - response = self.browser.open(url) - html = response.read() - count = 10 - except: - print "Retrying download..." - count += 1 - tfile = PersistentTemporaryFile('_fa.html') - tfile.write(html) - tfile.close() - self.temp_files.append(tfile) - return tfile.name