diff --git a/Changelog.yaml b/Changelog.yaml index 853f6a010e..953d0ff858 100644 --- a/Changelog.yaml +++ b/Changelog.yaml @@ -69,6 +69,7 @@ improved recipes: - Le Temps - Perfil + - Financial Times UK new recipes: - title: "Daytona Beach Journal" diff --git a/recipes/financial_times_uk.recipe b/recipes/financial_times_uk.recipe index cf219cfda1..6fe1ac6acd 100644 --- a/recipes/financial_times_uk.recipe +++ b/recipes/financial_times_uk.recipe @@ -1,15 +1,17 @@ __license__ = 'GPL v3' __copyright__ = '2010-2011, Darko Miletic ' ''' -ft.com +www.ft.com/uk-edition ''' from calibre import strftime from calibre.web.feeds.news import BasicNewsRecipe class FinancialTimes(BasicNewsRecipe): - title = u'Financial Times - UK printed edition' + title = 'Financial Times - UK printed edition' __author__ = 'Darko Miletic' - description = 'Financial world news' + description = "The Financial Times (FT) is one of the world's leading business news and information organisations, recognised internationally for its authority, integrity and accuracy." + publisher = 'The Financial Times Ltd.' + category = 'news, finances, politics, UK, World' oldest_article = 2 language = 'en_GB' max_articles_per_feed = 250 @@ -17,14 +19,24 @@ class FinancialTimes(BasicNewsRecipe): use_embedded_content = False needs_subscription = True encoding = 'utf8' - simultaneous_downloads= 1 - delay = 1 + publication_type = 'newspaper' + cover_url = strftime('http://specials.ft.com/vtf_pdf/%d%m%y_FRONT1_LON.pdf') + masthead_url = 'http://im.media.ft.com/m/img/masthead_main.jpg' LOGIN = 'https://registration.ft.com/registration/barrier/login' INDEX = 'http://www.ft.com/uk-edition' PREFIX = 'http://www.ft.com' + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + , 'linearize_tables' : True + } + def get_browser(self): br = BasicNewsRecipe.get_browser() + br.open(self.INDEX) if self.username is not None and self.password is not None: br.open(self.LOGIN) br.select_form(name='loginForm') @@ -33,29 +45,34 @@ class FinancialTimes(BasicNewsRecipe): br.submit() return br - keep_only_tags = [ dict(name='div', attrs={'id':'cont'}) ] - remove_tags_after = dict(name='p', attrs={'class':'copyright'}) + keep_only_tags = [dict(name='div', attrs={'class':['fullstory fullstoryHeader','fullstory fullstoryBody','ft-story-header','ft-story-body','index-detail']})] remove_tags = [ dict(name='div', attrs={'id':'floating-con'}) ,dict(name=['meta','iframe','base','object','embed','link']) + ,dict(attrs={'class':['storyTools','story-package','screen-copy','story-package separator','expandable-image']}) ] remove_attributes = ['width','height','lang'] extra_css = """ - body{font-family:Arial,Helvetica,sans-serif;} - h2{font-size:large;} - .ft-story-header{font-size:xx-small;} - .ft-story-body{font-size:small;} - a{color:#003399;} + body{font-family: Georgia,Times,"Times New Roman",serif} + h2{font-size:large} + .ft-story-header{font-size: x-small} .container{font-size:x-small;} h3{font-size:x-small;color:#003399;} .copyright{font-size: x-small} + img{margin-top: 0.8em; display: block} + .lastUpdated{font-family: Arial,Helvetica,sans-serif; font-size: x-small} + .byline,.ft-story-body,.ft-story-header{font-family: Arial,Helvetica,sans-serif} """ def get_artlinks(self, elem): articles = [] for item in elem.findAll('a',href=True): - url = self.PREFIX + item['href'] + rawlink = item['href'] + if rawlink.startswith('http://'): + url = rawlink + else: + url = self.PREFIX + rawlink title = self.tag_to_string(item) date = strftime(self.timefmt) articles.append({ @@ -65,7 +82,7 @@ class FinancialTimes(BasicNewsRecipe): ,'description':'' }) return articles - + def parse_index(self): feeds = [] soup = self.index_to_soup(self.INDEX) @@ -80,11 +97,34 @@ class FinancialTimes(BasicNewsRecipe): strest.insert(0,st) for item in strest: ftitle = self.tag_to_string(item) - self.report_progress(0, _('Fetching feed')+' %s...'%(ftitle)) + self.report_progress(0, _('Fetching feed')+' %s...'%(ftitle)) feedarts = self.get_artlinks(item.parent.ul) feeds.append((ftitle,feedarts)) return feeds def preprocess_html(self, soup): - return self.adeify_images(soup) - + items = ['promo-box','promo-title', + 'promo-headline','promo-image', + 'promo-intro','promo-link','subhead'] + for item in items: + for it in soup.findAll(item): + it.name = 'div' + it.attrs = [] + for item in soup.findAll(style=True): + del item['style'] + for item in soup.findAll('a'): + limg = item.find('img') + if item.string is not None: + str = item.string + item.replaceWith(str) + else: + if limg: + item.name = 'div' + item.attrs = [] + else: + str = self.tag_to_string(item) + item.replaceWith(str) + for item in soup.findAll('img'): + if not item.has_key('alt'): + item['alt'] = 'image' + return soup diff --git a/recipes/icons/financial_times_uk.png b/recipes/icons/financial_times_uk.png new file mode 100644 index 0000000000..2a769d9dbb Binary files /dev/null and b/recipes/icons/financial_times_uk.png differ