From 1af4092851cde94998ea58ed79b8f90000612180 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 13 Feb 2018 08:08:08 +0530 Subject: [PATCH] Add a configurable parameter to the NYT web edition recipe to skip articles older than specified number of days --- recipes/nytimes.recipe | 15 +++++++++++++-- recipes/nytimes_sub.recipe | 15 +++++++++++++-- 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/recipes/nytimes.recipe b/recipes/nytimes.recipe index e500d10c51..2743d2ca1d 100644 --- a/recipes/nytimes.recipe +++ b/recipes/nytimes.recipe @@ -11,6 +11,8 @@ from calibre.utils.date import strptime from calibre.web.feeds.news import BasicNewsRecipe is_web_edition = True +oldest_web_edition_article = 7 # days + # The sections to download when downloading the web edition, comment out # the section you are not interested in web_sections = [ @@ -155,12 +157,21 @@ class NewYorkTimes(BasicNewsRecipe): p = article.find(**classes('summary')) if p is not None: desc = self.tag_to_string(p) - yield {'title': title, 'url': url, 'description': desc} + date = '' + d = date_from_url(url) + if d is not None: + date = format_date(d) + today = datetime.date.today() + delta = today - d + if delta.days > oldest_web_edition_article: + self.log.debug('\tSkipping article', title, 'as it is too old') + continue + yield {'title': title, 'url': url, 'description': desc, 'date': date} def parse_web_section(self, soup, slug): def log(article): - self.log('\t', article['title'], ':', article['url']) + self.log('\t', article['title'] + article['date'], ':', article['url']) if article.get('description'): self.log('\t\t', article['description']) diff --git a/recipes/nytimes_sub.recipe b/recipes/nytimes_sub.recipe index 73ef9674ba..01de3e0e14 100644 --- a/recipes/nytimes_sub.recipe +++ b/recipes/nytimes_sub.recipe @@ -11,6 +11,8 @@ from calibre.utils.date import strptime from calibre.web.feeds.news import BasicNewsRecipe is_web_edition = False +oldest_web_edition_article = 7 # days + # The sections to download when downloading the web edition, comment out # the section you are not interested in web_sections = [ @@ -155,12 +157,21 @@ class NewYorkTimes(BasicNewsRecipe): p = article.find(**classes('summary')) if p is not None: desc = self.tag_to_string(p) - yield {'title': title, 'url': url, 'description': desc} + date = '' + d = date_from_url(url) + if d is not None: + date = format_date(d) + today = datetime.date.today() + delta = today - d + if delta.days > oldest_web_edition_article: + self.log.debug('\tSkipping article', title, 'as it is too old') + continue + yield {'title': title, 'url': url, 'description': desc, 'date': date} def parse_web_section(self, soup, slug): def log(article): - self.log('\t', article['title'], ':', article['url']) + self.log('\t', article['title'] + article['date'], ':', article['url']) if article.get('description'): self.log('\t\t', article['description'])