From 24fb0356c1ef722eda4e67da5859c2baa6542069 Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Wed, 18 Sep 2024 11:50:05 +0530 Subject: [PATCH] Update nytfeeds.recipe add more feeds, skip video links. --- recipes/nytfeeds.recipe | 44 ++++++++++++++++++++++++++++++----------- 1 file changed, 32 insertions(+), 12 deletions(-) diff --git a/recipes/nytfeeds.recipe b/recipes/nytfeeds.recipe index cbe7b18051..5e09f0a5b6 100644 --- a/recipes/nytfeeds.recipe +++ b/recipes/nytfeeds.recipe @@ -208,17 +208,27 @@ class nytFeeds(BasicNewsRecipe): img { display:block; margin:0 auto; } ''' + # https://www.nytimes.com/rss + # https://developer.nytimes.com/docs/rss-api/1/overview feeds = [ - ('World', 'https://rss.nytimes.com/services/xml/rss/nyt/World.xml'), - ('US', 'https://rss.nytimes.com/services/xml/rss/nyt/US.xml'), - ('Business', 'https://rss.nytimes.com/services/xml/rss/nyt/Business.xml'), - ('Technology', 'https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml'), - ('Science', 'https://rss.nytimes.com/services/xml/rss/nyt/Science.xml'), - ('Arts', 'https://rss.nytimes.com/services/xml/rss/nyt/Arts.xml'), - ('Fashion & Style', 'https://rss.nytimes.com/services/xml/rss/nyt/FashionandStyle.xml'), - ('TMagazine', 'https://rss.nytimes.com/services/xml/rss/nyt/tmagazine.xml'), - ('Travel', 'https://www.nytimes.com/services/xml/rss/nyt/Travel.xml'), - ('Sunday Review', 'https://rss.nytimes.com/services/xml/rss/nyt/sunday-review.xml'), + # to filter out all opinions from other sections first + 'https://rss.nytimes.com/services/xml/rss/nyt/Opinion.xml', + + 'https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml', + 'https://rss.nytimes.com/services/xml/rss/nyt/World.xml', + 'https://rss.nytimes.com/services/xml/rss/nyt/US.xml', + 'https://rss.nytimes.com/services/xml/rss/nyt/Business.xml', + 'https://rss.nytimes.com/services/xml/rss/nyt/YourMoney.xml', + 'https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml', + 'https://rss.nytimes.com/services/xml/rss/nyt/Science.xml', + 'https://rss.nytimes.com/services/xml/rss/nyt/Climate.xml', + 'https://rss.nytimes.com/services/xml/rss/nyt/Health.xml', + 'https://rss.nytimes.com/services/xml/rss/nyt/Arts.xml', + 'https://rss.nytimes.com/services/xml/rss/nyt/FashionandStyle.xml', + 'https://rss.nytimes.com/services/xml/rss/nyt/tmagazine.xml', + 'https://rss.nytimes.com/services/xml/rss/nyt/books.xml', + 'https://www.nytimes.com/services/xml/rss/nyt/Travel.xml', + 'http://nytimes.com/timeswire/feeds/' ] def get_browser(self, *args, **kwargs): @@ -231,6 +241,10 @@ class nytFeeds(BasicNewsRecipe): return br def preprocess_raw_html(self, raw_html, url): + if '/interactive/' in url: + return '
'\ + + 'This is an interactive article, which is supposed to be read in a browser.'\ + + '
' data = extract_json(raw_html) return '\n'.join(article_parse(data)) @@ -239,9 +253,15 @@ class nytFeeds(BasicNewsRecipe): if w and isinstance(w, str): res = '-' + w for img in soup.findAll('img', attrs={'src':True}): - ext = img['src'].split('?')[0].split('.')[-1] - img['src'] = img['src'].rsplit('-article', 1)[0] + res + '.' + ext + if '-article' in img['src']: + ext = img['src'].split('?')[0].split('.')[-1] + img['src'] = img['src'].rsplit('-article', 1)[0] + res + '.' + ext for c in soup.findAll('div', attrs={'class':'cap'}): for p in c.findAll(['p', 'div']): p.name = 'span' return soup + + def get_article_url(self, article): + url = BasicNewsRecipe.get_article_url(self, article) + if not re.search(r'/video/|live|/athletic/', url): + return url