This commit is contained in:
Kovid Goyal 2024-09-18 11:54:22 +05:30
commit 685fc41ce8
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -208,17 +208,27 @@ class nytFeeds(BasicNewsRecipe):
img { display:block; margin:0 auto; } img { display:block; margin:0 auto; }
''' '''
# https://www.nytimes.com/rss
# https://developer.nytimes.com/docs/rss-api/1/overview
feeds = [ feeds = [
('World', 'https://rss.nytimes.com/services/xml/rss/nyt/World.xml'), # to filter out all opinions from other sections first
('US', 'https://rss.nytimes.com/services/xml/rss/nyt/US.xml'), 'https://rss.nytimes.com/services/xml/rss/nyt/Opinion.xml',
('Business', 'https://rss.nytimes.com/services/xml/rss/nyt/Business.xml'),
('Technology', 'https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml'), 'https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml',
('Science', 'https://rss.nytimes.com/services/xml/rss/nyt/Science.xml'), 'https://rss.nytimes.com/services/xml/rss/nyt/World.xml',
('Arts', 'https://rss.nytimes.com/services/xml/rss/nyt/Arts.xml'), 'https://rss.nytimes.com/services/xml/rss/nyt/US.xml',
('Fashion & Style', 'https://rss.nytimes.com/services/xml/rss/nyt/FashionandStyle.xml'), 'https://rss.nytimes.com/services/xml/rss/nyt/Business.xml',
('TMagazine', 'https://rss.nytimes.com/services/xml/rss/nyt/tmagazine.xml'), 'https://rss.nytimes.com/services/xml/rss/nyt/YourMoney.xml',
('Travel', 'https://www.nytimes.com/services/xml/rss/nyt/Travel.xml'), 'https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml',
('Sunday Review', 'https://rss.nytimes.com/services/xml/rss/nyt/sunday-review.xml'), 'https://rss.nytimes.com/services/xml/rss/nyt/Science.xml',
'https://rss.nytimes.com/services/xml/rss/nyt/Climate.xml',
'https://rss.nytimes.com/services/xml/rss/nyt/Health.xml',
'https://rss.nytimes.com/services/xml/rss/nyt/Arts.xml',
'https://rss.nytimes.com/services/xml/rss/nyt/FashionandStyle.xml',
'https://rss.nytimes.com/services/xml/rss/nyt/tmagazine.xml',
'https://rss.nytimes.com/services/xml/rss/nyt/books.xml',
'https://www.nytimes.com/services/xml/rss/nyt/Travel.xml',
'http://nytimes.com/timeswire/feeds/'
] ]
def get_browser(self, *args, **kwargs): def get_browser(self, *args, **kwargs):
@ -231,6 +241,10 @@ class nytFeeds(BasicNewsRecipe):
return br return br
def preprocess_raw_html(self, raw_html, url): def preprocess_raw_html(self, raw_html, url):
if '/interactive/' in url:
return '<html><body><p><em>'\
+ 'This is an interactive article, which is supposed to be read in a browser.'\
+ '</p></em></body></html>'
data = extract_json(raw_html) data = extract_json(raw_html)
return '\n'.join(article_parse(data)) return '\n'.join(article_parse(data))
@ -239,9 +253,15 @@ class nytFeeds(BasicNewsRecipe):
if w and isinstance(w, str): if w and isinstance(w, str):
res = '-' + w res = '-' + w
for img in soup.findAll('img', attrs={'src':True}): for img in soup.findAll('img', attrs={'src':True}):
ext = img['src'].split('?')[0].split('.')[-1] if '-article' in img['src']:
img['src'] = img['src'].rsplit('-article', 1)[0] + res + '.' + ext ext = img['src'].split('?')[0].split('.')[-1]
img['src'] = img['src'].rsplit('-article', 1)[0] + res + '.' + ext
for c in soup.findAll('div', attrs={'class':'cap'}): for c in soup.findAll('div', attrs={'class':'cap'}):
for p in c.findAll(['p', 'div']): for p in c.findAll(['p', 'div']):
p.name = 'span' p.name = 'span'
return soup return soup
def get_article_url(self, article):
url = BasicNewsRecipe.get_article_url(self, article)
if not re.search(r'/video/|live|/athletic/', url):
return url