From d793ed36472e99fffd1ff0b7c7b7a3f275f584ce Mon Sep 17 00:00:00 2001
From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com>
Date: Wed, 25 Sep 2024 13:56:47 +0530
Subject: [PATCH 1/2] Update New York Times (Web)

---
 recipes/icons/iht.png      | Bin 759 -> 0 bytes
 recipes/icons/nytimes.png  | Bin 301 -> 416 bytes
 recipes/iht.recipe         |  30 ----------
 recipes/nytfeeds.recipe    |  12 ++--
 recipes/nytimes.recipe     | 116 +++++++++++++++++++++++++++++++------
 recipes/nytimes_sub.recipe |  45 +++++++++-----
 6 files changed, 136 insertions(+), 67 deletions(-)
 delete mode 100644 recipes/icons/iht.png
 delete mode 100644 recipes/iht.recipe
diff --git a/recipes/icons/iht.png b/recipes/icons/iht.png
deleted file mode 100644
index a9d598da97068054857a788ed2716a6d6e03aaf6..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 759
zcmeAS@N?(olHy`uVBq!ia0vp^3LwnE3?yBabRA=0U{nk632_B-b8~Zzjg9~R{|{tu
z*s$T|&6_~Bh=_=WhK8G)o0OE)fddBw1qFe!K)}z>50r%{ke8Pa4-W^b6cQ4$wYBZ;
z?gonQ-@m`GuuxG^5y%FrpE+|TkgTq*K6&ya4-b!>on3i(d1z?plP6DrMwytHczb&n
z7Z;0*i*s{x1GPv>N&@X=XJ_Z-<*lu)UA%ZPP`kFawz9IagoH#~T%545@S;VFI5|1F
zxVTJBO=DtWcJ11wrlwX_Rt9uKYip~Xo*vM&i4!MIpFUk*Uq3Q35@;+?WarMEKr?~r
zCr_Rn5fK6OG6x68rAwFAty|aI+k5x!U7#Z$K70tY&cMJxKtLcMAV5Jup`f6^!NEa;
zQzaHC5nU4G7tB!bUO?b{mce<Sh>cD66bw273id@$dbs-cH-~*Emozw>D3JI0J;mn0
zw6}LUCe8h&@O{R&*K_YB{QQvh_+!M6pC^BA{QQ1J%Ac=OUSGeH@TBBVO5x`P-=k+-
zul(NeG0s^47%{4zE{-7;x3*qNPI{BT(E4z4r}4JJ-21n--8H`RU+GxaE4%v7&y3?v
z-mdUy3N`pO{oj+6_stelH;5U(`MjT#;ZlBGgXaTV_kWr3C)OMMbkpYdvD>I~`ejzC
z$diOm^X(Y2RHF4aul~cQw*M5Hz@N{8Nk2Zj?7icmy&+)V<KzDF0X<Rz+l@B}@6}nj
zVj`2m0mpeOBWh-?YxFy(wdlC*4CUvIaRM<Ptau&1G~ZCl)Sh;%>5S27?F|vK+zZxe
zZg^C|z}VMV7{B2jua@*)hxU_dC)bt*-0tkStFVE4vhB8~UoY8Nv3Rggdu^9;`=VhL
zi{?S$8FM!+{yt;r=Hg{8Gh(OQe0qJ7)X7t-YidmUb~fEDoysg4*}@$9PW9>R=p6yv
gk&${Wu6ve<btQ#GoIPA528<U5Pgg&ebxsLQ07`g882|tP

diff --git a/recipes/icons/nytimes.png b/recipes/icons/nytimes.png
index 9ae9985ee4663dd2aa8177fbd2f0ff78a7cc9f07..2d170d68f4a5ce7fc46817242854f5fe5ab8d616 100644
GIT binary patch
literal 416
zcmeAS@N?(olHy`uVBq!ia0vp^3LwnE3?yBabR7dyHUT~%uI>dsLPA2qU?3tQ0%XI1
zgoFf;4HN`Q!Ud(Iq~Po;{Iezk%@HmM@(X78sMRBxc;&=Qz6~}$kCyLZt5&=_EqgZu
z10#c{i(`mI@6pLOC$%c@xZDiCdFKEB*DO0;3GC?)p3Lt3GI)ZYa`OM+e<zaSub9nT
zW8cK;`z0$uM1R+#^rn(^yIPm;c>2ReOyN+N_5#KrjW+vTf!qZR2fyu^@I7#eXR1u{
z@+mhK@R=<YTIu~T;<04>tTem4qWR8@hg(<<tesND_~C<CHG4e6_228{A7sqdyO5Aj
z$275dNrTKOmSY!W7`fJ+%w}K0>ubDll77S1Cht<_H@rOJ*<!H|H~wQf=M!5d&2gfb
zNmb{7A=9Kvw<UYdtLWP6PnoOy<ragw-4W5^lZh_-csH;8-WfdiE2Dkt{ZAtAuFb8#
k*e)sVRPbf`kJrB#4JQk)F!JQ(0R|O=r>mdKI;Vst02fl1tpET3

literal 301
zcmV+|0n+}7P)<h;3K|Lk000e1NJLTq000mG000mO00000I+&on0002=Nkl<Zc-jSk
zKPW_T0LQ=2d+(N~7#Ivvl7UIFTq)Zb4JHFOGEh#gENr?AH<*>htVm)|EJm4Z7RBqX
zlkVQV_wM)peIyJCZK9L_Vi1IYAK~*$o}&!1?#|5W?kmkIMJNiW%aTf#1PIe(VY9h6
zHY#^5qVLFYbliWQynnemD!Z|#7~-GG+`LW?bI1abnWCzvmD0i1O%#v`dWk(nS?fcy
zEpo(wJB|M>+xsV9X^|!(;KS9!V&$#zT_Z=(W_o0}@kwoI!^eoY;98bj-}WXRLr@oj
z{MMG_UA6$hpbwpkv-9_Yh>mE1F{Fl+AWQfMaOARK#zZT$00000NkvXXu0mjf-%y6A

diff --git a/recipes/iht.recipe b/recipes/iht.recipe
deleted file mode 100644
index 70129527eb..0000000000
--- a/recipes/iht.recipe
+++ /dev/null
@@ -1,30 +0,0 @@
-from calibre.web.feeds.news import BasicNewsRecipe
-
-
-class NYTimesGlobal(BasicNewsRecipe):
-    title = u'NY Times Global'
-    language = 'en'
-    __author__ = 'Krittika Goyal'
-    oldest_article = 1  # days
-    max_articles_per_feed = 25
-    use_embedded_content = False
-
-    no_stylesheets = True
-    auto_cleanup = True
-
-    feeds = [
-        ('NYTimes',
-         'http://www.nytimes.com/services/xml/rss/nyt/HomePage.xml'),
-        ('NYTimes global',
-         'http://www.nytimes.com/services/xml/rss/nyt/GlobalHome.xml'),
-        ('World',
-         'http://www.nytimes.com/services/xml/rss/nyt/World.xml'),
-        ('U.S.',
-         'http://www.nytimes.com/services/xml/rss/nyt/US.xml'),
-        ('Business',
-         'http://feeds.nytimes.com/nyt/rss/Business'),
-        ('Sports',
-         'http://www.nytimes.com/services/xml/rss/nyt/Sports.xml'),
-        ('Technology',
-         'http://feeds.nytimes.com/nyt/rss/Technology'),
-    ]
diff --git a/recipes/nytfeeds.recipe b/recipes/nytfeeds.recipe
index 9a0d54215d..4b9fd17551 100644
--- a/recipes/nytfeeds.recipe
+++ b/recipes/nytfeeds.recipe
@@ -1,9 +1,8 @@
 #!/usr/bin/env python
 import json
 import re
-import time
-from datetime import datetime, timedelta
 
+from calibre.utils.iso8601 import parse_iso8601
 from calibre.web.feeds.news import BasicNewsRecipe
 
 
@@ -66,7 +65,7 @@ def parse_byline(byl):
     yield '</i></b></div>'
 
 def iso_date(x):
-    dt = datetime.fromisoformat(x[:-1]) + timedelta(seconds=time.timezone)
+    dt = parse_iso8601(x, as_utc=False)
     return dt.strftime('%b %d, %Y at %I:%M %p')
 
 def parse_header(h):
@@ -138,7 +137,7 @@ def parse_types(x):
     elif x.get('__typename', '') == 'Image':
         yield ''.join(parse_image(x))
     elif x.get('__typename', '') == 'ImageBlock':
-        yield ''.join(parse_image(x['media']))
+        yield ''.join(parse_types(x['media']))
     elif x.get('__typename', '') == 'GridBlock':
         yield ''.join(parse_img_grid(x))
 
@@ -265,6 +264,8 @@ class nytFeeds(BasicNewsRecipe):
         'https://rss.nytimes.com/services/xml/rss/nyt/tmagazine.xml',
         'https://rss.nytimes.com/services/xml/rss/nyt/books.xml',
         'https://www.nytimes.com/services/xml/rss/nyt/Travel.xml',
+        'https://rss.nytimes.com/services/xml/rss/nyt/well.xml',
+        'https://rss.nytimes.com/services/xml/rss/nyt/Sports.xml',
         'http://nytimes.com/timeswire/feeds/'
     ]
 
@@ -301,5 +302,6 @@ class nytFeeds(BasicNewsRecipe):
     def get_article_url(self, article):
         url = BasicNewsRecipe.get_article_url(self, article)
         # you can remove '|/espanol/' from code below to include spanish articles.
-        if not re.search(r'/video/|/live/|/athletic/|/espanol/', url):
+        if not re.search(r'/video/|/live/|/athletic/|/espanol/|/card/', url):
             return url
+        self.log('\tSkipped URL: ', url)
diff --git a/recipes/nytimes.recipe b/recipes/nytimes.recipe
index 36b6ad1a5e..306fbd460d 100644
--- a/recipes/nytimes.recipe
+++ b/recipes/nytimes.recipe
@@ -14,9 +14,7 @@ from calibre.ebooks.BeautifulSoup import Tag
 from calibre.utils.date import strptime
 from calibre.web.feeds.news import BasicNewsRecipe
 
-is_web_edition = True
-oldest_web_edition_article = 7  # days
-use_wayback_machine = True
+use_wayback_machine = False
 
 
 # The sections to download when downloading the web edition, comment out
@@ -77,22 +75,28 @@ def new_tag(soup, name, attrs=()):
 
 
 class NewYorkTimes(BasicNewsRecipe):
-
-    if is_web_edition:
-        title = 'The New York Times (Web)'
-        description = 'New York Times (Web). You can edit the recipe to remove sections you are not interested in.'
-    else:
-        title = 'The New York Times'
-        description = 'Today\'s New York Times'
+    title = 'The New York Times (Web)'
+    description = (
+        'New York Times (Web). You can edit the recipe to remove sections you are not interested in. '
+        'Use advanced menu to make changes to fetch Todays Paper'
+    )
     encoding = 'utf-8'
     __author__ = 'Kovid Goyal'
-    language = 'en'
+    language = 'en_US'
     ignore_duplicate_articles = {'title', 'url'}
     no_stylesheets = True
-    compress_news_images = True
-    compress_news_images_auto_size = 5
-    conversion_options = {'flow_size': 0}
-    delay = 0 if use_wayback_machine else 1
+    is_web_edition = True
+    oldest_web_edition_article = 7  # days
+
+    extra_css = '''
+        .byl, .time { font-size:small; color:#202020; }
+        .cap { font-size:small; text-align:center; }
+        .cred { font-style:italic; font-size:small; }
+        em, blockquote { color: #202020; }
+        .sc { font-variant: small-caps; }
+        .lbl { font-size:small; color:#404040; }
+        img { display:block; margin:0 auto; }
+    '''
 
     @property
     def nyt_parser(self):
@@ -106,9 +110,13 @@ class NewYorkTimes(BasicNewsRecipe):
         if use_wayback_machine and not skip_wayback:
             from calibre import browser
             return self.nyt_parser.download_url(url, browser())
-        return self.browser.open_novisit(url).read()
+        return self.index_to_soup(url, raw=True)
 
     def preprocess_raw_html(self, raw_html, url):
+        if '/interactive/' in url:
+            return '<html><body><p><em>'\
+                + 'This is an interactive article, which is supposed to be read in a browser.'\
+                    + '</p></em></body></html>'
         html = self.nyt_parser.extract_html(self.index_to_soup(raw_html))
         return html
 
@@ -121,9 +129,51 @@ class NewYorkTimes(BasicNewsRecipe):
                 tf.write(self.get_nyt_page(url))
             return tf.name
 
+    recipe_specific_options = {
+        'web': {
+            'short': 'Type in yes, if you want Todays Paper',
+            'default': 'Web Edition'
+        },
+        'days': {
+            'short': 'Oldest article to download from this news source. In days ',
+            'long': 'For example, 1, gives you articles from the past 24 hours\n(Works only for Web_Edition)',
+            'default': str(oldest_web_edition_article)
+        },
+        'date': {
+            'short': 'The date of the edition to download (YYYY/MM/DD format)\nUsed to fetch past editions of NYT newspaper',
+            'long': 'For example, 2024/07/16'
+        },
+        'res': {
+            'short': 'For hi-res images, select a resolution from the following\noptions: popup, jumbo, mobileMasterAt3x, superJumbo',
+            'long': 'This is useful for non e-ink devices, and for a lower file size\nthan the default, use articleInline.',
+        },
+        'comp': {
+            'short': 'Compress News Images?',
+            'long': 'enter yes',
+            'default': 'no'
+        }
+    }
+
+    def __init__(self, *args, **kwargs):
+        BasicNewsRecipe.__init__(self, *args, **kwargs)
+        c = self.recipe_specific_options.get('comp')
+        d = self.recipe_specific_options.get('days')
+        w = self.recipe_specific_options.get('web')
+        if w and isinstance(w, str):
+            if w == 'yes':
+                self.is_web_edition = False
+        if d and isinstance(d, str):
+            self.oldest_web_edition_article = float(d)
+        if c and isinstance(c, str):
+            if c.lower() == 'yes':
+                self.compress_news_images = True
+
     def read_todays_paper(self):
         INDEX = 'https://www.nytimes.com/section/todayspaper'
         # INDEX = 'file:///t/raw.html'
+        d = self.recipe_specific_options.get('date')
+        if d and isinstance(d, str):
+            INDEX = 'https://www.nytimes.com/issue/todayspaper/' + d + '/todays-new-york-times'
         return self.index_to_soup(self.get_nyt_page(INDEX, skip_wayback=True))
 
     def read_nyt_metadata(self):
@@ -219,7 +269,7 @@ class NewYorkTimes(BasicNewsRecipe):
                         date = format_date(d)
                         today = datetime.date.today()
                         delta = today - d
-                        if delta.days > oldest_web_edition_article:
+                        if delta.days > self.oldest_web_edition_article:
                             self.log.debug('\tSkipping article', title, 'as it is too old')
                             continue
                     yield {'title': title, 'url': url, 'description': desc, 'date': date}
@@ -242,7 +292,7 @@ class NewYorkTimes(BasicNewsRecipe):
                         date = format_date(d)
                         today = datetime.date.today()
                         delta = today - d
-                        if delta.days > oldest_web_edition_article:
+                        if delta.days > self.oldest_web_edition_article:
                             self.log.debug('\tSkipping article', title, 'as it is too old')
                             continue
                     yield {'title': title, 'url': url, 'description': desc, 'date': date}
@@ -290,6 +340,34 @@ class NewYorkTimes(BasicNewsRecipe):
         # return [('All articles', [
         #     {'title': 'XXXXX', 'url': 'https://www.nytimes.com/2020/11/27/world/americas/coronavirus-migrants-venezuela.html'},
         # ])]
-        if is_web_edition:
+        if self.is_web_edition:
             return self.parse_web_sections()
         return self.parse_todays_page()
+
+    def get_browser(self, *args, **kwargs):
+        kwargs['user_agent'] = 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
+        br = BasicNewsRecipe.get_browser(self, *args, **kwargs)
+        br.addheaders += [
+            ('Referer', 'https://www.google.com/'),
+            ('X-Forwarded-For', '66.249.66.1')
+        ]
+        return br
+
+    def preprocess_html(self, soup):
+        w = self.recipe_specific_options.get('res')
+        if w and isinstance(w, str):
+            res = '-' + w
+            for img in soup.findAll('img', attrs={'src':True}):
+                if '-article' in img['src']:
+                    ext = img['src'].split('?')[0].split('.')[-1]
+                    img['src'] = img['src'].rsplit('-article', 1)[0] + res + '.' + ext
+        for c in soup.findAll('div', attrs={'class':'cap'}):
+            for p in c.findAll(['p', 'div']):
+                p.name = 'span'
+        return soup
+
+    def get_article_url(self, article):
+        url = BasicNewsRecipe.get_article_url(self, article)
+        if not re.search(r'/video/|/athletic/|/card/', url):
+            return url
+        self.log('\tSkipping ', url)
diff --git a/recipes/nytimes_sub.recipe b/recipes/nytimes_sub.recipe
index 96d67567d4..2a859fdca3 100644
--- a/recipes/nytimes_sub.recipe
+++ b/recipes/nytimes_sub.recipe
@@ -14,8 +14,6 @@ from calibre.ebooks.BeautifulSoup import Tag
 from calibre.utils.date import strptime
 from calibre.web.feeds.news import BasicNewsRecipe
 
-is_web_edition = False
-oldest_web_edition_article = 7  # days
 use_wayback_machine = False
 
 
@@ -77,18 +75,18 @@ def new_tag(soup, name, attrs=()):
 
 
 class NewYorkTimes(BasicNewsRecipe):
-
-    if is_web_edition:
-        title = 'The New York Times (Web)'
-        description = 'New York Times (Web). You can edit the recipe to remove sections you are not interested in.'
-    else:
-        title = 'The New York Times'
-        description = 'Today\'s New York Times'
+    title = 'The New York Times'
+    description = (
+        'New York Times. Todays Paper '
+        'Use advanced menu to make changes to fetch Web Edition'
+    )
     encoding = 'utf-8'
     __author__ = 'Kovid Goyal'
     language = 'en_US'
     ignore_duplicate_articles = {'title', 'url'}
     no_stylesheets = True
+    is_web_edition = False
+    oldest_web_edition_article = 7  # days
 
     extra_css = '''
         .byl, .time { font-size:small; color:#202020; }
@@ -132,8 +130,17 @@ class NewYorkTimes(BasicNewsRecipe):
             return tf.name
 
     recipe_specific_options = {
+        'web': {
+            'short': 'Type in yes, if you want Web Edition',
+            'default': 'Todays Paper'
+        },
+        'days': {
+            'short': 'Oldest article to download from this news source. In days ',
+            'long': 'For example, 1, gives you articles from the past 24 hours\n(Works only for Web_Edition)',
+            'default': str(oldest_web_edition_article)
+        },
         'date': {
-            'short': 'The date of the edition to download (YYYY/MM/DD format)',
+            'short': 'The date of the edition to download (YYYY/MM/DD format)\nUsed to fetch past editions of NYT newspaper',
             'long': 'For example, 2024/07/16'
         },
         'res': {
@@ -150,6 +157,13 @@ class NewYorkTimes(BasicNewsRecipe):
     def __init__(self, *args, **kwargs):
         BasicNewsRecipe.__init__(self, *args, **kwargs)
         c = self.recipe_specific_options.get('comp')
+        d = self.recipe_specific_options.get('days')
+        w = self.recipe_specific_options.get('web')
+        if w and isinstance(w, str):
+            if w == 'yes':
+                self.is_web_edition = True
+        if d and isinstance(d, str):
+            self.oldest_web_edition_article = float(d)
         if c and isinstance(c, str):
             if c.lower() == 'yes':
                 self.compress_news_images = True
@@ -255,7 +269,7 @@ class NewYorkTimes(BasicNewsRecipe):
                         date = format_date(d)
                         today = datetime.date.today()
                         delta = today - d
-                        if delta.days > oldest_web_edition_article:
+                        if delta.days > self.oldest_web_edition_article:
                             self.log.debug('\tSkipping article', title, 'as it is too old')
                             continue
                     yield {'title': title, 'url': url, 'description': desc, 'date': date}
@@ -278,7 +292,7 @@ class NewYorkTimes(BasicNewsRecipe):
                         date = format_date(d)
                         today = datetime.date.today()
                         delta = today - d
-                        if delta.days > oldest_web_edition_article:
+                        if delta.days > self.oldest_web_edition_article:
                             self.log.debug('\tSkipping article', title, 'as it is too old')
                             continue
                     yield {'title': title, 'url': url, 'description': desc, 'date': date}
@@ -326,7 +340,7 @@ class NewYorkTimes(BasicNewsRecipe):
         # return [('All articles', [
         #     {'title': 'XXXXX', 'url': 'https://www.nytimes.com/2020/11/27/world/americas/coronavirus-migrants-venezuela.html'},
         # ])]
-        if is_web_edition:
+        if self.is_web_edition:
             return self.parse_web_sections()
         return self.parse_todays_page()
 
@@ -351,3 +365,8 @@ class NewYorkTimes(BasicNewsRecipe):
             for p in c.findAll(['p', 'div']):
                 p.name = 'span'
         return soup
+
+    def get_article_url(self, article):
+        url = BasicNewsRecipe.get_article_url(self, article)
+        if not re.search(r'/video/|/athletic/', url):
+            return url

From 9e0506aaad9f8c4c8d60dbc618f5d08597c44bb8 Mon Sep 17 00:00:00 2001
From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com>
Date: Wed, 25 Sep 2024 14:01:32 +0530
Subject: [PATCH 2/2] use accent char to identify...

..resolved internal links
---
 src/calibre/web/feeds/news.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/calibre/web/feeds/news.py b/src/calibre/web/feeds/news.py
index 96f73faea1..ec1dfd2e75 100644
--- a/src/calibre/web/feeds/news.py
+++ b/src/calibre/web/feeds/news.py
@@ -1876,6 +1876,8 @@ class BasicNewsRecipe(Recipe):
                         if articles:
                             arelpath = sorted(articles, key=numeric_sort_key)[0]
                             a.set('href', item.relhref(arelpath))
+                            if a.text and len(a) == 0:
+                                a.text = a.text + '`'
                             if url not in seen:
                                 log.debug(f'Resolved internal URL: {url} -> {arelpath}')
                                 seen.add(url)