From 7980be7dd509e81f2e832c527f66cca10512cc1b Mon Sep 17 00:00:00 2001
From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com>
Date: Sun, 22 Sep 2024 18:26:05 +0530
Subject: [PATCH 1/3] Update nytimes.py

---
 recipes/nytfeeds.recipe                 |   4 +-
 recipes/nytimes_sub.recipe              |  63 ++++++-
 src/calibre/web/site_parsers/nytimes.py | 241 +++++++++++++++---------
 3 files changed, 217 insertions(+), 91 deletions(-)
diff --git a/recipes/nytfeeds.recipe b/recipes/nytfeeds.recipe
index 3fed8877c2..9a0d54215d 100644
--- a/recipes/nytfeeds.recipe
+++ b/recipes/nytfeeds.recipe
@@ -26,7 +26,7 @@ def parse_image(i):
 
 def parse_img_grid(g):
     for grd in g.get('gridMedia', {}):
-        yield '<br/>'.join(parse_image(grd))
+        yield ''.join(parse_image(grd))
     if g.get('caption'):
         yield '<div class="cap">{}'.format(g['caption'])
         if g.get('credit'):
@@ -301,5 +301,5 @@ class nytFeeds(BasicNewsRecipe):
     def get_article_url(self, article):
         url = BasicNewsRecipe.get_article_url(self, article)
         # you can remove '|/espanol/' from code below to include spanish articles.
-        if not re.search(r'/video/|live|/athletic/|/espanol/', url):
+        if not re.search(r'/video/|/live/|/athletic/|/espanol/', url):
             return url
diff --git a/recipes/nytimes_sub.recipe b/recipes/nytimes_sub.recipe
index 1486abe7a2..38f48ae632 100644
--- a/recipes/nytimes_sub.recipe
+++ b/recipes/nytimes_sub.recipe
@@ -16,7 +16,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
 
 is_web_edition = False
 oldest_web_edition_article = 7  # days
-use_wayback_machine = True
+use_wayback_machine = False
 
 
 # The sections to download when downloading the web edition, comment out
@@ -89,10 +89,16 @@ class NewYorkTimes(BasicNewsRecipe):
     language = 'en_US'
     ignore_duplicate_articles = {'title', 'url'}
     no_stylesheets = True
-    compress_news_images = True
-    compress_news_images_auto_size = 5
-    conversion_options = {'flow_size': 0}
-    delay = 0 if use_wayback_machine else 1
+
+    extra_css = '''
+        .byl, .time { font-size:small; color:#202020; }
+        .cap { font-size:small; text-align:center; }
+        .cred { font-style:italic; font-size:small; }
+        em, blockquote { color: #202020; }
+        .sc { font-variant: small-caps; }
+        .lbl { font-size:small; color:#404040; }
+        img { display:block; margin:0 auto; }
+    '''
 
     @property
     def nyt_parser(self):
@@ -109,6 +115,10 @@ class NewYorkTimes(BasicNewsRecipe):
         return self.browser.open_novisit(url).read()
 
     def preprocess_raw_html(self, raw_html, url):
+        if '/interactive/' in url:
+            return '<html><body><p><em>'\
+                + 'This is an interactive article, which is supposed to be read in a browser.'\
+                    + '</p></em></body></html>'
         html = self.nyt_parser.extract_html(self.index_to_soup(raw_html))
         return html
 
@@ -125,9 +135,25 @@ class NewYorkTimes(BasicNewsRecipe):
         'date': {
             'short': 'The date of the edition to download (YYYY/MM/DD format)',
             'long': 'For example, 2024/07/16'
+        },
+        'res': {
+            'short': 'For hi-res images, select a resolution from the following\noptions: popup, jumbo, mobileMasterAt3x, superJumbo',
+            'long': 'This is useful for non e-ink devices, and for a lower file size\nthan the default, use articleInline.',
+        },
+        'comp': {
+            'short': 'Compress News Images?',
+            'long': 'enter yes',
+            'default': 'no'
         }
     }
 
+    def __init__(self, *args, **kwargs):
+        BasicNewsRecipe.__init__(self, *args, **kwargs)
+        c = self.recipe_specific_options.get('comp')
+        if c and isinstance(c, str):
+            if c.lower() == 'yes':
+                self.compress_news_images = True
+
     def read_todays_paper(self):
         INDEX = 'https://www.nytimes.com/section/todayspaper'
         # INDEX = 'file:///t/raw.html'
@@ -303,3 +329,30 @@ class NewYorkTimes(BasicNewsRecipe):
         if is_web_edition:
             return self.parse_web_sections()
         return self.parse_todays_page()
+
+    def get_browser(self, *args, **kwargs):
+        kwargs['user_agent'] = 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
+        br = BasicNewsRecipe.get_browser(self, *args, **kwargs)
+        br.addheaders += [
+            ('Referer', 'https://www.google.com/'),
+            ('X-Forwarded-For', '66.249.66.1')
+        ]
+        return br
+
+    def get_article_url(self, article):
+        url = BasicNewsRecipe.get_article_url(self, article)
+        if not '/video/' in url:
+            return url
+
+    def preprocess_html(self, soup):
+        w = self.recipe_specific_options.get('res')
+        if w and isinstance(w, str):
+            res = '-' + w
+            for img in soup.findAll('img', attrs={'src':True}):
+                if '-article' in img['src']:
+                    ext = img['src'].split('?')[0].split('.')[-1]
+                    img['src'] = img['src'].rsplit('-article', 1)[0] + res + '.' + ext
+        for c in soup.findAll('div', attrs={'class':'cap'}):
+            for p in c.findAll(['p', 'div']):
+                p.name = 'span'
+        return soup
diff --git a/src/calibre/web/site_parsers/nytimes.py b/src/calibre/web/site_parsers/nytimes.py
index 919cabe61a..bb30659f69 100644
--- a/src/calibre/web/site_parsers/nytimes.py
+++ b/src/calibre/web/site_parsers/nytimes.py
@@ -13,88 +13,171 @@ module_version = 5  # needed for live updates
 pprint
 
 
-def is_heading(tn):
-    return tn in ('Heading1Block', 'Heading2Block', 'Heading3Block', 'Heading4Block')
+def parse_image(i):
+    if i.get('crops'):
+        yield '<div><img src="{}">'.format(i['crops'][0]['renditions'][0]['url'])
+    elif i.get('spanImageCrops'):
+        yield '<div><img src="{}">'.format(i['spanImageCrops'][0]['renditions'][0]['url'])
+    if i.get('caption'):
+        yield '<div class="cap">' + ''.join(parse_types(i['caption']))
+        if i.get('credit'):
+            yield '<span class="cred"> ' + i['credit'] + '</span>'
+        yield '</div>'
+    yield '</div>'
 
+def parse_img_grid(g):
+    for grd in g.get('gridMedia', {}):
+        yield ''.join(parse_image(grd))
+    if g.get('caption'):
+        yield '<div class="cap">{}'.format(g['caption'])
+        if g.get('credit'):
+            yield '<span class="cred"> ' + g['credit'] + '</span>'
+        yield '</div>'
 
-def process_inline_text(lines, block):
-    text = ''
-    if 'text@stripHtml' in block:
-        text = escape(block['text@stripHtml'])
-    elif 'renderedRepresentation' in block:  # happens in byline blocks
-        text = block['renderedRepresentation']
-    elif 'text' in block:
-        text = block['text']
-    if text:
-        for fmt in block.get('formats', ()):
-            tn = fmt['__typename']
-            if tn == 'LinkFormat':
-                ab = fmt
-                text = '<a href="{}" title="{}">{}</a>'.format(ab['url'], ab.get('title') or '', text)
-            elif tn == 'BoldFormat':
-                text = '<b>' + text + '</b>'
-        lines.append(text)
+def parse_vid(v):
+    if v.get('promotionalMedia'):
+        if v.get('headline'):
+            if v.get('url'):
+                yield '<div><b><a href="{}">Video</a>: '.format(v['url'])\
+                    + v['headline'].get('default', '') + '</b></div>'
+            elif v['headline'].get('default'):
+                yield '<div><b>' + v['headline']['default'] + '</b></div>'
+        yield ''.join(parse_types(v['promotionalMedia']))
+        if v.get('promotionalSummary'):
+            yield '<div class="cap">' + v['promotionalSummary'] + '</div>'
 
+def parse_emb(e):
+    if e.get('html') and 'datawrapper.dwcdn.net' in e.get('html', ''):
+        dw = re.search(r'datawrapper.dwcdn.net/(.{5})', e['html']).group(1)
+        yield '<div><img src="{}">'.format('https://datawrapper.dwcdn.net/' + dw + '/full.png') + '</div>'
+    elif e.get('promotionalMedia'):
+        if e.get('headline'):
+            yield '<div><b>' + e['headline']['default'] + '</b></div>'
+        yield ''.join(parse_types(e['promotionalMedia']))
+        if e.get('note'):
+            yield '<div class="cap">' + e['note'] + '</div>'
 
-def process_paragraph(lines, block, content_key='content'):
-    tn = block['__typename']
-    m = re.match(r'Heading([1-6])Block', tn)
-    if m is not None:
-        tag = 'h' + m.group(1)
-    else:
-        tag = 'p'
-    ta = block.get('textAlign') or 'LEFT'
-    style = f'text-align: {ta.lower()}'
-    lines.append(f'<{tag} style="{style}">')
-    for item in block[content_key]:
-        tn = item['__typename']
-        if tn in ('TextInline', 'Byline'):
-            process_inline_text(lines, item)
-    lines.append('</' + tag + '>')
+def parse_byline(byl):
+    for b in byl.get('bylines', {}):
+        yield '<div>' + b['renderedRepresentation'] + '</div>'
+    yield '<div><b><i>'
+    for rl in byl.get('role', {}):
+        if ''.join(parse_cnt(rl)).strip():
+            yield ''.join(parse_cnt(rl))
+    yield '</i></b></div>'
 
+def iso_date(x):
+    dt = parse_iso8601(x, as_utc=False)
+    return dt.strftime('%b %d, %Y at %I:%M %p')
 
-def process_timestamp(lines, block):
-    ts = block['timestamp']
-    dt = parse_iso8601(ts, as_utc=False)
-    lines.append('<p class="timestamp">' + escape(dt.strftime('%b %d, %Y')) + '</p>')
+def parse_header(h):
+    if h.get('label'):
+        yield '<div class="lbl">' + ''.join(parse_types(h['label'])) + '</div>'
+    if h.get('headline'):
+        yield ''.join(parse_types(h['headline']))
+    if h.get('summary'):
+        yield '<p><i>' +  ''.join(parse_types(h['summary'])) + '</i></p>'
+    if h.get('ledeMedia'):
+        yield ''.join(parse_types(h['ledeMedia']))
+    if h.get('byline'):
+        yield ''.join(parse_types(h['byline']))
+    if h.get('timestampBlock'):
+        yield ''.join(parse_types(h['timestampBlock']))
 
+def parse_fmt_type(fm):
+    for f in fm.get('formats', {}):
+        if f.get('__typename', '') == 'BoldFormat':
+            yield '<strong>'
+        if f.get('__typename', '') == 'ItalicFormat':
+            yield '<em>'
+        if f.get('__typename', '') == 'LinkFormat':
+            hrf = f['url']
+            yield '<a href="{}">'.format(hrf)
+    yield fm['text']
+    for f in reversed(fm.get('formats', {})):
+        if f.get('__typename', '') == 'BoldFormat':
+            yield '</strong>'
+        if f.get('__typename', '') == 'ItalicFormat':
+            yield '</em>'
+        if f.get('__typename', '') == 'LinkFormat':
+            yield '</a>'
 
-def process_header(lines, block):
-    label = block.get('label')
-    if label:
-        process_paragraph(lines, label)
-    headline = block.get('headline')
-    if headline:
-        process_paragraph(lines, headline)
-    summary = block.get('summary')
-    if summary:
-        process_paragraph(lines, summary)
-    lm = block.get('ledeMedia')
-    if lm and lm.get('__typename') == 'ImageBlock':
-        process_image_block(lines, lm)
-    byline = block.get('byline')
-    if byline:
-        process_paragraph(lines, byline, content_key='bylines')
-    timestamp = block.get('timestampBlock')
-    if timestamp:
-        process_timestamp(lines, timestamp)
+def parse_cnt(cnt):
+    if cnt.get('formats'):
+        yield ''.join(parse_fmt_type(cnt))
+    elif cnt.get('content'):
+        for cnt_ in cnt['content']:
+            yield from parse_types(cnt_)
+    elif cnt.get('text'):
+        yield cnt['text']
 
+def parse_types(x):
+    if 'Header' in x.get('__typename', ''):
+        yield '\n'.join(parse_header(x))
 
-def process_image_block(lines, block):
-    media = block['media']
-    caption = media.get('caption')
-    caption_lines = []
-    if caption:
-        process_inline_text(caption_lines, caption)
-    crops = media['crops']
-    renditions = crops[0]['renditions']
-    img = renditions[0]['url']
-    if 'web.archive.org' in img:
-        img = img.partition('/')[-1]
-        img = img[img.find('https://'):]
-    lines.append(f'<div style="text-align: center"><div style="text-align: center"><img src={quoteattr(img)}/></div><div style="font-size: smaller">')
-    lines.extend(caption_lines)
-    lines.append('</div></div>')
+    elif x.get('__typename', '') == 'Heading1Block':
+        yield '<h1>' + ''.join(parse_cnt(x)) + '</h1>'
+    elif x.get('__typename', '') in {'Heading2Block', 'Heading3Block', 'Heading4Block'}:
+        yield '<h4>' + ''.join(parse_cnt(x)) + '</h4>'
+
+    elif x.get('__typename', '') == 'ParagraphBlock':
+        yield '<p>' + ''.join(parse_cnt(x)) + '</p>'
+
+    elif x.get('__typename', '') == 'BylineBlock':
+        yield '<div class="byl"><br/>' + ''.join(parse_byline(x)) + '</div>'
+    elif x.get('__typename', '') == 'LabelBlock':
+        yield '<div class="sc">' + ''.join(parse_cnt(x)) + '</div>'
+    elif x.get('__typename', '') == 'BlockquoteBlock':
+        yield '<blockquote>' + ''.join(parse_cnt(x)) + '</blockquote>'
+    elif x.get('__typename', '') == 'TimestampBlock':
+        yield '<div class="time">' + iso_date(x['timestamp']) + '</div>'
+    elif x.get('__typename', '') == 'LineBreakInline':
+        yield '<br/>'
+    elif x.get('__typename', '') == 'RuleBlock':
+        yield '<hr/>'
+
+    elif x.get('__typename', '') == 'Image':
+        yield ''.join(parse_image(x))
+    elif x.get('__typename', '') == 'ImageBlock':
+        yield ''.join(parse_image(x['media']))
+    elif x.get('__typename', '') == 'GridBlock':
+        yield ''.join(parse_img_grid(x))
+
+    elif x.get('__typename', '') == 'VideoBlock':
+        yield ''.join(parse_types(x['media']))
+    elif x.get('__typename', '') == 'Video':
+        yield ''.join(parse_vid(x))
+        
+    elif x.get('__typename', '') == 'InteractiveBlock':
+        yield ''.join(parse_types(x['media']))
+    elif x.get('__typename', '') == 'EmbeddedInteractive':
+        yield ''.join(parse_emb(x))
+
+    elif x.get('__typename', '') == 'ListBlock':
+        yield '<ul>' + ''.join(parse_cnt(x)) + '</ul>'
+    elif x.get('__typename', '') == 'ListItemBlock':
+        yield '<li>' + ''.join(parse_cnt(x)) + '</li>'
+
+    elif x.get('__typename', '') == 'CapsuleBlock':
+        if x['capsuleContent'].get('body'):
+            yield ''.join(parse_cnt(x['capsuleContent']['body']))
+    elif x.get('__typename', '') == 'Capsule':
+        yield ''.join(parse_cnt(x['body']))
+
+    elif x.get('__typename', '') in {
+        'TextInline', 'TextOnlyDocumentBlock', 'DocumentBlock', 'SummaryBlock'
+    }:
+        yield ''.join(parse_cnt(x))
+
+    elif x.get('__typename'):
+        if ''.join(parse_cnt(x)).strip():
+            yield '<p><i>' + ''.join(parse_cnt(x)) + '</i></p>'
+
+def article_parse(data):
+    yield "<html><body>"
+    for d in data:
+        yield from parse_types(d)
+    yield "</body></html>"
 
 
 def json_to_html(raw):
@@ -105,18 +188,8 @@ def json_to_html(raw):
     except TypeError:
         data = data['initialState']
         return live_json_to_html(data)
-    article = next(iter(data.values()))
-    body = article['sprinkledBody']['content']
-    lines = []
-    for item in body:
-        tn = item['__typename']
-        if tn in ('HeaderBasicBlock', 'HeaderLegacyBlock', 'HeaderFullBleedVerticalBlock'):
-            process_header(lines, item)
-        elif tn in ('ParagraphBlock', 'LabelBlock', 'DetailBlock') or is_heading(tn):
-            process_paragraph(lines, item)
-        elif tn == 'ImageBlock':
-            process_image_block(lines, item)
-    return '<html><body>' + '\n'.join(lines) + '</body></html>'
+    content = data['article']['sprinkledBody']['content']
+    return '\n'.join(article_parse(content))
 
 
 def add_live_item(item, item_type, lines):

From b844da69d448e55351043c4a192e5fc179067812 Mon Sep 17 00:00:00 2001
From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com>
Date: Sun, 22 Sep 2024 18:39:22 +0530
Subject: [PATCH 2/3] ...

---
 recipes/icons/nytimes_sub.png | Bin 301 -> 416 bytes
 recipes/nytimes_sub.recipe    |   5 -----
 2 files changed, 5 deletions(-)

diff --git a/recipes/icons/nytimes_sub.png b/recipes/icons/nytimes_sub.png
index 9ae9985ee4663dd2aa8177fbd2f0ff78a7cc9f07..2d170d68f4a5ce7fc46817242854f5fe5ab8d616 100644
GIT binary patch
literal 416
zcmeAS@N?(olHy`uVBq!ia0vp^3LwnE3?yBabR7dyHUT~%uI>dsLPA2qU?3tQ0%XI1
zgoFf;4HN`Q!Ud(Iq~Po;{Iezk%@HmM@(X78sMRBxc;&=Qz6~}$kCyLZt5&=_EqgZu
z10#c{i(`mI@6pLOC$%c@xZDiCdFKEB*DO0;3GC?)p3Lt3GI)ZYa`OM+e<zaSub9nT
zW8cK;`z0$uM1R+#^rn(^yIPm;c>2ReOyN+N_5#KrjW+vTf!qZR2fyu^@I7#eXR1u{
z@+mhK@R=<YTIu~T;<04>tTem4qWR8@hg(<<tesND_~C<CHG4e6_228{A7sqdyO5Aj
z$275dNrTKOmSY!W7`fJ+%w}K0>ubDll77S1Cht<_H@rOJ*<!H|H~wQf=M!5d&2gfb
zNmb{7A=9Kvw<UYdtLWP6PnoOy<ragw-4W5^lZh_-csH;8-WfdiE2Dkt{ZAtAuFb8#
k*e)sVRPbf`kJrB#4JQk)F!JQ(0R|O=r>mdKI;Vst02fl1tpET3

literal 301
zcmV+|0n+}7P)<h;3K|Lk000e1NJLTq000mG000mO00000I+&on0002=Nkl<Zc-jSk
zKPW_T0LQ=2d+(N~7#Ivvl7UIFTq)Zb4JHFOGEh#gENr?AH<*>htVm)|EJm4Z7RBqX
zlkVQV_wM)peIyJCZK9L_Vi1IYAK~*$o}&!1?#|5W?kmkIMJNiW%aTf#1PIe(VY9h6
zHY#^5qVLFYbliWQynnemD!Z|#7~-GG+`LW?bI1abnWCzvmD0i1O%#v`dWk(nS?fcy
zEpo(wJB|M>+xsV9X^|!(;KS9!V&$#zT_Z=(W_o0}@kwoI!^eoY;98bj-}WXRLr@oj
z{MMG_UA6$hpbwpkv-9_Yh>mE1F{Fl+AWQfMaOARK#zZT$00000NkvXXu0mjf-%y6A

diff --git a/recipes/nytimes_sub.recipe b/recipes/nytimes_sub.recipe
index 38f48ae632..60ec193031 100644
--- a/recipes/nytimes_sub.recipe
+++ b/recipes/nytimes_sub.recipe
@@ -339,11 +339,6 @@ class NewYorkTimes(BasicNewsRecipe):
         ]
         return br
 
-    def get_article_url(self, article):
-        url = BasicNewsRecipe.get_article_url(self, article)
-        if not '/video/' in url:
-            return url
-
     def preprocess_html(self, soup):
         w = self.recipe_specific_options.get('res')
         if w and isinstance(w, str):

From bfd6280c49acc158eeaf092aa6eccb2a14af6385 Mon Sep 17 00:00:00 2001
From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com>
Date: Sun, 22 Sep 2024 18:52:49 +0530
Subject: [PATCH 3/3] ...

---
 recipes/nytimes_sub.recipe | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/recipes/nytimes_sub.recipe b/recipes/nytimes_sub.recipe
index 60ec193031..07a4c1bb57 100644
--- a/recipes/nytimes_sub.recipe
+++ b/recipes/nytimes_sub.recipe
@@ -112,7 +112,7 @@ class NewYorkTimes(BasicNewsRecipe):
         if use_wayback_machine and not skip_wayback:
             from calibre import browser
             return self.nyt_parser.download_url(url, browser())
-        return self.browser.open_novisit(url).read()
+        return self.index_to_soup(url)
 
     def preprocess_raw_html(self, raw_html, url):
         if '/interactive/' in url: