diff --git a/recipes/icons/nytimes_sub.png b/recipes/icons/nytimes_sub.png
index 9ae9985ee4..2d170d68f4 100644
Binary files a/recipes/icons/nytimes_sub.png and b/recipes/icons/nytimes_sub.png differ
diff --git a/recipes/nytfeeds.recipe b/recipes/nytfeeds.recipe
index 3fed8877c2..9a0d54215d 100644
--- a/recipes/nytfeeds.recipe
+++ b/recipes/nytfeeds.recipe
@@ -26,7 +26,7 @@ def parse_image(i):
def parse_img_grid(g):
for grd in g.get('gridMedia', {}):
- yield '
'.join(parse_image(grd))
+ yield ''.join(parse_image(grd))
if g.get('caption'):
yield '
'\ + + 'This is an interactive article, which is supposed to be read in a browser.'\ + + '
' html = self.nyt_parser.extract_html(self.index_to_soup(raw_html)) return html @@ -125,9 +135,25 @@ class NewYorkTimes(BasicNewsRecipe): 'date': { 'short': 'The date of the edition to download (YYYY/MM/DD format)', 'long': 'For example, 2024/07/16' + }, + 'res': { + 'short': 'For hi-res images, select a resolution from the following\noptions: popup, jumbo, mobileMasterAt3x, superJumbo', + 'long': 'This is useful for non e-ink devices, and for a lower file size\nthan the default, use articleInline.', + }, + 'comp': { + 'short': 'Compress News Images?', + 'long': 'enter yes', + 'default': 'no' } } + def __init__(self, *args, **kwargs): + BasicNewsRecipe.__init__(self, *args, **kwargs) + c = self.recipe_specific_options.get('comp') + if c and isinstance(c, str): + if c.lower() == 'yes': + self.compress_news_images = True + def read_todays_paper(self): INDEX = 'https://www.nytimes.com/section/todayspaper' # INDEX = 'file:///t/raw.html' @@ -303,3 +329,25 @@ class NewYorkTimes(BasicNewsRecipe): if is_web_edition: return self.parse_web_sections() return self.parse_todays_page() + + def get_browser(self, *args, **kwargs): + kwargs['user_agent'] = 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)' + br = BasicNewsRecipe.get_browser(self, *args, **kwargs) + br.addheaders += [ + ('Referer', 'https://www.google.com/'), + ('X-Forwarded-For', '66.249.66.1') + ] + return br + + def preprocess_html(self, soup): + w = self.recipe_specific_options.get('res') + if w and isinstance(w, str): + res = '-' + w + for img in soup.findAll('img', attrs={'src':True}): + if '-article' in img['src']: + ext = img['src'].split('?')[0].split('.')[-1] + img['src'] = img['src'].rsplit('-article', 1)[0] + res + '.' + ext + for c in soup.findAll('div', attrs={'class':'cap'}): + for p in c.findAll(['p', 'div']): + p.name = 'span' + return soup diff --git a/src/calibre/web/site_parsers/nytimes.py b/src/calibre/web/site_parsers/nytimes.py index 919cabe61a..bb30659f69 100644 --- a/src/calibre/web/site_parsers/nytimes.py +++ b/src/calibre/web/site_parsers/nytimes.py @@ -13,88 +13,171 @@ module_version = 5 # needed for live updates pprint -def is_heading(tn): - return tn in ('Heading1Block', 'Heading2Block', 'Heading3Block', 'Heading4Block') +def parse_image(i): + if i.get('crops'): + yield '' + ''.join(parse_types(h['summary'])) + '
' + if h.get('ledeMedia'): + yield ''.join(parse_types(h['ledeMedia'])) + if h.get('byline'): + yield ''.join(parse_types(h['byline'])) + if h.get('timestampBlock'): + yield ''.join(parse_types(h['timestampBlock'])) +def parse_fmt_type(fm): + for f in fm.get('formats', {}): + if f.get('__typename', '') == 'BoldFormat': + yield '' + if f.get('__typename', '') == 'ItalicFormat': + yield '' + if f.get('__typename', '') == 'LinkFormat': + hrf = f['url'] + yield ''.format(hrf) + yield fm['text'] + for f in reversed(fm.get('formats', {})): + if f.get('__typename', '') == 'BoldFormat': + yield '' + if f.get('__typename', '') == 'ItalicFormat': + yield '' + if f.get('__typename', '') == 'LinkFormat': + yield '' -def process_header(lines, block): - label = block.get('label') - if label: - process_paragraph(lines, label) - headline = block.get('headline') - if headline: - process_paragraph(lines, headline) - summary = block.get('summary') - if summary: - process_paragraph(lines, summary) - lm = block.get('ledeMedia') - if lm and lm.get('__typename') == 'ImageBlock': - process_image_block(lines, lm) - byline = block.get('byline') - if byline: - process_paragraph(lines, byline, content_key='bylines') - timestamp = block.get('timestampBlock') - if timestamp: - process_timestamp(lines, timestamp) +def parse_cnt(cnt): + if cnt.get('formats'): + yield ''.join(parse_fmt_type(cnt)) + elif cnt.get('content'): + for cnt_ in cnt['content']: + yield from parse_types(cnt_) + elif cnt.get('text'): + yield cnt['text'] +def parse_types(x): + if 'Header' in x.get('__typename', ''): + yield '\n'.join(parse_header(x)) -def process_image_block(lines, block): - media = block['media'] - caption = media.get('caption') - caption_lines = [] - if caption: - process_inline_text(caption_lines, caption) - crops = media['crops'] - renditions = crops[0]['renditions'] - img = renditions[0]['url'] - if 'web.archive.org' in img: - img = img.partition('/')[-1] - img = img[img.find('https://'):] - lines.append(f'' + ''.join(parse_cnt(x)) + '
' + + elif x.get('__typename', '') == 'BylineBlock': + yield '' + ''.join(parse_cnt(x)) + '' + elif x.get('__typename', '') == 'TimestampBlock': + yield '
' + ''.join(parse_cnt(x)) + '
' + +def article_parse(data): + yield "" + for d in data: + yield from parse_types(d) + yield "" def json_to_html(raw): @@ -105,18 +188,8 @@ def json_to_html(raw): except TypeError: data = data['initialState'] return live_json_to_html(data) - article = next(iter(data.values())) - body = article['sprinkledBody']['content'] - lines = [] - for item in body: - tn = item['__typename'] - if tn in ('HeaderBasicBlock', 'HeaderLegacyBlock', 'HeaderFullBleedVerticalBlock'): - process_header(lines, item) - elif tn in ('ParagraphBlock', 'LabelBlock', 'DetailBlock') or is_heading(tn): - process_paragraph(lines, item) - elif tn == 'ImageBlock': - process_image_block(lines, item) - return '' + '\n'.join(lines) + '' + content = data['article']['sprinkledBody']['content'] + return '\n'.join(article_parse(content)) def add_live_item(item, item_type, lines):