From 7980be7dd509e81f2e832c527f66cca10512cc1b Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Sun, 22 Sep 2024 18:26:05 +0530 Subject: [PATCH 1/3] Update nytimes.py --- recipes/nytfeeds.recipe | 4 +- recipes/nytimes_sub.recipe | 63 ++++++- src/calibre/web/site_parsers/nytimes.py | 241 +++++++++++++++--------- 3 files changed, 217 insertions(+), 91 deletions(-) diff --git a/recipes/nytfeeds.recipe b/recipes/nytfeeds.recipe index 3fed8877c2..9a0d54215d 100644 --- a/recipes/nytfeeds.recipe +++ b/recipes/nytfeeds.recipe @@ -26,7 +26,7 @@ def parse_image(i): def parse_img_grid(g): for grd in g.get('gridMedia', {}): - yield '
'.join(parse_image(grd)) + yield ''.join(parse_image(grd)) if g.get('caption'): yield '
{}'.format(g['caption']) if g.get('credit'): @@ -301,5 +301,5 @@ class nytFeeds(BasicNewsRecipe): def get_article_url(self, article): url = BasicNewsRecipe.get_article_url(self, article) # you can remove '|/espanol/' from code below to include spanish articles. - if not re.search(r'/video/|live|/athletic/|/espanol/', url): + if not re.search(r'/video/|/live/|/athletic/|/espanol/', url): return url diff --git a/recipes/nytimes_sub.recipe b/recipes/nytimes_sub.recipe index 1486abe7a2..38f48ae632 100644 --- a/recipes/nytimes_sub.recipe +++ b/recipes/nytimes_sub.recipe @@ -16,7 +16,7 @@ from calibre.web.feeds.news import BasicNewsRecipe is_web_edition = False oldest_web_edition_article = 7 # days -use_wayback_machine = True +use_wayback_machine = False # The sections to download when downloading the web edition, comment out @@ -89,10 +89,16 @@ class NewYorkTimes(BasicNewsRecipe): language = 'en_US' ignore_duplicate_articles = {'title', 'url'} no_stylesheets = True - compress_news_images = True - compress_news_images_auto_size = 5 - conversion_options = {'flow_size': 0} - delay = 0 if use_wayback_machine else 1 + + extra_css = ''' + .byl, .time { font-size:small; color:#202020; } + .cap { font-size:small; text-align:center; } + .cred { font-style:italic; font-size:small; } + em, blockquote { color: #202020; } + .sc { font-variant: small-caps; } + .lbl { font-size:small; color:#404040; } + img { display:block; margin:0 auto; } + ''' @property def nyt_parser(self): @@ -109,6 +115,10 @@ class NewYorkTimes(BasicNewsRecipe): return self.browser.open_novisit(url).read() def preprocess_raw_html(self, raw_html, url): + if '/interactive/' in url: + return '

'\ + + 'This is an interactive article, which is supposed to be read in a browser.'\ + + '

' html = self.nyt_parser.extract_html(self.index_to_soup(raw_html)) return html @@ -125,9 +135,25 @@ class NewYorkTimes(BasicNewsRecipe): 'date': { 'short': 'The date of the edition to download (YYYY/MM/DD format)', 'long': 'For example, 2024/07/16' + }, + 'res': { + 'short': 'For hi-res images, select a resolution from the following\noptions: popup, jumbo, mobileMasterAt3x, superJumbo', + 'long': 'This is useful for non e-ink devices, and for a lower file size\nthan the default, use articleInline.', + }, + 'comp': { + 'short': 'Compress News Images?', + 'long': 'enter yes', + 'default': 'no' } } + def __init__(self, *args, **kwargs): + BasicNewsRecipe.__init__(self, *args, **kwargs) + c = self.recipe_specific_options.get('comp') + if c and isinstance(c, str): + if c.lower() == 'yes': + self.compress_news_images = True + def read_todays_paper(self): INDEX = 'https://www.nytimes.com/section/todayspaper' # INDEX = 'file:///t/raw.html' @@ -303,3 +329,30 @@ class NewYorkTimes(BasicNewsRecipe): if is_web_edition: return self.parse_web_sections() return self.parse_todays_page() + + def get_browser(self, *args, **kwargs): + kwargs['user_agent'] = 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)' + br = BasicNewsRecipe.get_browser(self, *args, **kwargs) + br.addheaders += [ + ('Referer', 'https://www.google.com/'), + ('X-Forwarded-For', '66.249.66.1') + ] + return br + + def get_article_url(self, article): + url = BasicNewsRecipe.get_article_url(self, article) + if not '/video/' in url: + return url + + def preprocess_html(self, soup): + w = self.recipe_specific_options.get('res') + if w and isinstance(w, str): + res = '-' + w + for img in soup.findAll('img', attrs={'src':True}): + if '-article' in img['src']: + ext = img['src'].split('?')[0].split('.')[-1] + img['src'] = img['src'].rsplit('-article', 1)[0] + res + '.' + ext + for c in soup.findAll('div', attrs={'class':'cap'}): + for p in c.findAll(['p', 'div']): + p.name = 'span' + return soup diff --git a/src/calibre/web/site_parsers/nytimes.py b/src/calibre/web/site_parsers/nytimes.py index 919cabe61a..bb30659f69 100644 --- a/src/calibre/web/site_parsers/nytimes.py +++ b/src/calibre/web/site_parsers/nytimes.py @@ -13,88 +13,171 @@ module_version = 5 # needed for live updates pprint -def is_heading(tn): - return tn in ('Heading1Block', 'Heading2Block', 'Heading3Block', 'Heading4Block') +def parse_image(i): + if i.get('crops'): + yield '
'.format(i['crops'][0]['renditions'][0]['url']) + elif i.get('spanImageCrops'): + yield '
'.format(i['spanImageCrops'][0]['renditions'][0]['url']) + if i.get('caption'): + yield '
' + ''.join(parse_types(i['caption'])) + if i.get('credit'): + yield ' ' + i['credit'] + '' + yield '
' + yield '
' +def parse_img_grid(g): + for grd in g.get('gridMedia', {}): + yield ''.join(parse_image(grd)) + if g.get('caption'): + yield '
{}'.format(g['caption']) + if g.get('credit'): + yield ' ' + g['credit'] + '' + yield '
' -def process_inline_text(lines, block): - text = '' - if 'text@stripHtml' in block: - text = escape(block['text@stripHtml']) - elif 'renderedRepresentation' in block: # happens in byline blocks - text = block['renderedRepresentation'] - elif 'text' in block: - text = block['text'] - if text: - for fmt in block.get('formats', ()): - tn = fmt['__typename'] - if tn == 'LinkFormat': - ab = fmt - text = '{}'.format(ab['url'], ab.get('title') or '', text) - elif tn == 'BoldFormat': - text = '' + text + '' - lines.append(text) +def parse_vid(v): + if v.get('promotionalMedia'): + if v.get('headline'): + if v.get('url'): + yield '
Video: '.format(v['url'])\ + + v['headline'].get('default', '') + '
' + elif v['headline'].get('default'): + yield '
' + v['headline']['default'] + '
' + yield ''.join(parse_types(v['promotionalMedia'])) + if v.get('promotionalSummary'): + yield '
' + v['promotionalSummary'] + '
' +def parse_emb(e): + if e.get('html') and 'datawrapper.dwcdn.net' in e.get('html', ''): + dw = re.search(r'datawrapper.dwcdn.net/(.{5})', e['html']).group(1) + yield '
'.format('https://datawrapper.dwcdn.net/' + dw + '/full.png') + '
' + elif e.get('promotionalMedia'): + if e.get('headline'): + yield '
' + e['headline']['default'] + '
' + yield ''.join(parse_types(e['promotionalMedia'])) + if e.get('note'): + yield '
' + e['note'] + '
' -def process_paragraph(lines, block, content_key='content'): - tn = block['__typename'] - m = re.match(r'Heading([1-6])Block', tn) - if m is not None: - tag = 'h' + m.group(1) - else: - tag = 'p' - ta = block.get('textAlign') or 'LEFT' - style = f'text-align: {ta.lower()}' - lines.append(f'<{tag} style="{style}">') - for item in block[content_key]: - tn = item['__typename'] - if tn in ('TextInline', 'Byline'): - process_inline_text(lines, item) - lines.append('') +def parse_byline(byl): + for b in byl.get('bylines', {}): + yield '
' + b['renderedRepresentation'] + '
' + yield '
' + for rl in byl.get('role', {}): + if ''.join(parse_cnt(rl)).strip(): + yield ''.join(parse_cnt(rl)) + yield '
' +def iso_date(x): + dt = parse_iso8601(x, as_utc=False) + return dt.strftime('%b %d, %Y at %I:%M %p') -def process_timestamp(lines, block): - ts = block['timestamp'] - dt = parse_iso8601(ts, as_utc=False) - lines.append('

' + escape(dt.strftime('%b %d, %Y')) + '

') +def parse_header(h): + if h.get('label'): + yield '
' + ''.join(parse_types(h['label'])) + '
' + if h.get('headline'): + yield ''.join(parse_types(h['headline'])) + if h.get('summary'): + yield '

' + ''.join(parse_types(h['summary'])) + '

' + if h.get('ledeMedia'): + yield ''.join(parse_types(h['ledeMedia'])) + if h.get('byline'): + yield ''.join(parse_types(h['byline'])) + if h.get('timestampBlock'): + yield ''.join(parse_types(h['timestampBlock'])) +def parse_fmt_type(fm): + for f in fm.get('formats', {}): + if f.get('__typename', '') == 'BoldFormat': + yield '' + if f.get('__typename', '') == 'ItalicFormat': + yield '' + if f.get('__typename', '') == 'LinkFormat': + hrf = f['url'] + yield ''.format(hrf) + yield fm['text'] + for f in reversed(fm.get('formats', {})): + if f.get('__typename', '') == 'BoldFormat': + yield '' + if f.get('__typename', '') == 'ItalicFormat': + yield '' + if f.get('__typename', '') == 'LinkFormat': + yield '' -def process_header(lines, block): - label = block.get('label') - if label: - process_paragraph(lines, label) - headline = block.get('headline') - if headline: - process_paragraph(lines, headline) - summary = block.get('summary') - if summary: - process_paragraph(lines, summary) - lm = block.get('ledeMedia') - if lm and lm.get('__typename') == 'ImageBlock': - process_image_block(lines, lm) - byline = block.get('byline') - if byline: - process_paragraph(lines, byline, content_key='bylines') - timestamp = block.get('timestampBlock') - if timestamp: - process_timestamp(lines, timestamp) +def parse_cnt(cnt): + if cnt.get('formats'): + yield ''.join(parse_fmt_type(cnt)) + elif cnt.get('content'): + for cnt_ in cnt['content']: + yield from parse_types(cnt_) + elif cnt.get('text'): + yield cnt['text'] +def parse_types(x): + if 'Header' in x.get('__typename', ''): + yield '\n'.join(parse_header(x)) -def process_image_block(lines, block): - media = block['media'] - caption = media.get('caption') - caption_lines = [] - if caption: - process_inline_text(caption_lines, caption) - crops = media['crops'] - renditions = crops[0]['renditions'] - img = renditions[0]['url'] - if 'web.archive.org' in img: - img = img.partition('/')[-1] - img = img[img.find('https://'):] - lines.append(f'
') - lines.extend(caption_lines) - lines.append('
') + elif x.get('__typename', '') == 'Heading1Block': + yield '

' + ''.join(parse_cnt(x)) + '

' + elif x.get('__typename', '') in {'Heading2Block', 'Heading3Block', 'Heading4Block'}: + yield '

' + ''.join(parse_cnt(x)) + '

' + + elif x.get('__typename', '') == 'ParagraphBlock': + yield '

' + ''.join(parse_cnt(x)) + '

' + + elif x.get('__typename', '') == 'BylineBlock': + yield '

' + ''.join(parse_byline(x)) + '
' + elif x.get('__typename', '') == 'LabelBlock': + yield '
' + ''.join(parse_cnt(x)) + '
' + elif x.get('__typename', '') == 'BlockquoteBlock': + yield '
' + ''.join(parse_cnt(x)) + '
' + elif x.get('__typename', '') == 'TimestampBlock': + yield '
' + iso_date(x['timestamp']) + '
' + elif x.get('__typename', '') == 'LineBreakInline': + yield '
' + elif x.get('__typename', '') == 'RuleBlock': + yield '
' + + elif x.get('__typename', '') == 'Image': + yield ''.join(parse_image(x)) + elif x.get('__typename', '') == 'ImageBlock': + yield ''.join(parse_image(x['media'])) + elif x.get('__typename', '') == 'GridBlock': + yield ''.join(parse_img_grid(x)) + + elif x.get('__typename', '') == 'VideoBlock': + yield ''.join(parse_types(x['media'])) + elif x.get('__typename', '') == 'Video': + yield ''.join(parse_vid(x)) + + elif x.get('__typename', '') == 'InteractiveBlock': + yield ''.join(parse_types(x['media'])) + elif x.get('__typename', '') == 'EmbeddedInteractive': + yield ''.join(parse_emb(x)) + + elif x.get('__typename', '') == 'ListBlock': + yield '' + elif x.get('__typename', '') == 'ListItemBlock': + yield '
  • ' + ''.join(parse_cnt(x)) + '
  • ' + + elif x.get('__typename', '') == 'CapsuleBlock': + if x['capsuleContent'].get('body'): + yield ''.join(parse_cnt(x['capsuleContent']['body'])) + elif x.get('__typename', '') == 'Capsule': + yield ''.join(parse_cnt(x['body'])) + + elif x.get('__typename', '') in { + 'TextInline', 'TextOnlyDocumentBlock', 'DocumentBlock', 'SummaryBlock' + }: + yield ''.join(parse_cnt(x)) + + elif x.get('__typename'): + if ''.join(parse_cnt(x)).strip(): + yield '

    ' + ''.join(parse_cnt(x)) + '

    ' + +def article_parse(data): + yield "" + for d in data: + yield from parse_types(d) + yield "" def json_to_html(raw): @@ -105,18 +188,8 @@ def json_to_html(raw): except TypeError: data = data['initialState'] return live_json_to_html(data) - article = next(iter(data.values())) - body = article['sprinkledBody']['content'] - lines = [] - for item in body: - tn = item['__typename'] - if tn in ('HeaderBasicBlock', 'HeaderLegacyBlock', 'HeaderFullBleedVerticalBlock'): - process_header(lines, item) - elif tn in ('ParagraphBlock', 'LabelBlock', 'DetailBlock') or is_heading(tn): - process_paragraph(lines, item) - elif tn == 'ImageBlock': - process_image_block(lines, item) - return '' + '\n'.join(lines) + '' + content = data['article']['sprinkledBody']['content'] + return '\n'.join(article_parse(content)) def add_live_item(item, item_type, lines): From b844da69d448e55351043c4a192e5fc179067812 Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Sun, 22 Sep 2024 18:39:22 +0530 Subject: [PATCH 2/3] ... --- recipes/icons/nytimes_sub.png | Bin 301 -> 416 bytes recipes/nytimes_sub.recipe | 5 ----- 2 files changed, 5 deletions(-) diff --git a/recipes/icons/nytimes_sub.png b/recipes/icons/nytimes_sub.png index 9ae9985ee4663dd2aa8177fbd2f0ff78a7cc9f07..2d170d68f4a5ce7fc46817242854f5fe5ab8d616 100644 GIT binary patch literal 416 zcmeAS@N?(olHy`uVBq!ia0vp^3LwnE3?yBabR7dyHUT~%uI>dsLPA2qU?3tQ0%XI1 zgoFf;4HN`Q!Ud(Iq~Po;{Iezk%@HmM@(X78sMRBxc;&=Qz6~}$kCyLZt5&=_EqgZu z10#c{i(`mI@6pLOC$%c@xZDiCdFKEB*DO0;3GC?)p3Lt3GI)ZYa`OM+e2ReOyN+N_5#KrjW+vTf!qZR2fyu^@I7#eXR1u{ z@+mhK@R=tTem4qWR8@hg(<ubDll77S1Cht<_H@rOJ*mdKI;Vst02fl1tpET3 literal 301 zcmV+|0n+}7P)htVm)|EJm4Z7RBqX zlkVQV_wM)peIyJCZK9L_Vi1IYAK~*$o}&!1?#|5W?kmkIMJNiW%aTf#1PIe(VY9h6 zHY#^5qVLFYbliWQynnemD!Z|#7~-GG+`LW?bI1abnWCzvmD0i1O%#v`dWk(nS?fcy zEpo(wJB|M>+xsV9X^|!(;KS9!V&$#zT_Z=(W_o0}@kwoI!^eoY;98bj-}WXRLr@oj z{MMG_UA6$hpbwpkv-9_Yh>mE1F{Fl+AWQfMaOARK#zZT$00000NkvXXu0mjf-%y6A diff --git a/recipes/nytimes_sub.recipe b/recipes/nytimes_sub.recipe index 38f48ae632..60ec193031 100644 --- a/recipes/nytimes_sub.recipe +++ b/recipes/nytimes_sub.recipe @@ -339,11 +339,6 @@ class NewYorkTimes(BasicNewsRecipe): ] return br - def get_article_url(self, article): - url = BasicNewsRecipe.get_article_url(self, article) - if not '/video/' in url: - return url - def preprocess_html(self, soup): w = self.recipe_specific_options.get('res') if w and isinstance(w, str): From bfd6280c49acc158eeaf092aa6eccb2a14af6385 Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Sun, 22 Sep 2024 18:52:49 +0530 Subject: [PATCH 3/3] ... --- recipes/nytimes_sub.recipe | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recipes/nytimes_sub.recipe b/recipes/nytimes_sub.recipe index 60ec193031..07a4c1bb57 100644 --- a/recipes/nytimes_sub.recipe +++ b/recipes/nytimes_sub.recipe @@ -112,7 +112,7 @@ class NewYorkTimes(BasicNewsRecipe): if use_wayback_machine and not skip_wayback: from calibre import browser return self.nyt_parser.download_url(url, browser()) - return self.browser.open_novisit(url).read() + return self.index_to_soup(url) def preprocess_raw_html(self, raw_html, url): if '/interactive/' in url: