From 812cf96bc53bfc3edab76ffdd28fffe0b41a3f9a Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Mon, 30 Sep 2024 10:05:02 +0530 Subject: [PATCH] ... --- recipes/nytfeeds.recipe | 32 ++++++++-------------- src/calibre/web/site_parsers/nytimes.py | 36 +++++++++---------------- 2 files changed, 24 insertions(+), 44 deletions(-) diff --git a/recipes/nytfeeds.recipe b/recipes/nytfeeds.recipe index 0ac1ed5fcd..6214051f19 100644 --- a/recipes/nytfeeds.recipe +++ b/recipes/nytfeeds.recipe @@ -109,11 +109,12 @@ def parse_cnt(cnt): yield ''.join(parse_fmt_type(cnt)) else: for cnt_ in cnt[k]: - yield from parse_types(cnt_) + yield ''.join(parse_types(cnt_)) if isinstance(cnt[k], dict): - yield from parse_types(cnt[k]) - if cnt.get('text') and 'formats' not in cnt: - yield cnt['text'] + yield ''.join(parse_types(cnt[k])) + if cnt.get('text') and 'formats' not in cnt and 'content' not in cnt: + if isinstance(cnt['text'], str): + yield cnt['text'] def parse_types(x): typename = x.get('__typename', '') @@ -141,9 +142,6 @@ def parse_types(x): elif typename == 'RuleBlock': yield '
' - elif typename in {'ImageBlock', 'VideoBlock', 'InteractiveBlock'}: - yield "".join(parse_types(x['media'])) - elif typename == 'Image': yield "".join(parse_image(x)) @@ -161,23 +159,15 @@ def parse_types(x): elif typename == 'ListItemBlock': yield f'
  • {"".join(parse_cnt(x))}
  • ' - elif typename == 'CapsuleBlock': - if x['capsuleContent'].get('body'): - yield "".join(parse_cnt(x['capsuleContent']['body'])) - elif typename == 'Capsule': - yield "".join(parse_cnt(x['body'])) - - elif typename in { - 'TextInline', 'TextOnlyDocumentBlock', 'DocumentBlock', - 'SummaryBlock', 'VisualStackBlock' - }: + elif typename == 'TextInline': yield "".join(parse_cnt(x)) + elif typename in {'DetailBlock', 'TextRunKV'}: + yield f'

    {"".join(parse_cnt(x))}

    ' + elif typename and typename not in {'RelatedLinksBlock', 'Dropzone'}: - if x.get('media'): - yield "".join(parse_types(x['media'])) - elif "".join(parse_cnt(x)).strip(): - yield f'

    {"".join(parse_cnt(x))}

    ' + if "".join(parse_cnt(x)).strip(): + yield "".join(parse_cnt(x)) def article_parse(data): yield "" diff --git a/src/calibre/web/site_parsers/nytimes.py b/src/calibre/web/site_parsers/nytimes.py index b82f7a124b..eeefb3c51c 100644 --- a/src/calibre/web/site_parsers/nytimes.py +++ b/src/calibre/web/site_parsers/nytimes.py @@ -9,7 +9,7 @@ from xml.sax.saxutils import escape, quoteattr from calibre.utils.iso8601 import parse_iso8601 -module_version = 7 # needed for live updates +module_version = 8 # needed for live updates pprint @@ -111,11 +111,12 @@ def parse_cnt(cnt): yield ''.join(parse_fmt_type(cnt)) else: for cnt_ in cnt[k]: - yield from parse_types(cnt_) + yield ''.join(parse_types(cnt_)) if isinstance(cnt[k], dict): - yield from parse_types(cnt[k]) - if cnt.get('text') and 'formats' not in cnt: - yield cnt['text'] + yield ''.join(parse_types(cnt[k])) + if cnt.get('text') and 'formats' not in cnt and 'content' not in cnt: + if isinstance(cnt['text'], str): + yield cnt['text'] def parse_types(x): typename = x.get('__typename', '') @@ -143,9 +144,6 @@ def parse_types(x): elif typename == 'RuleBlock': yield '
    ' - elif typename in {'ImageBlock', 'VideoBlock', 'InteractiveBlock'}: - yield "".join(parse_types(x['media'])) - elif typename == 'Image': yield "".join(parse_image(x)) @@ -161,25 +159,17 @@ def parse_types(x): elif typename == 'ListBlock': yield f'' elif typename == 'ListItemBlock': - yield f'
  • {"".join(parse_cnt(x))}
  • ' + yield f'\n
  • {"".join(parse_cnt(x))}
  • ' - elif typename == 'CapsuleBlock': - if x['capsuleContent'].get('body'): - yield "".join(parse_cnt(x['capsuleContent']['body'])) - elif typename == 'Capsule': - yield "".join(parse_cnt(x['body'])) - - elif typename in { - 'TextInline', 'TextOnlyDocumentBlock', 'DocumentBlock', - 'SummaryBlock', 'VisualStackBlock' - }: + elif typename == 'TextInline': yield "".join(parse_cnt(x)) + elif typename in {'DetailBlock', 'TextRunKV'}: + yield f'

    {"".join(parse_cnt(x))}

    ' + elif typename and typename not in {'RelatedLinksBlock', 'Dropzone'}: - if x.get('media'): - yield "".join(parse_types(x['media'])) - elif "".join(parse_cnt(x)).strip(): - yield f'

    {"".join(parse_cnt(x))}

    ' + if "".join(parse_cnt(x)).strip(): + yield "".join(parse_cnt(x)) def article_parse(data): yield ""