From 2a15d7fa57af69091302ce4c80a82f5411d1d8d7 Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Mon, 30 Sep 2024 10:03:22 +0530 Subject: [PATCH 1/2] update science journal --- recipes/science_advances.recipe | 3 +-- recipes/science_journal.recipe | 3 +-- recipes/sciimmunol.recipe | 3 +-- recipes/scirobotics.recipe | 3 +-- recipes/scisignaling.recipe | 3 +-- recipes/scistm.recipe | 3 +-- 6 files changed, 6 insertions(+), 12 deletions(-) diff --git a/recipes/science_advances.recipe b/recipes/science_advances.recipe index 689a1132ae..cd2a6818f0 100644 --- a/recipes/science_advances.recipe +++ b/recipes/science_advances.recipe @@ -94,8 +94,7 @@ class scienceadv(BasicNewsRecipe): feeds = [] - div = soup.find('div', attrs={'class':'toc__body'}) - for sec in div.findAll('section', **classes('toc__section')): + for sec in soup.findAll('section', **classes('toc__section')): name = sec.find(**classes('sidebar-article-title--decorated')) section = self.tag_to_string(name).strip() self.log(section) diff --git a/recipes/science_journal.recipe b/recipes/science_journal.recipe index c3051af82c..0317146c76 100644 --- a/recipes/science_journal.recipe +++ b/recipes/science_journal.recipe @@ -92,8 +92,7 @@ class science(BasicNewsRecipe): feeds = [] - div = soup.find('div', attrs={'class':'toc__body'}) - for sec in div.findAll('section', **classes('toc__section')): + for sec in soup.findAll('section', **classes('toc__section')): name = sec.find(**classes('sidebar-article-title--decorated')) section = self.tag_to_string(name).strip() self.log(section) diff --git a/recipes/sciimmunol.recipe b/recipes/sciimmunol.recipe index 77b5802972..21d79fd2d8 100644 --- a/recipes/sciimmunol.recipe +++ b/recipes/sciimmunol.recipe @@ -94,8 +94,7 @@ class scienceadv(BasicNewsRecipe): feeds = [] - div = soup.find('div', attrs={'class':'toc__body'}) - for sec in div.findAll('section', **classes('toc__section')): + for sec in soup.findAll('section', **classes('toc__section')): name = sec.find(**classes('sidebar-article-title--decorated')) section = self.tag_to_string(name).strip() self.log(section) diff --git a/recipes/scirobotics.recipe b/recipes/scirobotics.recipe index 36e914d409..45a4df16da 100644 --- a/recipes/scirobotics.recipe +++ b/recipes/scirobotics.recipe @@ -94,8 +94,7 @@ class scienceadv(BasicNewsRecipe): feeds = [] - div = soup.find('div', attrs={'class':'toc__body'}) - for sec in div.findAll('section', **classes('toc__section')): + for sec in soup.findAll('section', **classes('toc__section')): name = sec.find(**classes('sidebar-article-title--decorated')) section = self.tag_to_string(name).strip() self.log(section) diff --git a/recipes/scisignaling.recipe b/recipes/scisignaling.recipe index ed8bcdf151..18769af889 100644 --- a/recipes/scisignaling.recipe +++ b/recipes/scisignaling.recipe @@ -93,8 +93,7 @@ class scienceadv(BasicNewsRecipe): feeds = [] - div = soup.find('div', attrs={'class':'toc__body'}) - for sec in div.findAll('section', **classes('toc__section')): + for sec in soup.findAll('section', **classes('toc__section')): name = sec.find(**classes('sidebar-article-title--decorated')) section = self.tag_to_string(name).strip() self.log(section) diff --git a/recipes/scistm.recipe b/recipes/scistm.recipe index 42b85c3719..101b60d7c4 100644 --- a/recipes/scistm.recipe +++ b/recipes/scistm.recipe @@ -94,8 +94,7 @@ class scienceadv(BasicNewsRecipe): feeds = [] - div = soup.find('div', attrs={'class':'toc__body'}) - for sec in div.findAll('section', **classes('toc__section')): + for sec in soup.findAll('section', **classes('toc__section')): name = sec.find(**classes('sidebar-article-title--decorated')) section = self.tag_to_string(name).strip() self.log(section) From 812cf96bc53bfc3edab76ffdd28fffe0b41a3f9a Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Mon, 30 Sep 2024 10:05:02 +0530 Subject: [PATCH 2/2] ... --- recipes/nytfeeds.recipe | 32 ++++++++-------------- src/calibre/web/site_parsers/nytimes.py | 36 +++++++++---------------- 2 files changed, 24 insertions(+), 44 deletions(-) diff --git a/recipes/nytfeeds.recipe b/recipes/nytfeeds.recipe index 0ac1ed5fcd..6214051f19 100644 --- a/recipes/nytfeeds.recipe +++ b/recipes/nytfeeds.recipe @@ -109,11 +109,12 @@ def parse_cnt(cnt): yield ''.join(parse_fmt_type(cnt)) else: for cnt_ in cnt[k]: - yield from parse_types(cnt_) + yield ''.join(parse_types(cnt_)) if isinstance(cnt[k], dict): - yield from parse_types(cnt[k]) - if cnt.get('text') and 'formats' not in cnt: - yield cnt['text'] + yield ''.join(parse_types(cnt[k])) + if cnt.get('text') and 'formats' not in cnt and 'content' not in cnt: + if isinstance(cnt['text'], str): + yield cnt['text'] def parse_types(x): typename = x.get('__typename', '') @@ -141,9 +142,6 @@ def parse_types(x): elif typename == 'RuleBlock': yield '
{"".join(parse_cnt(x))}
' + elif typename and typename not in {'RelatedLinksBlock', 'Dropzone'}: - if x.get('media'): - yield "".join(parse_types(x['media'])) - elif "".join(parse_cnt(x)).strip(): - yield f'{"".join(parse_cnt(x))}
' + if "".join(parse_cnt(x)).strip(): + yield "".join(parse_cnt(x)) def article_parse(data): yield "" diff --git a/src/calibre/web/site_parsers/nytimes.py b/src/calibre/web/site_parsers/nytimes.py index b82f7a124b..eeefb3c51c 100644 --- a/src/calibre/web/site_parsers/nytimes.py +++ b/src/calibre/web/site_parsers/nytimes.py @@ -9,7 +9,7 @@ from xml.sax.saxutils import escape, quoteattr from calibre.utils.iso8601 import parse_iso8601 -module_version = 7 # needed for live updates +module_version = 8 # needed for live updates pprint @@ -111,11 +111,12 @@ def parse_cnt(cnt): yield ''.join(parse_fmt_type(cnt)) else: for cnt_ in cnt[k]: - yield from parse_types(cnt_) + yield ''.join(parse_types(cnt_)) if isinstance(cnt[k], dict): - yield from parse_types(cnt[k]) - if cnt.get('text') and 'formats' not in cnt: - yield cnt['text'] + yield ''.join(parse_types(cnt[k])) + if cnt.get('text') and 'formats' not in cnt and 'content' not in cnt: + if isinstance(cnt['text'], str): + yield cnt['text'] def parse_types(x): typename = x.get('__typename', '') @@ -143,9 +144,6 @@ def parse_types(x): elif typename == 'RuleBlock': yield '{"".join(parse_cnt(x))}
' + elif typename and typename not in {'RelatedLinksBlock', 'Dropzone'}: - if x.get('media'): - yield "".join(parse_types(x['media'])) - elif "".join(parse_cnt(x)).strip(): - yield f'{"".join(parse_cnt(x))}
' + if "".join(parse_cnt(x)).strip(): + yield "".join(parse_cnt(x)) def article_parse(data): yield ""