From 3ebc50d03a24bdb2b15ea6f2462433b453024e31 Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Sun, 16 Mar 2025 11:39:07 +0530 Subject: [PATCH 1/6] Update economist --- recipes/economist.recipe | 25 +++++++++++++++++++++---- recipes/economist_free.recipe | 25 +++++++++++++++++++++---- recipes/hindufeeds.recipe | 5 ++--- 3 files changed, 44 insertions(+), 11 deletions(-) diff --git a/recipes/economist.recipe b/recipes/economist.recipe index 0648e9e228..a22fe21969 100644 --- a/recipes/economist.recipe +++ b/recipes/economist.recipe @@ -85,6 +85,14 @@ def load_article_from_json(raw, root): for node in data.get('text') or (): process_node(node, article) +def process_web_list(li_node): + li_html = '' + for li in li_node['items']: + if li.get('textHtml'): + li_html += f'
  • {li.get("textHtml")}
  • ' + else: + li_html += f'
  • {li.get("text", "")}
  • ' + return li_html def process_web_node(node): ntype = node.get('type', '') @@ -92,7 +100,7 @@ def process_web_node(node): if node.get('textHtml'): return f'

    {node.get("textHtml")}

    ' return f'

    {node.get("text", "")}

    ' - elif ntype == 'PARAGRAPH': + elif ntype in ['PARAGRAPH', 'BOOK_INFO']: if node.get('textHtml'): return f'

    {node.get("textHtml")}

    ' return f'

    {node.get("text", "")}

    ' @@ -109,9 +117,15 @@ def process_web_node(node): return f'
    {node.get("text", "")}
    ' elif ntype == 'DIVIDER': return '
    ' + elif ntype == 'INFOGRAPHIC': + if node.get('fallback'): + return process_web_node(node['fallback']) elif ntype == 'INFOBOX': for x in safe_dict(node, 'components'): return f'
    {process_web_node(x)}
    ' + elif ntype == 'UNORDERED_LIST': + if node.get('items'): + return process_web_list(node) elif ntype: print('** ', ntype) return '' @@ -120,7 +134,10 @@ def process_web_node(node): def load_article_from_web_json(raw): # open('/t/raw.json', 'w').write(raw) body = '' - data = json.loads(raw)['props']['pageProps']['content'] + try: + data = json.loads(raw)['props']['pageProps']['cp2Content'] + except Exception: + data = json.loads(raw)['props']['pageProps']['content'] body += f'
    {data.get("flyTitle", "")}
    ' body += f'

    {data["headline"]}

    ' if data.get('rubric') and data.get('rubric') is not None: @@ -182,7 +199,7 @@ def process_url(url): class Economist(BasicNewsRecipe): title = 'The Economist' - language = 'en' + language = 'en_GB' encoding = 'utf-8' masthead_url = 'https://www.livemint.com/lm-img/dev/economist-logo-oneline.png' @@ -274,7 +291,7 @@ class Economist(BasicNewsRecipe): def economist_test_article(self): return [('Articles', [{'title':'test', - 'url':'https://www.economist.com/the-americas/2024/04/14/elon-musk-is-feuding-with-brazils-powerful-supreme-court' + 'url':'https://www.economist.com/leaders/2025/03/13/americas-bullied-allies-need-to-toughen-up' }])] def economist_return_index(self, ans): diff --git a/recipes/economist_free.recipe b/recipes/economist_free.recipe index 0648e9e228..a22fe21969 100644 --- a/recipes/economist_free.recipe +++ b/recipes/economist_free.recipe @@ -85,6 +85,14 @@ def load_article_from_json(raw, root): for node in data.get('text') or (): process_node(node, article) +def process_web_list(li_node): + li_html = '' + for li in li_node['items']: + if li.get('textHtml'): + li_html += f'
  • {li.get("textHtml")}
  • ' + else: + li_html += f'
  • {li.get("text", "")}
  • ' + return li_html def process_web_node(node): ntype = node.get('type', '') @@ -92,7 +100,7 @@ def process_web_node(node): if node.get('textHtml'): return f'

    {node.get("textHtml")}

    ' return f'

    {node.get("text", "")}

    ' - elif ntype == 'PARAGRAPH': + elif ntype in ['PARAGRAPH', 'BOOK_INFO']: if node.get('textHtml'): return f'

    {node.get("textHtml")}

    ' return f'

    {node.get("text", "")}

    ' @@ -109,9 +117,15 @@ def process_web_node(node): return f'
    {node.get("text", "")}
    ' elif ntype == 'DIVIDER': return '
    ' + elif ntype == 'INFOGRAPHIC': + if node.get('fallback'): + return process_web_node(node['fallback']) elif ntype == 'INFOBOX': for x in safe_dict(node, 'components'): return f'
    {process_web_node(x)}
    ' + elif ntype == 'UNORDERED_LIST': + if node.get('items'): + return process_web_list(node) elif ntype: print('** ', ntype) return '' @@ -120,7 +134,10 @@ def process_web_node(node): def load_article_from_web_json(raw): # open('/t/raw.json', 'w').write(raw) body = '' - data = json.loads(raw)['props']['pageProps']['content'] + try: + data = json.loads(raw)['props']['pageProps']['cp2Content'] + except Exception: + data = json.loads(raw)['props']['pageProps']['content'] body += f'
    {data.get("flyTitle", "")}
    ' body += f'

    {data["headline"]}

    ' if data.get('rubric') and data.get('rubric') is not None: @@ -182,7 +199,7 @@ def process_url(url): class Economist(BasicNewsRecipe): title = 'The Economist' - language = 'en' + language = 'en_GB' encoding = 'utf-8' masthead_url = 'https://www.livemint.com/lm-img/dev/economist-logo-oneline.png' @@ -274,7 +291,7 @@ class Economist(BasicNewsRecipe): def economist_test_article(self): return [('Articles', [{'title':'test', - 'url':'https://www.economist.com/the-americas/2024/04/14/elon-musk-is-feuding-with-brazils-powerful-supreme-court' + 'url':'https://www.economist.com/leaders/2025/03/13/americas-bullied-allies-need-to-toughen-up' }])] def economist_return_index(self, ans): diff --git a/recipes/hindufeeds.recipe b/recipes/hindufeeds.recipe index 73148c0ea9..1fc7e76cb0 100644 --- a/recipes/hindufeeds.recipe +++ b/recipes/hindufeeds.recipe @@ -21,7 +21,6 @@ class TheHindufeeds(BasicNewsRecipe): .author, .dateLine, .publish-time {font-size:small; font-weight:bold;} .subhead, .subhead_lead, .bold {font-weight:bold;} .update-publish-time, .publish-time-new {font-size:small; } - img {display:block; margin:0 auto;} .italic {font-style:italic; color:#202020;} ''' @@ -55,7 +54,7 @@ class TheHindufeeds(BasicNewsRecipe): def preprocess_html(self, soup): for cap in soup.findAll('p', attrs={'class': 'caption'}): - cap.name = 'figcaption' + cap.name = 'div' for img in soup.findAll('img', attrs={'data-original': True}): if img['data-original'].endswith('1x1_spacer.png'): source = img.findPrevious('source', srcset=True) @@ -91,7 +90,7 @@ class TheHindufeeds(BasicNewsRecipe): ('Business', 'https://www.thehindu.com/business/feeder/default.rss'), ('World', 'https://www.thehindu.com/news/international/feeder/default.rss'), # ('Sport', 'https://www.thehindu.com/sport/feeder/default.rss'), - ('Entertainment', 'https://www.thehindu.com/entertainment/feeder/default.rss'), + # ('Entertainment', 'https://www.thehindu.com/entertainment/feeder/default.rss'), # ('Crossword', 'https://crossword.thehindu.com/?utm_source=thehindu&utm_medium=mainmenufeeder/default.rss'), ('Science', 'https://www.thehindu.com/sci-tech/science/feeder/default.rss'), ('Life and Style', 'https://www.thehindu.com/life-and-style/feeder/default.rss'), From 26bb850d62c3a091ef95ad9d75366c171871473c Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Sun, 16 Mar 2025 11:40:07 +0530 Subject: [PATCH 2/6] ... --- recipes/1843.recipe | 20 ++++++++++++++++++-- recipes/economist_world_ahead.recipe | 25 ++++++++++++++++++++++--- 2 files changed, 40 insertions(+), 5 deletions(-) diff --git a/recipes/1843.recipe b/recipes/1843.recipe index 75c7017f09..4b7901f5cd 100644 --- a/recipes/1843.recipe +++ b/recipes/1843.recipe @@ -11,13 +11,23 @@ from calibre.ebooks.BeautifulSoup import NavigableString, Tag from calibre.web.feeds.news import BasicNewsRecipe +def process_list(li_node): + li_html = '' + for li in li_node['items']: + if li.get('textHtml'): + li_html += f'
  • {li.get("textHtml")}
  • ' + else: + li_html += f'
  • {li.get("text", "")}
  • ' + return li_html + + def process_node(node): ntype = node.get('type', '') if ntype == 'CROSSHEAD': if node.get('textHtml'): return f'

    {node.get("textHtml")}

    ' return f'

    {node.get("text", "")}

    ' - elif ntype == 'PARAGRAPH': + elif ntype in ['PARAGRAPH', 'BOOK_INFO']: if node.get('textHtml'): return f'

    {node.get("textHtml")}

    ' return f'

    {node.get("text", "")}

    ' @@ -34,9 +44,15 @@ def process_node(node): return f'
    {node.get("text", "")}
    ' elif ntype == 'DIVIDER': return '
    ' + elif ntype == 'INFOGRAPHIC': + if node.get('fallback'): + return process_node(node['fallback']) elif ntype == 'INFOBOX': for x in safe_dict(node, 'components'): return f'
    {process_node(x)}
    ' + elif ntype == 'UNORDERED_LIST': + if node.get('items'): + return process_list(node) elif ntype: print('** ', ntype) return '' @@ -121,7 +137,7 @@ def process_url(url): class Econ1843(BasicNewsRecipe): title = 'Economist 1843' - language = 'en' + language = 'en_GB' encoding = 'utf-8' masthead_url = 'https://www.livemint.com/lm-img/dev/economist-logo-oneline.png' diff --git a/recipes/economist_world_ahead.recipe b/recipes/economist_world_ahead.recipe index 6d7e2336ee..3c9c39fb6e 100644 --- a/recipes/economist_world_ahead.recipe +++ b/recipes/economist_world_ahead.recipe @@ -12,13 +12,23 @@ from calibre.ebooks.BeautifulSoup import NavigableString, Tag from calibre.web.feeds.news import BasicNewsRecipe +def process_list(li_node): + li_html = '' + for li in li_node['items']: + if li.get('textHtml'): + li_html += f'
  • {li.get("textHtml")}
  • ' + else: + li_html += f'
  • {li.get("text", "")}
  • ' + return li_html + + def process_node(node): ntype = node.get('type', '') if ntype == 'CROSSHEAD': if node.get('textHtml'): return f'

    {node.get("textHtml")}

    ' return f'

    {node.get("text", "")}

    ' - elif ntype == 'PARAGRAPH': + elif ntype in ['PARAGRAPH', 'BOOK_INFO']: if node.get('textHtml'): return f'

    {node.get("textHtml")}

    ' return f'

    {node.get("text", "")}

    ' @@ -35,9 +45,15 @@ def process_node(node): return f'
    {node.get("text", "")}
    ' elif ntype == 'DIVIDER': return '
    ' + elif ntype == 'INFOGRAPHIC': + if node.get('fallback'): + return process_node(node['fallback']) elif ntype == 'INFOBOX': for x in safe_dict(node, 'components'): return f'
    {process_node(x)}
    ' + elif ntype == 'UNORDERED_LIST': + if node.get('items'): + return process_list(node) elif ntype: print('** ', ntype) return '' @@ -57,7 +73,10 @@ class JSONHasNoContent(ValueError): def load_article_from_json(raw): # open('/t/raw.json', 'w').write(raw) body = '' - data = json.loads(raw)['props']['pageProps']['cp2Content'] + try: + data = json.loads(raw)['props']['pageProps']['cp2Content'] + except Exception: + data = json.loads(raw)['props']['pageProps']['content'] body += f'
    {data.get("flyTitle", "")}
    ' body += f'

    {data["headline"]}

    ' body += f'
    {data.get("rubric", "")}
    ' @@ -118,7 +137,7 @@ def process_url(url): class EconomistWorld(BasicNewsRecipe): title = 'The Economist World Ahead' - language = 'en' + language = 'en_GB' encoding = 'utf-8' masthead_url = 'https://www.livemint.com/lm-img/dev/economist-logo-oneline.png' From cd56b74115adee613fc616390297d7ecb2330f37 Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Sun, 16 Mar 2025 11:40:47 +0530 Subject: [PATCH 3/6] ... --- recipes/economist_news.recipe | 2 +- recipes/economist_search.recipe | 25 ++++++++++++++++++++++--- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/recipes/economist_news.recipe b/recipes/economist_news.recipe index e08f7ba91d..f2b50f57fa 100644 --- a/recipes/economist_news.recipe +++ b/recipes/economist_news.recipe @@ -121,7 +121,7 @@ def process_url(url): class EconomistNews(BasicNewsRecipe): title = 'The Economist News' - language = 'en' + language = 'en_GB' encoding = 'utf-8' masthead_url = 'https://www.livemint.com/lm-img/dev/economist-logo-oneline.png' diff --git a/recipes/economist_search.recipe b/recipes/economist_search.recipe index 5ac61ad57f..0a3725bfcc 100644 --- a/recipes/economist_search.recipe +++ b/recipes/economist_search.recipe @@ -12,13 +12,23 @@ from calibre.ebooks.BeautifulSoup import NavigableString, Tag from calibre.web.feeds.news import BasicNewsRecipe +def process_list(li_node): + li_html = '' + for li in li_node['items']: + if li.get('textHtml'): + li_html += f'
  • {li.get("textHtml")}
  • ' + else: + li_html += f'
  • {li.get("text", "")}
  • ' + return li_html + + def process_node(node): ntype = node.get('type', '') if ntype == 'CROSSHEAD': if node.get('textHtml'): return f'

    {node.get("textHtml")}

    ' return f'

    {node.get("text", "")}

    ' - elif ntype == 'PARAGRAPH': + elif ntype in ['PARAGRAPH', 'BOOK_INFO']: if node.get('textHtml'): return f'

    {node.get("textHtml")}

    ' return f'

    {node.get("text", "")}

    ' @@ -35,9 +45,15 @@ def process_node(node): return f'
    {node.get("text", "")}
    ' elif ntype == 'DIVIDER': return '
    ' + elif ntype == 'INFOGRAPHIC': + if node.get('fallback'): + return process_node(node['fallback']) elif ntype == 'INFOBOX': for x in safe_dict(node, 'components'): return f'
    {process_node(x)}
    ' + elif ntype == 'UNORDERED_LIST': + if node.get('items'): + return process_list(node) elif ntype: print('** ', ntype) return '' @@ -57,7 +73,10 @@ class JSONHasNoContent(ValueError): def load_article_from_json(raw): # open('/t/raw.json', 'w').write(raw) body = '' - data = json.loads(raw)['props']['pageProps']['cp2Content'] + try: + data = json.loads(raw)['props']['pageProps']['cp2Content'] + except Exception: + data = json.loads(raw)['props']['pageProps']['content'] body += f'
    {data.get("flyTitle", "")}
    ' body += f'

    {data["headline"]}

    ' body += f'
    {data.get("rubric", "")}
    ' @@ -114,7 +133,7 @@ def process_url(url): class econ_search(BasicNewsRecipe): title = 'The Economist - Search' - language = 'en' + language = 'en_GB' encoding = 'utf-8' __author__ = 'unkn0wn' description = ( From a29f384098dcbf06d16fe32c332ec245c985321d Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Sun, 16 Mar 2025 11:41:16 +0530 Subject: [PATCH 4/6] Update spectator_magazine.recipe en_GB --- recipes/spectator_magazine.recipe | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recipes/spectator_magazine.recipe b/recipes/spectator_magazine.recipe index 84b29306b8..54a131a8b6 100644 --- a/recipes/spectator_magazine.recipe +++ b/recipes/spectator_magazine.recipe @@ -13,7 +13,7 @@ class spectator(BasicNewsRecipe): title = 'Spectator Magazine' __author__ = 'unkn0wn' description = 'The Spectator was established in 1828, and is the best-written and most influential weekly in the English language.' - language = 'en' + language = 'en_GB' no_stylesheets = True remove_attributes = ['height', 'width', 'style'] ignore_duplicate_articles = {'url'} From f6e75918727bf1e11985abde81b381482cddf0f4 Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Sun, 16 Mar 2025 12:08:42 +0530 Subject: [PATCH 5/6] ... --- recipes/1843.recipe | 10 ++++++++-- recipes/economist.recipe | 12 ++++++++++-- recipes/economist_free.recipe | 12 ++++++++++-- recipes/economist_search.recipe | 9 +++++++-- recipes/economist_world_ahead.recipe | 10 ++++++++-- 5 files changed, 43 insertions(+), 10 deletions(-) diff --git a/recipes/1843.recipe b/recipes/1843.recipe index 4b7901f5cd..931f4da6d9 100644 --- a/recipes/1843.recipe +++ b/recipes/1843.recipe @@ -21,6 +21,13 @@ def process_list(li_node): return li_html +def process_info_box(bx): + info = '' + for x in safe_dict(bx, 'components'): + info += f'
    {process_web_node(x)}
    ' + return info + + def process_node(node): ntype = node.get('type', '') if ntype == 'CROSSHEAD': @@ -48,8 +55,7 @@ def process_node(node): if node.get('fallback'): return process_node(node['fallback']) elif ntype == 'INFOBOX': - for x in safe_dict(node, 'components'): - return f'
    {process_node(x)}
    ' + return process_info_box(node) elif ntype == 'UNORDERED_LIST': if node.get('items'): return process_list(node) diff --git a/recipes/economist.recipe b/recipes/economist.recipe index a22fe21969..6cbed35d6d 100644 --- a/recipes/economist.recipe +++ b/recipes/economist.recipe @@ -85,6 +85,7 @@ def load_article_from_json(raw, root): for node in data.get('text') or (): process_node(node, article) + def process_web_list(li_node): li_html = '' for li in li_node['items']: @@ -94,6 +95,14 @@ def process_web_list(li_node): li_html += f'
  • {li.get("text", "")}
  • ' return li_html + +def process_info_box(bx): + info = '' + for x in safe_dict(bx, 'components'): + info += f'
    {process_web_node(x)}
    ' + return info + + def process_web_node(node): ntype = node.get('type', '') if ntype == 'CROSSHEAD': @@ -121,8 +130,7 @@ def process_web_node(node): if node.get('fallback'): return process_web_node(node['fallback']) elif ntype == 'INFOBOX': - for x in safe_dict(node, 'components'): - return f'
    {process_web_node(x)}
    ' + return process_info_box(node) elif ntype == 'UNORDERED_LIST': if node.get('items'): return process_web_list(node) diff --git a/recipes/economist_free.recipe b/recipes/economist_free.recipe index a22fe21969..6cbed35d6d 100644 --- a/recipes/economist_free.recipe +++ b/recipes/economist_free.recipe @@ -85,6 +85,7 @@ def load_article_from_json(raw, root): for node in data.get('text') or (): process_node(node, article) + def process_web_list(li_node): li_html = '' for li in li_node['items']: @@ -94,6 +95,14 @@ def process_web_list(li_node): li_html += f'
  • {li.get("text", "")}
  • ' return li_html + +def process_info_box(bx): + info = '' + for x in safe_dict(bx, 'components'): + info += f'
    {process_web_node(x)}
    ' + return info + + def process_web_node(node): ntype = node.get('type', '') if ntype == 'CROSSHEAD': @@ -121,8 +130,7 @@ def process_web_node(node): if node.get('fallback'): return process_web_node(node['fallback']) elif ntype == 'INFOBOX': - for x in safe_dict(node, 'components'): - return f'
    {process_web_node(x)}
    ' + return process_info_box(node) elif ntype == 'UNORDERED_LIST': if node.get('items'): return process_web_list(node) diff --git a/recipes/economist_search.recipe b/recipes/economist_search.recipe index 0a3725bfcc..fd27399cce 100644 --- a/recipes/economist_search.recipe +++ b/recipes/economist_search.recipe @@ -22,6 +22,12 @@ def process_list(li_node): return li_html +def process_info_box(bx): + info = '' + for x in safe_dict(bx, 'components'): + info += f'
    {process_web_node(x)}
    ' + return info + def process_node(node): ntype = node.get('type', '') if ntype == 'CROSSHEAD': @@ -49,8 +55,7 @@ def process_node(node): if node.get('fallback'): return process_node(node['fallback']) elif ntype == 'INFOBOX': - for x in safe_dict(node, 'components'): - return f'
    {process_node(x)}
    ' + return process_info_box(node) elif ntype == 'UNORDERED_LIST': if node.get('items'): return process_list(node) diff --git a/recipes/economist_world_ahead.recipe b/recipes/economist_world_ahead.recipe index 3c9c39fb6e..0adab83a48 100644 --- a/recipes/economist_world_ahead.recipe +++ b/recipes/economist_world_ahead.recipe @@ -22,6 +22,13 @@ def process_list(li_node): return li_html +def process_info_box(bx): + info = '' + for x in safe_dict(bx, 'components'): + info += f'
    {process_web_node(x)}
    ' + return info + + def process_node(node): ntype = node.get('type', '') if ntype == 'CROSSHEAD': @@ -49,8 +56,7 @@ def process_node(node): if node.get('fallback'): return process_node(node['fallback']) elif ntype == 'INFOBOX': - for x in safe_dict(node, 'components'): - return f'
    {process_node(x)}
    ' + return process_info_box(node) elif ntype == 'UNORDERED_LIST': if node.get('items'): return process_list(node) From 9bafbfa7c156d1beb9eb7270ab6941958cbe7747 Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Sun, 16 Mar 2025 12:09:46 +0530 Subject: [PATCH 6/6] ... --- recipes/1843.recipe | 2 +- recipes/economist_search.recipe | 2 +- recipes/economist_world_ahead.recipe | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/recipes/1843.recipe b/recipes/1843.recipe index 931f4da6d9..c698f2c67c 100644 --- a/recipes/1843.recipe +++ b/recipes/1843.recipe @@ -24,7 +24,7 @@ def process_list(li_node): def process_info_box(bx): info = '' for x in safe_dict(bx, 'components'): - info += f'
    {process_web_node(x)}
    ' + info += f'
    {process_node(x)}
    ' return info diff --git a/recipes/economist_search.recipe b/recipes/economist_search.recipe index fd27399cce..f5ea674d54 100644 --- a/recipes/economist_search.recipe +++ b/recipes/economist_search.recipe @@ -25,7 +25,7 @@ def process_list(li_node): def process_info_box(bx): info = '' for x in safe_dict(bx, 'components'): - info += f'
    {process_web_node(x)}
    ' + info += f'
    {process_node(x)}
    ' return info def process_node(node): diff --git a/recipes/economist_world_ahead.recipe b/recipes/economist_world_ahead.recipe index 0adab83a48..85ee8f50ce 100644 --- a/recipes/economist_world_ahead.recipe +++ b/recipes/economist_world_ahead.recipe @@ -25,7 +25,7 @@ def process_list(li_node): def process_info_box(bx): info = '' for x in safe_dict(bx, 'components'): - info += f'
    {process_web_node(x)}
    ' + info += f'
    {process_node(x)}
    ' return info