Merge branch 'master' of https://github.com/unkn0w7n/calibre

2025-07-08 10:44:09 -04:00 · 2025-03-16 16:37:45 +05:30 · 2025-03-16 16:37:45 +05:30 · d422c4b7f4
commit d422c4b7f4
parent 712dadf69b 9bafbfa7c1
8 changed files with 151 additions and 31 deletions
--- a/recipes/1843.recipe
+++ b/recipes/1843.recipe
@ -11,13 +11,30 @@ from calibre.ebooks.BeautifulSoup import NavigableString, Tag
 from calibre.web.feeds.news import BasicNewsRecipe


+def process_list(li_node):
+    li_html = ''
+    for li in li_node['items']:
+        if li.get('textHtml'):
+            li_html += f'<li>{li.get("textHtml")}</li>'
+        else:
+            li_html +=  f'<li>{li.get("text", "")}</li>'
+    return li_html
+
+
+def process_info_box(bx):
+    info = ''
+    for x in safe_dict(bx, 'components'):
+        info += f'<blockquote>{process_node(x)}</blockquote>'
+    return info
+
+
 def process_node(node):
    ntype = node.get('type', '')
    if ntype == 'CROSSHEAD':
        if node.get('textHtml'):
            return f'<h4>{node.get("textHtml")}</h4>'
        return f'<h4>{node.get("text", "")}</h4>'
-    elif ntype == 'PARAGRAPH':
+    elif ntype in ['PARAGRAPH', 'BOOK_INFO']:
        if node.get('textHtml'):
            return f'<p>{node.get("textHtml")}</p>'
        return f'<p>{node.get("text", "")}</p>'
@ -34,9 +51,14 @@ def process_node(node):
        return f'<blockquote>{node.get("text", "")}</blockquote>'
    elif ntype == 'DIVIDER':
        return '<hr>'
+    elif ntype == 'INFOGRAPHIC':
+        if node.get('fallback'):
+            return process_node(node['fallback'])
    elif ntype == 'INFOBOX':
-        for x in safe_dict(node, 'components'):
-            return f'<blockquote>{process_node(x)}</blockquote>'
+        return process_info_box(node)
+    elif ntype == 'UNORDERED_LIST':
+        if node.get('items'):
+            return process_list(node)
    elif ntype:
        print('** ', ntype)
        return ''
@ -121,7 +143,7 @@ def process_url(url):
 class Econ1843(BasicNewsRecipe):

    title = 'Economist 1843'
-    language = 'en'
+    language = 'en_GB'
    encoding = 'utf-8'
    masthead_url = 'https://www.livemint.com/lm-img/dev/economist-logo-oneline.png'

--- a/recipes/economist.recipe
+++ b/recipes/economist.recipe
@ -86,13 +86,30 @@ def load_article_from_json(raw, root):
        process_node(node, article)


+def process_web_list(li_node):
+    li_html = ''
+    for li in li_node['items']:
+        if li.get('textHtml'):
+            li_html += f'<li>{li.get("textHtml")}</li>'
+        else:
+            li_html +=  f'<li>{li.get("text", "")}</li>'
+    return li_html
+
+
+def process_info_box(bx):
+    info = ''
+    for x in safe_dict(bx, 'components'):
+        info += f'<blockquote>{process_web_node(x)}</blockquote>'
+    return info
+
+
 def process_web_node(node):
    ntype = node.get('type', '')
    if ntype == 'CROSSHEAD':
        if node.get('textHtml'):
            return f'<h4>{node.get("textHtml")}</h4>'
        return f'<h4>{node.get("text", "")}</h4>'
-    elif ntype == 'PARAGRAPH':
+    elif ntype in ['PARAGRAPH', 'BOOK_INFO']:
        if node.get('textHtml'):
            return f'<p>{node.get("textHtml")}</p>'
        return f'<p>{node.get("text", "")}</p>'
@ -109,9 +126,14 @@ def process_web_node(node):
        return f'<blockquote>{node.get("text", "")}</blockquote>'
    elif ntype == 'DIVIDER':
        return '<hr>'
+    elif ntype == 'INFOGRAPHIC':
+        if node.get('fallback'):
+            return process_web_node(node['fallback'])
    elif ntype == 'INFOBOX':
-        for x in safe_dict(node, 'components'):
-            return f'<blockquote>{process_web_node(x)}</blockquote>'
+        return process_info_box(node)
+    elif ntype == 'UNORDERED_LIST':
+        if node.get('items'):
+            return process_web_list(node)
    elif ntype:
        print('** ', ntype)
        return ''
@ -120,7 +142,10 @@ def process_web_node(node):
 def load_article_from_web_json(raw):
    # open('/t/raw.json', 'w').write(raw)
    body = ''
-    data = json.loads(raw)['props']['pageProps']['content']
+    try:
+        data = json.loads(raw)['props']['pageProps']['cp2Content']
+    except Exception:
+        data = json.loads(raw)['props']['pageProps']['content']
    body += f'<div style="color: red; font-size:small; font-weight:bold;">{data.get("flyTitle", "")}</div>'
    body += f'<h1>{data["headline"]}</h1>'
    if data.get('rubric') and data.get('rubric') is not None:
@ -182,7 +207,7 @@ def process_url(url):

 class Economist(BasicNewsRecipe):
    title = 'The Economist'
-    language = 'en'
+    language = 'en_GB'
    encoding = 'utf-8'
    masthead_url = 'https://www.livemint.com/lm-img/dev/economist-logo-oneline.png'

@ -274,7 +299,7 @@ class Economist(BasicNewsRecipe):

    def economist_test_article(self):
        return [('Articles', [{'title':'test',
-            'url':'https://www.economist.com/the-americas/2024/04/14/elon-musk-is-feuding-with-brazils-powerful-supreme-court'
+            'url':'https://www.economist.com/leaders/2025/03/13/americas-bullied-allies-need-to-toughen-up'
        }])]

    def economist_return_index(self, ans):
--- a/recipes/economist_free.recipe
+++ b/recipes/economist_free.recipe
@ -86,13 +86,30 @@ def load_article_from_json(raw, root):
        process_node(node, article)


+def process_web_list(li_node):
+    li_html = ''
+    for li in li_node['items']:
+        if li.get('textHtml'):
+            li_html += f'<li>{li.get("textHtml")}</li>'
+        else:
+            li_html +=  f'<li>{li.get("text", "")}</li>'
+    return li_html
+
+
+def process_info_box(bx):
+    info = ''
+    for x in safe_dict(bx, 'components'):
+        info += f'<blockquote>{process_web_node(x)}</blockquote>'
+    return info
+
+
 def process_web_node(node):
    ntype = node.get('type', '')
    if ntype == 'CROSSHEAD':
        if node.get('textHtml'):
            return f'<h4>{node.get("textHtml")}</h4>'
        return f'<h4>{node.get("text", "")}</h4>'
-    elif ntype == 'PARAGRAPH':
+    elif ntype in ['PARAGRAPH', 'BOOK_INFO']:
        if node.get('textHtml'):
            return f'<p>{node.get("textHtml")}</p>'
        return f'<p>{node.get("text", "")}</p>'
@ -109,9 +126,14 @@ def process_web_node(node):
        return f'<blockquote>{node.get("text", "")}</blockquote>'
    elif ntype == 'DIVIDER':
        return '<hr>'
+    elif ntype == 'INFOGRAPHIC':
+        if node.get('fallback'):
+            return process_web_node(node['fallback'])
    elif ntype == 'INFOBOX':
-        for x in safe_dict(node, 'components'):
-            return f'<blockquote>{process_web_node(x)}</blockquote>'
+        return process_info_box(node)
+    elif ntype == 'UNORDERED_LIST':
+        if node.get('items'):
+            return process_web_list(node)
    elif ntype:
        print('** ', ntype)
        return ''
@ -120,7 +142,10 @@ def process_web_node(node):
 def load_article_from_web_json(raw):
    # open('/t/raw.json', 'w').write(raw)
    body = ''
-    data = json.loads(raw)['props']['pageProps']['content']
+    try:
+        data = json.loads(raw)['props']['pageProps']['cp2Content']
+    except Exception:
+        data = json.loads(raw)['props']['pageProps']['content']
    body += f'<div style="color: red; font-size:small; font-weight:bold;">{data.get("flyTitle", "")}</div>'
    body += f'<h1>{data["headline"]}</h1>'
    if data.get('rubric') and data.get('rubric') is not None:
@ -182,7 +207,7 @@ def process_url(url):

 class Economist(BasicNewsRecipe):
    title = 'The Economist'
-    language = 'en'
+    language = 'en_GB'
    encoding = 'utf-8'
    masthead_url = 'https://www.livemint.com/lm-img/dev/economist-logo-oneline.png'

@ -274,7 +299,7 @@ class Economist(BasicNewsRecipe):

    def economist_test_article(self):
        return [('Articles', [{'title':'test',
-            'url':'https://www.economist.com/the-americas/2024/04/14/elon-musk-is-feuding-with-brazils-powerful-supreme-court'
+            'url':'https://www.economist.com/leaders/2025/03/13/americas-bullied-allies-need-to-toughen-up'
        }])]

    def economist_return_index(self, ans):
--- a/recipes/economist_news.recipe
+++ b/recipes/economist_news.recipe
@ -121,7 +121,7 @@ def process_url(url):

 class EconomistNews(BasicNewsRecipe):
    title = 'The Economist News'
-    language = 'en'
+    language = 'en_GB'
    encoding = 'utf-8'
    masthead_url = 'https://www.livemint.com/lm-img/dev/economist-logo-oneline.png'

--- a/recipes/economist_search.recipe
+++ b/recipes/economist_search.recipe
@ -12,13 +12,29 @@ from calibre.ebooks.BeautifulSoup import NavigableString, Tag
 from calibre.web.feeds.news import BasicNewsRecipe


+def process_list(li_node):
+    li_html = ''
+    for li in li_node['items']:
+        if li.get('textHtml'):
+            li_html += f'<li>{li.get("textHtml")}</li>'
+        else:
+            li_html +=  f'<li>{li.get("text", "")}</li>'
+    return li_html
+
+
+def process_info_box(bx):
+    info = ''
+    for x in safe_dict(bx, 'components'):
+        info += f'<blockquote>{process_node(x)}</blockquote>'
+    return info
+
 def process_node(node):
    ntype = node.get('type', '')
    if ntype == 'CROSSHEAD':
        if node.get('textHtml'):
            return f'<h4>{node.get("textHtml")}</h4>'
        return f'<h4>{node.get("text", "")}</h4>'
-    elif ntype == 'PARAGRAPH':
+    elif ntype in ['PARAGRAPH', 'BOOK_INFO']:
        if node.get('textHtml'):
            return f'<p>{node.get("textHtml")}</p>'
        return f'<p>{node.get("text", "")}</p>'
@ -35,9 +51,14 @@ def process_node(node):
        return f'<blockquote>{node.get("text", "")}</blockquote>'
    elif ntype == 'DIVIDER':
        return '<hr>'
+    elif ntype == 'INFOGRAPHIC':
+        if node.get('fallback'):
+            return process_node(node['fallback'])
    elif ntype == 'INFOBOX':
-        for x in safe_dict(node, 'components'):
-            return f'<blockquote>{process_node(x)}</blockquote>'
+        return process_info_box(node)
+    elif ntype == 'UNORDERED_LIST':
+        if node.get('items'):
+            return process_list(node)
    elif ntype:
        print('** ', ntype)
        return ''
@ -57,7 +78,10 @@ class JSONHasNoContent(ValueError):
 def load_article_from_json(raw):
    # open('/t/raw.json', 'w').write(raw)
    body = ''
-    data = json.loads(raw)['props']['pageProps']['cp2Content']
+    try:
+        data = json.loads(raw)['props']['pageProps']['cp2Content']
+    except Exception:
+        data = json.loads(raw)['props']['pageProps']['content']
    body += f'<div style="color: red; font-size:small; font-weight:bold;">{data.get("flyTitle", "")}</div>'
    body += f'<h1>{data["headline"]}</h1>'
    body += f'<div style="font-style: italic; color:#202020;">{data.get("rubric", "")}</div>'
@ -114,7 +138,7 @@ def process_url(url):

 class econ_search(BasicNewsRecipe):
    title = 'The Economist - Search'
-    language = 'en'
+    language = 'en_GB'
    encoding = 'utf-8'
    __author__ = 'unkn0wn'
    description = (
--- a/recipes/economist_world_ahead.recipe
+++ b/recipes/economist_world_ahead.recipe
@ -12,13 +12,30 @@ from calibre.ebooks.BeautifulSoup import NavigableString, Tag
 from calibre.web.feeds.news import BasicNewsRecipe


+def process_list(li_node):
+    li_html = ''
+    for li in li_node['items']:
+        if li.get('textHtml'):
+            li_html += f'<li>{li.get("textHtml")}</li>'
+        else:
+            li_html +=  f'<li>{li.get("text", "")}</li>'
+    return li_html
+
+
+def process_info_box(bx):
+    info = ''
+    for x in safe_dict(bx, 'components'):
+        info += f'<blockquote>{process_node(x)}</blockquote>'
+    return info
+
+
 def process_node(node):
    ntype = node.get('type', '')
    if ntype == 'CROSSHEAD':
        if node.get('textHtml'):
            return f'<h4>{node.get("textHtml")}</h4>'
        return f'<h4>{node.get("text", "")}</h4>'
-    elif ntype == 'PARAGRAPH':
+    elif ntype in ['PARAGRAPH', 'BOOK_INFO']:
        if node.get('textHtml'):
            return f'<p>{node.get("textHtml")}</p>'
        return f'<p>{node.get("text", "")}</p>'
@ -35,9 +52,14 @@ def process_node(node):
        return f'<blockquote>{node.get("text", "")}</blockquote>'
    elif ntype == 'DIVIDER':
        return '<hr>'
+    elif ntype == 'INFOGRAPHIC':
+        if node.get('fallback'):
+            return process_node(node['fallback'])
    elif ntype == 'INFOBOX':
-        for x in safe_dict(node, 'components'):
-            return f'<blockquote>{process_node(x)}</blockquote>'
+        return process_info_box(node)
+    elif ntype == 'UNORDERED_LIST':
+        if node.get('items'):
+            return process_list(node)
    elif ntype:
        print('** ', ntype)
        return ''
@ -57,7 +79,10 @@ class JSONHasNoContent(ValueError):
 def load_article_from_json(raw):
    # open('/t/raw.json', 'w').write(raw)
    body = ''
-    data = json.loads(raw)['props']['pageProps']['cp2Content']
+    try:
+        data = json.loads(raw)['props']['pageProps']['cp2Content']
+    except Exception:
+        data = json.loads(raw)['props']['pageProps']['content']
    body += f'<div style="color: red; font-size:small; font-weight:bold;">{data.get("flyTitle", "")}</div>'
    body += f'<h1>{data["headline"]}</h1>'
    body += f'<div style="font-style: italic; color:#202020;">{data.get("rubric", "")}</div>'
@ -118,7 +143,7 @@ def process_url(url):

 class EconomistWorld(BasicNewsRecipe):
    title = 'The Economist World Ahead'
-    language = 'en'
+    language = 'en_GB'
    encoding = 'utf-8'
    masthead_url = 'https://www.livemint.com/lm-img/dev/economist-logo-oneline.png'

--- a/recipes/hindufeeds.recipe
+++ b/recipes/hindufeeds.recipe
@ -21,7 +21,6 @@ class TheHindufeeds(BasicNewsRecipe):
        .author, .dateLine, .publish-time {font-size:small; font-weight:bold;}
        .subhead, .subhead_lead, .bold {font-weight:bold;}
        .update-publish-time, .publish-time-new {font-size:small; }
-        img {display:block; margin:0 auto;}
        .italic {font-style:italic; color:#202020;}
    '''

@ -55,7 +54,7 @@ class TheHindufeeds(BasicNewsRecipe):

    def preprocess_html(self, soup):
        for cap in soup.findAll('p', attrs={'class': 'caption'}):
-            cap.name = 'figcaption'
+            cap.name = 'div'
        for img in soup.findAll('img', attrs={'data-original': True}):
            if img['data-original'].endswith('1x1_spacer.png'):
                source = img.findPrevious('source', srcset=True)
@ -91,7 +90,7 @@ class TheHindufeeds(BasicNewsRecipe):
        ('Business', 'https://www.thehindu.com/business/feeder/default.rss'),
        ('World', 'https://www.thehindu.com/news/international/feeder/default.rss'),
        # ('Sport', 'https://www.thehindu.com/sport/feeder/default.rss'),
-        ('Entertainment', 'https://www.thehindu.com/entertainment/feeder/default.rss'),
+        # ('Entertainment', 'https://www.thehindu.com/entertainment/feeder/default.rss'),
        # ('Crossword', 'https://crossword.thehindu.com/?utm_source=thehindu&utm_medium=mainmenufeeder/default.rss'),
        ('Science', 'https://www.thehindu.com/sci-tech/science/feeder/default.rss'),
        ('Life and Style', 'https://www.thehindu.com/life-and-style/feeder/default.rss'),
--- a/recipes/spectator_magazine.recipe
+++ b/recipes/spectator_magazine.recipe
@ -13,7 +13,7 @@ class spectator(BasicNewsRecipe):
    title = 'Spectator Magazine'
    __author__ = 'unkn0wn'
    description = 'The Spectator was established in 1828, and is the best-written and most influential weekly in the English language.'
-    language = 'en'
+    language = 'en_GB'
    no_stylesheets = True
    remove_attributes = ['height', 'width', 'style']
    ignore_duplicate_articles = {'url'}