Merge branch 'master' of https://github.com/unkn0w7n/calibre

2025-07-09 03:04:10 -04:00 · 2024-03-10 19:12:57 +05:30 · 2024-03-10 19:12:57 +05:30 · 8cd60c1a96
commit 8cd60c1a96
parent f6e7d13bee 01b16cd9f8
5 changed files with 251 additions and 143 deletions
--- a/recipes/bloomberg-business-week.recipe
+++ b/recipes/bloomberg-business-week.recipe
@ -30,7 +30,7 @@ def get_contents(x):
        return '<blockquote class="col">' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '</blockquote>'
    elif otype == 'media':
        if x['subType'] == 'photo':
-            return '<div><div class="img"><img src="{}"></div><div class="cap">{}<div>{}</div></div></div>'.format(
+            return '<div><div class="img"><img src="{}"></div><div class="cap">{} <span>{}</span></div></div>'.format(
                x['data']['photo']['src'], x['data']['photo']['caption'], x['data']['photo']['credit'])
        elif x['subType'] == 'chart':
            if x['data'] and x['data']['chart']:
@ -78,7 +78,7 @@ class Bloomberg(BasicNewsRecipe):
    extra_css = '''
        .auth {font-size:small; font-weight:bold;}
        .time, .chart, .css--lede-byline, .css--lede-timestamp {font-size:small;}
-        .subhead {font-style:italic; color:#404040;}
+        .subhead, .cap span {font-style:italic; color:#404040;}
        em, .col {color:#202020;}
        .cat {font-size:small; color:gray;}
        .news-figure-caption-text, .cap, .img, .css--caption-outer-wrapper {font-size:small; text-align:center;}
--- a/recipes/bloomberg.recipe
+++ b/recipes/bloomberg.recipe
@ -31,7 +31,7 @@ def get_contents(x):
        return '<blockquote class="col">' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '</blockquote>'
    elif otype == 'media':
        if x['subType'] == 'photo':
-            return '<div><div class="img"><img src="{}"></div><div class="cap">{}<div>{}</div></div></div>'.format(
+            return '<div><div class="img"><img src="{}"></div><div class="cap">{} <span>{}</span></div></div>'.format(
                x['data']['photo']['src'], x['data']['photo']['caption'], x['data']['photo']['credit'])
        elif x['subType'] == 'chart':
            if x['data'] and x['data']['chart']:
@ -77,7 +77,7 @@ class Bloomberg(BasicNewsRecipe):
    extra_css = '''
        .auth {font-size:small; font-weight:bold;}
        .time, .chart {font-size:small;}
-        .subhead {font-style:italic; color:#404040;}
+        .subhead, .cap span {font-style:italic; color:#404040;}
        em, .col {color:#202020;}
        .cat {font-size:small; color:gray;}
        .news-figure-caption-text, .cap, .img {font-size:small; text-align:center;}
--- a/recipes/natgeo.recipe
+++ b/recipes/natgeo.recipe
@ -19,10 +19,7 @@ def classes(classes):
 def extract_json(raw):
    s = raw.find("window['__natgeo__']")
    script = raw[s:raw.find('</script>', s)]
-    data = json.loads(script[script.find('{'):].rstrip(';'))['page']['content']
+    return json.loads(script[script.find('{'):].rstrip(';'))['page']['content']['prismarticle']
    if 'article' in data:
        return data['article']
    return data['prismarticle']
 def parse_contributors(grp):
@ -35,63 +32,99 @@ def parse_contributors(grp):
 def parse_lead_image(media):
    if 'image' in media:
        yield '<p>'
        if 'dsc' in media['image']:
-            yield '<p><div><img src="{}" alt="{}"></div>'.format(
+            yield '<div><img src="{}" alt="{}"></div>'.format(
                escape(media['image']['src'], True), escape(media['image']['dsc'], True))
        else:
-            yield '<p><div><img src="{}"></div>'.format(escape(media['image']['src'], True))
+            yield '<div><img src="{}"></div>'.format(escape(media['image']['src'], True))
-        if 'caption' in media:
+        if 'caption' in media and 'credit' in media:
            yield '<div class="cap">' + media['caption'] + '<span class="cred"> ' + media['credit'] + '</span></div>'
        elif 'caption' in media:
            yield '<div class="cap">' + media['caption'] + '</div>'
        if 'credit' in media:
            yield '<div class="cred">' + media['credit'] + '</div>'
        yield '</p>'
-def parse_body(item):
+def parse_inline(inl):
-    c = item['cntnt']
+    if inl.get('content', {}).get('name', '') == 'Image':
-    if item.get('type') == 'inline':
+        props = inl['content']['props']
-        if c.get('cmsType') == 'listicle':
+        yield '<p>'
-            if 'title' in c:
+        if 'image' in props:
-                yield '<h3>' + escape(c['title']) + '</h3>'
+            yield '<div class="img"><img src="{}"></div>'.format(props['image']['src'])
-            yield c['text']
+        if 'caption' in props:
-        elif c.get('cmsType') == 'image':
+            yield '<div class="cap">{}<span class="cred">{}</span></div>'.format(
-            for line in parse_lead_image(c):
+                    props['caption']['text'], ' ' + props['caption']['credit']
-                yield line
+                )
-        elif c.get('cmsType') == 'imagegroup':
+        yield '</p>'
-            for imgs in c['images']:
+    if inl.get('content', {}).get('name', '') == 'ImageGroup':
-                for line in parse_lead_image(imgs):
+        if 'images' in inl['content']['props']:
-                    yield line
+            for imgs in inl['content']['props']['images']:
-        elif c.get('cmsType') == 'pullquote':
+                yield '<p>'
-            if 'quote' in c:
+                if 'src' in imgs:
-                yield '<blockquote>' + c['quote'] + '</blockquote>'
+                    yield '<div class="img"><img src="{}"></div>'.format(imgs['src'])
-        elif c.get('cmsType') == 'editorsNote':
+                if 'caption' in imgs:
-            if 'note' in c:
+                    yield '<div class="cap">{}<span class="cred">{}</span></div>'.format(
-                yield '<blockquote>' + c['note'] + '</blockquote>'
+                    imgs['caption']['text'], ' ' + imgs['caption']['credit']
-    else:
+                )
-        if c['mrkup'].strip().startswith('<'):
+                yield '</p>'
-            yield c['mrkup']
+
-        else:
+
-            yield '<{tag}>{markup}</{tag}>'.format(
+def parse_cont(content):
-                tag=item['type'], markup=c['mrkup'])
+    for cont in content.get('content', {}):
        if isinstance(cont, dict):
            yield from parse_body(cont)
        if isinstance(cont, str):
            yield cont
 def parse_body(x):
    if isinstance(x, dict):
        if 'type' in x:
            tag = x['type']
            if tag == 'inline':
                yield ''.join(parse_inline(x))
            elif 'attrs' in x and 'href' in x.get('attrs', {}):
                yield '<' + tag + ' href = "{}">'.format(x['attrs']['href'])
                for yld in parse_cont(x):
                    yield yld
                yield '</' + tag + '>'
            else:
                yield '<' + tag + '>'
                for yld in parse_cont(x):
                    yield yld
                yield '</' + tag + '>'
    elif isinstance(x, list):
        for y in x:
            if isinstance(y, dict):
                yield from parse_body(y)
 def parse_article(edg):
    sc = edg['schma']
-    yield '<h3 class="sub">' + escape(edg['sctn']) + '</h3>'
+    yield '<div class="sub">' + escape(edg['sctn']) + '</div>'
    yield '<h1>' + escape(sc['sclTtl']) + '</h1>'
-    yield '<div class="byline">' + escape(sc['sclDsc']) + '</div><br>'
+    yield '<div class="byline">' + escape(sc['sclDsc']) + '</div>'
    yield '<p>'
    for line in parse_contributors(edg['cntrbGrp']):
        yield line
    ts = parse_iso8601(edg['mdDt'], as_utc=False).strftime('%B %d, %Y')
    yield '<div class="time">Published: ' + escape(ts) + '</div>'
    if 'readTime' in edg:
-        yield '<div class="time">' + escape(edg['readTime']) + '</div><br>'
+        yield '<div class="time">' + escape(edg['readTime']) + '</div>'
    yield '</p>'
    if edg.get('ldMda', {}).get('cmsType') == 'image':
        for line in parse_lead_image(edg['ldMda']):
            yield line
-    for item in edg['bdy']:
+    for main in edg['prismData']['mainComponents']:
-        for line in parse_body(item):
+        if main['name'] == 'Body':
-            yield line
+            for item in main['props']['body']:
                if isinstance(item, dict):
                    if item.get('type', '') == 'inline':
                        for inl in parse_inline(item):
                            yield inl
                elif isinstance(item, list):
                    for line in item:
                        yield ''.join(parse_body(line))
 def article_parse(data):
@ -131,11 +164,12 @@ class NatGeo(BasicNewsRecipe):
    ignore_duplicate_articles = {'url'}
    extra_css = '''
-        .sub, blockquote { color:#404040; }
+        blockquote { color:#404040; }
        .byline, i { font-style:italic; color:#202020; }
-        .cap {text-align:center; font-size:small; }
+        .cap { font-size:small; }
-        .cred {text-align:center; font-size:small; color:#404040; }
+        img {display:block; margin:0 auto;}
-        .auth, .time { font-size:small; color:#5c5c5c; }
+        .cred { font-style:italic; font-size:small; color:#404040; }
        .auth, .time, .sub { font-size:small; color:#5c5c5c; }
    '''
    def get_cover_url(self):
@ -186,9 +220,11 @@ class NatGeo(BasicNewsRecipe):
        return '\n'.join(article_parse(data))
    def preprocess_html(self, soup):
        for h2 in soup.findAll('h2'):
            h2.name = 'h4'
        for img in soup.findAll('img', src=True):
            # for high res images use '?w=2000&h=2000'
-            img['src'] = img['src'] + '?w=1000&h=1000'
+            img['src'] = img['src'] + '?w=600&h=600'
        return soup
    def populate_article_metadata(self, article, soup, first):
--- a/recipes/natgeohis.recipe
+++ b/recipes/natgeohis.recipe
@ -18,10 +18,7 @@ def classes(classes):
 def extract_json(raw):
    s = raw.find("window['__natgeo__']")
    script = raw[s:raw.find('</script>', s)]
-    data = json.loads(script[script.find('{'):].rstrip(';'))['page']['content']
+    return json.loads(script[script.find('{'):].rstrip(';'))['page']['content']['prismarticle']
    if 'article' in data:
        return data['article']
    return data['prismarticle']
 def parse_contributors(grp):
@ -34,63 +31,99 @@ def parse_contributors(grp):
 def parse_lead_image(media):
    if 'image' in media:
        yield '<p>'
        if 'dsc' in media['image']:
-            yield '<p><div><img src="{}" alt="{}"></div>'.format(
+            yield '<div><img src="{}" alt="{}"></div>'.format(
                escape(media['image']['src'], True), escape(media['image']['dsc'], True))
        else:
-            yield '<p><div><img src="{}"></div>'.format(escape(media['image']['src'], True))
+            yield '<div><img src="{}"></div>'.format(escape(media['image']['src'], True))
-        if 'caption' in media:
+        if 'caption' in media and 'credit' in media:
            yield '<div class="cap">' + media['caption'] + '<span class="cred"> ' + media['credit'] + '</span></div>'
        elif 'caption' in media:
            yield '<div class="cap">' + media['caption'] + '</div>'
        if 'credit' in media:
            yield '<div class="cred">' + media['credit'] + '</div>'
        yield '</p>'
-def parse_body(item):
+def parse_inline(inl):
-    c = item['cntnt']
+    if inl.get('content', {}).get('name', '') == 'Image':
-    if item.get('type') == 'inline':
+        props = inl['content']['props']
-        if c.get('cmsType') == 'listicle':
+        yield '<p>'
-            if 'title' in c:
+        if 'image' in props:
-                yield '<h3>' + escape(c['title']) + '</h3>'
+            yield '<div class="img"><img src="{}"></div>'.format(props['image']['src'])
-            yield c['text']
+        if 'caption' in props:
-        elif c.get('cmsType') == 'image':
+            yield '<div class="cap">{}<span class="cred">{}</span></div>'.format(
-            for line in parse_lead_image(c):
+                    props['caption']['text'], ' ' + props['caption']['credit']
-                yield line
+                )
-        elif c.get('cmsType') == 'imagegroup':
+        yield '</p>'
-            for imgs in c['images']:
+    if inl.get('content', {}).get('name', '') == 'ImageGroup':
-                for line in parse_lead_image(imgs): 
+        if 'images' in inl['content']['props']:
-                    yield line
+            for imgs in inl['content']['props']['images']:
-        elif c.get('cmsType') == 'pullquote':
+                yield '<p>'
-            if 'quote' in c:
+                if 'src' in imgs:
-                yield '<blockquote>' + c['quote'] + '</blockquote>'
+                    yield '<div class="img"><img src="{}"></div>'.format(imgs['src'])
-        elif c.get('cmsType') == 'editorsNote':
+                if 'caption' in imgs:
-            if 'note' in c:
+                    yield '<div class="cap">{}<span class="cred">{}</span></div>'.format(
-                yield '<blockquote>' + c['note'] + '</blockquote>'
+                    imgs['caption']['text'], ' ' + imgs['caption']['credit']
-    else:
+                )
-        if c['mrkup'].strip().startswith('<'):
+                yield '</p>'
-            yield c['mrkup']
+
-        else:
+
-            yield '<{tag}>{markup}</{tag}>'.format(
+def parse_cont(content):
-                tag=item['type'], markup=c['mrkup'])
+    for cont in content.get('content', {}):
        if isinstance(cont, dict):
            yield from parse_body(cont)
        if isinstance(cont, str):
            yield cont
 def parse_body(x):
    if isinstance(x, dict):
        if 'type' in x:
            tag = x['type']
            if tag == 'inline':
                yield ''.join(parse_inline(x))
            elif 'attrs' in x and 'href' in x.get('attrs', {}):
                yield '<' + tag + ' href = "{}">'.format(x['attrs']['href'])
                for yld in parse_cont(x):
                    yield yld
                yield '</' + tag + '>'
            else:
                yield '<' + tag + '>'
                for yld in parse_cont(x):
                    yield yld
                yield '</' + tag + '>'
    elif isinstance(x, list):
        for y in x:
            if isinstance(y, dict):
                yield from parse_body(y)
 def parse_article(edg):
    sc = edg['schma']
-    yield '<h3 class="sub">' + escape(edg['sctn']) + '</h3>'
+    yield '<div class="sub">' + escape(edg['sctn']) + '</div>'
    yield '<h1>' + escape(sc['sclTtl']) + '</h1>'
-    yield '<div class="byline">' + escape(sc['sclDsc']) + '</div><br>'
+    yield '<div class="byline">' + escape(sc['sclDsc']) + '</div>'
    yield '<p>'
    for line in parse_contributors(edg['cntrbGrp']):
        yield line
    ts = parse_iso8601(edg['mdDt'], as_utc=False).strftime('%B %d, %Y')
    yield '<div class="time">Published: ' + escape(ts) + '</div>'
    if 'readTime' in edg:
-        yield '<div class="time">' + escape(edg['readTime']) + '</div><br>'
+        yield '<div class="time">' + escape(edg['readTime']) + '</div>'
    yield '</p>'
    if edg.get('ldMda', {}).get('cmsType') == 'image':
        for line in parse_lead_image(edg['ldMda']):
            yield line
-    for item in edg['bdy']:
+    for main in edg['prismData']['mainComponents']:
-        for line in parse_body(item):
+        if main['name'] == 'Body':
-            yield line
+            for item in main['props']['body']:
                if isinstance(item, dict):
                    if item.get('type', '') == 'inline':
                        for inl in parse_inline(item):
                            yield inl
                elif isinstance(item, list):
                    for line in item:
                        yield ''.join(parse_body(line))
 def article_parse(data):
@ -120,7 +153,7 @@ class NatGeo(BasicNewsRecipe):
    encoding = 'utf8'
    publisher = 'nationalgeographic.com'
    category = 'science, nat geo'
-    __author__ = 'Kovid Goyal'
+    __author__ = 'Kovid Goyal, unkn0wn'
    description = 'Inspiring people to care about the planet since 1888'
    timefmt = ' [%a, %d %b, %Y]'
    no_stylesheets = True
@ -131,11 +164,12 @@ class NatGeo(BasicNewsRecipe):
    resolve_internal_links = True
    extra_css = '''
-        .sub, blockquote { color:#404040; }
+        blockquote { color:#404040; }
        .byline, i { font-style:italic; color:#202020; }
-        .cap {text-align:center; font-size:small; }
+        .cap { font-size:small; }
-        .cred {text-align:center; font-size:small; color:#404040; }
+        img {display:block; margin:0 auto;}
-        .auth, .time { font-size:small; color:#5c5c5c; }
+        .cred { font-style:italic; font-size:small; color:#404040; }
        .auth, .time, .sub { font-size:small; color:#5c5c5c; }
    '''
    def get_cover_url(self):
@ -161,9 +195,11 @@ class NatGeo(BasicNewsRecipe):
        return '\n'.join(article_parse(data))
    def preprocess_html(self, soup):
        for h2 in soup.findAll('h2'):
            h2.name = 'h4'
        for img in soup.findAll('img', src=True):
            # for high res images use '?w=2000&h=2000'
-            img['src'] = img['src'] + '?w=1000&h=1000'
+            img['src'] = img['src'] + '?w=600&h=600'
        return soup
    def populate_article_metadata(self, article, soup, first):
--- a/recipes/natgeomag.recipe
+++ b/recipes/natgeomag.recipe
@ -23,10 +23,7 @@ def classes(classes):
 def extract_json(raw):
    s = raw.find("window['__natgeo__']")
    script = raw[s:raw.find('</script>', s)]
-    data = json.loads(script[script.find('{'):].rstrip(';'))['page']['content']
+    return json.loads(script[script.find('{'):].rstrip(';'))['page']['content']['prismarticle']
    if 'article' in data:
        return data['article']
    return data['prismarticle']
 def parse_contributors(grp):
@ -39,63 +36,99 @@ def parse_contributors(grp):
 def parse_lead_image(media):
    if 'image' in media:
        yield '<p>'
        if 'dsc' in media['image']:
-            yield '<p><div><img src="{}" alt="{}"></div>'.format(
+            yield '<div><img src="{}" alt="{}"></div>'.format(
                escape(media['image']['src'], True), escape(media['image']['dsc'], True))
        else:
-            yield '<p><div><img src="{}"></div>'.format(escape(media['image']['src'], True))
+            yield '<div><img src="{}"></div>'.format(escape(media['image']['src'], True))
-        if 'caption' in media:
+        if 'caption' in media and 'credit' in media:
            yield '<div class="cap">' + media['caption'] + '<span class="cred"> ' + media['credit'] + '</span></div>'
        elif 'caption' in media:
            yield '<div class="cap">' + media['caption'] + '</div>'
        if 'credit' in media:
            yield '<div class="cred">' + media['credit'] + '</div>'
        yield '</p>'
-def parse_body(item):
+def parse_inline(inl):
-    c = item['cntnt']
+    if inl.get('content', {}).get('name', '') == 'Image':
-    if item.get('type') == 'inline':
+        props = inl['content']['props']
-        if c.get('cmsType') == 'listicle':
+        yield '<p>'
-            if 'title' in c:
+        if 'image' in props:
-                yield '<h3>' + escape(c['title']) + '</h3>'
+            yield '<div class="img"><img src="{}"></div>'.format(props['image']['src'])
-            yield c['text']
+        if 'caption' in props:
-        elif c.get('cmsType') == 'image':
+            yield '<div class="cap">{}<span class="cred">{}</span></div>'.format(
-            for line in parse_lead_image(c):
+                    props['caption']['text'], ' ' + props['caption']['credit']
-                yield line
+                )
-        elif c.get('cmsType') == 'imagegroup':
+        yield '</p>'
-            for imgs in c['images']:
+    if inl.get('content', {}).get('name', '') == 'ImageGroup':
-                for line in parse_lead_image(imgs):
+        if 'images' in inl['content']['props']:
-                    yield line
+            for imgs in inl['content']['props']['images']:
-        elif c.get('cmsType') == 'pullquote':
+                yield '<p>'
-            if 'quote' in c:
+                if 'src' in imgs:
-                yield '<blockquote>' + c['quote'] + '</blockquote>'
+                    yield '<div class="img"><img src="{}"></div>'.format(imgs['src'])
-        elif c.get('cmsType') == 'editorsNote':
+                if 'caption' in imgs:
-            if 'note' in c:
+                    yield '<div class="cap">{}<span class="cred">{}</span></div>'.format(
-                yield '<blockquote>' + c['note'] + '</blockquote>'
+                    imgs['caption']['text'], ' ' + imgs['caption']['credit']
-    else:
+                )
-        if c['mrkup'].strip().startswith('<'):
+                yield '</p>'
-            yield c['mrkup']
+
-        else:
+
-            yield '<{tag}>{markup}</{tag}>'.format(
+def parse_cont(content):
-                tag=item['type'], markup=c['mrkup'])
+    for cont in content.get('content', {}):
        if isinstance(cont, dict):
            yield from parse_body(cont)
        if isinstance(cont, str):
            yield cont
 def parse_body(x):
    if isinstance(x, dict):
        if 'type' in x:
            tag = x['type']
            if tag == 'inline':
                yield ''.join(parse_inline(x))
            elif 'attrs' in x and 'href' in x.get('attrs', {}):
                yield '<' + tag + ' href = "{}">'.format(x['attrs']['href'])
                for yld in parse_cont(x):
                    yield yld
                yield '</' + tag + '>'
            else:
                yield '<' + tag + '>'
                for yld in parse_cont(x):
                    yield yld
                yield '</' + tag + '>'
    elif isinstance(x, list):
        for y in x:
            if isinstance(y, dict):
                yield from parse_body(y)
 def parse_article(edg):
    sc = edg['schma']
-    yield '<h3 class="sub">' + escape(edg['sctn']) + '</h3>'
+    yield '<div class="sub">' + escape(edg['sctn']) + '</div>'
    yield '<h1>' + escape(sc['sclTtl']) + '</h1>'
-    yield '<div class="byline">' + escape(sc['sclDsc']) + '</div><br>'
+    yield '<div class="byline">' + escape(sc['sclDsc']) + '</div>'
    yield '<p>'
    for line in parse_contributors(edg['cntrbGrp']):
        yield line
    ts = parse_iso8601(edg['mdDt'], as_utc=False).strftime('%B %d, %Y')
    yield '<div class="time">Published: ' + escape(ts) + '</div>'
    if 'readTime' in edg:
-        yield '<div class="time">' + escape(edg['readTime']) + '</div><br>'
+        yield '<div class="time">' + escape(edg['readTime']) + '</div>'
    yield '</p>'
    if edg.get('ldMda', {}).get('cmsType') == 'image':
        for line in parse_lead_image(edg['ldMda']):
            yield line
-    for item in edg['bdy']:
+    for main in edg['prismData']['mainComponents']:
-        for line in parse_body(item):
+        if main['name'] == 'Body':
-            yield line
+            for item in main['props']['body']:
                if isinstance(item, dict):
                    if item.get('type', '') == 'inline':
                        for inl in parse_inline(item):
                            yield inl
                elif isinstance(item, list):
                    for line in item:
                        yield ''.join(parse_body(line))
 def article_parse(data):
@ -134,11 +167,12 @@ class NatGeo(BasicNewsRecipe):
    resolve_internal_links = True
    extra_css = '''
-        .sub, blockquote { color:#404040; }
+        blockquote { color:#404040; }
        .byline, i { font-style:italic; color:#202020; }
-        .cap {text-align:center; font-size:small; }
+        .cap { font-size:small; }
-        .cred {text-align:center; font-size:small; color:#404040; }
+        img {display:block; margin:0 auto;}
-        .auth, .time { font-size:small; color:#5c5c5c; }
+        .cred { font-style:italic; font-size:small; color:#404040; }
        .auth, .time, .sub { font-size:small; color:#5c5c5c; }
    '''
    def parse_index(self):
@ -183,9 +217,11 @@ class NatGeo(BasicNewsRecipe):
        return '\n'.join(article_parse(data))
    def preprocess_html(self, soup):
        for h2 in soup.findAll('h2'):
            h2.name = 'h4'
        for img in soup.findAll('img', src=True):
            # for high res images use '?w=2000&h=2000'
-            img['src'] = img['src'] + '?w=1200&h=1200'
+            img['src'] = img['src'] + '?w=600&h=600'
        return soup
    def populate_article_metadata(self, article, soup, first):