Merge branch 'master' of https://github.com/unkn0w7n/calibre

2026-05-30 02:32:33 -04:00 · 2024-08-25 17:42:03 +05:30
parent b51055a00f 906a2cde25
commit cfba2cbdd5
7 changed files with 387 additions and 12 deletions
@@ -0,0 +1,291 @@
+#!/usr/bin/env  python
+# License: GPLv3 Copyright: 2008, Kovid Goyal <kovid at kovidgoyal.net>
+
+import json
+
+from html5_parser import parse
+from lxml import etree
+
+from calibre.ebooks.BeautifulSoup import NavigableString, Tag, BeautifulSoup
+from calibre.scraper.simple import read_url
+from calibre.web.feeds.news import BasicNewsRecipe
+
+
+def E(parent, name, text='', **attrs):
+    ans = parent.makeelement(name, **attrs)
+    ans.text = text
+    parent.append(ans)
+    return ans
+
+def safe_dict(data, *names):
+    ans = data
+    for x in names:
+        ans = ans.get(x) or {}
+    return ans
+
+
+class JSONHasNoContent(ValueError):
+    pass
+
+
+def load_article_from_json(raw, root):
+    # open('/t/raw.json', 'w').write(raw)
+    try:
+        data = json.loads(raw)['props']['pageProps']['content']
+    except KeyError as e:
+        raise JSONHasNoContent(e)
+    if isinstance(data, list):
+        data = data[0]
+    body = root.xpath('//body')[0]
+    for child in tuple(body):
+        body.remove(child)
+    article = E(body, 'article')
+    E(article, 'div', data['subheadline'], id='subhead')
+    E(article, 'h1', data['headline'])
+    E(article, 'p', data['description'], id='desc')
+    if data['dateline'] is None:
+        E(article, 'p', (data['datePublishedString'] or ''), id='date')
+    else:
+        E(article, 'p', (data['datePublishedString'] or '') + ' | ' + (data['dateline']), id='date')
+    main_image_url = safe_dict(data, 'image', 'main', 'url').get('canonical')
+    if main_image_url:
+        div = E(article, 'div')
+        try:
+            E(div, 'img', src=main_image_url)
+        except Exception:
+            pass
+    E(article, 'section', id='body')
+
+def cleanup_html_article(root):
+    main = root.xpath('//main')[0]
+    body = root.xpath('//body')[0]
+    for child in tuple(body):
+        body.remove(child)
+    body.append(main)
+    main.set('id', '')
+    main.tag = 'article'
+    for x in root.xpath('//*[@style]'):
+        x.set('style', '')
+    for x in root.xpath('//button'):
+        x.getparent().remove(x)
+
+def classes(classes):
+    q = frozenset(classes.split(' '))
+    return dict(attrs={
+        'class': lambda x: x and frozenset(x.split()).intersection(q)})
+
+def new_tag(soup, name, attrs=()):
+    impl = getattr(soup, 'new_tag', None)
+    if impl is not None:
+        return impl(name, attrs=dict(attrs))
+    return Tag(soup, name, attrs=attrs or None)
+
+def process_url(url):
+    if url.startswith('/'):
+        url = 'https://www.economist.com' + url
+    return url
+
+
+class econ_search(BasicNewsRecipe):
+
+    title = 'The Economist - Search'
+    language = 'en'
+    encoding = 'utf-8'
+    __author__ = "Kovid Goyal"
+    description = (
+        'Use the Advanced section of the recipe to search.'
+    )
+
+    remove_attributes = ['style', 'height', 'width']
+    no_stylesheets = True
+    ignore_duplicate_articles = {'url'}
+    extra_css = '''
+        em, blockquote { color:#202020; }
+        img {display:block; margin:0 auto;}
+        .sub { font-size:small; }
+        #subhead { color: #404040; font-size:small; font-weight:bold; }'
+        #desc { font-style: italic; color:#202020; }
+        #date { color: gray; font-size:small; }
+    '''
+
+    resolve_internal_links = True
+    remove_tags = [
+        dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent', 'aside', 'footer', 'svg']),
+        dict(attrs={'aria-label': "Article Teaser"}),
+        dict(attrs={'id':'player'}),
+        dict(attrs={
+                'class': [
+                    'dblClkTrk', 'ec-article-info', 'share_inline_header',
+                    'related-items', 'main-content-container', 'ec-topic-widget',
+                    'teaser', 'blog-post__bottom-panel-bottom', 'blog-post__comments-label',
+                    'blog-post__foot-note', 'blog-post__sharebar', 'blog-post__bottom-panel',
+                    'newsletter-form','share-links-header','teaser--wrapped', 'latest-updates-panel__container',
+                    'latest-updates-panel__article-link','blog-post__section'
+                ]
+            }
+        ),
+        dict(attrs={
+                'class': lambda x: x and 'blog-post__siblings-list-aside' in x.split()}),
+        dict(attrs={'id':lambda x: x and 'gpt-ad-slot' in x}),
+        classes(
+            'share-links-header teaser--wrapped latest-updates-panel__container'
+            ' latest-updates-panel__article-link blog-post__section newsletter-form blog-post__bottom-panel'
+        )
+    ]
+    keep_only_tags = [dict(name='article', id=lambda x: not x)]
+    no_stylesheets = True
+    remove_attributes = ['data-reactid', 'width', 'height']
+    # economist.com has started throttling after about 60% of the total has
+    # downloaded with connection reset by peer (104) errors.
+    delay = 1
+
+    def __init__(self, *args, **kwargs):
+        BasicNewsRecipe.__init__(self, *args, **kwargs)
+        if self.output_profile.short_name.startswith('kindle'):
+            # Reduce image sizes to get file size below amazon's email
+            # sending threshold
+            self.web2disk_options.compress_news_images = True
+            self.web2disk_options.compress_news_images_auto_size = 5
+            self.log.warn('Kindle Output profile being used, reducing image quality to keep file size below amazon email threshold')
+
+    def get_browser(self, *args, **kwargs):
+        # Needed to bypass cloudflare
+        kwargs['user_agent'] = 'common_words/based'
+        br = BasicNewsRecipe.get_browser(self, *args, **kwargs)
+        br.addheaders += [('Accept-Language', 'en-GB,en-US;q=0.9,en;q=0.8')]
+        return br
+
+    def preprocess_raw_html(self, raw, url):
+        # open('/t/raw.html', 'wb').write(raw.encode('utf-8'))
+        root = parse(raw)
+        if '/interactive/' in url:
+            return '<html><body><article><h1>' + root.xpath('//h1')[0].text + '</h1><em>' \
+                        + 'This article is supposed to be read in a browser' \
+                            + '</em></article></body></html>'
+        script = root.xpath('//script[@id="__NEXT_DATA__"]')
+        if script:
+            try:
+                load_article_from_json(script[0].text, root)
+            except JSONHasNoContent:
+                cleanup_html_article(root)
+        for div in root.xpath('//div[@class="lazy-image"]'):
+            noscript = list(div.iter('noscript'))
+            if noscript and noscript[0].text:
+                img = list(parse(noscript[0].text).iter('img'))
+                if img:
+                    p = noscript[0].getparent()
+                    idx = p.index(noscript[0])
+                    p.insert(idx, p.makeelement('img', src=img[0].get('src')))
+                    p.remove(noscript[0])
+        for x in root.xpath('//*[name()="script" or name()="style" or name()="source" or name()="meta"]'):
+            x.getparent().remove(x)
+        raw = etree.tostring(root, encoding='unicode')
+
+        raw_ar = read_url([], 'https://archive.is/latest/' + url)
+        archive = BeautifulSoup(str(raw_ar))
+        art = archive.find('article')
+        if art:
+            bdy = art.findAll('section')
+            if len(bdy) != 0:
+                content = bdy[-1]
+            else:
+                content = archive.find('div', attrs={'itemprop':'text'})
+            soup = BeautifulSoup(raw)
+            article = soup.find('section', attrs={'id':'body'})
+            if not article:
+                article = soup.find('div', attrs={'itemprop':'text'})
+                if not article:
+                    article = soup.find(attrs={'itemprop':'blogPost'})
+            if article and content:
+                self.log('**fetching archive content')
+                article.append(content)
+                
+                div = soup.findAll(attrs={'style': lambda x: x and x.startswith(
+                        ('color:rgb(13, 13, 13);', 'color: rgb(18, 18, 18);')
+                    )})
+                for p in div:
+                     p.name = 'p'
+                return str(soup)
+            return raw
+        return raw
+
+    def preprocess_html(self, soup):
+        for img in soup.findAll('img', attrs={'old-src':True}):
+            img['src'] = img['old-src']
+        for a in soup.findAll('a', href=True):
+            a['href'] = 'http' + a['href'].split('http')[-1]
+        for fig in soup.findAll('figure'):
+            fig['class'] = 'sub'
+        for sty in soup.findAll(attrs={'style':True}):
+            del sty['style']
+        return soup
+
+    recipe_specific_options = {
+        'q': {
+            'short': 'Text Search',
+            'default': 'schools brief'
+        },
+        's': {
+            'short': 'Sort by (date/relevance)',
+            'long': 'you can sort by date or relevance',
+            'default': 'relevance'
+        },
+        'p': {
+            'short': 'number of pages',
+            'long': 'number of pages of search results you want',
+            'default': '2'
+        }
+    }
+    
+    def parse_index(self):
+        url = 'https://www.economist.com/search?q={query}&sort={sort}&page={page}'
+        search = self.recipe_specific_options.get('q')
+        sort_type = self.recipe_specific_options.get('s')
+        page = self.recipe_specific_options.get('p')
+        self.title = 'The Economist - ' + search
+        ans = []
+        for num in range(1, int(page) + 1):
+            ans.extend(self.economist_parse_index(url.format(query=search.replace(' ', '+'), sort=sort_type, page=num)))
+        return [('Articles', ans)]
+
+    def economist_parse_index(self, url):
+        self.log('Page ', url.rsplit('=', 1)[-1])
+        soup = self.index_to_soup(url)
+        results = soup.find('ol', id='search-results')
+        if not results:
+            self.log('\tPage ', url.rsplit('=', 1)[-1], ' not found')
+            return
+        for a in results.findAll('a', attrs={'class':'_search-result'}):
+            url = a['href']
+            title = self.tag_to_string(a.find(attrs={'class':'_headline'}))
+            desc = self.tag_to_string(a.find(attrs={'class':'_snippet'}))
+            self.log('\t', title, '\n\t', desc, '\n\t\t', url)
+            yield {'title': title, 'url': url, 'description': desc}
+
+    def eco_find_image_tables(self, soup):
+        for x in soup.findAll('table', align=['right', 'center']):
+            if len(x.findAll('font')) in (1, 2) and len(x.findAll('img')) == 1:
+                yield x
+
+    def postprocess_html(self, soup, first):
+        for img in soup.findAll('img', srcset=True):
+            del img['srcset']
+        for table in list(self.eco_find_image_tables(soup)):
+            caption = table.find('font')
+            img = table.find('img')
+            div = new_tag(soup, 'div')
+            div['style'] = 'text-align:left;font-size:70%'
+            ns = NavigableString(self.tag_to_string(caption))
+            div.insert(0, ns)
+            div.insert(1, new_tag(soup, 'br'))
+            del img['width']
+            del img['height']
+            img.extract()
+            div.insert(2, img)
+            table.replaceWith(div)
+        return soup
+
+    def canonicalize_internal_url(self, url, is_link=True):
+        if url.endswith('/print'):
+            url = url.rpartition('/')[0]
+        return BasicNewsRecipe.canonicalize_internal_url(self, url, is_link=is_link)
@@ -26,7 +26,7 @@ class scienceadv(BasicNewsRecipe):
    browser_type = 'qt'

    extra_css = '''
-        .news-article__figure__caption {font-size:small; text-align:center;}
+        .news-article__figure__caption, .figc {font-size:small;}
        .core-self-citation, .meta-panel__left-content, .news-article__hero__top-meta {font-size:small;}
        .contributors, .news-article__hero__bottom-meta, #bibliography, #elettersSection {font-size:small;}
        img {display:block; margin:0 auto;}
@@ -51,6 +51,11 @@ class scienceadv(BasicNewsRecipe):
            'short': 'Enter the Issue Number you want to download\n(Vol/Issue format)',
            'long': 'For example, 385/6710',
            'default': 'current'
+        },
+        'res': {
+            'short': 'For hi-res images, select a resolution from the\nfollowing options: 800, 1000, 1200 or 1500',
+            'long': 'This is useful for non e-ink devices, and for a lower file size\nthan the default, use 400 or 300.',
+            'default': '600'
        }
    }

@@ -58,6 +63,15 @@ class scienceadv(BasicNewsRecipe):
        for p in soup.findAll(attrs={'role':'paragraph'}):
            p.name = 'p'
            p.attrs = {}
+        for img in soup.findAll('img', attrs={'src':True}):
+            if img['src'].endswith('.jpg'):
+                res = '/cdn-cgi/image/width=600'
+                w = self.recipe_specific_options.get('res')
+                if w and isinstance(w, str):
+                    res = '/cdn-cgi/image/width=' + w
+                img['src'] = absurl(res + img['src'])
+        for figc in soup.findAll('figcaption'):
+            figc['class'] = 'figc'
        return soup

    def parse_index(self):
@@ -75,7 +89,7 @@ class scienceadv(BasicNewsRecipe):
            self.description = self.tag_to_string(det).strip()
        cov = soup.find(**classes('cover-image__image'))
        if cov:
-            self.cover_url = absurl(cov.img['src'])
+            self.cover_url = absurl('/cdn-cgi/image/width=800' + cov.img['src'])

        feeds = []

@@ -24,7 +24,7 @@ class science(BasicNewsRecipe):
    browser_type = 'qt'

    extra_css = '''
-        .news-article__figure__caption {font-size:small; text-align:center;}
+        .news-article__figure__caption, .figc {font-size:small;}
        .core-self-citation, .meta-panel__left-content, .news-article__hero__top-meta {font-size:small;}
        .contributors, .news-article__hero__bottom-meta, #bibliography, #elettersSection {font-size:small;}
        img {display:block; margin:0 auto;}
@@ -49,6 +49,11 @@ class science(BasicNewsRecipe):
            'short': 'Enter the Issue Number you want to download\n(Vol/Issue format)',
            'long': 'For example, 385/6710',
            'default': 'current'
+        },
+        'res': {
+            'short': 'For hi-res images, select a resolution from the\nfollowing options: 800, 1000, 1200 or 1500',
+            'long': 'This is useful for non e-ink devices, and for a lower file size\nthan the default, use 400 or 300.',
+            'default': '600'
        }
    }

@@ -56,6 +61,15 @@ class science(BasicNewsRecipe):
        for p in soup.findAll(attrs={'role':'paragraph'}):
            p.name = 'p'
            p.attrs = {}
+        for img in soup.findAll('img', attrs={'src':True}):
+            if img['src'].endswith('.jpg'):
+                res = '/cdn-cgi/image/width=600'
+                w = self.recipe_specific_options.get('res')
+                if w and isinstance(w, str):
+                    res = '/cdn-cgi/image/width=' + w
+                img['src'] = absurl(res + img['src'])
+        for figc in soup.findAll('figcaption'):
+            figc['class'] = 'figc'
        return soup

    def parse_index(self):
@@ -73,7 +87,7 @@ class science(BasicNewsRecipe):
            self.description = self.tag_to_string(det).strip()
        cov = soup.find(**classes('cover-image__image'))
        if cov:
-            self.cover_url = absurl(cov.img['src'])
+            self.cover_url = absurl('/cdn-cgi/image/width=800' + cov.img['src'])

        feeds = []

@@ -26,7 +26,7 @@ class scienceadv(BasicNewsRecipe):
    browser_type = 'qt'

    extra_css = '''
-        .news-article__figure__caption {font-size:small; text-align:center;}
+        .news-article__figure__caption, .figc {font-size:small;}
        .core-self-citation, .meta-panel__left-content, .news-article__hero__top-meta {font-size:small;}
        .contributors, .news-article__hero__bottom-meta, #bibliography, #elettersSection {font-size:small;}
        img {display:block; margin:0 auto;}
@@ -51,6 +51,11 @@ class scienceadv(BasicNewsRecipe):
            'short': 'Enter the Issue Number you want to download\n(Vol/Issue format)',
            'long': 'For example, 385/6710',
            'default': 'current'
+        },
+        'res': {
+            'short': 'For hi-res images, select a resolution from the\nfollowing options: 800, 1000, 1200 or 1500',
+            'long': 'This is useful for non e-ink devices, and for a lower file size\nthan the default, use 400 or 300.',
+            'default': '600'
        }
    }

@@ -58,6 +63,15 @@ class scienceadv(BasicNewsRecipe):
        for p in soup.findAll(attrs={'role':'paragraph'}):
            p.name = 'p'
            p.attrs = {}
+        for img in soup.findAll('img', attrs={'src':True}):
+            if img['src'].endswith('.jpg'):
+                res = '/cdn-cgi/image/width=600'
+                w = self.recipe_specific_options.get('res')
+                if w and isinstance(w, str):
+                    res = '/cdn-cgi/image/width=' + w
+                img['src'] = absurl(res + img['src'])
+        for figc in soup.findAll('figcaption'):
+            figc['class'] = 'figc'
        return soup

    def parse_index(self):
@@ -75,7 +89,7 @@ class scienceadv(BasicNewsRecipe):
            self.description = self.tag_to_string(det).strip()
        cov = soup.find(**classes('cover-image__image'))
        if cov:
-            self.cover_url = absurl(cov.img['src'])
+            self.cover_url = absurl('/cdn-cgi/image/width=800' + cov.img['src'])

        feeds = []

@@ -26,7 +26,7 @@ class scienceadv(BasicNewsRecipe):
    browser_type = 'qt'

    extra_css = '''
-        .news-article__figure__caption {font-size:small; text-align:center;}
+        .news-article__figure__caption, .figc {font-size:small;}
        .core-self-citation, .meta-panel__left-content, .news-article__hero__top-meta {font-size:small;}
        .contributors, .news-article__hero__bottom-meta, #bibliography, #elettersSection {font-size:small;}
        img {display:block; margin:0 auto;}
@@ -51,6 +51,11 @@ class scienceadv(BasicNewsRecipe):
            'short': 'Enter the Issue Number you want to download\n(Vol/Issue format)',
            'long': 'For example, 385/6710',
            'default': 'current'
+        },
+        'res': {
+            'short': 'For hi-res images, select a resolution from the\nfollowing options: 800, 1000, 1200 or 1500',
+            'long': 'This is useful for non e-ink devices, and for a lower file size\nthan the default, use 400 or 300.',
+            'default': '600'
        }
    }

@@ -58,6 +63,15 @@ class scienceadv(BasicNewsRecipe):
        for p in soup.findAll(attrs={'role':'paragraph'}):
            p.name = 'p'
            p.attrs = {}
+        for img in soup.findAll('img', attrs={'src':True}):
+            if img['src'].endswith('.jpg'):
+                res = '/cdn-cgi/image/width=600'
+                w = self.recipe_specific_options.get('res')
+                if w and isinstance(w, str):
+                    res = '/cdn-cgi/image/width=' + w
+                img['src'] = absurl(res + img['src'])
+        for figc in soup.findAll('figcaption'):
+            figc['class'] = 'figc'
        return soup

    def parse_index(self):
@@ -75,7 +89,7 @@ class scienceadv(BasicNewsRecipe):
            self.description = self.tag_to_string(det).strip()
        cov = soup.find(**classes('cover-image__image'))
        if cov:
-            self.cover_url = absurl(cov.img['src'])
+            self.cover_url = absurl('/cdn-cgi/image/width=800' + cov.img['src'])

        feeds = []

@@ -25,7 +25,7 @@ class scienceadv(BasicNewsRecipe):
    browser_type = 'qt'

    extra_css = '''
-        .news-article__figure__caption {font-size:small; text-align:center;}
+        .news-article__figure__caption, .figc {font-size:small;}
        .core-self-citation, .meta-panel__left-content, .news-article__hero__top-meta {font-size:small;}
        .contributors, .news-article__hero__bottom-meta, #bibliography, #elettersSection {font-size:small;}
        img {display:block; margin:0 auto;}
@@ -50,6 +50,11 @@ class scienceadv(BasicNewsRecipe):
            'short': 'Enter the Issue Number you want to download\n(Vol/Issue format)',
            'long': 'For example, 385/6710',
            'default': 'current'
+        },
+        'res': {
+            'short': 'For hi-res images, select a resolution from the\nfollowing options: 800, 1000, 1200 or 1500',
+            'long': 'This is useful for non e-ink devices, and for a lower file size\nthan the default, use 400 or 300.',
+            'default': '600'
        }
    }

@@ -57,6 +62,15 @@ class scienceadv(BasicNewsRecipe):
        for p in soup.findAll(attrs={'role':'paragraph'}):
            p.name = 'p'
            p.attrs = {}
+        for img in soup.findAll('img', attrs={'src':True}):
+            if img['src'].endswith('.jpg'):
+                res = '/cdn-cgi/image/width=600'
+                w = self.recipe_specific_options.get('res')
+                if w and isinstance(w, str):
+                    res = '/cdn-cgi/image/width=' + w
+                img['src'] = absurl(res + img['src'])
+        for figc in soup.findAll('figcaption'):
+            figc['class'] = 'figc'
        return soup

    def parse_index(self):
@@ -74,7 +88,7 @@ class scienceadv(BasicNewsRecipe):
            self.description = self.tag_to_string(det).strip()
        cov = soup.find(**classes('cover-image__image'))
        if cov:
-            self.cover_url = absurl(cov.img['src'])
+            self.cover_url = absurl('/cdn-cgi/image/width=800' + cov.img['src'])

        feeds = []

@@ -26,7 +26,7 @@ class scienceadv(BasicNewsRecipe):
    browser_type = 'qt'

    extra_css = '''
-        .news-article__figure__caption {font-size:small; text-align:center;}
+        .news-article__figure__caption, .figc {font-size:small;}
        .core-self-citation, .meta-panel__left-content, .news-article__hero__top-meta {font-size:small;}
        .contributors, .news-article__hero__bottom-meta, #bibliography, #elettersSection {font-size:small;}
        img {display:block; margin:0 auto;}
@@ -51,6 +51,11 @@ class scienceadv(BasicNewsRecipe):
            'short': 'Enter the Issue Number you want to download\n(Vol/Issue format)',
            'long': 'For example, 385/6710',
            'default': 'current'
+        },
+        'res': {
+            'short': 'For hi-res images, select a resolution from the\nfollowing options: 800, 1000, 1200 or 1500',
+            'long': 'This is useful for non e-ink devices, and for a lower file size\nthan the default, use 400 or 300.',
+            'default': '600'
        }
    }

@@ -58,6 +63,15 @@ class scienceadv(BasicNewsRecipe):
        for p in soup.findAll(attrs={'role':'paragraph'}):
            p.name = 'p'
            p.attrs = {}
+        for img in soup.findAll('img', attrs={'src':True}):
+            if img['src'].endswith('.jpg'):
+                res = '/cdn-cgi/image/width=600'
+                w = self.recipe_specific_options.get('res')
+                if w and isinstance(w, str):
+                    res = '/cdn-cgi/image/width=' + w
+                img['src'] = absurl(res + img['src'])
+        for figc in soup.findAll('figcaption'):
+            figc['class'] = 'figc'
        return soup

    def parse_index(self):
@@ -75,7 +89,7 @@ class scienceadv(BasicNewsRecipe):
            self.description = self.tag_to_string(det).strip()
        cov = soup.find(**classes('cover-image__image'))
        if cov:
-            self.cover_url = absurl(cov.img['src'])
+            self.cover_url = absurl('/cdn-cgi/image/width=800' + cov.img['src'])

        feeds = []