Merge branch 'master' of https://github.com/unkn0w7n/calibre

2025-07-09 03:04:10 -04:00 · 2024-05-17 12:06:44 +05:30 · 2024-05-17 12:06:44 +05:30 · 838a056ad2
commit 838a056ad2
parent 07ed435fd2 6c876fd00a
2 changed files with 42 additions and 31 deletions
--- a/recipes/lrb.recipe
+++ b/recipes/lrb.recipe
@ -26,18 +26,24 @@ class LondonReviewOfBooksPayed(BasicNewsRecipe):
    no_stylesheets = True
    delay = 1
    encoding = 'utf-8'
-    INDEX = 'https://www.lrb.co.uk'
+    INDEX = 'https://www.lrb.co.uk/the-paper/'
    publication_type = 'magazine'
-    needs_subscription = True
+    needs_subscription = 'optional'
    requires_version = (3, 0, 0)
    masthead_url = 'https://www.mylrb.co.uk/out/lrb-2014/img/logo-2x.png'
    extra_css = '''
        .article-word-count, #article-tag-holder { font-size:small; color:#202020; }
        .embedded-image-caption { font-size:small; text-align:center; }
        blockquote, em { color:#202020; }
    '''
    resolve_internal_links = True
    keep_only_tags = [
-        classes('article-header--title paperArticle-reviewsHeader article-content letters-content'),
+        dict(attrs={'id':['article-tag-holder', 'article-heading-holder']}),
        classes('article-copy article-word-count'),
    ]
-    remove_tags = [
+
-        classes('social-button article-mask lrb-readmorelink article-send-letter article-share'),
+    remove_attributes = ['style', 'width', 'height']
    ]
    remove_attributes = ['width', 'height']
    def get_browser(self):
        br = BasicNewsRecipe.get_browser(self)
@ -52,22 +58,23 @@ class LondonReviewOfBooksPayed(BasicNewsRecipe):
        return br
    def preprocess_html(self, soup):
        for h2 in soup.findAll('h2'):
            h2.name = 'h4'
        for cap in soup.findAll(**classes('embedded-image-caption')):
            for p in cap.findAll('p'):
                p.name = 'div'
        for img in soup.findAll('img', attrs={'data-srcset': True}):
-            for x in img['data-srcset'].split():
+            img['src'] = 'https://www.lrb.co.uk/storage/400_filter/images/' + img['data-appsrc'].split('/images/')[-1]
                if '/' in x:
                    img['src'] = x
        return soup
    def parse_index(self):
        articles = []
        soup = self.index_to_soup(self.INDEX)
-        container = soup.find(attrs={'class': 'issue-grid'})
+        container = soup.find('div', attrs={'class': 'article-issue-cover-image'})
-        img = container.find('img')
+        if container:
-        self.cover_url = img['data-srcset'].split()[-2]
+            self.cover_url = 'https://www.lrb.co.uk/storage/800_filter/images/' + container.img['data-appsrc'].split('/images/')[-1]
-        h3 = container.find('h3')
+        edition = self.tag_to_string(soup.find('h1', attrs={'class': 'toc-title'}))
-        self.timefmt = ' [{}]'.format(self.tag_to_string(h3))
+        self.timefmt = ' [{}]'.format(edition)
-        a = img.findParent('a')
+        self.log('Downloading: ', edition)
        soup = self.index_to_soup(absolutize(a['href']))
        grid = soup.find(attrs={'class': 'toc-grid-items'})
        articles = []
        for a in grid.findAll(**classes('toc-item')):
@ -77,5 +84,4 @@ class LondonReviewOfBooksPayed(BasicNewsRecipe):
            title = '{}: {}'.format(self.tag_to_string(h3), self.tag_to_string(h4))
            self.log(title, url)
            articles.append({'title': title, 'url': url})
        return [('Articles', articles)]
--- a/recipes/wsj.recipe
+++ b/recipes/wsj.recipe
@ -85,6 +85,11 @@ class WSJ(BasicNewsRecipe):
            div = col.findParent('div')
            if div:
                div.extract()
        time = soup.find('time')
        if time:
            p = time.findParent('div')
            if p:
                p.name = 'p'
        return soup
    if not past_edition:
@ -117,30 +122,26 @@ class WSJ(BasicNewsRecipe):
    def parse_index(self):
        index = 'https://bartender.mobile.dowjones.io'
        catalog = json.loads(self.index_to_soup(index + '/catalogs/v1/wsj/us/catalog.json', raw=True))
-        edit = []
+        edit = [itm['key'][3:] for itm in catalog['items'] if itm['type'] == 'ITP'][1:]
        for itm in catalog['items']:
            if itm['type'] == 'ITP':
                edit.append(itm['key'][3:])
        self.log('**Past Editions available :', ', '.join(edit))
        for itm in catalog['items']:
            if past_edition:
                if itm['key'] == 'ITP' + past_edition:
                    key = itm['key']
                    manifest = itm['manifest']
-                    dt = datetime.fromisoformat(itm['date'][:-1]) + timedelta(seconds=time.timezone)
+                    date = itm['date']
                    dt = dt.strftime('%b %d, %Y')
                    self.log('Downloading Past Edition ', dt)
                    self.timefmt = ' [' + dt + ']'
                    break   
            elif itm['type'] == 'ITP':
                key = itm['key']
                manifest = itm['manifest']
-                dt = datetime.fromisoformat(itm['date'][:-1]) + timedelta(seconds=time.timezone)
+                date = itm['date']
                dt = dt.strftime('%b %d, %Y')
                self.log('Downloading ', dt)
                self.timefmt = ' [' + dt + ']'
                break
        dt = datetime.fromisoformat(date[:-1]) + timedelta(seconds=time.timezone)
        dt = dt.strftime('%b %d, %Y')
        self.log('Downloading ', dt)
        self.timefmt = ' [' + dt + ']'
        feeds = []
        manif = json.loads(self.index_to_soup(index + manifest, raw=True))
@ -179,6 +180,10 @@ class WSJ(BasicNewsRecipe):
                h1 = soup.find('h1')
                if h1:
                    h1['title'] = url['content']
            h2 = soup.find('h2')
            if h2:
                h2['id'] = 'subhed'
                h2.name = 'p'
            return soup.prettify()
    def populate_article_metadata(self, article, soup, first):