Merge branch 'master' of https://github.com/unkn0w7n/calibre

2025-08-30 23:00:21 -04:00 · 2024-05-17 12:06:44 +05:30 · 2024-05-17 12:06:44 +05:30 · 838a056ad2
commit 838a056ad2
parent 07ed435fd2 6c876fd00a
2 changed files with 42 additions and 31 deletions
--- a/recipes/lrb.recipe
+++ b/recipes/lrb.recipe
@ -26,18 +26,24 @@ class LondonReviewOfBooksPayed(BasicNewsRecipe):
    no_stylesheets = True
    delay = 1
    encoding = 'utf-8'
-    INDEX = 'https://www.lrb.co.uk'
+    INDEX = 'https://www.lrb.co.uk/the-paper/'
    publication_type = 'magazine'
-    needs_subscription = True
+    needs_subscription = 'optional'
    requires_version = (3, 0, 0)
+    masthead_url = 'https://www.mylrb.co.uk/out/lrb-2014/img/logo-2x.png'
+    extra_css = '''
+        .article-word-count, #article-tag-holder { font-size:small; color:#202020; }
+        .embedded-image-caption { font-size:small; text-align:center; }
+        blockquote, em { color:#202020; }
+    '''
+    resolve_internal_links = True

    keep_only_tags = [
-        classes('article-header--title paperArticle-reviewsHeader article-content letters-content'),
+        dict(attrs={'id':['article-tag-holder', 'article-heading-holder']}),
+        classes('article-copy article-word-count'),
    ]
-    remove_tags = [
-        classes('social-button article-mask lrb-readmorelink article-send-letter article-share'),
-    ]
-    remove_attributes = ['width', 'height']
+
+    remove_attributes = ['style', 'width', 'height']

    def get_browser(self):
        br = BasicNewsRecipe.get_browser(self)
@ -52,22 +58,23 @@ class LondonReviewOfBooksPayed(BasicNewsRecipe):
        return br

    def preprocess_html(self, soup):
+        for h2 in soup.findAll('h2'):
+            h2.name = 'h4'
+        for cap in soup.findAll(**classes('embedded-image-caption')):
+            for p in cap.findAll('p'):
+                p.name = 'div'
        for img in soup.findAll('img', attrs={'data-srcset': True}):
-            for x in img['data-srcset'].split():
-                if '/' in x:
-                    img['src'] = x
+            img['src'] = 'https://www.lrb.co.uk/storage/400_filter/images/' + img['data-appsrc'].split('/images/')[-1]
        return soup

    def parse_index(self):
-        articles = []
        soup = self.index_to_soup(self.INDEX)
-        container = soup.find(attrs={'class': 'issue-grid'})
-        img = container.find('img')
-        self.cover_url = img['data-srcset'].split()[-2]
-        h3 = container.find('h3')
-        self.timefmt = ' [{}]'.format(self.tag_to_string(h3))
-        a = img.findParent('a')
-        soup = self.index_to_soup(absolutize(a['href']))
+        container = soup.find('div', attrs={'class': 'article-issue-cover-image'})
+        if container:
+            self.cover_url = 'https://www.lrb.co.uk/storage/800_filter/images/' + container.img['data-appsrc'].split('/images/')[-1]
+        edition = self.tag_to_string(soup.find('h1', attrs={'class': 'toc-title'}))
+        self.timefmt = ' [{}]'.format(edition)
+        self.log('Downloading: ', edition)
        grid = soup.find(attrs={'class': 'toc-grid-items'})
        articles = []
        for a in grid.findAll(**classes('toc-item')):
@ -77,5 +84,4 @@ class LondonReviewOfBooksPayed(BasicNewsRecipe):
            title = '{}: {}'.format(self.tag_to_string(h3), self.tag_to_string(h4))
            self.log(title, url)
            articles.append({'title': title, 'url': url})
-
        return [('Articles', articles)]
--- a/recipes/wsj.recipe
+++ b/recipes/wsj.recipe
@ -85,6 +85,11 @@ class WSJ(BasicNewsRecipe):
            div = col.findParent('div')
            if div:
                div.extract()
+        time = soup.find('time')
+        if time:
+            p = time.findParent('div')
+            if p:
+                p.name = 'p'
        return soup

    if not past_edition:
@ -117,30 +122,26 @@ class WSJ(BasicNewsRecipe):
    def parse_index(self):
        index = 'https://bartender.mobile.dowjones.io'
        catalog = json.loads(self.index_to_soup(index + '/catalogs/v1/wsj/us/catalog.json', raw=True))
-        edit = []
-        for itm in catalog['items']:
-            if itm['type'] == 'ITP':
-                edit.append(itm['key'][3:])
+        edit = [itm['key'][3:] for itm in catalog['items'] if itm['type'] == 'ITP'][1:]
        self.log('**Past Editions available :', ', '.join(edit))
        for itm in catalog['items']:
            if past_edition:
                if itm['key'] == 'ITP' + past_edition:
                    key = itm['key']
                    manifest = itm['manifest']
-                    dt = datetime.fromisoformat(itm['date'][:-1]) + timedelta(seconds=time.timezone)
-                    dt = dt.strftime('%b %d, %Y')
-                    self.log('Downloading Past Edition ', dt)
-                    self.timefmt = ' [' + dt + ']'
+                    date = itm['date']
                    break   
            elif itm['type'] == 'ITP':
                key = itm['key']
                manifest = itm['manifest']
-                dt = datetime.fromisoformat(itm['date'][:-1]) + timedelta(seconds=time.timezone)
-                dt = dt.strftime('%b %d, %Y')
-                self.log('Downloading ', dt)
-                self.timefmt = ' [' + dt + ']'
+                date = itm['date']
                break

+        dt = datetime.fromisoformat(date[:-1]) + timedelta(seconds=time.timezone)
+        dt = dt.strftime('%b %d, %Y')
+        self.log('Downloading ', dt)
+        self.timefmt = ' [' + dt + ']'
+
        feeds = []

        manif = json.loads(self.index_to_soup(index + manifest, raw=True))
@ -179,6 +180,10 @@ class WSJ(BasicNewsRecipe):
                h1 = soup.find('h1')
                if h1:
                    h1['title'] = url['content']
+            h2 = soup.find('h2')
+            if h2:
+                h2['id'] = 'subhed'
+                h2.name = 'p'
            return soup.prettify()

    def populate_article_metadata(self, article, soup, first):