Times Literary Supplement by unkn0wn

Merge branch 'master' of https://github.com/unkn0w7n/calibre
2025-08-11 09:13:57 -04:00 · 2024-06-26 19:00:37 +05:30 · 2024-06-26 19:00:37 +05:30 · 26e67c96ed
commit 26e67c96ed
parent e88c683aad a9b085ef5e
4 changed files with 135 additions and 15 deletions
--- a/recipes/icons/tls_mag.png
+++ b/recipes/icons/tls_mag.png
--- a/recipes/tls_mag.recipe
+++ b/recipes/tls_mag.recipe
@ -0,0 +1,115 @@
+import json, re
+from calibre import browser
+from calibre.web.feeds.news import BasicNewsRecipe
+from calibre.ebooks.BeautifulSoup import BeautifulSoup
+
+
+def re_html(y):
+    soup = BeautifulSoup(y.rstrip(), "html.parser")
+    return soup.text
+
+def get_cont(x):
+    url = x['url']
+    title = x['headline']
+    desc = x['standfirst']
+    if x['byline']['text']:
+        desc = 'By ' + x['byline']['text'] + ' | ' + desc
+    print('              ', re_html(title), '\n\t', re_html(desc), '\n\t', url)
+    return ({ 'title': re_html(title), 'description': re_html(desc), 'url': url })
+
+def get_id(url):
+    rq = browser().open(url)
+    return re.search('\?p=(\S+)>', str(rq.info())).group(1)
+
+
+class tls(BasicNewsRecipe):
+    title = 'Times Literary Supplement'
+    __author__ = 'unkn0wn'
+    description = (
+        'TLS, world’s leading journal for literature and ideas. Every week, we publish book reviews, book extracts, '
+        'essays and poems from leading writers from around the world. We cover far more than just literature, featuring '
+        'major articles on subjects from anthropology to zoology, philosophy to politics, comedy to psychology. Each week, '
+        'we also review the latest in fiction, film, opera, theatre, dance, radio and television.'
+    )
+    encoding = 'utf-8'
+    language = 'en_GB'
+    masthead_url = 'https://www.the-tls.co.uk/wp-content/uploads/sites/7/2019/11/Smaller-Logo.jpg'
+
+    extra_css = '''
+        .label { font-size:small; color:#404040; }
+        .figc { font-size:small; text-align:center; }
+        .desc { font-style:italic; color:#202020; }
+        .auth { font-size:small; }
+        em, blockquote { color:#202020; }
+        .det { font-size:small; color:#202020; }
+    '''
+
+    def parse_index(self):
+        issue = 'https://www.the-tls.co.uk/issues/current-issue/'
+        url = 'https://www.the-tls.co.uk/wp-json/tls/v2/contents-page/' + get_id(issue)
+        raw = self.index_to_soup(url, raw=True)
+        data = json.loads(raw)
+        self.cover_url = data['featuredimage']['full_image'] + '?w600'
+        self.timefmt = ' [' + data['issuedateline']['issuedate'] + ']'
+        self.description = 'Issue ' + data['issuedateline']['issuenumber']
+
+        feeds = []
+
+        self.log('A note from the Editor')
+        feeds.append(('A note from the Editor', [get_cont(data['featuredarticle'])]))
+
+        cont = data['contents']
+        for c in cont:
+            section = re_html(cont[c]['articleheader']['title'])
+            self.log(section)
+            articles = []
+            for arts in cont[c]['articleslist']:
+                articles.append(get_cont(arts))
+            if articles:
+                feeds.append((section, articles))
+        return feeds
+
+    def print_version(self, url):
+        return 'https://www.the-tls.co.uk/wp-json/tls/v2/single-article/' + get_id(url)
+
+    def preprocess_raw_html(self, raw, *a):
+        data = json.loads(raw)
+        prim = data['articleIntroPrimary']
+        title = '<h1>' + prim['headline'] + '</h1>\n'
+        desc = '<p class="desc">' + prim['standfirst'] + '</p>\n'
+
+        auth = lede = ''
+
+        label = '<div class="label">{}</div>\n'
+        if prim['label']['category']['text']:
+            label = label.format(prim['label']['articletype'] + ' | ' + prim['label']['category']['text'])
+        else:
+            label = label.format(prim['label']['articletype'])
+
+        if prim['byline']['text']:
+            auth = '<p class="auth"><a href="{}">'.format(prim['byline']['link']) + prim['byline']['text'] + '</a></p>\n'
+
+        bks = ''
+        if data['bookdetails']:
+            for a in data['bookdetails']:
+                bks += '<br>'
+                for x, y in a.items():
+                    if isinstance(y, str):
+                        if x == 'imageurl':
+                            bks += '<img src="{}">'.format(y)
+                        elif y:
+                            bks += '<div class="det">' + y + '</div>\n'
+                bks += '<br>'
+
+        if 'full_image' in data['leadimage'] and data['leadimage']['full_image']:
+            lede = '<br><img src="{}"><div class="figc">{}</div>'.format(
+                data['leadimage']['full_image'] + '?w600', data['leadimage']['imagecaption'] + ' <i>' \
+                    + data['leadimage']['imagecredit'] + '</i>'
+            )
+
+        body = data['content']
+
+        html = '<html><body><div>' \
+                    + label + title + desc + auth + lede + bks + body + \
+                        '</div></body></html>'
+        return BeautifulSoup(html).prettify()
--- a/recipes/wsj.recipe
+++ b/recipes/wsj.recipe
@ -13,12 +13,16 @@ past_edition = None

 def media_bucket(x):
    if x.get('type', '') == 'image':
-        return '<img src="{}"><div class="figc">{}</div>\n'.format(
+        if x.get('subtype', '') == 'graphic':
+            return '<br><img src="{}"><div class="figc">{}</div>\n'.format(
                x['manifest-url'], x['caption'] + '<i> ' + x['credit'] + '</i>'
            )
+        return '<br><img src="{}"><div class="figc">{}</div>\n'.format(
+            x['manifest-url'].split('?')[0] + '?width=600', x['caption'] + '<i> ' + x['credit'] + '</i>'
+        )
    if x.get('type', '') == 'video':
-        return '<a href="{}"><img src="{}"></a><div class="figc">{}</div>\n'.format(
-            x['share_link'], x['thumbnail_url'], x['caption'] + '<i> ' + x['credit'] + '</i>'
+        return '<br><a href="{}"><img src="{}"></a><div class="figc">{}</div>\n'.format(
+            x['share_link'], x['thumbnail_url'].split('?')[0] + '?width=600', x['caption'] + '<i> ' + x['credit'] + '</i>'
        )
    return

@ -90,7 +94,6 @@ class WSJ(BasicNewsRecipe):
                m_itm = soup.findAll('panel', attrs={'class':'media-item'})
                if i_lst and m_itm:
                    for x, y in list(zip_longest(m_itm, i_lst)):
-                        x.name = 'p'
                        x.insert_after(BeautifulSoup(y, 'html.parser'))
        return soup

@ -141,9 +144,9 @@ class WSJ(BasicNewsRecipe):
                break

        dt = datetime.fromisoformat(date[:-1]) + timedelta(seconds=time.timezone)
-        dt = dt.strftime('%b %d, %Y')
-        self.log('Downloading ', dt)
-        self.timefmt = ' [' + dt + ']'
+        dt_ = dt.strftime('%b %d, %Y')
+        self.log('Downloading ', dt_)
+        self.timefmt = ' [' + dt_ + ']'

        feeds = []

@ -153,7 +156,7 @@ class WSJ(BasicNewsRecipe):
                if '-pages_' in k:
                    section = k.split('-pages_')[0].replace('_', ' ')
                    if 'MAGAZINE' in section:
-                        if not datetime.now().strftime("%d") == 1:
+                        if not dt.strftime('%d') == 1:
                            continue
                        self.log('Loading Magazine section')
                    self.log(section)
--- a/recipes/wsj_mag.recipe
+++ b/recipes/wsj_mag.recipe
@ -9,16 +9,19 @@ from calibre.web.feeds.news import BasicNewsRecipe, classes

 def media_bucket(x):
    if x.get('type', '') == 'image':
-        return '<img src="{}"><div class="figc">{}</div>\n'.format(
+        if x.get('subtype', '') == 'graphic':
+            return '<br><img src="{}"><div class="figc">{}</div>\n'.format(
                x['manifest-url'], x['caption'] + '<i> ' + x['credit'] + '</i>'
            )
+        return '<br><img src="{}"><div class="figc">{}</div>\n'.format(
+            x['manifest-url'].split('?')[0] + '?width=600', x['caption'] + '<i> ' + x['credit'] + '</i>'
+        )
    if x.get('type', '') == 'video':
-        return '<a href="{}"><img src="{}"></a><div class="figc">{}</div>\n'.format(
-            x['share_link'], x['thumbnail_url'], x['caption'] + '<i> ' + x['credit'] + '</i>'
+        return '<br><a href="{}"><img src="{}"></a><div class="figc">{}</div>\n'.format(
+            x['share_link'], x['thumbnail_url'].split('?')[0] + '?width=600', x['caption'] + '<i> ' + x['credit'] + '</i>'
        )
    return

-
 class WSJ(BasicNewsRecipe):
    title = 'WSJ. Magazine'
    __author__ = 'unkn0wn'
@ -87,7 +90,6 @@ class WSJ(BasicNewsRecipe):
                m_itm = soup.findAll('panel', attrs={'class':'media-item'})
                if i_lst and m_itm:
                    for x, y in list(zip_longest(m_itm, i_lst)):
-                        x.name = 'p'
                        x.insert_after(BeautifulSoup(y, 'html.parser'))
        return soup