Merge branch 'master' of https://github.com/unkn0w7n/calibre

2025-08-11 09:13:57 -04:00 · 2024-09-28 11:08:12 +05:30 · 2024-09-28 11:08:12 +05:30 · c4dfd28e8a
commit c4dfd28e8a
parent f0e42c3bf3 1ab50ec0ea
1 changed files with 37 additions and 54 deletions
--- a/recipes/india_today.recipe
+++ b/recipes/india_today.recipe
@ -1,23 +1,9 @@
 #!/usr/bin/env python
-# vim:fileencoding=utf-8
-
-from calibre.ebooks.BeautifulSoup import Tag
+import re
+import json
 from calibre.web.feeds.news import BasicNewsRecipe


-def classes(classes):
-    q = frozenset(classes.split(' '))
-    return dict(attrs={
-        'class': lambda x: x and frozenset(x.split()).intersection(q)})
-
-
-def new_tag(soup, name, attrs=()):
-    impl = getattr(soup, 'new_tag', None)
-    if impl is not None:
-        return impl(name, attrs=dict(attrs))
-    return Tag(soup, name, attrs=attrs or None)
-
-
 class IndiaToday(BasicNewsRecipe):
    title = u'India Today Magazine'
    language = 'en_IN'
@ -33,21 +19,13 @@ class IndiaToday(BasicNewsRecipe):
    masthead_url = 'https://akm-img-a-in.tosshub.com/sites/all/themes/itg/logo.png'

    extra_css = '''
-        #sub-d {font-style:italic; color:#202020;}
-        .story__byline {font-size:small; text-align:left;}
-        .body_caption, .mos__alt .caption, .caption-drupal-entity {font-size:small; text-align:center;}
-        blockquote{color:#404040;}
+        #sub-h {font-style:italic; color:#202020;}
+        .body_caption, #imgcap, .mos__alt .caption, .caption-drupal-entity, .calibre-nuked-tag-figcaption {font-size:small; text-align:center;}
+        #author, .authors__container {font-size:small;}
+        blockquote {color:#404040;}
    '''

-    remove_tags = [
-            classes('checkout__section sharing align-center-button amp-izooto-sub ads__container inline-story-add amp-ad readmore__box'),
-            dict(name=(('amp-web-push-widget', 'amp-ad'))),
-            dict(attrs={'id':'tab-link-wrapper-plugin'}),
-            dict(name='div', attrs={'amp-access':'NOT granted'})
-        ]
-
-    def preprocess_raw_html(self, raw_html, url):
-        return raw_html.replace('â€”', '--')
+    remove_tags = [dict(attrs={id:['tab-link-wrapper-plugin']})]

    recipe_specific_options = {
        'date': {
@ -105,32 +83,37 @@ class IndiaToday(BasicNewsRecipe):
        return sorted(sections.items(), key=sort_key)

    def preprocess_html(self, soup):
-        if soup.find('div', attrs={'amp-access':'granted'}) is not None:
-            keep_only_tags = [
-                classes('strytitle strykicker story__byline srtymos'),
-                dict(name='div', attrs={'amp-access':'granted'}),
-            ]
-        else:
-            keep_only_tags = [
-                classes('strytitle strykicker story__byline srtymos'),
-                dict(name='div', attrs={'class':'description'}),
-            ]
-        body = new_tag(soup, 'body')
-        for spec in keep_only_tags:
-            for tag in soup.find('body').findAll(**spec):
-                body.insert(len(body.contents), tag)
-        soup.find('body').replaceWith(body)
-
-        for img in soup.findAll('amp-img'):
-            if not img.find('img'):
-                img.name = 'img'
-        h2 = soup.find('h2')
-        if h2:
-            h2.name = 'p'
-            h2['id'] = 'sub-d'
        for quo in soup.findAll(attrs={'class':'quotes'}):
            quo.name = 'blockquote'
        return soup

-    def print_version(self, url):
-        return url.replace('.in/','.in/amp/')
+    def preprocess_raw_html(self, raw, *a):
+        m = re.search('id="__NEXT_DATA__" type="application/json">', raw)
+        raw = raw[m.start():]
+        raw = raw.split('>', 1)[1]
+        data = json.JSONDecoder().raw_decode(raw)[0]
+        data = data['props']['pageProps']['initialState']['server']['page_data']
+        title = data['title']
+        body = '<div>' + data['description'] + '</div>'
+
+        slug = desc = image = author = date = imagecap = city = ''
+
+        if 'slug' in data:
+            slug = '<div>' + data['slug'] + '</div>\n'
+        if 'description_short' in data:
+            desc = '<p id="sub-h">' + data['description_short'] + '</p>\n'
+        if data.get('author'):
+            author = ''.join([names['title'] for names in data['author']])
+        if 'city' in data:
+            city = data['city']
+        if 'datetime_updated' in data:
+            date = data['datetime_updated']
+        if 'image_main' in data:
+            image = '<br/><img src="{}">'.format(data['image_main'])
+            if 'image_caption' in data:
+                imagecap = '<div id="imgcap">' + data['image_caption'] + '</div>'
+
+        html = '<html><body>' + slug + '<h1>' + title + '</h1>\n' + desc + '<div id="author">'\
+                    + author +  '<span> ' + city + ' UPDATED: ' + date + '</span></div>\n' + image + imagecap + body\
+                        + '</body></html>'
+        return html