From 1ab50ec0eaf62c4567f4f7999863f5c3fcfe9ce7 Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Sat, 28 Sep 2024 10:59:44 +0530 Subject: [PATCH] Update india_today.recipe --- recipes/india_today.recipe | 91 ++++++++++++++++---------------------- 1 file changed, 37 insertions(+), 54 deletions(-) diff --git a/recipes/india_today.recipe b/recipes/india_today.recipe index c8e648e93b..23ec834516 100644 --- a/recipes/india_today.recipe +++ b/recipes/india_today.recipe @@ -1,23 +1,9 @@ #!/usr/bin/env python -# vim:fileencoding=utf-8 - -from calibre.ebooks.BeautifulSoup import Tag +import re +import json from calibre.web.feeds.news import BasicNewsRecipe -def classes(classes): - q = frozenset(classes.split(' ')) - return dict(attrs={ - 'class': lambda x: x and frozenset(x.split()).intersection(q)}) - - -def new_tag(soup, name, attrs=()): - impl = getattr(soup, 'new_tag', None) - if impl is not None: - return impl(name, attrs=dict(attrs)) - return Tag(soup, name, attrs=attrs or None) - - class IndiaToday(BasicNewsRecipe): title = u'India Today Magazine' language = 'en_IN' @@ -33,21 +19,13 @@ class IndiaToday(BasicNewsRecipe): masthead_url = 'https://akm-img-a-in.tosshub.com/sites/all/themes/itg/logo.png' extra_css = ''' - #sub-d {font-style:italic; color:#202020;} - .story__byline {font-size:small; text-align:left;} - .body_caption, .mos__alt .caption, .caption-drupal-entity {font-size:small; text-align:center;} - blockquote{color:#404040;} + #sub-h {font-style:italic; color:#202020;} + .body_caption, #imgcap, .mos__alt .caption, .caption-drupal-entity, .calibre-nuked-tag-figcaption {font-size:small; text-align:center;} + #author, .authors__container {font-size:small;} + blockquote {color:#404040;} ''' - remove_tags = [ - classes('checkout__section sharing align-center-button amp-izooto-sub ads__container inline-story-add amp-ad readmore__box'), - dict(name=(('amp-web-push-widget', 'amp-ad'))), - dict(attrs={'id':'tab-link-wrapper-plugin'}), - dict(name='div', attrs={'amp-access':'NOT granted'}) - ] - - def preprocess_raw_html(self, raw_html, url): - return raw_html.replace('—', '--') + remove_tags = [dict(attrs={id:['tab-link-wrapper-plugin']})] recipe_specific_options = { 'date': { @@ -105,32 +83,37 @@ class IndiaToday(BasicNewsRecipe): return sorted(sections.items(), key=sort_key) def preprocess_html(self, soup): - if soup.find('div', attrs={'amp-access':'granted'}) is not None: - keep_only_tags = [ - classes('strytitle strykicker story__byline srtymos'), - dict(name='div', attrs={'amp-access':'granted'}), - ] - else: - keep_only_tags = [ - classes('strytitle strykicker story__byline srtymos'), - dict(name='div', attrs={'class':'description'}), - ] - body = new_tag(soup, 'body') - for spec in keep_only_tags: - for tag in soup.find('body').findAll(**spec): - body.insert(len(body.contents), tag) - soup.find('body').replaceWith(body) - - for img in soup.findAll('amp-img'): - if not img.find('img'): - img.name = 'img' - h2 = soup.find('h2') - if h2: - h2.name = 'p' - h2['id'] = 'sub-d' for quo in soup.findAll(attrs={'class':'quotes'}): quo.name = 'blockquote' return soup - def print_version(self, url): - return url.replace('.in/','.in/amp/') + def preprocess_raw_html(self, raw, *a): + m = re.search('id="__NEXT_DATA__" type="application/json">', raw) + raw = raw[m.start():] + raw = raw.split('>', 1)[1] + data = json.JSONDecoder().raw_decode(raw)[0] + data = data['props']['pageProps']['initialState']['server']['page_data'] + title = data['title'] + body = '
' + data['description'] + '
' + + slug = desc = image = author = date = imagecap = city = '' + + if 'slug' in data: + slug = '
' + data['slug'] + '
\n' + if 'description_short' in data: + desc = '

' + data['description_short'] + '

\n' + if data.get('author'): + author = ''.join([names['title'] for names in data['author']]) + if 'city' in data: + city = data['city'] + if 'datetime_updated' in data: + date = data['datetime_updated'] + if 'image_main' in data: + image = '
'.format(data['image_main']) + if 'image_caption' in data: + imagecap = '
' + data['image_caption'] + '
' + + html = '' + slug + '

' + title + '

\n' + desc + '
'\ + + author + ' ' + city + ' UPDATED: ' + date + '
\n' + image + imagecap + body\ + + '' + return html