Update The Week

This commit is contained in:
Kovid Goyal 2022-04-04 15:14:51 +05:30
parent 7944a8a022
commit 7f79f21f1d
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
4 changed files with 32 additions and 14 deletions

View File

@ -63,6 +63,11 @@ class FE_India(BasicNewsRecipe):
('Money','https://www.financialexpress.com/money/feed'), ('Money','https://www.financialexpress.com/money/feed'),
] ]
def get_cover_url(self):
soup = self.index_to_soup('https://www.magzter.com/IN/The-Indian-Express-Ltd./Financial-Express-Mumbai/Business/')
for citem in soup.findAll('meta', content=lambda s: s and s.endswith('view/3.jpg')):
return citem['content']
def preprocess_html(self, soup, *a): def preprocess_html(self, soup, *a):
for img in soup.findAll(attrs={'data-src': True}): for img in soup.findAll(attrs={'data-src': True}):
img['src'] = img['data-src'] img['src'] = img['data-src']

View File

@ -49,6 +49,11 @@ class HindustanTimes(BasicNewsRecipe):
# ('Budget',''https://www.hindustantimes.com/feeds/rss/budget/rssfeed.xml') # ('Budget',''https://www.hindustantimes.com/feeds/rss/budget/rssfeed.xml')
] ]
def get_cover_url(self):
soup = self.index_to_soup('https://www.magzter.com/IN/HT-Digital-Streams-Ltd./Hindustan-Times-Delhi/Newspaper/')
for citem in soup.findAll('meta', content=lambda s: s and s.endswith('view/3.jpg')):
return citem['content']
def preprocess_html(self, soup): def preprocess_html(self, soup):
for img in soup.findAll('img', attrs={'data-src': True}): for img in soup.findAll('img', attrs={'data-src': True}):
img['src'] = img['data-src'] img['src'] = img['data-src']

View File

@ -35,6 +35,13 @@ class IndiaToday(BasicNewsRecipe):
('Sports','https://www.indiatoday.in/rss/1206518'), ('Sports','https://www.indiatoday.in/rss/1206518'),
] ]
extra_css = '[itemprop^="description"] {font-size: small; font-style: italic;}'
def get_cover_url(self):
soup = self.index_to_soup('https://www.magzter.com/IN/India-Today-Group/India-Today/News/')
for citem in soup.findAll('meta', content=lambda s: s and s.endswith('view/3.jpg')):
return citem['content']
def preprocess_raw_html(self, raw_html, url): def preprocess_raw_html(self, raw_html, url):
from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.ebooks.BeautifulSoup import BeautifulSoup
soup = BeautifulSoup(raw_html) soup = BeautifulSoup(raw_html)

View File

@ -2,7 +2,6 @@
# vim:fileencoding=utf-8 # vim:fileencoding=utf-8
# License: GPLv3 Copyright: 2021, Kovid Goyal <kovid at kovidgoyal.net> # License: GPLv3 Copyright: 2021, Kovid Goyal <kovid at kovidgoyal.net>
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
@ -20,6 +19,7 @@ class TheWeek(BasicNewsRecipe):
no_stylesheets = True no_stylesheets = True
use_embedded_content = True use_embedded_content = True
ignore_duplicate_articles = {'url'} ignore_duplicate_articles = {'url'}
remove_attributes = ['style', 'align', 'border', 'hspace']
feeds = [ feeds = [
('Cover Story', 'https://www.theweek.in/theweek/cover.rss'), ('Cover Story', 'https://www.theweek.in/theweek/cover.rss'),
@ -34,23 +34,24 @@ class TheWeek(BasicNewsRecipe):
] ]
def get_cover_url(self): def get_cover_url(self):
soup = self.index_to_soup('https://www.theweek.in/theweek.html') soup = self.index_to_soup(
for img in soup.findAll('img', attrs={'data-src-web': lambda x: x and '/cover-magazine' in x}): 'https://www.magzter.com/IN/Malayala_Manorama/THE_WEEK/Business/'
src = img['data-src-web'] )
try: for citem in soup.findAll(
idx = src.rfind('.image.') 'meta', content=lambda s: s and s.endswith('view/3.jpg')
except Exception: ):
pass return citem['content']
else:
if idx > -1:
src = src[:idx]
return 'https://img.theweek.in' + src
def preprocess_html(self, soup): def preprocess_html(self, soup):
a = soup.find('a') a = soup.find('a')
a.name = 'div' if a:
a.name = 'div'
h2 = soup.find('h2') h2 = soup.find('h2')
h2.string = fix_title(h2.string) if h2:
h2.string = fix_title(h2.string)
for p in soup.findAll('p'):
if p.string == '\xa0':
p.decompose()
return soup return soup
def populate_article_metadata(self, article, soup, first): def populate_article_metadata(self, article, soup, first):