mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
Gets today's front page image displayed in Hindu ePaper website. Also add some styles to the lead image and caption.
129 lines
4.8 KiB
Plaintext
129 lines
4.8 KiB
Plaintext
from __future__ import with_statement
|
|
__license__ = 'GPL 3'
|
|
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
|
|
|
from calibre.web.feeds.news import BasicNewsRecipe
|
|
import string
|
|
|
|
|
|
def classes(classes):
|
|
q = frozenset(classes.split(' '))
|
|
return dict(
|
|
attrs={'class': lambda x: x and frozenset(x.split()).intersection(q)})
|
|
|
|
|
|
class TheHindu(BasicNewsRecipe):
|
|
title = u'The Hindu'
|
|
language = 'en_IN'
|
|
epaper_url = 'https://epaper.thehindu.com'
|
|
|
|
oldest_article = 1
|
|
__author__ = 'Kovid Goyal'
|
|
max_articles_per_feed = 100
|
|
no_stylesheets = True
|
|
remove_attributes = ['style']
|
|
extra_css = '.lead-img-cont { text-align: center; } ' \
|
|
'.lead-img-caption { font-size: small; font-style: italic; }'
|
|
|
|
ignore_duplicate_articles = {'title', 'url'}
|
|
keep_only_tags = [
|
|
dict(name='h1', attrs={'class': ['title', 'special-article-heading']}),
|
|
classes('author-nm lead-img-cont mobile-author-cont photo-collage intro'),
|
|
dict(id=lambda x: x and x.startswith('content-body-')),
|
|
]
|
|
|
|
def get_browser(self):
|
|
br = BasicNewsRecipe.get_browser(self)
|
|
br.addheaders += [('Referer', self.epaper_url)] # needed for fetching cover
|
|
return br
|
|
|
|
def get_cover_url(self):
|
|
url = self.index_to_soup(self.epaper_url + '/Login/DefaultImage', raw=True)
|
|
return url.replace(br'\\', b'/').decode('utf-8')[1:-1]
|
|
|
|
def preprocess_html(self, soup):
|
|
img = soup.find('img', attrs={'class': 'lead-img'})
|
|
try:
|
|
for i, source in enumerate(tuple(img.parent.findAll('source', srcset=True))):
|
|
if i == 0:
|
|
img['src'] = source['srcset'].split()[0]
|
|
source.extract()
|
|
except Exception:
|
|
pass
|
|
# Remove duplicate intro
|
|
for h in soup.findAll('h2', attrs={'class': 'intro'})[1:]:
|
|
h.extract()
|
|
return soup
|
|
|
|
def populate_article_metadata(self, article, soup, first):
|
|
try:
|
|
desc = soup.find('meta', attrs={'name': 'description'}).get('content')
|
|
if not desc.startswith('Todays paper'):
|
|
desc += '...' if len(desc) >= 199 else '' # indicate truncation
|
|
article.text_summary = article.summary = desc
|
|
except AttributeError:
|
|
return
|
|
|
|
def articles_from_soup(self, soup):
|
|
ans = []
|
|
div = soup.find('section', attrs={'id': 'section_'})
|
|
if div is None:
|
|
return ans
|
|
for ul in div.findAll('ul', attrs={'class': 'archive-list'}):
|
|
for x in ul.findAll(['a']):
|
|
title = self.tag_to_string(x)
|
|
url = x.get('href', False)
|
|
if not url or not title:
|
|
continue
|
|
self.log('\t\tFound article:', title)
|
|
self.log('\t\t\t', url)
|
|
ans.append({
|
|
'title': title,
|
|
'url': url,
|
|
'description': '',
|
|
'date': ''})
|
|
return ans
|
|
|
|
def parse_index(self):
|
|
# return [('xxx', [
|
|
# {'title':'xxx', 'url':'http://www.thehindu.com/opinion/op-ed/rohingya-bangladeshs-burden-to-bear/article19694058.ece'},
|
|
# {'title':'yyy', 'url':'http://www.thehindu.com/sci-tech/energy-and-environment/on-river-washed-antique-plains/article19699327.ece'}
|
|
# ])]
|
|
soup = self.index_to_soup('http://www.thehindu.com/todays-paper/')
|
|
nav_div = soup.find(id='subnav-tpbar-latest')
|
|
section_list = []
|
|
|
|
# Finding all the section titles that are acceptable
|
|
for x in nav_div.findAll(['a']):
|
|
if self.is_accepted_entry(x):
|
|
section_list.append(
|
|
(string.capwords(self.tag_to_string(x)), x['href']))
|
|
feeds = []
|
|
|
|
# For each section title, fetch the article urls
|
|
for section in section_list:
|
|
section_title = section[0]
|
|
section_url = section[1]
|
|
self.log('Found section:', section_title, section_url)
|
|
soup = self.index_to_soup(section_url)
|
|
articles = self.articles_from_soup(soup)
|
|
if articles:
|
|
feeds.append((section_title, articles))
|
|
|
|
return feeds
|
|
|
|
def is_accepted_entry(self, entry):
|
|
# Those sections in the top nav bar that we will omit
|
|
omit_list = [
|
|
'tp-tamilnadu', 'tp-karnataka', 'tp-kerala', 'tp-andhrapradesh',
|
|
'tp-telangana', 'tp-newdelhi', 'tp-mumbai', 'tp-otherstates',
|
|
'tp-in-school', 'tp-metroplus', 'tp-youngworld', 'tp-fridayreview',
|
|
'tp-downtown', 'tp-bookreview', 'tp-others']
|
|
|
|
is_accepted = True
|
|
for omit_entry in omit_list:
|
|
if entry['href'][0:-1].endswith(omit_entry):
|
|
is_accepted = False
|
|
break
|
|
return is_accepted
|