mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-08-11 09:13:57 -04:00
Merge branch 'master' of https://github.com/unkn0w7n/calibre
This commit is contained in:
commit
15096708d6
@ -1,8 +1,8 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=utf-8
|
||||
from datetime import datetime, timedelta
|
||||
# from datetime import datetime, timedelta
|
||||
|
||||
from calibre.utils.date import parse_date
|
||||
# from calibre.utils.date import parse_date
|
||||
from calibre.web.feeds.news import BasicNewsRecipe, classes
|
||||
|
||||
|
||||
@ -11,7 +11,6 @@ class IndianExpress(BasicNewsRecipe):
|
||||
language = 'en_IN'
|
||||
__author__ = 'unkn0wn'
|
||||
oldest_article = 1.15 # days
|
||||
max_articles_per_feed = 25
|
||||
encoding = 'utf-8'
|
||||
masthead_url = 'https://indianexpress.com/wp-content/themes/indianexpress/images/indian-express-logo-n.svg'
|
||||
no_stylesheets = True
|
||||
@ -49,84 +48,118 @@ class IndianExpress(BasicNewsRecipe):
|
||||
)
|
||||
]
|
||||
|
||||
def parse_index(self):
|
||||
recipe_specific_options = {
|
||||
'days': {
|
||||
'short': 'Oldest article to download from this news source. In days ',
|
||||
'long': 'For example, 0.5, gives you articles from the past 12 hours',
|
||||
'default': str(oldest_article),
|
||||
},
|
||||
'res': {
|
||||
'short': 'For hi-res images, select a resolution from the\nfollowing options: 400, 800, 1200, 1600',
|
||||
'long': 'This is useful for non e-ink devices.',
|
||||
'default': '600',
|
||||
},
|
||||
}
|
||||
|
||||
section_list = [
|
||||
('Daily Briefing', 'https://indianexpress.com/section/live-news/'),
|
||||
('Front Page', 'https://indianexpress.com/print/front-page/'),
|
||||
('India', 'https://indianexpress.com/section/india/'),
|
||||
# ('Express Network', 'https://indianexpress.com/print/express-network/'),
|
||||
('Delhi Confidential', 'https://indianexpress.com/section/delhi-confidential/'),
|
||||
('Editorials', 'https://indianexpress.com/section/opinion/editorials/'),
|
||||
('Columns', 'https://indianexpress.com/section/opinion/columns/'),
|
||||
('UPSC-CSE Key', 'https://indianexpress.com/section/upsc-current-affairs/'),
|
||||
('Explained', 'https://indianexpress.com/section/explained/'),
|
||||
('Business', 'https://indianexpress.com/section/business/'),
|
||||
# ('Political Pulse', 'https://indianexpress.com/section/political-pulse/'),
|
||||
('Sunday Eye', 'https://indianexpress.com/section/express-sunday-eye/'),
|
||||
('World', 'https://indianexpress.com/section/world/'),
|
||||
# ('Education', 'https://indianexpress.com/section/education/'),
|
||||
# ('Gadgets', 'https://indianexpress.com/section/technology/gadgets/'),
|
||||
('Tech Review', 'https://indianexpress.com/section/technology/tech-reviews/'),
|
||||
# ('Techhook', 'https://indianexpress.com/section/technology/techook/'),
|
||||
# ('Laptops', 'https://indianexpress.com/section/technology/laptops/'),
|
||||
# ('Mobiles & Tabs', 'https://indianexpress.com/section/technology/mobile-tabs/'),
|
||||
('Science', 'https://indianexpress.com/section/technology/science/'),
|
||||
('Movie Review', 'https://indianexpress.com/section/entertainment/movie-review/'),
|
||||
]
|
||||
def __init__(self, *args, **kwargs):
|
||||
BasicNewsRecipe.__init__(self, *args, **kwargs)
|
||||
d = self.recipe_specific_options.get('days')
|
||||
if d and isinstance(d, str):
|
||||
self.oldest_article = float(d)
|
||||
|
||||
feeds = []
|
||||
feeds = [
|
||||
'https://indianexpress.com/section/opinion/feed',
|
||||
'https://indianexpress.com/section/delhi-confidential/feed',
|
||||
'https://indianexpress.com/section/india/feed',
|
||||
'https://indianexpress.com/section/political-pulse/feed',
|
||||
'https://indianexpress.com/section/explained/feed',
|
||||
'https://indianexpress.com/section/business/feed/',
|
||||
'https://indianexpress.com/section/upsc-current-affairs/feed',
|
||||
'https://indianexpress.com/section/express-sunday-eye/feed',
|
||||
'http://indianexpress.com/section/world/feed',
|
||||
'https://indianexpress.com/section/technology/feed',
|
||||
'https://indianexpress.com/section/entertainment/feed',
|
||||
'https://indianexpress.com/feed',
|
||||
]
|
||||
|
||||
# For each section title, fetch the article urls
|
||||
for section in section_list:
|
||||
section_title = section[0]
|
||||
section_url = section[1]
|
||||
self.log(section_title, section_url)
|
||||
soup = self.index_to_soup(section_url)
|
||||
if '/world/' in section_url or '/explained/' in section_url:
|
||||
articles = self.articles_from_page(soup)
|
||||
else:
|
||||
articles = self.articles_from_soup(soup)
|
||||
if articles:
|
||||
feeds.append((section_title, articles))
|
||||
return feeds
|
||||
# def parse_index(self):
|
||||
|
||||
def articles_from_page(self, soup):
|
||||
ans = []
|
||||
for div in soup.findAll(attrs={'class': ['northeast-topbox', 'explained-section-grid']}):
|
||||
for a in div.findAll('a', href=True):
|
||||
if not a.find('img') and '/section/' not in a['href']:
|
||||
url = a['href']
|
||||
title = self.tag_to_string(a)
|
||||
self.log('\t', title, '\n\t\t', url)
|
||||
ans.append({'title': title, 'url': url, 'description': ''})
|
||||
return ans
|
||||
# section_list = [
|
||||
# ('Daily Briefing', 'https://indianexpress.com/section/live-news/'),
|
||||
# ('Front Page', 'https://indianexpress.com/print/front-page/'),
|
||||
# ('India', 'https://indianexpress.com/section/india/'),
|
||||
# # ('Express Network', 'https://indianexpress.com/print/express-network/'),
|
||||
# ('Delhi Confidential', 'https://indianexpress.com/section/delhi-confidential/'),
|
||||
# ('Editorials', 'https://indianexpress.com/section/opinion/editorials/'),
|
||||
# ('Columns', 'https://indianexpress.com/section/opinion/columns/'),
|
||||
# ('UPSC-CSE Key', 'https://indianexpress.com/section/upsc-current-affairs/'),
|
||||
# ('Explained', 'https://indianexpress.com/section/explained/'),
|
||||
# ('Business', 'https://indianexpress.com/section/business/'),
|
||||
# # ('Political Pulse', 'https://indianexpress.com/section/political-pulse/'),
|
||||
# ('Sunday Eye', 'https://indianexpress.com/section/express-sunday-eye/'),
|
||||
# ('World', 'https://indianexpress.com/section/world/'),
|
||||
# # ('Education', 'https://indianexpress.com/section/education/'),
|
||||
# # ('Gadgets', 'https://indianexpress.com/section/technology/gadgets/'),
|
||||
# ('Tech Review', 'https://indianexpress.com/section/technology/tech-reviews/'),
|
||||
# # ('Techhook', 'https://indianexpress.com/section/technology/techook/'),
|
||||
# # ('Laptops', 'https://indianexpress.com/section/technology/laptops/'),
|
||||
# # ('Mobiles & Tabs', 'https://indianexpress.com/section/technology/mobile-tabs/'),
|
||||
# ('Science', 'https://indianexpress.com/section/technology/science/'),
|
||||
# ('Movie Review', 'https://indianexpress.com/section/entertainment/movie-review/'),
|
||||
# ]
|
||||
|
||||
def articles_from_soup(self, soup):
|
||||
ans = []
|
||||
div = soup.find('div', attrs={'class': ['nation', 'o-opin', 'myie-nation', 'opinion-more-wrapper']})
|
||||
for art in div.findAll(
|
||||
attrs={'class': ['articles', 'o-opin-article', 'myie-articles']}
|
||||
):
|
||||
for a in art.findAll('a', href=True):
|
||||
if not a.find('img') and not any(
|
||||
x in a['href'] for x in ['/profile/', '/agency/', '/section/']
|
||||
):
|
||||
url = a['href']
|
||||
title = self.tag_to_string(a)
|
||||
desc = ''
|
||||
if p := (art.find('p') or art.find(attrs={'class': 'opinion-news-para'})):
|
||||
desc = self.tag_to_string(p)
|
||||
if da := art.find(
|
||||
attrs={'class': ['date', 'o-opin-date', 'opinion-date', 'my-time']}
|
||||
):
|
||||
date = parse_date(self.tag_to_string(da)).replace(tzinfo=None)
|
||||
today = datetime.now()
|
||||
if (today - date) > timedelta(self.oldest_article):
|
||||
continue
|
||||
self.log('\t', title, '\n\t', desc, '\n\t\t', url)
|
||||
ans.append({'title': title, 'url': url, 'description': desc})
|
||||
return ans
|
||||
# feeds = []
|
||||
|
||||
# # For each section title, fetch the article urls
|
||||
# for section in section_list:
|
||||
# section_title = section[0]
|
||||
# section_url = section[1]
|
||||
# self.log(section_title, section_url)
|
||||
# soup = self.index_to_soup(section_url)
|
||||
# if '/world/' in section_url or '/explained/' in section_url:
|
||||
# articles = self.articles_from_page(soup)
|
||||
# else:
|
||||
# articles = self.articles_from_soup(soup)
|
||||
# if articles:
|
||||
# feeds.append((section_title, articles))
|
||||
# return feeds
|
||||
|
||||
# def articles_from_page(self, soup):
|
||||
# ans = []
|
||||
# for div in soup.findAll(attrs={'class': ['northeast-topbox', 'explained-section-grid']}):
|
||||
# for a in div.findAll('a', href=True):
|
||||
# if not a.find('img') and '/section/' not in a['href']:
|
||||
# url = a['href']
|
||||
# title = self.tag_to_string(a)
|
||||
# self.log('\t', title, '\n\t\t', url)
|
||||
# ans.append({'title': title, 'url': url, 'description': ''})
|
||||
# return ans
|
||||
|
||||
# def articles_from_soup(self, soup):
|
||||
# ans = []
|
||||
# div = soup.find('div', attrs={'class': ['nation', 'o-opin', 'myie-nation', 'opinion-more-wrapper']})
|
||||
# for art in div.findAll(
|
||||
# attrs={'class': ['articles', 'o-opin-article', 'myie-articles']}
|
||||
# ):
|
||||
# for a in art.findAll('a', href=True):
|
||||
# if not a.find('img') and not any(
|
||||
# x in a['href'] for x in ['/profile/', '/agency/', '/section/']
|
||||
# ):
|
||||
# url = a['href']
|
||||
# title = self.tag_to_string(a)
|
||||
# desc = ''
|
||||
# if p := (art.find('p') or art.find(attrs={'class': 'opinion-news-para'})):
|
||||
# desc = self.tag_to_string(p)
|
||||
# if da := art.find(
|
||||
# attrs={'class': ['date', 'o-opin-date', 'opinion-date', 'my-time']}
|
||||
# ):
|
||||
# date = parse_date(self.tag_to_string(da)).replace(tzinfo=None)
|
||||
# today = datetime.now()
|
||||
# if (today - date) > timedelta(self.oldest_article):
|
||||
# continue
|
||||
# self.log('\t', title, '\n\t', desc, '\n\t\t', url)
|
||||
# ans.append({'title': title, 'url': url, 'description': desc})
|
||||
# return ans
|
||||
|
||||
def get_cover_url(self):
|
||||
soup = self.index_to_soup(
|
||||
@ -136,7 +169,11 @@ class IndianExpress(BasicNewsRecipe):
|
||||
return citem['content'].replace('300', '600')
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
if h2 := (soup.find(attrs={'itemprop': 'description'}) or soup.find(**classes('synopsis'))):
|
||||
width = '600'
|
||||
w = self.recipe_specific_options.get('res')
|
||||
if w and isinstance(w, str):
|
||||
width = w
|
||||
if h2 := (soup.find(attrs={'itemprop': 'description'}) or soup.find(**classes('synopsis top-description'))):
|
||||
h2.name = 'p'
|
||||
h2['id'] = 'sub-d'
|
||||
for span in soup.findAll(
|
||||
@ -144,12 +181,12 @@ class IndianExpress(BasicNewsRecipe):
|
||||
):
|
||||
span['id'] = 'img-cap'
|
||||
for img in soup.findAll('img', attrs={'data-src': True}):
|
||||
img['src'] = img['data-src']
|
||||
if span := soup.find('span', content=True, attrs={'itemprop': 'dateModified'}):
|
||||
date = parse_date(span['content']).replace(tzinfo=None)
|
||||
today = datetime.now()
|
||||
if (today - date) > timedelta(self.oldest_article):
|
||||
self.abort_article('Skipping old article')
|
||||
img['src'] = img['data-src'].split('?')[0] + '?w=' + width
|
||||
# if span := soup.find('span', content=True, attrs={'itemprop': 'dateModified'}):
|
||||
# date = parse_date(span['content']).replace(tzinfo=None)
|
||||
# today = datetime.now()
|
||||
# if (today - date) > timedelta(self.oldest_article):
|
||||
# self.abort_article('Skipping old article')
|
||||
for img in soup.findAll('img', attrs={'src': True}):
|
||||
img['src'] = img['src'].split('?')[0] + '?w=600'
|
||||
img['src'] = img['src'].split('?')[0] + '?w=' + width
|
||||
return soup
|
||||
|
@ -1,30 +1,31 @@
|
||||
#!/usr/bin/env python
|
||||
from calibre.web.feeds.news import BasicNewsRecipe, classes
|
||||
|
||||
|
||||
class PIB(BasicNewsRecipe):
|
||||
title = u'Press Information Bureau'
|
||||
title = 'Press Information Bureau'
|
||||
language = 'en_IN'
|
||||
__author__ = 'unkn0wn'
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
remove_attributes = ['style','height','width']
|
||||
remove_attributes = ['style', 'height', 'width']
|
||||
ignore_duplicate_articles = {'url'}
|
||||
description = ('The Press Information Bureau (PIB) is the nodal agency of the Government of India'
|
||||
' to disseminate information to the print and electronic media on government policies,'
|
||||
' programmes, initiatives and achievements. Best downloaded at the end of the day!')
|
||||
description = (
|
||||
'The Press Information Bureau (PIB) is the nodal agency of the Government of India'
|
||||
' to disseminate information to the print and electronic media on government policies,'
|
||||
' programmes, initiatives and achievements. Best downloaded at the end of the day!'
|
||||
)
|
||||
extra_css = '''
|
||||
#ltrSubtitle{color:#404040;}
|
||||
blockquote{color:#404040;}
|
||||
.ReleaseDateSubHeaddateTime{font-style:italic; font-size:small;}
|
||||
'''
|
||||
masthead_url = 'https://tse3.mm.bing.net/th?id=OIP.4QE8KPl1dZ3_BoR3X92aqgHaIH'
|
||||
|
||||
keep_only_tags = [
|
||||
classes('innner-page-main-about-us-content-right-part')
|
||||
]
|
||||
remove_tags = [
|
||||
classes('ReleaseLang log_oo')
|
||||
]
|
||||
masthead_url = 'https://tse3.mm.bing.net/th?id=OIP.4QE8KPl1dZ3_BoR3X92aqgHaIH'
|
||||
cover_url = 'https://static.pib.gov.in/WriteReadData/specificdocs/photo/2024/jun/ph2024624343601.jpg'
|
||||
|
||||
keep_only_tags = [classes('innner-page-main-about-us-content-right-part')]
|
||||
remove_tags = [classes('ReleaseLang log_oo')]
|
||||
|
||||
def parse_index(self):
|
||||
soup = self.index_to_soup('https://pib.gov.in/Allrel.aspx')
|
||||
@ -37,7 +38,7 @@ class PIB(BasicNewsRecipe):
|
||||
for a in div.findAll('a', href=True):
|
||||
url = a['href']
|
||||
if url.startswith('/'):
|
||||
url = 'https://pib.gov.in' + url
|
||||
url = 'https://pib.gov.in' + url
|
||||
title = self.tag_to_string(a)
|
||||
self.log('\t', title, '\n\t\t', url)
|
||||
articles.append({'title': title, 'url': url})
|
||||
|
Loading…
x
Reference in New Issue
Block a user