mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 10:14:46 -04:00
Merge branch 'master' of https://github.com/unkn0w7n/calibre
This commit is contained in:
commit
3ebd85de99
@ -6,7 +6,6 @@ from datetime import date
|
|||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe, classes
|
from calibre.web.feeds.news import BasicNewsRecipe, classes
|
||||||
|
|
||||||
is_saturday = date.today().weekday() == 5
|
|
||||||
|
|
||||||
class LiveMint(BasicNewsRecipe):
|
class LiveMint(BasicNewsRecipe):
|
||||||
title = 'Live Mint'
|
title = 'Live Mint'
|
||||||
@ -25,13 +24,12 @@ class LiveMint(BasicNewsRecipe):
|
|||||||
'days': {
|
'days': {
|
||||||
'short': 'Oldest article to download from this news source. In days ',
|
'short': 'Oldest article to download from this news source. In days ',
|
||||||
'long': 'For example, 0.5, gives you articles from the past 12 hours',
|
'long': 'For example, 0.5, gives you articles from the past 12 hours',
|
||||||
'default': str(oldest_article)
|
'default': str(oldest_article),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
resolve_internal_links = True
|
resolve_internal_links = True
|
||||||
|
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
BasicNewsRecipe.__init__(self, *args, **kwargs)
|
BasicNewsRecipe.__init__(self, *args, **kwargs)
|
||||||
d = self.recipe_specific_options.get('days')
|
d = self.recipe_specific_options.get('days')
|
||||||
@ -42,55 +40,14 @@ class LiveMint(BasicNewsRecipe):
|
|||||||
today = date.today().strftime('%d/%m/%Y')
|
today = date.today().strftime('%d/%m/%Y')
|
||||||
today = today.replace('/', '%2F')
|
today = today.replace('/', '%2F')
|
||||||
raw = self.index_to_soup(
|
raw = self.index_to_soup(
|
||||||
'https://epaper.livemint.com/Home/GetAllpages?editionid=1&editiondate=' + today, raw=True
|
'https://epaper.livemint.com/Home/GetAllpages?editionid=1&editiondate=' + today,
|
||||||
|
raw=True
|
||||||
)
|
)
|
||||||
for cov in json.loads(raw):
|
for cov in json.loads(raw):
|
||||||
if cov['NewsProPageTitle'].lower().startswith(('front', 'cover')):
|
if cov['NewsProPageTitle'].lower().startswith(('front', 'cover')):
|
||||||
return cov['HighResolution']
|
return cov['HighResolution']
|
||||||
|
|
||||||
if is_saturday:
|
extra_css = """
|
||||||
title = 'Mint Lounge'
|
|
||||||
masthead_url = 'https://lifestyle.livemint.com/mintlounge/static-images/lounge-logo.svg'
|
|
||||||
|
|
||||||
oldest_article = 6.5 # days
|
|
||||||
|
|
||||||
extra_css = '''
|
|
||||||
#story-summary-0 {font-style:italic; color:#202020;}
|
|
||||||
.innerBanner, .storyImgSec {text-align:center; font-size:small;}
|
|
||||||
.author {font-size:small;}
|
|
||||||
'''
|
|
||||||
|
|
||||||
keep_only_tags = [
|
|
||||||
classes('storyPageHeading storyContent innerBanner author')
|
|
||||||
]
|
|
||||||
remove_tags = [
|
|
||||||
dict(name=['meta', 'link', 'svg', 'button', 'iframe']),
|
|
||||||
classes('hidden-article-url sidebarAdv similarStoriesClass moreFromSecClass linkStories publishDetail'),
|
|
||||||
dict(attrs={'id':['hidden-article-id-0', 'hidden-article-type-0']})
|
|
||||||
]
|
|
||||||
|
|
||||||
feeds = [
|
|
||||||
('Lounge News', 'https://lifestyle.livemint.com/rss/news'),
|
|
||||||
('Food', 'https://lifestyle.livemint.com/rss/food'),
|
|
||||||
('Fashion', 'https://lifestyle.livemint.com/rss/fashion'),
|
|
||||||
('How to Lounge', 'https://lifestyle.livemint.com/rss/how-to-lounge'),
|
|
||||||
('Smart Living', 'https://lifestyle.livemint.com/rss/smart-living'),
|
|
||||||
('Health', 'https://lifestyle.livemint.com/rss/health'),
|
|
||||||
('Relationships', 'https://lifestyle.livemint.com//rss/relationships')
|
|
||||||
]
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
|
||||||
if h2 := soup.find('h2'):
|
|
||||||
h2.name = 'p'
|
|
||||||
for also in soup.findAll('h2'):
|
|
||||||
if self.tag_to_string(also).strip().startswith('Also'):
|
|
||||||
also.extract()
|
|
||||||
for img in soup.findAll('img', attrs={'data-img': True}):
|
|
||||||
img['src'] = img['data-img']
|
|
||||||
return soup
|
|
||||||
else:
|
|
||||||
|
|
||||||
extra_css = '''
|
|
||||||
img {margin:0 auto;}
|
img {margin:0 auto;}
|
||||||
.psTopLogoItem img, .ecologoStory { width:100; }
|
.psTopLogoItem img, .ecologoStory { width:100; }
|
||||||
#img-cap {font-size:small; text-align:center;}
|
#img-cap {font-size:small; text-align:center;}
|
||||||
@ -99,24 +56,38 @@ class LiveMint(BasicNewsRecipe):
|
|||||||
}
|
}
|
||||||
em, blockquote {color:#202020;}
|
em, blockquote {color:#202020;}
|
||||||
.moreAbout, .articleInfo, .metaData, .psTopicsHeading, .topicsTag, .auth {font-size:small;}
|
.moreAbout, .articleInfo, .metaData, .psTopicsHeading, .topicsTag, .auth {font-size:small;}
|
||||||
'''
|
"""
|
||||||
|
|
||||||
keep_only_tags = [
|
keep_only_tags = [
|
||||||
dict(name='article', attrs={'id':lambda x: x and x.startswith(('article_', 'box_'))}),
|
dict(
|
||||||
|
name='article',
|
||||||
|
attrs={'id': lambda x: x and x.startswith(('article_', 'box_'))},
|
||||||
|
),
|
||||||
dict(attrs={'class': lambda x: x and x.startswith('storyPage_storyBox__')}),
|
dict(attrs={'class': lambda x: x and x.startswith('storyPage_storyBox__')}),
|
||||||
classes('contentSec')
|
classes('contentSec'),
|
||||||
]
|
]
|
||||||
|
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
dict(name=['meta', 'link', 'svg', 'button', 'iframe']),
|
dict(name=['meta', 'link', 'svg', 'button', 'iframe']),
|
||||||
dict(attrs={'class':lambda x: x and x.startswith(
|
dict(
|
||||||
('storyPage_alsoRead__', 'storyPage_firstPublishDate__', 'storyPage_bcrumb__')
|
attrs={
|
||||||
)}),
|
'class': lambda x: x
|
||||||
dict(attrs={'id':['faqSection', 'seoText', 'ellipsisId']}),
|
and x.startswith(
|
||||||
|
(
|
||||||
|
'storyPage_alsoRead__',
|
||||||
|
'storyPage_firstPublishDate__',
|
||||||
|
'storyPage_bcrumb__',
|
||||||
|
)
|
||||||
|
)
|
||||||
|
}
|
||||||
|
),
|
||||||
|
dict(attrs={'id': ['faqSection', 'seoText', 'ellipsisId', 'gift_redeemed_box ']}),
|
||||||
classes(
|
classes(
|
||||||
'trendingSimilarHeight moreNews mobAppDownload label msgError msgOk taboolaHeight gadgetSlider ninSec'
|
'trendingSimilarHeight moreNews mobAppDownload label msgError msgOk taboolaHeight gadgetSlider ninSec'
|
||||||
' socialHolder imgbig disclamerText disqus-comment-count openinApp2 lastAdSlot bs_logo author-widget'
|
' socialHolder imgbig disclamerText disqus-comment-count openinApp2 lastAdSlot bs_logo author-widget'
|
||||||
' datePublish sepStory premiumSlider moreStory Joinus moreAbout milestone benefitText checkCibilBtn trade'
|
' datePublish sepStory premiumSlider moreStory Joinus moreAbout milestone benefitText checkCibilBtn'
|
||||||
)
|
' double_gift_box trade'
|
||||||
|
),
|
||||||
]
|
]
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
@ -142,30 +113,40 @@ class LiveMint(BasicNewsRecipe):
|
|||||||
def preprocess_raw_html(self, raw, *a):
|
def preprocess_raw_html(self, raw, *a):
|
||||||
# remove empty p tags
|
# remove empty p tags
|
||||||
raw = re.sub(
|
raw = re.sub(
|
||||||
r'(<p>\s*)(<[^(\/|a|i|b|em|strong)])', '\g<2>', re.sub(
|
r'(<p>\s*)(<[^(\/|a|i|b|em|strong)])', r'\g<2>', re.sub(
|
||||||
r'(<p>\s* \s*<\/p>)|(<p>\s*<\/p>)|(<p\s*\S+> \s*<\/p>)', '', raw
|
r'(<p>\s* \s*<\/p>)|(<p>\s*<\/p>)|(<p\s*\S+> \s*<\/p>)', '', raw
|
||||||
)
|
),
|
||||||
)
|
)
|
||||||
if '<script>var wsjFlag=true;</script>' in raw:
|
if '<script>var wsjFlag=true;</script>' in raw:
|
||||||
m = re.search(r'type="application/ld\+json">[^<]+?"@type": "NewsArticle"', raw)
|
m = re.search(
|
||||||
|
r'type="application/ld\+json">[^<]+?"@type": "NewsArticle"', raw
|
||||||
|
)
|
||||||
raw1 = raw[m.start() :]
|
raw1 = raw[m.start() :]
|
||||||
raw1 = raw1.split('>', 1)[1].strip()
|
raw1 = raw1.split('>', 1)[1].strip()
|
||||||
data = json.JSONDecoder().raw_decode(raw1)[0]
|
data = json.JSONDecoder().raw_decode(raw1)[0]
|
||||||
value = data['hasPart']['value']
|
value = data['hasPart']['value']
|
||||||
body = data['articleBody'] + '</p> <p>'\
|
body = (
|
||||||
|
data['articleBody']
|
||||||
|
+ '</p> <p>'
|
||||||
+ re.sub(r'(([a-z]|[^A-Z])\.|\.”)([A-Z]|“[A-Z])', r'\1 <p> \3', value)
|
+ re.sub(r'(([a-z]|[^A-Z])\.|\.”)([A-Z]|“[A-Z])', r'\1 <p> \3', value)
|
||||||
|
)
|
||||||
body = '<div class="FirstEle"> <p>' + body + '</p> </div>'
|
body = '<div class="FirstEle"> <p>' + body + '</p> </div>'
|
||||||
raw2 = re.sub(r'<div class="FirstEle">([^}]*)</div>', body, raw)
|
raw2 = re.sub(r'<div class="FirstEle">([^}]*)</div>', body, raw)
|
||||||
return raw2
|
return raw2
|
||||||
return raw
|
return raw
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
for h2 in soup.findAll('h2'):
|
auth = soup.find(
|
||||||
h2.name = 'h4'
|
attrs={
|
||||||
auth = soup.find(attrs={'class':lambda x: x and x.startswith(('storyPage_authorInfo__', 'storyPage_authorSocial__'))})
|
'class': lambda x: x
|
||||||
|
and x.startswith(('storyPage_authorInfo__', 'storyPage_authorSocial__'))
|
||||||
|
}
|
||||||
|
)
|
||||||
if auth:
|
if auth:
|
||||||
auth['class'] = 'auth'
|
auth['class'] = 'auth'
|
||||||
summ = soup.find(attrs={'class':lambda x: x and x.startswith('storyPage_summary__')})
|
summ = soup.find(
|
||||||
|
attrs={'class': lambda x: x and x.startswith('storyPage_summary__')}
|
||||||
|
)
|
||||||
if summ:
|
if summ:
|
||||||
summ['class'] = 'summary'
|
summ['class'] = 'summary'
|
||||||
for strong in soup.findAll('strong'):
|
for strong in soup.findAll('strong'):
|
||||||
@ -177,7 +158,9 @@ class LiveMint(BasicNewsRecipe):
|
|||||||
nos.name = 'span'
|
nos.name = 'span'
|
||||||
for span in soup.findAll('figcaption'):
|
for span in soup.findAll('figcaption'):
|
||||||
span['id'] = 'img-cap'
|
span['id'] = 'img-cap'
|
||||||
for auth in soup.findAll('span', attrs={'class':lambda x: x and 'articleInfo' in x.split()}):
|
for auth in soup.findAll(
|
||||||
|
'span', attrs={'class': lambda x: x and 'articleInfo' in x.split()}
|
||||||
|
):
|
||||||
auth.name = 'div'
|
auth.name = 'div'
|
||||||
for img in soup.findAll('img', attrs={'data-src': True}):
|
for img in soup.findAll('img', attrs={'data-src': True}):
|
||||||
img['src'] = img['data-src']
|
img['src'] = img['data-src']
|
||||||
|
@ -76,7 +76,7 @@ class Reuters(BasicNewsRecipe):
|
|||||||
'technology',
|
'technology',
|
||||||
# 'sports',
|
# 'sports',
|
||||||
'science',
|
'science',
|
||||||
# 'lifestyle',
|
'lifestyle',
|
||||||
]
|
]
|
||||||
|
|
||||||
feeds = []
|
feeds = []
|
||||||
|
Loading…
x
Reference in New Issue
Block a user