This commit is contained in:
unkn0w7n 2024-07-25 12:49:25 +05:30
parent b9c50b071a
commit 76684b3a2b
4 changed files with 49 additions and 6 deletions

View File

@ -1,3 +1,5 @@
#!/usr/bin/env python
# vim:fileencoding=utf-8
__license__ = 'GPL v3'
__copyright__ = '2009-2010, Darko Miletic <darko.miletic at gmail.com>'
@ -35,6 +37,20 @@ class AlJazeera(BasicNewsRecipe):
'meta', 'base', 'iframe', 'embed']),
]
recipe_specific_options = {
'days': {
'short': 'Oldest article to download from this news source. In days ',
'long': 'For example, 0.5, gives you articles from the past 12 hours',
'default': str(oldest_article)
}
}
def __init__(self, *args, **kwargs):
BasicNewsRecipe.__init__(self, *args, **kwargs)
d = self.recipe_specific_options.get('days')
if d and isinstance(d, str):
self.oldest_article = float(d)
feeds = [(u'Al Jazeera English',
u'http://www.aljazeera.com/xml/rss/all.xml')]

View File

@ -40,6 +40,20 @@ class TheIndependentNew(BasicNewsRecipe):
encoding = 'utf-8'
compress_news_images = True
recipe_specific_options = {
'days': {
'short': 'Oldest article to download from this news source. In days ',
'long': 'For example, 0.5, gives you articles from the past 12 hours',
'default': str(oldest_article)
}
}
def __init__(self, *args, **kwargs):
BasicNewsRecipe.__init__(self, *args, **kwargs)
d = self.recipe_specific_options.get('days')
if d and isinstance(d, str):
self.oldest_article = float(d)
keep_only_tags = [
dict(id=['articleHeader', 'main']),
classes('headline sub-headline breadcrumb author publish-date hero-image body-content'),

View File

@ -143,9 +143,7 @@ class LiveMint(BasicNewsRecipe):
# remove empty p tags
raw = re.sub(
r'(<p>\s*)(<[^(\/|a|i|b|em|strong)])', '\g<2>', re.sub(
r'(<p>\s*&nbsp;\s*<\/p>)|(<p>\s*<\/p>)|(<p\s*\S+>&nbsp;\s*<\/p>)', '', re.sub(
r'(?=<h2>\s*Also\s*Read).*?(?<=</h2>)', '', raw
)
r'(<p>\s*&nbsp;\s*<\/p>)|(<p>\s*<\/p>)|(<p\s*\S+>&nbsp;\s*<\/p>)', '', raw
)
)
if '<script>var wsjFlag=true;</script>' in raw:
@ -186,10 +184,11 @@ class LiveMint(BasicNewsRecipe):
for span in soup.findAll('span', attrs={'class':'exclusive'}):
span.extract()
for al in soup.findAll('a', attrs={'class':'manualbacklink'}):
pa = al.findParent('p')
pa = al.findParent(['p', 'h2', 'h3', 'h4'])
if pa:
pa.extract()
if wa := soup.find(**classes('autobacklink-topic')):
wa = soup.find(**classes('autobacklink-topic'))
if wa:
p = wa.findParent('p')
if p:
p.extract()

View File

@ -1,3 +1,5 @@
#!/usr/bin/env python
# vim:fileencoding=utf-8
'''
newscientist.com
'''
@ -70,8 +72,20 @@ class NewScientist(BasicNewsRecipe):
classes('ArticleHeader__SocialWrapper AdvertWrapper ReadMoreWithImage ArticleTopics')
]
recipe_specific_options = {
'issue': {
'short': 'Enter the Issue Number you want to download ',
'long': 'For example, 3498'
}
}
def parse_index(self):
soup = self.index_to_soup('https://www.newscientist.com/issues/current/')
issue_url = 'https://www.newscientist.com/issues/current/'
d = self.recipe_specific_options.get('issue')
if d and isinstance(d, str):
issue_url = 'https://www.newscientist.com/issue/' + d
soup = self.index_to_soup(issue_url)
div = soup.find('div', attrs={'class':'ThisWeeksMagazineHero__CoverInfo'})
tme = div.find(**classes('ThisWeeksMagazineHero__MagInfoHeading'))
self.log('Downloading issue:', self.tag_to_string(tme))