mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 10:44:09 -04:00
...
This commit is contained in:
parent
b9c50b071a
commit
76684b3a2b
@ -1,3 +1,5 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# vim:fileencoding=utf-8
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2009-2010, Darko Miletic <darko.miletic at gmail.com>'
|
__copyright__ = '2009-2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||||
|
|
||||||
@ -35,6 +37,20 @@ class AlJazeera(BasicNewsRecipe):
|
|||||||
'meta', 'base', 'iframe', 'embed']),
|
'meta', 'base', 'iframe', 'embed']),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
recipe_specific_options = {
|
||||||
|
'days': {
|
||||||
|
'short': 'Oldest article to download from this news source. In days ',
|
||||||
|
'long': 'For example, 0.5, gives you articles from the past 12 hours',
|
||||||
|
'default': str(oldest_article)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
BasicNewsRecipe.__init__(self, *args, **kwargs)
|
||||||
|
d = self.recipe_specific_options.get('days')
|
||||||
|
if d and isinstance(d, str):
|
||||||
|
self.oldest_article = float(d)
|
||||||
|
|
||||||
feeds = [(u'Al Jazeera English',
|
feeds = [(u'Al Jazeera English',
|
||||||
u'http://www.aljazeera.com/xml/rss/all.xml')]
|
u'http://www.aljazeera.com/xml/rss/all.xml')]
|
||||||
|
|
||||||
|
@ -40,6 +40,20 @@ class TheIndependentNew(BasicNewsRecipe):
|
|||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
compress_news_images = True
|
compress_news_images = True
|
||||||
|
|
||||||
|
recipe_specific_options = {
|
||||||
|
'days': {
|
||||||
|
'short': 'Oldest article to download from this news source. In days ',
|
||||||
|
'long': 'For example, 0.5, gives you articles from the past 12 hours',
|
||||||
|
'default': str(oldest_article)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
BasicNewsRecipe.__init__(self, *args, **kwargs)
|
||||||
|
d = self.recipe_specific_options.get('days')
|
||||||
|
if d and isinstance(d, str):
|
||||||
|
self.oldest_article = float(d)
|
||||||
|
|
||||||
keep_only_tags = [
|
keep_only_tags = [
|
||||||
dict(id=['articleHeader', 'main']),
|
dict(id=['articleHeader', 'main']),
|
||||||
classes('headline sub-headline breadcrumb author publish-date hero-image body-content'),
|
classes('headline sub-headline breadcrumb author publish-date hero-image body-content'),
|
||||||
|
@ -143,9 +143,7 @@ class LiveMint(BasicNewsRecipe):
|
|||||||
# remove empty p tags
|
# remove empty p tags
|
||||||
raw = re.sub(
|
raw = re.sub(
|
||||||
r'(<p>\s*)(<[^(\/|a|i|b|em|strong)])', '\g<2>', re.sub(
|
r'(<p>\s*)(<[^(\/|a|i|b|em|strong)])', '\g<2>', re.sub(
|
||||||
r'(<p>\s* \s*<\/p>)|(<p>\s*<\/p>)|(<p\s*\S+> \s*<\/p>)', '', re.sub(
|
r'(<p>\s* \s*<\/p>)|(<p>\s*<\/p>)|(<p\s*\S+> \s*<\/p>)', '', raw
|
||||||
r'(?=<h2>\s*Also\s*Read).*?(?<=</h2>)', '', raw
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
if '<script>var wsjFlag=true;</script>' in raw:
|
if '<script>var wsjFlag=true;</script>' in raw:
|
||||||
@ -186,10 +184,11 @@ class LiveMint(BasicNewsRecipe):
|
|||||||
for span in soup.findAll('span', attrs={'class':'exclusive'}):
|
for span in soup.findAll('span', attrs={'class':'exclusive'}):
|
||||||
span.extract()
|
span.extract()
|
||||||
for al in soup.findAll('a', attrs={'class':'manualbacklink'}):
|
for al in soup.findAll('a', attrs={'class':'manualbacklink'}):
|
||||||
pa = al.findParent('p')
|
pa = al.findParent(['p', 'h2', 'h3', 'h4'])
|
||||||
if pa:
|
if pa:
|
||||||
pa.extract()
|
pa.extract()
|
||||||
if wa := soup.find(**classes('autobacklink-topic')):
|
wa = soup.find(**classes('autobacklink-topic'))
|
||||||
|
if wa:
|
||||||
p = wa.findParent('p')
|
p = wa.findParent('p')
|
||||||
if p:
|
if p:
|
||||||
p.extract()
|
p.extract()
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# vim:fileencoding=utf-8
|
||||||
'''
|
'''
|
||||||
newscientist.com
|
newscientist.com
|
||||||
'''
|
'''
|
||||||
@ -70,8 +72,20 @@ class NewScientist(BasicNewsRecipe):
|
|||||||
classes('ArticleHeader__SocialWrapper AdvertWrapper ReadMoreWithImage ArticleTopics')
|
classes('ArticleHeader__SocialWrapper AdvertWrapper ReadMoreWithImage ArticleTopics')
|
||||||
]
|
]
|
||||||
|
|
||||||
|
recipe_specific_options = {
|
||||||
|
'issue': {
|
||||||
|
'short': 'Enter the Issue Number you want to download ',
|
||||||
|
'long': 'For example, 3498'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
soup = self.index_to_soup('https://www.newscientist.com/issues/current/')
|
issue_url = 'https://www.newscientist.com/issues/current/'
|
||||||
|
d = self.recipe_specific_options.get('issue')
|
||||||
|
if d and isinstance(d, str):
|
||||||
|
issue_url = 'https://www.newscientist.com/issue/' + d
|
||||||
|
|
||||||
|
soup = self.index_to_soup(issue_url)
|
||||||
div = soup.find('div', attrs={'class':'ThisWeeksMagazineHero__CoverInfo'})
|
div = soup.find('div', attrs={'class':'ThisWeeksMagazineHero__CoverInfo'})
|
||||||
tme = div.find(**classes('ThisWeeksMagazineHero__MagInfoHeading'))
|
tme = div.find(**classes('ThisWeeksMagazineHero__MagInfoHeading'))
|
||||||
self.log('Downloading issue:', self.tag_to_string(tme))
|
self.log('Downloading issue:', self.tag_to_string(tme))
|
||||||
|
Loading…
x
Reference in New Issue
Block a user