mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Update Revista Muy Interesante
This commit is contained in:
parent
69120d4a88
commit
4f8a1f2248
@ -1,120 +1,62 @@
|
|||||||
from calibre.ebooks.BeautifulSoup import Tag
|
#!/usr/bin/env python
|
||||||
from calibre.web.feeds.news import re
|
from calibre.web.feeds.news import BasicNewsRecipe, classes
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
|
||||||
|
|
||||||
|
|
||||||
def new_tag(soup, name, attrs=()):
|
|
||||||
impl = getattr(soup, 'new_tag', None)
|
|
||||||
if impl is not None:
|
|
||||||
return impl(name, attrs=dict(attrs))
|
|
||||||
return Tag(soup, name, attrs=attrs or None)
|
|
||||||
|
|
||||||
|
|
||||||
class RevistaMuyInteresante(BasicNewsRecipe):
|
class RevistaMuyInteresante(BasicNewsRecipe):
|
||||||
|
|
||||||
title = 'Revista Muy Interesante'
|
title = 'Revista Muy Interesante'
|
||||||
__author__ = 'Jefferson Frantz'
|
__author__ = 'unkn0wn'
|
||||||
description = 'Revista de divulgacion'
|
description = 'Revista Muy Interesante, es un sitio con información sobre ciencia, tecnología, historia, sociedad, medio ambiente, etc.'
|
||||||
timefmt = ' [%d %b, %Y]'
|
|
||||||
language = 'es'
|
language = 'es'
|
||||||
|
encoding = 'utf-8'
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
|
remove_attributes = ['style', 'height', 'width']
|
||||||
conversion_options = {'linearize_tables': True}
|
ignore_duplicate_articles = {'url'}
|
||||||
|
masthead_url = 'https://www.muyinteresante.com/static/img/logo_web.svg'
|
||||||
extra_css = ' .txt_articulo{ font-family: sans-serif; font-size: medium; text-align: justify } .contentheading{font-family: serif; font-size: large; font-weight: bold; color: #000000; text-align: center}' # noqa
|
resolve_internal_links = True
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
|
||||||
for item in soup.findAll(style=True):
|
|
||||||
del item['style']
|
|
||||||
|
|
||||||
for img_tag in soup.findAll('img'):
|
|
||||||
imagen = img_tag
|
|
||||||
nt = new_tag(soup, 'p')
|
|
||||||
img_tag.replaceWith(nt)
|
|
||||||
div = soup.find(attrs={'class': 'article_category'})
|
|
||||||
div.insert(0, imagen)
|
|
||||||
break
|
|
||||||
return soup
|
|
||||||
|
|
||||||
preprocess_regexps = [
|
|
||||||
(re.compile(r'<td class="contentheading" width="100%">.*?</td>', re.DOTALL | re.IGNORECASE), lambda match: '<td class="contentheading">' +
|
|
||||||
match.group().replace('<td class="contentheading" width="100%">', '').strip().replace('</td>', '').strip() + '</td>'),
|
|
||||||
|
|
||||||
]
|
|
||||||
|
|
||||||
keep_only_tags = [dict(name='div', attrs={'class': ['article']}), dict(
|
|
||||||
name='td', attrs={'class': ['txt_articulo']})]
|
|
||||||
|
|
||||||
remove_tags = [
|
|
||||||
dict(name=['object', 'link', 'script', 'ul', 'iframe', 'ins']), dict(name='div', attrs={'id': ['comment']}), dict(name='td', attrs={'class': ['buttonheading']}), dict(name='div', attrs={'class': ['tags_articles', 'bajo_title']}), dict(name='table', attrs={'class': ['pagenav']}), dict(name='form', attrs={'class': ['voteform']}) # noqa
|
|
||||||
]
|
|
||||||
|
|
||||||
remove_tags_after = dict(name='div', attrs={'class': 'tags_articles'})
|
|
||||||
|
|
||||||
# TO GET ARTICLES IN SECTION
|
|
||||||
def nz_parse_section(self, url):
|
|
||||||
soup = self.index_to_soup(url)
|
|
||||||
div = soup.find(attrs={'class': 'contenido'})
|
|
||||||
current_articles = []
|
|
||||||
for x in div.findAllNext(attrs={'class': ['headline']}):
|
|
||||||
a = x.find('a', href=True)
|
|
||||||
if a is None:
|
|
||||||
continue
|
|
||||||
title = self.tag_to_string(a)
|
|
||||||
url = a.get('href', False)
|
|
||||||
if not url or not title:
|
|
||||||
continue
|
|
||||||
if url.startswith('/'):
|
|
||||||
url = 'http://www.muyinteresante.es' + url
|
|
||||||
# self.log('\t\tFound article:', title)
|
|
||||||
# self.log('\t\t\t', url)
|
|
||||||
current_articles.append({'title': title, 'url': url,
|
|
||||||
'description': '', 'date': ''})
|
|
||||||
|
|
||||||
return current_articles
|
|
||||||
|
|
||||||
# To GET SECTIONS
|
|
||||||
def parse_index(self):
|
|
||||||
feeds = []
|
|
||||||
for title, url in [
|
|
||||||
('Historia',
|
|
||||||
'http://www.muyinteresante.es/historia-articulos'),
|
|
||||||
('Ciencia',
|
|
||||||
'http://www.muyinteresante.es/ciencia-articulos'),
|
|
||||||
('Naturaleza',
|
|
||||||
'http://www.muyinteresante.es/naturaleza-articulos'),
|
|
||||||
('Tecnología',
|
|
||||||
'http://www.muyinteresante.es/tecnologia-articulos'),
|
|
||||||
('Salud',
|
|
||||||
'http://www.muyinteresante.es/salud-articulos'),
|
|
||||||
('Más Muy',
|
|
||||||
'http://www.muyinteresante.es/muy'),
|
|
||||||
('Innova - Automoción',
|
|
||||||
'http://www.muyinteresante.es/articulos-innovacion-autos'),
|
|
||||||
('Innova - Salud',
|
|
||||||
'http://www.muyinteresante.es/articulos-innovacion-salud'),
|
|
||||||
('Innova - Medio Ambiente',
|
|
||||||
'http://www.muyinteresante.es/articulos-innovacion-medio-ambiente'),
|
|
||||||
('Innova - Alimentación',
|
|
||||||
'http://www.muyinteresante.es/articulos-innovacion-alimentacion'),
|
|
||||||
('Innova - Sociedad',
|
|
||||||
'http://www.muyinteresante.es/articulos-innovacion-sociedad'),
|
|
||||||
('Innova - Tecnología',
|
|
||||||
'http://www.muyinteresante.es/articulos-innovacion-tecnologia'),
|
|
||||||
('Innova - Ocio',
|
|
||||||
'http://www.muyinteresante.es/articulos-innovacion-ocio'),
|
|
||||||
]:
|
|
||||||
articles = self.nz_parse_section(url)
|
|
||||||
if articles:
|
|
||||||
feeds.append((title, articles))
|
|
||||||
return feeds
|
|
||||||
|
|
||||||
def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
index = 'http://www.muyinteresante.es/revista'
|
soup = self.index_to_soup(
|
||||||
soup = self.index_to_soup(index)
|
'https://www.magzter.com/ES/Zinet-Media-Global/Muy-Interesante-Espa%C3%B1a/Science/1806044'
|
||||||
link_item = soup.find('img', attrs={'class': 'img_portada'})
|
)
|
||||||
if link_item:
|
for citem in soup.findAll(
|
||||||
cover_url = "http://www.muyinteresante.es" + link_item['src']
|
'meta', content=lambda s: s and s.endswith('view/3.jpg')
|
||||||
return cover_url
|
):
|
||||||
|
return citem['content']
|
||||||
|
|
||||||
|
extra_css = '''
|
||||||
|
.c-detail__bar, .c-detail__author, .c-detail__media__txt { font-size:small; }
|
||||||
|
.default-epigraph { font-style:italic; }
|
||||||
|
'''
|
||||||
|
|
||||||
|
keep_only_tags = [dict(name='article', attrs={'class':'c-detail'})]
|
||||||
|
|
||||||
|
remove_tags = [
|
||||||
|
dict(name=['aside', 'svg', 'script']),
|
||||||
|
classes('c-detail__share')
|
||||||
|
]
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
au = soup.find(**classes('c-detail__author'))
|
||||||
|
if au:
|
||||||
|
for p in au.findAll('p'):
|
||||||
|
p.name = 'div'
|
||||||
|
for h in soup.findAll(['h2', 'h3']):
|
||||||
|
h.name = 'h4'
|
||||||
|
return soup
|
||||||
|
|
||||||
|
def parse_index(self):
|
||||||
|
soup = self.index_to_soup('https://www.muyinteresante.com/')
|
||||||
|
ans = []
|
||||||
|
for articles in soup.findAll('article'):
|
||||||
|
a = articles.find('a', attrs={'class':'page-link', 'href':True})
|
||||||
|
if not a:
|
||||||
|
continue
|
||||||
|
title = self.tag_to_string(a)
|
||||||
|
url = a['href']
|
||||||
|
desc = ''
|
||||||
|
info = articles.find(**classes('c-article__info_content'))
|
||||||
|
if info:
|
||||||
|
desc = self.tag_to_string(info)
|
||||||
|
self.log('\t', title, '\n\t', desc, '\n\t\t', url)
|
||||||
|
ans.append({'title': title, 'url': url, 'description': desc})
|
||||||
|
return [('Articles', ans)]
|
||||||
|
@ -108,6 +108,16 @@ class SCMP(BasicNewsRecipe):
|
|||||||
'short': 'Oldest article to download from this news source. In days ',
|
'short': 'Oldest article to download from this news source. In days ',
|
||||||
'long': 'For example, 0.5, gives you articles from the past 12 hours',
|
'long': 'For example, 0.5, gives you articles from the past 12 hours',
|
||||||
'default': str(oldest_article)
|
'default': str(oldest_article)
|
||||||
|
},
|
||||||
|
'comp': {
|
||||||
|
'short': 'Compress News Images?',
|
||||||
|
'long': 'enter yes',
|
||||||
|
'default': 'no'
|
||||||
|
},
|
||||||
|
'rev': {
|
||||||
|
'short': 'Reverse the order of articles in each feed?',
|
||||||
|
'long': 'enter yes',
|
||||||
|
'default': 'no'
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -116,6 +126,14 @@ class SCMP(BasicNewsRecipe):
|
|||||||
d = self.recipe_specific_options.get('days')
|
d = self.recipe_specific_options.get('days')
|
||||||
if d and isinstance(d, str):
|
if d and isinstance(d, str):
|
||||||
self.oldest_article = float(d)
|
self.oldest_article = float(d)
|
||||||
|
r = self.recipe_specific_options.get('rev')
|
||||||
|
if r and isinstance(r, str):
|
||||||
|
if r.lower() == 'yes':
|
||||||
|
self.reverse_article_order = True
|
||||||
|
c = self.recipe_specific_options.get('comp')
|
||||||
|
if c and isinstance(c, str):
|
||||||
|
if c.lower() == 'yes':
|
||||||
|
self.compress_news_images = True
|
||||||
|
|
||||||
# used when unable to extract article from <script>, particularly in the Sports section
|
# used when unable to extract article from <script>, particularly in the Sports section
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
@ -174,6 +192,6 @@ class SCMP(BasicNewsRecipe):
|
|||||||
img['src'] = y + urlparse(img['src']).path
|
img['src'] = y + urlparse(img['src']).path
|
||||||
for img in soup.findAll('img', attrs={'title':True}):
|
for img in soup.findAll('img', attrs={'title':True}):
|
||||||
div = soup.new_tag('div', attrs={'style':'text-align:center; font-size:small;'})
|
div = soup.new_tag('div', attrs={'style':'text-align:center; font-size:small;'})
|
||||||
div.string = img['title']
|
div.string = img.get('title', '')
|
||||||
img.find_parent('div').append(div)
|
img.find_parent('div').append(div)
|
||||||
return soup
|
return soup
|
||||||
|
Loading…
x
Reference in New Issue
Block a user