Merge branch 'master' of https://github.com/pho-souza/calibre

2026-05-31 02:55:19 -04:00 · 2026-05-10 21:30:13 +05:30
parent 550f8d8f76 e4d1fc7dda
commit df7a261386
2 changed files with 127 additions and 0 deletions
@@ -0,0 +1,127 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+import re
+
+class SuperInteressante(BasicNewsRecipe):
+    title = 'Superinteressante: Revista'
+    description = 'Escolha uma edição específica ou baixe a mais recente.'
+    language = 'pt-br'
+    __author__ = 'Pedro Souza'
+
+
+    
+    recipe_specific_options = {
+        'edicao_numero': {
+            'short': 'Número da edição',
+            'long': 'Digite o número da edição (ex: 485). Deixe em branco para a mais recente:',
+            'default': ''
+        }
+    }
+    
+    no_shortcuts = True
+    encoding = 'utf-8'
+    max_articles_per_feed = 100
+    simultaneous_downloads = 3
+    use_embedded_content = False
+    no_stylesheets = True
+    timeout = 30
+
+    # Desativa cache para forçar novo download sempre
+    no_favicon = True
+    ignore_duplicate_articles = {'title', 'url'}
+    
+    keep_only_tags = [
+        dict(name='article'),
+        dict(name='div', attrs={'class': ['content-body', 'article-content', 'entry-content']}),
+        dict(name='header', attrs={'class': ['article-header']})
+    ]
+    
+    remove_tags = [
+        dict(name='div', attrs={'class': lambda x: x and any(c in x.split() for c in ['ads', 'post-ads', 'noreadme-audima', 'top-share'])}),
+        dict(name='div', attrs={'class': ['banner-container', 'trending', 'sidebar', 'social-share', 'tags', 'related-posts', 'newsletter-box']}),
+        dict(name='section', attrs={'class': ['newsletter', 'comments', 'recommended']}),
+        dict(name='footer'), dict(name='nav'), dict(name='script'), dict(name='style'), dict(name='aside')
+    ]
+
+    def _get_edicao_input(self):
+        # O Calibre injeta o valor como atributo direto: self.edicao_numero
+        valor = getattr(self, 'edicao_numero', '') or getattr(self, 'edicao-numero', '')
+        if valor:
+            return str(valor).strip()
+
+        # Fallback: o Calibre pode ter substituído o dict pela string diretamente
+        opt = self.recipe_specific_options.get('edicao_numero', {})
+        if isinstance(opt, str):
+            return opt.strip()
+        if isinstance(opt, dict):
+            return opt.get('default', '').strip()
+
+        return ''
+
+    def parse_index(self):
+        edicao_input = self._get_edicao_input()
+        self.log(f'[DEBUG] Edição capturada: "{edicao_input}"')
+
+        if edicao_input and edicao_input.isdigit():
+            numero = edicao_input
+            # Muda o título para incluir a edição (mantém o prefixo)
+            self.title = f'Superinteressante: Edição {numero}'
+            issue_link = f'https://super.abril.com.br/superarquivo/{numero}/'
+        else:
+            self.title = 'Superinteressante: Última Edição'
+            self.log('Buscando edição mais recente...')
+            # Passa browser sem cache explicitamente
+            soup_arquivo = self.index_to_soup('https://super.abril.com.br/superarquivo/')
+            all_links = soup_arquivo.find_all('a', href=re.compile(r'/superarquivo/\d+/?$'))
+            if not all_links:
+                raise ValueError("Não foi possível localizar o arquivo.")
+            issue_link = all_links[0]['href']
+            if not issue_link.startswith('http'):
+                issue_link = 'https://super.abril.com.br' + issue_link
+
+            
+        self.log(f'Baixando: {self.title} via {issue_link}')
+        soup_issue = self.index_to_soup(issue_link)
+        
+        # CAPTURA DA CAPA
+        cover_div = soup_issue.find('div', attrs={'class': 'cover'})
+        if cover_div:
+            img_tag = cover_div.find('img')
+            if img_tag:
+                self.cover_url = img_tag.get('data-src') or img_tag.get('src')
+                if self.cover_url and ' ' in self.cover_url:
+                    self.cover_url = self.cover_url.split(' ')[0].split(',')[0]
+        
+        # CAPTURA DOS ARTIGOS
+        articles = []
+        found_links = soup_issue.select('a.list-item-link, .list-item a, .article-title a, h2 a')
+        seen_urls = set()
+        
+        for link in found_links:
+            url = link.get('href')
+            if not url or not url.startswith(('http', '/')): continue
+            if not url.startswith('http'): url = 'https://super.abril.com.br' + url
+            if any(x in url for x in ['/facebook', '/twitter', '/whatsapp', '/tags/']): continue
+            if url in seen_urls or url == issue_link: continue
+                
+            title = self.tag_to_string(link).strip()
+            if not title:
+                title_tag = link.find(['h2', 'h3', 'span'])
+                if title_tag: title = self.tag_to_string(title_tag).strip()
+            
+            if title and len(title) > 10:
+                articles.append({'title': title, 'url': url, 'description': '', 'date': ''})
+                seen_urls.add(url)
+        
+        self.log(f'[DEBUG] Total de artigos encontrados: {len(articles)}')
+        return [('Matérias', articles)]
+
+    def get_browser(self):
+        br = BasicNewsRecipe.get_browser(self)
+        br.addheaders = [
+            ('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'),
+            # Cabeçalhos anti-cache
+            ('Cache-Control', 'no-cache, no-store, must-revalidate'),
+            ('Pragma', 'no-cache'),
+            ('Expires', '0'),
+        ]
+        return br