From 3bcf2c0d1b2c08451235f0b703ec210b77322739 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 19 Jul 2024 20:20:27 +0530 Subject: [PATCH] Update Pagina12 Fixes #2073611 [Updated recipe for Pagina 12](https://bugs.launchpad.net/calibre/+bug/2073611) --- recipes/pagina12.recipe | 35 +++++++++++++++-------------------- recipes/wsj.recipe | 1 + 2 files changed, 16 insertions(+), 20 deletions(-) diff --git a/recipes/pagina12.recipe b/recipes/pagina12.recipe index 1325678e6f..54ab8ae358 100644 --- a/recipes/pagina12.recipe +++ b/recipes/pagina12.recipe @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- __license__ = 'GPL v3' -__copyright__ = '2008-2018, Darko Miletic ' +__copyright__ = '2008-2024, Darko Miletic ' ''' pagina12.com.ar ''' @@ -28,6 +28,7 @@ class Pagina12(BasicNewsRecipe): delay = 1 simultaneous_downloads = 1 timeout = 8 + needs_subscription = 'optional' ignore_duplicate_articles = {'url'} articles_are_obfuscated = True temp_files = [] @@ -63,29 +64,23 @@ class Pagina12(BasicNewsRecipe): ]}) ] + def get_browser(self): + br = BasicNewsRecipe.get_browser(self) + br.open('https://www.pagina12.com.ar/') + if self.username is not None and self.password is not None: + br.open('https://auth.pagina12.com.ar/ingresar?redirect=https://www.pagina12.com.ar') + br.select_form(id='login') + br['email'] = self.username + br['password'] = self.password + br.submit() + return br + feeds = [ (u'Diario de hoy' , u'https://www.pagina12.com.ar/rss/edicion-impresa'), - (u'El Pais' , u'https://www.pagina12.com.ar/rss/secciones/el-pais/notas'), - (u'Economia' , u'https://www.pagina12.com.ar/rss/secciones/economia/notas'), - (u'Sociedad' , u'https://www.pagina12.com.ar/rss/secciones/sociedad/notas'), - (u'El Mundo' , u'https://www.pagina12.com.ar/rss/secciones/el-mundo/notas'), - (u'Deportes' , u'https://www.pagina12.com.ar/rss/secciones/deportes/notas'), - (u'Cultura' , u'https://www.pagina12.com.ar/rss/secciones/cultura/notas'), - (u'Universidad' , u'https://www.pagina12.com.ar/rss/secciones/universidad/notas'), - (u'Ciencia' , u'https://www.pagina12.com.ar/rss/secciones/ciencia/notas'), - (u'Psicologia' , u'https://www.pagina12.com.ar/rss/secciones/psicologia/notas'), - (u'Ajedrez' , u'https://www.pagina12.com.ar/rss/secciones/ajedrez/notas'), - (u'La Ventana' , u'https://www.pagina12.com.ar/rss/secciones/la-ventana/notas'), - (u'Dialogos' , u'https://www.pagina12.com.ar/rss/secciones/dialogos/notas'), - (u'Hoy' , u'https://www.pagina12.com.ar/rss/secciones/hoy/notas'), - (u'Plastica' , u'https://www.pagina12.com.ar/rss/secciones/plastica/notas'), - (u'Cartas de Lectores', u'https://www.pagina12.com.ar/rss/secciones/cartas-de-lectores/notas'), (u'Espectaculos' , u'https://www.pagina12.com.ar/rss/suplementos/cultura-y-espectaculos/notas'), (u'Radar' , u'https://www.pagina12.com.ar/rss/suplementos/radar/notas'), (u'Radar libros' , u'https://www.pagina12.com.ar/rss/suplementos/radar-libros/notas'), (u'Cash' , u'https://www.pagina12.com.ar/rss/suplementos/cash/notas'), - (u'Turismo' , u'https://www.pagina12.com.ar/rss/suplementos/turismo/notas'), - (u'Libero' , u'https://www.pagina12.com.ar/rss/suplementos/libero/notas'), (u'NO' , u'https://www.pagina12.com.ar/rss/suplementos/no/notas'), (u'Las 12' , u'https://www.pagina12.com.ar/rss/suplementos/las12/notas'), (u'Soy' , u'https://www.pagina12.com.ar/rss/suplementos/soy/notas'), @@ -99,8 +94,8 @@ class Pagina12(BasicNewsRecipe): mydiv = soup.find('div', {'class' : lambda x: x and 'printed-edition-cover' in x.split()}) if mydiv: for image in mydiv.findAll('img'): - if image['data-src'].startswith('https://images.pagina12.com.ar/styles/width700/public/'): - return image['data-src'] + if image['src'].startswith('https://images.pagina12.com.ar/styles/width700/public/'): + return image['src'] return None def get_obfuscated_article(self, url): diff --git a/recipes/wsj.recipe b/recipes/wsj.recipe index a5d910e9f6..b8571040dc 100644 --- a/recipes/wsj.recipe +++ b/recipes/wsj.recipe @@ -6,6 +6,7 @@ from itertools import zip_longest from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.web.feeds.news import BasicNewsRecipe, classes + def media_bucket(x): if x.get('type', '') == 'image': if x.get('subtype', '') == 'graphic' or 'images.wsj.net' not in x['manifest-url']: