From aa2ef871fb6e82656e82129e6d31674956f6e8e8 Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Fri, 19 Jul 2024 18:52:03 +0530 Subject: [PATCH 1/5] Update economist.recipe --- recipes/economist.recipe | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/recipes/economist.recipe b/recipes/economist.recipe index cf47a8fda1..2941338f4f 100644 --- a/recipes/economist.recipe +++ b/recipes/economist.recipe @@ -15,8 +15,6 @@ from calibre.web.feeds.news import BasicNewsRecipe from html5_parser import parse from lxml import etree -# For past editions, set date to, for example, '2020-11-28'. -edition_date = None use_archive = True @@ -71,7 +69,7 @@ if use_archive: except Exception: date = data['datePublished'] dt = datetime.fromisoformat(date[:-1]) + timedelta(seconds=time.timezone) - dt = dt.strftime('%b %d, %Y, %I:%M %p') + dt = dt.strftime('%b %d, %Y %I:%M %p') if data['dateline'] is None: E(article, 'p', dt, style='color: gray; font-size:small;') else: @@ -199,6 +197,13 @@ class Economist(BasicNewsRecipe): # downloaded with connection reset by peer (104) errors. delay = 1 + recipe_specific_options = { + 'date': { + 'short': 'The date of the edition to download (YYYY-MM-DD format)', + 'long': 'For example, 2024-07-19\nThis seems to work only for a couple of past editions.' + } + } + needs_subscription = False def get_browser(self, *args, **kwargs): @@ -231,6 +236,7 @@ class Economist(BasicNewsRecipe): if use_archive: def parse_index(self): + edition_date = self.recipe_specific_options.get('date') # return self.economist_test_article() # url = 'https://www.economist.com/weeklyedition/archive' query = { @@ -326,6 +332,7 @@ class Economist(BasicNewsRecipe): self.log.warn('Kindle Output profile being used, reducing image quality to keep file size below amazon email threshold') def parse_index(self): + edition_date = self.recipe_specific_options.get('date') # return self.economist_test_article() if edition_date: url = 'https://www.economist.com/weeklyedition/' + edition_date From 6218a97320d31abfef989f86f96eea998cb6a921 Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Fri, 19 Jul 2024 18:55:09 +0530 Subject: [PATCH 2/5] ... --- recipes/economist.recipe | 1 + 1 file changed, 1 insertion(+) diff --git a/recipes/economist.recipe b/recipes/economist.recipe index 2941338f4f..15cb026a6c 100644 --- a/recipes/economist.recipe +++ b/recipes/economist.recipe @@ -266,6 +266,7 @@ class Economist(BasicNewsRecipe): return self.economist_return_index(ans) def economist_parse_index(self, raw): + edition_date = self.recipe_specific_options.get('date') if edition_date: data = json.loads(raw)['data']['section'] else: From eafdc3c7a94dae84a87d6ba9baf411fcb07c1cde Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Fri, 19 Jul 2024 18:58:19 +0530 Subject: [PATCH 3/5] ... --- recipes/economist.recipe | 2 ++ 1 file changed, 2 insertions(+) diff --git a/recipes/economist.recipe b/recipes/economist.recipe index 15cb026a6c..3ecf3082f5 100644 --- a/recipes/economist.recipe +++ b/recipes/economist.recipe @@ -214,6 +214,7 @@ class Economist(BasicNewsRecipe): return br def publication_date(self): + edition_date = self.recipe_specific_options.get('date') if edition_date: return parse_only_date(edition_date, as_utc=False) url = self.browser.open("https://www.economist.com/printedition").geturl() @@ -424,6 +425,7 @@ class Economist(BasicNewsRecipe): return raw def parse_index_from_printedition(self): # return self.economist_test_article() + edition_date = self.recipe_specific_options.get('date') if edition_date: url = 'https://www.economist.com/weeklyedition/' + edition_date self.timefmt = ' [' + edition_date + ']' From e4755c3df239ca84fde7c30b90a0606d4717be66 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 19 Jul 2024 19:02:59 +0530 Subject: [PATCH 4/5] propagate changes from previous PR to duplicate economist recipe --- recipes/economist_free.recipe | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/recipes/economist_free.recipe b/recipes/economist_free.recipe index cf47a8fda1..3ecf3082f5 100644 --- a/recipes/economist_free.recipe +++ b/recipes/economist_free.recipe @@ -15,8 +15,6 @@ from calibre.web.feeds.news import BasicNewsRecipe from html5_parser import parse from lxml import etree -# For past editions, set date to, for example, '2020-11-28'. -edition_date = None use_archive = True @@ -71,7 +69,7 @@ if use_archive: except Exception: date = data['datePublished'] dt = datetime.fromisoformat(date[:-1]) + timedelta(seconds=time.timezone) - dt = dt.strftime('%b %d, %Y, %I:%M %p') + dt = dt.strftime('%b %d, %Y %I:%M %p') if data['dateline'] is None: E(article, 'p', dt, style='color: gray; font-size:small;') else: @@ -199,6 +197,13 @@ class Economist(BasicNewsRecipe): # downloaded with connection reset by peer (104) errors. delay = 1 + recipe_specific_options = { + 'date': { + 'short': 'The date of the edition to download (YYYY-MM-DD format)', + 'long': 'For example, 2024-07-19\nThis seems to work only for a couple of past editions.' + } + } + needs_subscription = False def get_browser(self, *args, **kwargs): @@ -209,6 +214,7 @@ class Economist(BasicNewsRecipe): return br def publication_date(self): + edition_date = self.recipe_specific_options.get('date') if edition_date: return parse_only_date(edition_date, as_utc=False) url = self.browser.open("https://www.economist.com/printedition").geturl() @@ -231,6 +237,7 @@ class Economist(BasicNewsRecipe): if use_archive: def parse_index(self): + edition_date = self.recipe_specific_options.get('date') # return self.economist_test_article() # url = 'https://www.economist.com/weeklyedition/archive' query = { @@ -260,6 +267,7 @@ class Economist(BasicNewsRecipe): return self.economist_return_index(ans) def economist_parse_index(self, raw): + edition_date = self.recipe_specific_options.get('date') if edition_date: data = json.loads(raw)['data']['section'] else: @@ -326,6 +334,7 @@ class Economist(BasicNewsRecipe): self.log.warn('Kindle Output profile being used, reducing image quality to keep file size below amazon email threshold') def parse_index(self): + edition_date = self.recipe_specific_options.get('date') # return self.economist_test_article() if edition_date: url = 'https://www.economist.com/weeklyedition/' + edition_date @@ -416,6 +425,7 @@ class Economist(BasicNewsRecipe): return raw def parse_index_from_printedition(self): # return self.economist_test_article() + edition_date = self.recipe_specific_options.get('date') if edition_date: url = 'https://www.economist.com/weeklyedition/' + edition_date self.timefmt = ' [' + edition_date + ']' From 3bcf2c0d1b2c08451235f0b703ec210b77322739 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 19 Jul 2024 20:20:27 +0530 Subject: [PATCH 5/5] Update Pagina12 Fixes #2073611 [Updated recipe for Pagina 12](https://bugs.launchpad.net/calibre/+bug/2073611) --- recipes/pagina12.recipe | 35 +++++++++++++++-------------------- recipes/wsj.recipe | 1 + 2 files changed, 16 insertions(+), 20 deletions(-) diff --git a/recipes/pagina12.recipe b/recipes/pagina12.recipe index 1325678e6f..54ab8ae358 100644 --- a/recipes/pagina12.recipe +++ b/recipes/pagina12.recipe @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- __license__ = 'GPL v3' -__copyright__ = '2008-2018, Darko Miletic ' +__copyright__ = '2008-2024, Darko Miletic ' ''' pagina12.com.ar ''' @@ -28,6 +28,7 @@ class Pagina12(BasicNewsRecipe): delay = 1 simultaneous_downloads = 1 timeout = 8 + needs_subscription = 'optional' ignore_duplicate_articles = {'url'} articles_are_obfuscated = True temp_files = [] @@ -63,29 +64,23 @@ class Pagina12(BasicNewsRecipe): ]}) ] + def get_browser(self): + br = BasicNewsRecipe.get_browser(self) + br.open('https://www.pagina12.com.ar/') + if self.username is not None and self.password is not None: + br.open('https://auth.pagina12.com.ar/ingresar?redirect=https://www.pagina12.com.ar') + br.select_form(id='login') + br['email'] = self.username + br['password'] = self.password + br.submit() + return br + feeds = [ (u'Diario de hoy' , u'https://www.pagina12.com.ar/rss/edicion-impresa'), - (u'El Pais' , u'https://www.pagina12.com.ar/rss/secciones/el-pais/notas'), - (u'Economia' , u'https://www.pagina12.com.ar/rss/secciones/economia/notas'), - (u'Sociedad' , u'https://www.pagina12.com.ar/rss/secciones/sociedad/notas'), - (u'El Mundo' , u'https://www.pagina12.com.ar/rss/secciones/el-mundo/notas'), - (u'Deportes' , u'https://www.pagina12.com.ar/rss/secciones/deportes/notas'), - (u'Cultura' , u'https://www.pagina12.com.ar/rss/secciones/cultura/notas'), - (u'Universidad' , u'https://www.pagina12.com.ar/rss/secciones/universidad/notas'), - (u'Ciencia' , u'https://www.pagina12.com.ar/rss/secciones/ciencia/notas'), - (u'Psicologia' , u'https://www.pagina12.com.ar/rss/secciones/psicologia/notas'), - (u'Ajedrez' , u'https://www.pagina12.com.ar/rss/secciones/ajedrez/notas'), - (u'La Ventana' , u'https://www.pagina12.com.ar/rss/secciones/la-ventana/notas'), - (u'Dialogos' , u'https://www.pagina12.com.ar/rss/secciones/dialogos/notas'), - (u'Hoy' , u'https://www.pagina12.com.ar/rss/secciones/hoy/notas'), - (u'Plastica' , u'https://www.pagina12.com.ar/rss/secciones/plastica/notas'), - (u'Cartas de Lectores', u'https://www.pagina12.com.ar/rss/secciones/cartas-de-lectores/notas'), (u'Espectaculos' , u'https://www.pagina12.com.ar/rss/suplementos/cultura-y-espectaculos/notas'), (u'Radar' , u'https://www.pagina12.com.ar/rss/suplementos/radar/notas'), (u'Radar libros' , u'https://www.pagina12.com.ar/rss/suplementos/radar-libros/notas'), (u'Cash' , u'https://www.pagina12.com.ar/rss/suplementos/cash/notas'), - (u'Turismo' , u'https://www.pagina12.com.ar/rss/suplementos/turismo/notas'), - (u'Libero' , u'https://www.pagina12.com.ar/rss/suplementos/libero/notas'), (u'NO' , u'https://www.pagina12.com.ar/rss/suplementos/no/notas'), (u'Las 12' , u'https://www.pagina12.com.ar/rss/suplementos/las12/notas'), (u'Soy' , u'https://www.pagina12.com.ar/rss/suplementos/soy/notas'), @@ -99,8 +94,8 @@ class Pagina12(BasicNewsRecipe): mydiv = soup.find('div', {'class' : lambda x: x and 'printed-edition-cover' in x.split()}) if mydiv: for image in mydiv.findAll('img'): - if image['data-src'].startswith('https://images.pagina12.com.ar/styles/width700/public/'): - return image['data-src'] + if image['src'].startswith('https://images.pagina12.com.ar/styles/width700/public/'): + return image['src'] return None def get_obfuscated_article(self, url): diff --git a/recipes/wsj.recipe b/recipes/wsj.recipe index a5d910e9f6..b8571040dc 100644 --- a/recipes/wsj.recipe +++ b/recipes/wsj.recipe @@ -6,6 +6,7 @@ from itertools import zip_longest from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.web.feeds.news import BasicNewsRecipe, classes + def media_bucket(x): if x.get('type', '') == 'image': if x.get('subtype', '') == 'graphic' or 'images.wsj.net' not in x['manifest-url']: