Merge branch 'kovidgoyal:master' into tolino

2025-07-09 03:04:10 -04:00 · 2024-07-19 22:42:41 +02:00 · 2024-07-19 22:42:41 +02:00 · 01a7e09ac9
commit 01a7e09ac9
parent 011f274830 3bcf2c0d1b
4 changed files with 42 additions and 26 deletions
--- a/recipes/economist.recipe
+++ b/recipes/economist.recipe
@ -15,8 +15,6 @@ from calibre.web.feeds.news import BasicNewsRecipe
 from html5_parser import parse
 from lxml import etree

-# For past editions, set date to, for example, '2020-11-28'.
-edition_date = None
 use_archive = True


@ -71,7 +69,7 @@ if use_archive:
        except Exception:
            date = data['datePublished']
        dt = datetime.fromisoformat(date[:-1]) + timedelta(seconds=time.timezone)
-        dt = dt.strftime('%b %d, %Y, %I:%M %p')
+        dt = dt.strftime('%b %d, %Y %I:%M %p')
        if data['dateline'] is None:
            E(article, 'p', dt, style='color: gray; font-size:small;')
        else:
@ -199,6 +197,13 @@ class Economist(BasicNewsRecipe):
    # downloaded with connection reset by peer (104) errors.
    delay = 1

+    recipe_specific_options = {
+        'date': {
+            'short': 'The date of the edition to download (YYYY-MM-DD format)',
+            'long': 'For example, 2024-07-19\nThis seems to work only for a couple of past editions.'
+        }
+    }
+
    needs_subscription = False

    def get_browser(self, *args, **kwargs):
@ -209,6 +214,7 @@ class Economist(BasicNewsRecipe):
        return br

    def publication_date(self):
+        edition_date = self.recipe_specific_options.get('date')
        if edition_date:
            return parse_only_date(edition_date, as_utc=False)
        url = self.browser.open("https://www.economist.com/printedition").geturl()
@ -231,6 +237,7 @@ class Economist(BasicNewsRecipe):

    if use_archive:
        def parse_index(self):
+            edition_date = self.recipe_specific_options.get('date')
            # return self.economist_test_article()
            # url = 'https://www.economist.com/weeklyedition/archive'
            query = {
@ -260,6 +267,7 @@ class Economist(BasicNewsRecipe):
            return self.economist_return_index(ans)

        def economist_parse_index(self, raw):
+            edition_date = self.recipe_specific_options.get('date')
            if edition_date:
                data = json.loads(raw)['data']['section']
            else:
@ -326,6 +334,7 @@ class Economist(BasicNewsRecipe):
                self.log.warn('Kindle Output profile being used, reducing image quality to keep file size below amazon email threshold')

        def parse_index(self):
+            edition_date = self.recipe_specific_options.get('date')
            # return self.economist_test_article()
            if edition_date:
                url = 'https://www.economist.com/weeklyedition/' + edition_date
@ -416,6 +425,7 @@ class Economist(BasicNewsRecipe):
        return raw
    def parse_index_from_printedition(self):
        # return self.economist_test_article()
+        edition_date = self.recipe_specific_options.get('date')
        if edition_date:
            url = 'https://www.economist.com/weeklyedition/' + edition_date
            self.timefmt = ' [' + edition_date + ']'
--- a/recipes/economist_free.recipe
+++ b/recipes/economist_free.recipe
@ -15,8 +15,6 @@ from calibre.web.feeds.news import BasicNewsRecipe
 from html5_parser import parse
 from lxml import etree

-# For past editions, set date to, for example, '2020-11-28'.
-edition_date = None
 use_archive = True


@ -71,7 +69,7 @@ if use_archive:
        except Exception:
            date = data['datePublished']
        dt = datetime.fromisoformat(date[:-1]) + timedelta(seconds=time.timezone)
-        dt = dt.strftime('%b %d, %Y, %I:%M %p')
+        dt = dt.strftime('%b %d, %Y %I:%M %p')
        if data['dateline'] is None:
            E(article, 'p', dt, style='color: gray; font-size:small;')
        else:
@ -199,6 +197,13 @@ class Economist(BasicNewsRecipe):
    # downloaded with connection reset by peer (104) errors.
    delay = 1

+    recipe_specific_options = {
+        'date': {
+            'short': 'The date of the edition to download (YYYY-MM-DD format)',
+            'long': 'For example, 2024-07-19\nThis seems to work only for a couple of past editions.'
+        }
+    }
+
    needs_subscription = False

    def get_browser(self, *args, **kwargs):
@ -209,6 +214,7 @@ class Economist(BasicNewsRecipe):
        return br

    def publication_date(self):
+        edition_date = self.recipe_specific_options.get('date')
        if edition_date:
            return parse_only_date(edition_date, as_utc=False)
        url = self.browser.open("https://www.economist.com/printedition").geturl()
@ -231,6 +237,7 @@ class Economist(BasicNewsRecipe):

    if use_archive:
        def parse_index(self):
+            edition_date = self.recipe_specific_options.get('date')
            # return self.economist_test_article()
            # url = 'https://www.economist.com/weeklyedition/archive'
            query = {
@ -260,6 +267,7 @@ class Economist(BasicNewsRecipe):
            return self.economist_return_index(ans)

        def economist_parse_index(self, raw):
+            edition_date = self.recipe_specific_options.get('date')
            if edition_date:
                data = json.loads(raw)['data']['section']
            else:
@ -326,6 +334,7 @@ class Economist(BasicNewsRecipe):
                self.log.warn('Kindle Output profile being used, reducing image quality to keep file size below amazon email threshold')

        def parse_index(self):
+            edition_date = self.recipe_specific_options.get('date')
            # return self.economist_test_article()
            if edition_date:
                url = 'https://www.economist.com/weeklyedition/' + edition_date
@ -416,6 +425,7 @@ class Economist(BasicNewsRecipe):
        return raw
    def parse_index_from_printedition(self):
        # return self.economist_test_article()
+        edition_date = self.recipe_specific_options.get('date')
        if edition_date:
            url = 'https://www.economist.com/weeklyedition/' + edition_date
            self.timefmt = ' [' + edition_date + ']'
--- a/recipes/pagina12.recipe
+++ b/recipes/pagina12.recipe
@ -2,7 +2,7 @@
 # -*- coding: utf-8 -*-

 __license__ = 'GPL v3'
-__copyright__ = '2008-2018, Darko Miletic <darko.miletic at gmail.com>'
+__copyright__ = '2008-2024, Darko Miletic <darko.miletic at gmail.com>'
 '''
 pagina12.com.ar
 '''
@ -28,6 +28,7 @@ class Pagina12(BasicNewsRecipe):
    delay                  = 1
    simultaneous_downloads = 1
    timeout                = 8
+    needs_subscription     = 'optional'
    ignore_duplicate_articles = {'url'}
    articles_are_obfuscated = True
    temp_files              = []
@ -63,29 +64,23 @@ class Pagina12(BasicNewsRecipe):
                                       ]})
    ]

+    def get_browser(self):
+        br = BasicNewsRecipe.get_browser(self)
+        br.open('https://www.pagina12.com.ar/')
+        if self.username is not None and self.password is not None:
+            br.open('https://auth.pagina12.com.ar/ingresar?redirect=https://www.pagina12.com.ar')
+            br.select_form(id='login')
+            br['email'] = self.username
+            br['password'] = self.password
+            br.submit()
+        return br
+
    feeds = [
      (u'Diario de hoy'     , u'https://www.pagina12.com.ar/rss/edicion-impresa'),
-      (u'El Pais'           , u'https://www.pagina12.com.ar/rss/secciones/el-pais/notas'),
-      (u'Economia'          , u'https://www.pagina12.com.ar/rss/secciones/economia/notas'),
-      (u'Sociedad'          , u'https://www.pagina12.com.ar/rss/secciones/sociedad/notas'),
-      (u'El Mundo'          , u'https://www.pagina12.com.ar/rss/secciones/el-mundo/notas'),
-      (u'Deportes'          , u'https://www.pagina12.com.ar/rss/secciones/deportes/notas'),
-      (u'Cultura'           , u'https://www.pagina12.com.ar/rss/secciones/cultura/notas'),
-      (u'Universidad'       , u'https://www.pagina12.com.ar/rss/secciones/universidad/notas'),
-      (u'Ciencia'           , u'https://www.pagina12.com.ar/rss/secciones/ciencia/notas'),
-      (u'Psicologia'        , u'https://www.pagina12.com.ar/rss/secciones/psicologia/notas'),
-      (u'Ajedrez'           , u'https://www.pagina12.com.ar/rss/secciones/ajedrez/notas'),
-      (u'La Ventana'        , u'https://www.pagina12.com.ar/rss/secciones/la-ventana/notas'),
-      (u'Dialogos'          , u'https://www.pagina12.com.ar/rss/secciones/dialogos/notas'),
-      (u'Hoy'               , u'https://www.pagina12.com.ar/rss/secciones/hoy/notas'),
-      (u'Plastica'          , u'https://www.pagina12.com.ar/rss/secciones/plastica/notas'),
-      (u'Cartas de Lectores', u'https://www.pagina12.com.ar/rss/secciones/cartas-de-lectores/notas'),
      (u'Espectaculos'      , u'https://www.pagina12.com.ar/rss/suplementos/cultura-y-espectaculos/notas'),
      (u'Radar'             , u'https://www.pagina12.com.ar/rss/suplementos/radar/notas'),
      (u'Radar libros'      , u'https://www.pagina12.com.ar/rss/suplementos/radar-libros/notas'),
      (u'Cash'              , u'https://www.pagina12.com.ar/rss/suplementos/cash/notas'),
-      (u'Turismo'           , u'https://www.pagina12.com.ar/rss/suplementos/turismo/notas'),
-      (u'Libero'            , u'https://www.pagina12.com.ar/rss/suplementos/libero/notas'),
      (u'NO'                , u'https://www.pagina12.com.ar/rss/suplementos/no/notas'),
      (u'Las 12'            , u'https://www.pagina12.com.ar/rss/suplementos/las12/notas'),
      (u'Soy'               , u'https://www.pagina12.com.ar/rss/suplementos/soy/notas'),
@ -99,8 +94,8 @@ class Pagina12(BasicNewsRecipe):
        mydiv = soup.find('div', {'class' : lambda x: x and 'printed-edition-cover' in x.split()})
        if mydiv:
            for image in mydiv.findAll('img'):
-                if image['data-src'].startswith('https://images.pagina12.com.ar/styles/width700/public/'):
-                    return image['data-src']
+                if image['src'].startswith('https://images.pagina12.com.ar/styles/width700/public/'):
+                    return image['src']
        return None

    def get_obfuscated_article(self, url):
--- a/recipes/wsj.recipe
+++ b/recipes/wsj.recipe
@ -6,6 +6,7 @@ from itertools import zip_longest
 from calibre.ebooks.BeautifulSoup import BeautifulSoup
 from calibre.web.feeds.news import BasicNewsRecipe, classes

+
 def media_bucket(x):
    if x.get('type', '') == 'image':
        if x.get('subtype', '') == 'graphic' or 'images.wsj.net' not in x['manifest-url']: