Merge branch 'kovidgoyal:master' into tolino

This commit is contained in:
beedaddy 2024-07-19 22:42:41 +02:00 committed by GitHub
commit 01a7e09ac9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 42 additions and 26 deletions

View File

@ -15,8 +15,6 @@ from calibre.web.feeds.news import BasicNewsRecipe
from html5_parser import parse from html5_parser import parse
from lxml import etree from lxml import etree
# For past editions, set date to, for example, '2020-11-28'.
edition_date = None
use_archive = True use_archive = True
@ -71,7 +69,7 @@ if use_archive:
except Exception: except Exception:
date = data['datePublished'] date = data['datePublished']
dt = datetime.fromisoformat(date[:-1]) + timedelta(seconds=time.timezone) dt = datetime.fromisoformat(date[:-1]) + timedelta(seconds=time.timezone)
dt = dt.strftime('%b %d, %Y, %I:%M %p') dt = dt.strftime('%b %d, %Y %I:%M %p')
if data['dateline'] is None: if data['dateline'] is None:
E(article, 'p', dt, style='color: gray; font-size:small;') E(article, 'p', dt, style='color: gray; font-size:small;')
else: else:
@ -199,6 +197,13 @@ class Economist(BasicNewsRecipe):
# downloaded with connection reset by peer (104) errors. # downloaded with connection reset by peer (104) errors.
delay = 1 delay = 1
recipe_specific_options = {
'date': {
'short': 'The date of the edition to download (YYYY-MM-DD format)',
'long': 'For example, 2024-07-19\nThis seems to work only for a couple of past editions.'
}
}
needs_subscription = False needs_subscription = False
def get_browser(self, *args, **kwargs): def get_browser(self, *args, **kwargs):
@ -209,6 +214,7 @@ class Economist(BasicNewsRecipe):
return br return br
def publication_date(self): def publication_date(self):
edition_date = self.recipe_specific_options.get('date')
if edition_date: if edition_date:
return parse_only_date(edition_date, as_utc=False) return parse_only_date(edition_date, as_utc=False)
url = self.browser.open("https://www.economist.com/printedition").geturl() url = self.browser.open("https://www.economist.com/printedition").geturl()
@ -231,6 +237,7 @@ class Economist(BasicNewsRecipe):
if use_archive: if use_archive:
def parse_index(self): def parse_index(self):
edition_date = self.recipe_specific_options.get('date')
# return self.economist_test_article() # return self.economist_test_article()
# url = 'https://www.economist.com/weeklyedition/archive' # url = 'https://www.economist.com/weeklyedition/archive'
query = { query = {
@ -260,6 +267,7 @@ class Economist(BasicNewsRecipe):
return self.economist_return_index(ans) return self.economist_return_index(ans)
def economist_parse_index(self, raw): def economist_parse_index(self, raw):
edition_date = self.recipe_specific_options.get('date')
if edition_date: if edition_date:
data = json.loads(raw)['data']['section'] data = json.loads(raw)['data']['section']
else: else:
@ -326,6 +334,7 @@ class Economist(BasicNewsRecipe):
self.log.warn('Kindle Output profile being used, reducing image quality to keep file size below amazon email threshold') self.log.warn('Kindle Output profile being used, reducing image quality to keep file size below amazon email threshold')
def parse_index(self): def parse_index(self):
edition_date = self.recipe_specific_options.get('date')
# return self.economist_test_article() # return self.economist_test_article()
if edition_date: if edition_date:
url = 'https://www.economist.com/weeklyedition/' + edition_date url = 'https://www.economist.com/weeklyedition/' + edition_date
@ -416,6 +425,7 @@ class Economist(BasicNewsRecipe):
return raw return raw
def parse_index_from_printedition(self): def parse_index_from_printedition(self):
# return self.economist_test_article() # return self.economist_test_article()
edition_date = self.recipe_specific_options.get('date')
if edition_date: if edition_date:
url = 'https://www.economist.com/weeklyedition/' + edition_date url = 'https://www.economist.com/weeklyedition/' + edition_date
self.timefmt = ' [' + edition_date + ']' self.timefmt = ' [' + edition_date + ']'

View File

@ -15,8 +15,6 @@ from calibre.web.feeds.news import BasicNewsRecipe
from html5_parser import parse from html5_parser import parse
from lxml import etree from lxml import etree
# For past editions, set date to, for example, '2020-11-28'.
edition_date = None
use_archive = True use_archive = True
@ -71,7 +69,7 @@ if use_archive:
except Exception: except Exception:
date = data['datePublished'] date = data['datePublished']
dt = datetime.fromisoformat(date[:-1]) + timedelta(seconds=time.timezone) dt = datetime.fromisoformat(date[:-1]) + timedelta(seconds=time.timezone)
dt = dt.strftime('%b %d, %Y, %I:%M %p') dt = dt.strftime('%b %d, %Y %I:%M %p')
if data['dateline'] is None: if data['dateline'] is None:
E(article, 'p', dt, style='color: gray; font-size:small;') E(article, 'p', dt, style='color: gray; font-size:small;')
else: else:
@ -199,6 +197,13 @@ class Economist(BasicNewsRecipe):
# downloaded with connection reset by peer (104) errors. # downloaded with connection reset by peer (104) errors.
delay = 1 delay = 1
recipe_specific_options = {
'date': {
'short': 'The date of the edition to download (YYYY-MM-DD format)',
'long': 'For example, 2024-07-19\nThis seems to work only for a couple of past editions.'
}
}
needs_subscription = False needs_subscription = False
def get_browser(self, *args, **kwargs): def get_browser(self, *args, **kwargs):
@ -209,6 +214,7 @@ class Economist(BasicNewsRecipe):
return br return br
def publication_date(self): def publication_date(self):
edition_date = self.recipe_specific_options.get('date')
if edition_date: if edition_date:
return parse_only_date(edition_date, as_utc=False) return parse_only_date(edition_date, as_utc=False)
url = self.browser.open("https://www.economist.com/printedition").geturl() url = self.browser.open("https://www.economist.com/printedition").geturl()
@ -231,6 +237,7 @@ class Economist(BasicNewsRecipe):
if use_archive: if use_archive:
def parse_index(self): def parse_index(self):
edition_date = self.recipe_specific_options.get('date')
# return self.economist_test_article() # return self.economist_test_article()
# url = 'https://www.economist.com/weeklyedition/archive' # url = 'https://www.economist.com/weeklyedition/archive'
query = { query = {
@ -260,6 +267,7 @@ class Economist(BasicNewsRecipe):
return self.economist_return_index(ans) return self.economist_return_index(ans)
def economist_parse_index(self, raw): def economist_parse_index(self, raw):
edition_date = self.recipe_specific_options.get('date')
if edition_date: if edition_date:
data = json.loads(raw)['data']['section'] data = json.loads(raw)['data']['section']
else: else:
@ -326,6 +334,7 @@ class Economist(BasicNewsRecipe):
self.log.warn('Kindle Output profile being used, reducing image quality to keep file size below amazon email threshold') self.log.warn('Kindle Output profile being used, reducing image quality to keep file size below amazon email threshold')
def parse_index(self): def parse_index(self):
edition_date = self.recipe_specific_options.get('date')
# return self.economist_test_article() # return self.economist_test_article()
if edition_date: if edition_date:
url = 'https://www.economist.com/weeklyedition/' + edition_date url = 'https://www.economist.com/weeklyedition/' + edition_date
@ -416,6 +425,7 @@ class Economist(BasicNewsRecipe):
return raw return raw
def parse_index_from_printedition(self): def parse_index_from_printedition(self):
# return self.economist_test_article() # return self.economist_test_article()
edition_date = self.recipe_specific_options.get('date')
if edition_date: if edition_date:
url = 'https://www.economist.com/weeklyedition/' + edition_date url = 'https://www.economist.com/weeklyedition/' + edition_date
self.timefmt = ' [' + edition_date + ']' self.timefmt = ' [' + edition_date + ']'

View File

@ -2,7 +2,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008-2018, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2008-2024, Darko Miletic <darko.miletic at gmail.com>'
''' '''
pagina12.com.ar pagina12.com.ar
''' '''
@ -28,6 +28,7 @@ class Pagina12(BasicNewsRecipe):
delay = 1 delay = 1
simultaneous_downloads = 1 simultaneous_downloads = 1
timeout = 8 timeout = 8
needs_subscription = 'optional'
ignore_duplicate_articles = {'url'} ignore_duplicate_articles = {'url'}
articles_are_obfuscated = True articles_are_obfuscated = True
temp_files = [] temp_files = []
@ -63,29 +64,23 @@ class Pagina12(BasicNewsRecipe):
]}) ]})
] ]
def get_browser(self):
br = BasicNewsRecipe.get_browser(self)
br.open('https://www.pagina12.com.ar/')
if self.username is not None and self.password is not None:
br.open('https://auth.pagina12.com.ar/ingresar?redirect=https://www.pagina12.com.ar')
br.select_form(id='login')
br['email'] = self.username
br['password'] = self.password
br.submit()
return br
feeds = [ feeds = [
(u'Diario de hoy' , u'https://www.pagina12.com.ar/rss/edicion-impresa'), (u'Diario de hoy' , u'https://www.pagina12.com.ar/rss/edicion-impresa'),
(u'El Pais' , u'https://www.pagina12.com.ar/rss/secciones/el-pais/notas'),
(u'Economia' , u'https://www.pagina12.com.ar/rss/secciones/economia/notas'),
(u'Sociedad' , u'https://www.pagina12.com.ar/rss/secciones/sociedad/notas'),
(u'El Mundo' , u'https://www.pagina12.com.ar/rss/secciones/el-mundo/notas'),
(u'Deportes' , u'https://www.pagina12.com.ar/rss/secciones/deportes/notas'),
(u'Cultura' , u'https://www.pagina12.com.ar/rss/secciones/cultura/notas'),
(u'Universidad' , u'https://www.pagina12.com.ar/rss/secciones/universidad/notas'),
(u'Ciencia' , u'https://www.pagina12.com.ar/rss/secciones/ciencia/notas'),
(u'Psicologia' , u'https://www.pagina12.com.ar/rss/secciones/psicologia/notas'),
(u'Ajedrez' , u'https://www.pagina12.com.ar/rss/secciones/ajedrez/notas'),
(u'La Ventana' , u'https://www.pagina12.com.ar/rss/secciones/la-ventana/notas'),
(u'Dialogos' , u'https://www.pagina12.com.ar/rss/secciones/dialogos/notas'),
(u'Hoy' , u'https://www.pagina12.com.ar/rss/secciones/hoy/notas'),
(u'Plastica' , u'https://www.pagina12.com.ar/rss/secciones/plastica/notas'),
(u'Cartas de Lectores', u'https://www.pagina12.com.ar/rss/secciones/cartas-de-lectores/notas'),
(u'Espectaculos' , u'https://www.pagina12.com.ar/rss/suplementos/cultura-y-espectaculos/notas'), (u'Espectaculos' , u'https://www.pagina12.com.ar/rss/suplementos/cultura-y-espectaculos/notas'),
(u'Radar' , u'https://www.pagina12.com.ar/rss/suplementos/radar/notas'), (u'Radar' , u'https://www.pagina12.com.ar/rss/suplementos/radar/notas'),
(u'Radar libros' , u'https://www.pagina12.com.ar/rss/suplementos/radar-libros/notas'), (u'Radar libros' , u'https://www.pagina12.com.ar/rss/suplementos/radar-libros/notas'),
(u'Cash' , u'https://www.pagina12.com.ar/rss/suplementos/cash/notas'), (u'Cash' , u'https://www.pagina12.com.ar/rss/suplementos/cash/notas'),
(u'Turismo' , u'https://www.pagina12.com.ar/rss/suplementos/turismo/notas'),
(u'Libero' , u'https://www.pagina12.com.ar/rss/suplementos/libero/notas'),
(u'NO' , u'https://www.pagina12.com.ar/rss/suplementos/no/notas'), (u'NO' , u'https://www.pagina12.com.ar/rss/suplementos/no/notas'),
(u'Las 12' , u'https://www.pagina12.com.ar/rss/suplementos/las12/notas'), (u'Las 12' , u'https://www.pagina12.com.ar/rss/suplementos/las12/notas'),
(u'Soy' , u'https://www.pagina12.com.ar/rss/suplementos/soy/notas'), (u'Soy' , u'https://www.pagina12.com.ar/rss/suplementos/soy/notas'),
@ -99,8 +94,8 @@ class Pagina12(BasicNewsRecipe):
mydiv = soup.find('div', {'class' : lambda x: x and 'printed-edition-cover' in x.split()}) mydiv = soup.find('div', {'class' : lambda x: x and 'printed-edition-cover' in x.split()})
if mydiv: if mydiv:
for image in mydiv.findAll('img'): for image in mydiv.findAll('img'):
if image['data-src'].startswith('https://images.pagina12.com.ar/styles/width700/public/'): if image['src'].startswith('https://images.pagina12.com.ar/styles/width700/public/'):
return image['data-src'] return image['src']
return None return None
def get_obfuscated_article(self, url): def get_obfuscated_article(self, url):

View File

@ -6,6 +6,7 @@ from itertools import zip_longest
from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.web.feeds.news import BasicNewsRecipe, classes from calibre.web.feeds.news import BasicNewsRecipe, classes
def media_bucket(x): def media_bucket(x):
if x.get('type', '') == 'image': if x.get('type', '') == 'image':
if x.get('subtype', '') == 'graphic' or 'images.wsj.net' not in x['manifest-url']: if x.get('subtype', '') == 'graphic' or 'images.wsj.net' not in x['manifest-url']: