From 906c0aa79d20c6e5745a2bdf84f7d1401a73eac4 Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Fri, 23 Aug 2024 21:51:37 +0530 Subject: [PATCH] Update project_syndicate.recipe --- recipes/project_syndicate.recipe | 38 ++++++++++++++++++++++++-------- 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/recipes/project_syndicate.recipe b/recipes/project_syndicate.recipe index e6addd446f..1b0e0e74af 100644 --- a/recipes/project_syndicate.recipe +++ b/recipes/project_syndicate.recipe @@ -1,4 +1,5 @@ -from calibre.ptempfile import PersistentTemporaryFile +#!/usr/bin/env python +import random from calibre.scraper.simple import read_url from calibre.web.feeds.news import BasicNewsRecipe @@ -27,11 +28,26 @@ class projectsynd(BasicNewsRecipe): articles_are_obfuscated = True def get_obfuscated_article(self, url): - raw = read_url(self.storage, 'https://archive.is/latest/' + url) - pt = PersistentTemporaryFile('.html') - pt.write(raw.encode('utf-8')) - pt.close() - return pt.name + dom = random.choice(('fo', 'is', 'li', 'md', 'ph', 'vn')) + data = read_url(self.storage, 'https://archive.' + dom + '/latest/' + url.split('?')[0]) + return { + 'data': data, + 'url': url.split('?')[0] + } + + recipe_specific_options = { + 'days': { + 'short': 'Oldest article to download from this news source. In days ', + 'long': 'For example, 0.5, gives you articles from the past 12 hours', + 'default': str(oldest_article) + } + } + + def __init__(self, *args, **kwargs): + BasicNewsRecipe.__init__(self, *args, **kwargs) + d = self.recipe_specific_options.get('days') + if d and isinstance(d, str): + self.oldest_article = float(d) extra_css = ''' [itemprop^="associatedMedia"]{ font-size:small; text-align:center; } @@ -47,7 +63,7 @@ class projectsynd(BasicNewsRecipe): ] remove_tags = [ - dict(name=['button', 'svg']), + dict(name=['button', 'svg', 'source']), dict(attrs={'data-message-area':True}), dict(attrs={'id':['editorspicks', 'movie_player']}), dict(name='aside', attrs={'id':lambda x: x and x.startswith('comments-')}) @@ -64,14 +80,18 @@ class projectsynd(BasicNewsRecipe): ] def preprocess_html(self, soup): + for h2 in soup.findAll('h2'): + h2.name = 'h4' for img in soup.findAll('img', attrs={'old-src':True}): img['src'] = img['old-src'].replace('medium', 'xlarge') if abst := soup.find(attrs={'itemprop':'abstract'}): if div := abst.find('div'): div.name = 'p' div['class'] = 'sub' - for div in soup.findAll('div', attrs={'data-line-id':True}): - div.name = 'p' + bdy = soup.find(attrs={'itemprop':'articleBody'}) + if bdy: + for div in bdy.findAll('div', recursive=False): + div.name = 'p' for a in soup.findAll('a', href=True): a['href'] = 'http' + a['href'].split('http')[-1] return soup