From 517859adcb410dc602a9545f113e615651445863 Mon Sep 17 00:00:00 2001 From: ping Date: Tue, 12 Apr 2022 09:34:30 +0800 Subject: [PATCH] Fix recipe for scmp.com --- recipes/scmp.recipe | 273 ++++++++++++++++++++++++++++++-------------- 1 file changed, 186 insertions(+), 87 deletions(-) diff --git a/recipes/scmp.recipe b/recipes/scmp.recipe index 7eb63a8a38..915022c55e 100644 --- a/recipes/scmp.recipe +++ b/recipes/scmp.recipe @@ -1,105 +1,204 @@ -''' +""" scmp.com -''' +""" -from mechanize import Request -import json from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import Tag - - -def classes(classes): - q = frozenset(classes.split(' ')) - return dict(attrs={ - 'class': lambda x: x and frozenset(x.split()).intersection(q)}) - - -def new_tag(soup, name, attrs=()): - impl = getattr(soup, 'new_tag', None) - if impl is not None: - return impl(name, attrs=dict(attrs)) - return Tag(soup, name, attrs=attrs or None) +from calibre.ebooks.BeautifulSoup import Tag, BeautifulSoup +import re +import json +from datetime import datetime, timezone, timedelta class SCMP(BasicNewsRecipe): - title = 'South China Morning Post' - __author__ = 'llam' + title = "South China Morning Post" + __author__ = "llam" description = "SCMP.com, Hong Kong's premier online English daily provides exclusive up-to-date news, audio video news, podcasts, RSS Feeds, Blogs, breaking news, top stories, award winning news and analysis on Hong Kong and China." # noqa - publisher = 'South China Morning Post Publishers Ltd.' - oldest_article = 2 - delay = 1 - max_articles_per_feed = 200 + publisher = "South China Morning Post Publishers Ltd." + oldest_article = 1 + max_articles_per_feed = 25 no_stylesheets = True - encoding = 'utf-8' + remove_javascript = True + encoding = "utf-8" use_embedded_content = False - language = 'en_CN' + language = "en" remove_empty_feeds = True - needs_subscription = 'optional' - publication_type = 'newspaper' + publication_type = "newspaper" + auto_cleanup = False + compress_news_images = True + ignore_duplicate_articles = {"title", "url"} - keep_only_tags = [ - dict(name='h1'), - classes('info__subHeadline article-author main__right'), - ] + # used when unable to extract article from