From 07480ba07cc6f33c0f9290d17878c436a8a69b65 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 5 Apr 2022 07:34:53 +0530 Subject: [PATCH] Update WSJ Site appears to be in the process of transitioning to a new react based architecture. Roll eyes. --- recipes/wsj.recipe | 18 ++++++++++++++++-- recipes/wsj_free.recipe | 18 ++++++++++++++++-- 2 files changed, 32 insertions(+), 4 deletions(-) diff --git a/recipes/wsj.recipe b/recipes/wsj.recipe index 57f9aec7a0..04762ec80b 100644 --- a/recipes/wsj.recipe +++ b/recipes/wsj.recipe @@ -32,6 +32,19 @@ def classes(classes): 'class': lambda x: x and frozenset(x.split()).intersection(q)}) +def prefixed_classes(classes): + q = frozenset(classes.split(' ')) + + def matcher(x): + if x: + for candidate in frozenset(x.split()): + for x in q: + if candidate.startswith(x): + return True + return False + return {'attrs': {'class': matcher}} + + class WSJ(BasicNewsRecipe): if needs_subscription: @@ -57,8 +70,9 @@ class WSJ(BasicNewsRecipe): dict(name='span', itemprop='author', rel='author'), dict(name='article', id='article-contents articleBody'.split()), dict(name='div', id='article_story_body ncTitleArea snipper-ad-login'.split()), - dict(classes('nc-exp-artbody errorNotFound')), + classes('nc-exp-artbody errorNotFound'), dict(attrs={'data-module-zone': 'article_snippet'}), + prefixed_classes('Headline__StyledHeadline- MediaLayout__Layout- ArticleByline__Container- ArticleTimestamp__Timestamp- ArticleBody__Container-'), ] remove_tags = [ @@ -288,6 +302,6 @@ class WSJ(BasicNewsRecipe): return [ ('Testing', [ {'title': 'Article One', - 'url': 'https://www.wsj.com/articles/gms-plan-to-drop-chevy-cruze-hits-ohio-town-hard-1543314600'}, # noqa + 'url': 'https://www.wsj.com/articles/egg-prices-jump-as-bird-flu-hits-poultry-flocks-11648900800'}, # noqa ]), ] diff --git a/recipes/wsj_free.recipe b/recipes/wsj_free.recipe index dab14efb36..a9f87bbcec 100644 --- a/recipes/wsj_free.recipe +++ b/recipes/wsj_free.recipe @@ -32,6 +32,19 @@ def classes(classes): 'class': lambda x: x and frozenset(x.split()).intersection(q)}) +def prefixed_classes(classes): + q = frozenset(classes.split(' ')) + + def matcher(x): + if x: + for candidate in frozenset(x.split()): + for x in q: + if candidate.startswith(x): + return True + return False + return {'attrs': {'class': matcher}} + + class WSJ(BasicNewsRecipe): if needs_subscription: @@ -57,8 +70,9 @@ class WSJ(BasicNewsRecipe): dict(name='span', itemprop='author', rel='author'), dict(name='article', id='article-contents articleBody'.split()), dict(name='div', id='article_story_body ncTitleArea snipper-ad-login'.split()), - dict(classes('nc-exp-artbody errorNotFound')), + classes('nc-exp-artbody errorNotFound'), dict(attrs={'data-module-zone': 'article_snippet'}), + prefixed_classes('Headline__StyledHeadline- MediaLayout__Layout- ArticleByline__Container- ArticleTimestamp__Timestamp- ArticleBody__Container-'), ] remove_tags = [ @@ -288,6 +302,6 @@ class WSJ(BasicNewsRecipe): return [ ('Testing', [ {'title': 'Article One', - 'url': 'https://www.wsj.com/articles/gms-plan-to-drop-chevy-cruze-hits-ohio-town-hard-1543314600'}, # noqa + 'url': 'https://www.wsj.com/articles/egg-prices-jump-as-bird-flu-hits-poultry-flocks-11648900800'}, # noqa ]), ]