From 624482eddd08c6bfbec0d37aa8cc129139cfec8f Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Mon, 8 Apr 2024 11:17:26 +0530 Subject: [PATCH] Update New Yorker --- recipes/barrons.recipe | 6 +- recipes/caravan_magazine.recipe | 4 +- recipes/new_yorker.recipe | 100 ++++++++++---------------------- 3 files changed, 37 insertions(+), 73 deletions(-) diff --git a/recipes/barrons.recipe b/recipes/barrons.recipe index 0a5d08e7f5..fd833aa2f5 100644 --- a/recipes/barrons.recipe +++ b/recipes/barrons.recipe @@ -77,10 +77,14 @@ class barrons(BasicNewsRecipe): return br def parse_index(self): + self.log( + '\n***\nif this recipe fails, report it on: ' + 'https://www.mobileread.com/forums/forumdisplay.php?f=228\n***\n' + ) archive = self.index_to_soup('https://www.barrons.com/magazine?archives=' + date.today().strftime('%Y')) issue = archive.find(**prefixed_classes('BarronsTheme--archive-box--')) self.timefmt = ' [' + self.tag_to_string(issue.find(**prefixed_classes('BarronsTheme--date--'))) + ']' - self.log(self.timefmt) + self.description = self.tag_to_string(issue.find(**prefixed_classes('BarronsTheme--headline--'))) self.cover_url = issue.img['src'].split('?')[0] ans = defaultdict(list) diff --git a/recipes/caravan_magazine.recipe b/recipes/caravan_magazine.recipe index 7896deda72..c271c3a92c 100644 --- a/recipes/caravan_magazine.recipe +++ b/recipes/caravan_magazine.recipe @@ -92,8 +92,8 @@ class CaravanMagazine(BasicNewsRecipe): 'https://www.mobileread.com/forums/forumdisplay.php?f=228\n***\n' ) api = 'https://api.caravanmagazine.in/api/trpc/magazines.getLatestIssue' - # https://api.caravanmagazine.in/api/trpc/magazines.getForMonthAndYear?batch=1&input= - # %7B%220%22%3A%7B%22json%22%3A%7B%22month%22%3A 2 %2C%22year%22%3A 2024 %7D%7D%7D + # api = 'https://api.caravanmagazine.in/api/trpc/magazines.getForMonthAndYear?batch=1&' + \ + # 'input=%7B%220%22%3A%7B%22json%22%3A%7B%22month%22%3A' + '2' + '%2C%22year%22%3A' + '2024' + '%7D%7D%7D' # input={"0":{"json":{"month":2,"year":2024}}} raw = self.index_to_soup(api, raw=True) data = json.loads(raw)['result']['data']['json'] diff --git a/recipes/new_yorker.recipe b/recipes/new_yorker.recipe index 7a301aeb43..0423f57e2d 100644 --- a/recipes/new_yorker.recipe +++ b/recipes/new_yorker.recipe @@ -3,7 +3,7 @@ # License: GPLv3 Copyright: 2016, Kovid Goyal from __future__ import absolute_import, division, print_function, unicode_literals -from collections import OrderedDict +from collections import defaultdict from calibre import browser from calibre.ebooks.BeautifulSoup import Tag @@ -15,14 +15,6 @@ def absurl(x): x = 'https://www.newyorker.com' + x return x - -def new_tag(soup, name, attrs=()): - impl = getattr(soup, 'new_tag', None) - if impl is not None: - return impl(name, attrs=dict(attrs)) - return Tag(soup, name, attrs=attrs or None) - - class NewYorker(BasicNewsRecipe): title = "The New Yorker Magazine" @@ -36,22 +28,16 @@ class NewYorker(BasicNewsRecipe): encoding = 'utf-8' extra_css = ''' img { display:block; margin:0 auto; } - .byline { font-size:smaller; font-weight: bold;} + .rubric__name, .byline, time, .caption { font-size:small; } + .byline, .rubric__name { font-size:smaller; font-weight: bold;} h3 { margin-bottom: 6px; } - .caption { font-size: smaller; font-style: italic; font-weight: normal; } + .caption, .contributors { font-size: smaller; font-style: italic; font-weight: normal; } + .content-header__accreditation { font-style:italic; } ''' keep_only_tags = [ - prefixed_classes( - 'SplitScreenContentHeaderHed- SplitScreenContentHeaderDek- SplitScreenContentHeaderByline-' - ' SplitScreenContentHeaderLeadWrapper-' - ), classes( - 'split-screen-content-header__dek split-screen-content-header__hed' - ' content-header__dek content-header__hed content-header__publish-date content-header__lede-block' - ' content-header__rubric--issue-date content-header__lead-asset' - ' split-screen-content-header__publish-date split-screen-content-header__lede-block' - ' article__body bylines featured-image byline-and-date inset-mobile-crop-image hero-image-caption' + 'article__content-header article__body contributors' ), ] remove_tags = [ @@ -127,57 +113,31 @@ class NewYorker(BasicNewsRecipe): soup = self.index_to_soup( 'https://www.newyorker.com/magazine?intcid=magazine') - stories = OrderedDict() # So we can list sections in order + feeds_dict = defaultdict(list) + for section in soup.findAll('section', + attrs={'class': lambda x: x and 'SummaryRiverSection-' in x}): + for h2 in section.findAll(attrs={'class':lambda x: x and 'SectionTitleHed-' in x}): + secname = self.tag_to_string(h2) + self.log(secname) + articles = [] + for a in section.findAll('a', href=True, attrs={'class':lambda x: x and 'summary-item__hed-link' in x}): + section = secname + url = absurl(a['href']) + title = self.tag_to_string(a) + desc = '' + summ = a.find_next_sibling(attrs={'class':lambda x: x and 'summary-item__dek' in x}) + if summ: + desc = self.tag_to_string(summ) + byl = a.find_next_sibling(attrs={'class':lambda x: x and 'summary-item__byline-' in x}) + if byl: + desc = self.tag_to_string(byl) + ' | ' + desc + rub = a.find_previous_sibling(attrs={'class':lambda x: x and 'summary-item__rubric' in x}) + if rub: + desc = self.tag_to_string(rub) + ' | ' + desc + self.log('\t', title, '\n\t', desc, '\n\t\t', url) + feeds_dict[section].append({"title": title, "url": url, "description": desc}) + return feeds_dict.items() - # Iterate sections of content - - for section_soup in soup.findAll( - attrs={'class': lambda x: x and 'MagazinePageSection__section___21cc7' in x}): - section = section_soup.find('h2').text - self.log("Found section:", section) - - # Iterate stories in section - - is_mail_section = (section == "Mail") - - if is_mail_section: - cname = "Link__link___" - else: - cname = "River__riverItemContent___" - - for story in section_soup.findAll( - attrs={'class': lambda x: x and cname in x}): - - title = "" - url = "" - desc = "" - - if is_mail_section: - title = story.text - url = absurl(story['href']) - else: - h4 = story.find('h4') - title = self.tag_to_string(h4) - a = story.find('h4').parent - url = absurl(a['href']) - # Get description - body = story.find(attrs={'class': 'River__dek___CayIg'}) - if body is not None: - desc = str(body.contents[0]) - - self.log('Found article:', title) - self.log('\t' + url) - self.log('\t' + desc) - self.log('') - - if section not in stories: - stories[section] = [] - stories[section].append({ - 'title': title, - 'url': url, - 'description': desc}) - - return [(k, stories[k]) for k, v in stories.items()] # The New Yorker changes the content it delivers based on cookies, so the # following ensures that we send no cookies