Update New York Review of Books

Fixes #1900136 [fetch fail New York Review of Books login version](https://bugs.launchpad.net/calibre/+bug/1900136)
2025-08-30 23:00:21 -04:00 · 2020-10-16 21:29:33 +05:30 · 2020-10-16 21:29:33 +05:30 · 64e2b05b5c
commit 64e2b05b5c
parent 612079b5bf
2 changed files with 80 additions and 113 deletions
--- a/recipes/new_york_review_of_books.recipe
+++ b/recipes/new_york_review_of_books.recipe
@ -6,18 +6,21 @@ __docformat__ = 'restructuredtext en'
 '''
 nybooks.com
 '''
 import re
 from calibre.web.feeds.news import BasicNewsRecipe
-def find_header(tag):
+def classes(classes):
-    return tag.name == 'header' and ''.join(tag.parent['class']) == 'article'
+    q = frozenset(classes.split(' '))
    return dict(attrs={
        'class': lambda x: x and frozenset(x.split()).intersection(q)})
 def absurl(url):
-    if url.startswith('/'):
+    if url.startswith('//'):
-        url = 'http://www.nybooks.com' + url
+        url = 'https:' + url
    elif url.startswith('/'):
        url = 'https://www.nybooks.com' + url
    return url
@ -34,76 +37,53 @@ class NewYorkReviewOfBooks(BasicNewsRecipe):
    needs_subscription = True
    keep_only_tags = [
-        dict(name='section', attrs={'class': 'article_body'}),
+        dict(name='h1'),
-        dict(name=find_header),
+        classes('author article-col article-main-content'),
        dict(name='div', attrs={
             'class': ['footnotes', 'for-subscribers-only']}),
    ]
-
+    remove_tags = [
-    preprocess_regexps = [(re.compile(r'<head>.*?</head>', re.DOTALL), lambda
+        classes('inline-ad'),
-                           m:'<head></head>')]
+    ]
-
+    remove_tags_after = classes('article-main-content')
    def print_version(self, url):
        if '?' in url:
            url = url.rpartition('?')[0]
        return url + '?pagination=false'
    def get_browser(self):
        br = BasicNewsRecipe.get_browser(self)
-        br.open('http://www.nybooks.com/account/signin/')
+        br.open('https://www.nybooks.com/account/signin/')
-        br.select_form(nr=2)
+        br.select_form(id='loginform')
-        br['user_login'] = self.username
+        br['log'] = self.username
-        br['user_password'] = self.password
+        br['pwd'] = self.password
        br.submit()
        return br
    def preprocess_html(self, soup):
        header = soup.find('header')
        body = soup.find('body')
        body.insert(0, header)
        header.find('div', attrs={'class': 'details'}).extract()
        for i in soup.findAll('input'):
            i.extract()
        return soup
    def postprocess_html(self, soup, first):
        for img in soup.findAll('img', srcset=True):
            del img['srcset']
        return soup
    def parse_index(self):
-        soup = self.index_to_soup('http://www.nybooks.com/current-issue')
+        soup = self.index_to_soup('https://www.nybooks.com/current-issue')
        # from calibre.utils.ipython import ipython
        # ipython({'soup': soup})
        # Find cover
-        sidebar = soup.find('div', attrs={'class': 'issue_cover'})
+        cover = soup.find('img', attrs={'class':'border-light-gray'})
-        if sidebar is not None:
+        if cover is not None:
-            img = sidebar.find('img', src=True)
+            self.cover_url = absurl(cover['src'])
            self.cover_url = absurl(img['src'])
            self.log('Found cover at:', self.cover_url)
        # Find date
-        div = soup.find('time', pubdate='pubdate')
+        div = soup.find('p', **classes('h2'))
        if div is not None:
            text = self.tag_to_string(div)
-            date = text.partition(u'\u2022')[0].strip()
+            self.timefmt = text
-            self.timefmt = u' [%s]' % date
+            self.log('Issue date:', text)
            self.log('Issue date:', date)
        # Find TOC
        tocs = soup.find('div', attrs={'class': 'current_issue'}).findAll(
            'div', attrs={'class': 'articles_list'})
        articles = []
-        for toc in tocs:
+        for h4 in soup.findAll('h4'):
-            for div in toc.findAll('div', attrs={'class': 'row'}):
+            title = self.tag_to_string(h4).strip()
-                h2 = div.find('h2')
+            url = absurl(h4.find('a')['href'])
-                title = self.tag_to_string(h2).strip()
+            author = self.tag_to_string(h4.parent.parent.find('a'))
-                author = self.tag_to_string(
+            title = title + ' (%s)' % author
                    div.find('div', attrs={'class': 'author'})).strip()
                title = title + u' (%s)' % author
                url = absurl(h2.find('a', href=True)['href'])
            desc = ''
-                for p in div.findAll('p', attrs={'class': lambda x: x and 'quiet' in x}):
+            div = h4
-                    desc += self.tag_to_string(p)
+            while div.next_sibling:
                div = div.next_sibling
                desc += self.tag_to_string(div).strip()
            self.log('Found article:', title)
            self.log('\t', url)
            self.log('\t', desc)
--- a/recipes/new_york_review_of_books_no_sub.recipe
+++ b/recipes/new_york_review_of_books_no_sub.recipe
@ -6,18 +6,21 @@ __docformat__ = 'restructuredtext en'
 '''
 nybooks.com
 '''
 import re
 from calibre.web.feeds.news import BasicNewsRecipe
-def find_header(tag):
+def classes(classes):
-    return tag.name == 'header' and ''.join(tag.parent['class']) == 'article'
+    q = frozenset(classes.split(' '))
    return dict(attrs={
        'class': lambda x: x and frozenset(x.split()).intersection(q)})
 def absurl(url):
-    if url.startswith('/'):
+    if url.startswith('//'):
-        url = 'http://www.nybooks.com' + url
+        url = 'https:' + url
    elif url.startswith('/'):
        url = 'https://www.nybooks.com' + url
    return url
@ -33,60 +36,44 @@ class NewYorkReviewOfBooks(BasicNewsRecipe):
    no_javascript = True
    keep_only_tags = [
-        dict(name='section', attrs={'class': 'article_body'}),
+        dict(name='h1'),
-        dict(name=find_header),
+        classes('author article-col article-main-content'),
        dict(name='div', attrs={
             'class': ['footnotes', 'for-subscribers-only']}),
    ]
-
+    remove_tags = [
-    preprocess_regexps = [(re.compile(r'<head>.*?</head>', re.DOTALL), lambda
+        classes('inline-ad'),
-                           m:'<head></head>')]
+    ]
-
+    remove_tags_after = classes('article-main-content')
    def print_version(self, url):
        return url + '?pagination=false'
    def preprocess_html(self, soup):
        header = soup.find('header')
        body = soup.find('body')
        body.insert(0, header)
        header.find('div', attrs={'class': 'details'}).extract()
        for i in soup.findAll('input'):
            i.extract()
        return soup
    def parse_index(self):
-        soup = self.index_to_soup('http://www.nybooks.com/current-issue')
+        soup = self.index_to_soup('https://www.nybooks.com/current-issue')
        # from calibre.utils.ipython import ipython
        # ipython({'soup': soup})
        # Find cover
-        sidebar = soup.find('div', attrs={'class': 'issue_cover'})
+        cover = soup.find('img', attrs={'class':'border-light-gray'})
-        if sidebar is not None:
+        if cover is not None:
-            img = sidebar.find('img', src=True)
+            self.cover_url = absurl(cover['src'])
            self.cover_url = absurl(img['src'])
            self.log('Found cover at:', self.cover_url)
        # Find date
-        div = soup.find('time', pubdate='pubdate')
+        div = soup.find('p', **classes('h2'))
        if div is not None:
            text = self.tag_to_string(div)
-            date = text.partition(u'\u2022')[0].strip()
+            self.timefmt = text
-            self.timefmt = u' [%s]' % date
+            self.log('Issue date:', text)
            self.log('Issue date:', date)
        # Find TOC
        tocs = soup.find('div', attrs={'class': 'current_issue'}).findAll(
            'div', attrs={'class': 'articles_list'})
        articles = []
-        for toc in tocs:
+        for h4 in soup.findAll('h4'):
-            for div in toc.findAll('div', attrs={'class': 'row'}):
+            title = self.tag_to_string(h4).strip()
-                h2 = div.find('h2')
+            url = absurl(h4.find('a')['href'])
-                title = self.tag_to_string(h2).strip()
+            author = self.tag_to_string(h4.parent.parent.find('a'))
-                author = self.tag_to_string(
+            title = title + ' (%s)' % author
                    div.find('div', attrs={'class': 'author'})).strip()
                title = title + u' (%s)' % author
                url = absurl(h2.find('a', href=True)['href'])
            desc = ''
-                for p in div.findAll('p', attrs={'class': lambda x: x and 'quiet' in x}):
+            div = h4
-                    desc += self.tag_to_string(p)
+            while div.next_sibling:
                div = div.next_sibling
                desc += self.tag_to_string(div).strip()
            self.log('Found article:', title)
            self.log('\t', url)
            self.log('\t', desc)