From dff08d5ebde82cf4396a19c0b293e8905077fa4b Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Thu, 15 Mar 2018 10:09:18 +0530
Subject: [PATCH] Update NYT

---
 recipes/nytimes.recipe     | 62 +++++++++++++++++++++++++++++++++-----
 recipes/nytimes_sub.recipe | 62 +++++++++++++++++++++++++++++++++-----
 2 files changed, 110 insertions(+), 14 deletions(-)

diff --git a/recipes/nytimes.recipe b/recipes/nytimes.recipe
index c34e770639..f9796a23b6 100644
--- a/recipes/nytimes.recipe
+++ b/recipes/nytimes.recipe
@@ -10,6 +10,7 @@ import re
 from calibre import strftime
 from calibre.utils.date import strptime
 from calibre.web.feeds.news import BasicNewsRecipe
+from calibre.ebooks.BeautifulSoup import Tag
 
 is_web_edition = True
 oldest_web_edition_article = 7  # days
@@ -79,10 +80,6 @@ class NewYorkTimes(BasicNewsRecipe):
     compress_news_images = True
     compress_news_images_auto_size = 5
 
-    keep_only_tags = [
-        dict(id='story-header'),
-        classes('story-body-supplemental story-interrupter'),
-    ]
     remove_tags = [
         dict(attrs={'aria-label':'tools'.split()}),
         dict(attrs={'data-videoid':True}),
@@ -91,11 +88,37 @@ class NewYorkTimes(BasicNewsRecipe):
         dict(name='a', href=lambda x: x and '#story-continues-' in x),
         dict(name='a', href=lambda x: x and '#whats-next' in x),
         dict(id=lambda x: x and 'sharetools-' in x),
-        dict(id='newsletter-promo supported-by-ad'.split()),
-        classes('story-print-citation supported-by accessibility-ad-header visually-hidden'),
+        dict(id='newsletter-promo supported-by-ad bottom-wrapper'.split()),
+        classes('story-print-citation supported-by accessibility-ad-header visually-hidden bottom-of-article ad'),
+        dict(attrs={'class': lambda x: x and ('SectionBar' in x or 'recirculation' in x or 'ResponsiveAd' in x)}),
     ]
 
-    def postprocess_html(self, soup, first_fetch):
+    def preprocess_html(self, soup):
+        article = soup.find(id='story')
+        # The NYT is apparently A/B testing a new page layout
+        has_supplemental = article.find(**classes('story-body-supplemental')) is not None
+        if has_supplemental:
+            keep_only_tags = [
+                dict(id='story-header'),
+                classes('story-body-supplemental story-interrupter'),
+            ]
+        else:
+            keep_only_tags = [
+                dict(id='story')
+            ]
+        body = Tag(soup, 'body')
+        for spec in keep_only_tags:
+            for tag in soup.find('body').findAll(**spec):
+                body.insert(len(body.contents), tag)
+        soup.find('body').replaceWith(body)
+
+        # Remove the header bar with New York Times as an SVG in it
+        for svg in soup.findAll('svg'):
+            h = svg.findParent('header')
+            if h is not None:
+                h.extract()
+
+        # Add a space to the dateline
         t = soup.find(**classes('dateline'))
         if t is not None:
             t.insert(0, ' ')
@@ -217,3 +240,28 @@ class NewYorkTimes(BasicNewsRecipe):
         if is_web_edition:
             return self.parse_web_sections()
         return self.parse_todays_page()
+
+    # The NYT occassionally returns bogus articles for some reason just in case
+    # it is because of cookies, dont store cookies
+    def get_browser(self, *args, **kwargs):
+        return self
+
+    def clone_browser(self, *args, **kwargs):
+        return self.get_browser()
+
+    def open_novisit(self, *args, **kwargs):
+        from calibre import browser
+        br = browser()
+        response = br.open_novisit(*args, **kwargs)
+        # headers = response.info()
+        # if headers.get('X-PageType') == 'vi-story':
+        #     import tempfile
+        #     with tempfile.NamedTemporaryFile(suffix='.html', dir='/t/n', delete=False) as f:
+        #         f.write(response.read())
+        #     import time
+        #     time.sleep(1)
+        #     br = browser()
+        #     response = br.open_novisit(*args, **kwargs)
+        return response
+
+    open = open_novisit
diff --git a/recipes/nytimes_sub.recipe b/recipes/nytimes_sub.recipe
index 6a506f6a2e..a309e98880 100644
--- a/recipes/nytimes_sub.recipe
+++ b/recipes/nytimes_sub.recipe
@@ -10,6 +10,7 @@ import re
 from calibre import strftime
 from calibre.utils.date import strptime
 from calibre.web.feeds.news import BasicNewsRecipe
+from calibre.ebooks.BeautifulSoup import Tag
 
 is_web_edition = False
 oldest_web_edition_article = 7  # days
@@ -79,10 +80,6 @@ class NewYorkTimes(BasicNewsRecipe):
     compress_news_images = True
     compress_news_images_auto_size = 5
 
-    keep_only_tags = [
-        dict(id='story-header'),
-        classes('story-body-supplemental story-interrupter'),
-    ]
     remove_tags = [
         dict(attrs={'aria-label':'tools'.split()}),
         dict(attrs={'data-videoid':True}),
@@ -91,11 +88,37 @@ class NewYorkTimes(BasicNewsRecipe):
         dict(name='a', href=lambda x: x and '#story-continues-' in x),
         dict(name='a', href=lambda x: x and '#whats-next' in x),
         dict(id=lambda x: x and 'sharetools-' in x),
-        dict(id='newsletter-promo supported-by-ad'.split()),
-        classes('story-print-citation supported-by accessibility-ad-header visually-hidden'),
+        dict(id='newsletter-promo supported-by-ad bottom-wrapper'.split()),
+        classes('story-print-citation supported-by accessibility-ad-header visually-hidden bottom-of-article ad'),
+        dict(attrs={'class': lambda x: x and ('SectionBar' in x or 'recirculation' in x or 'ResponsiveAd' in x)}),
     ]
 
-    def postprocess_html(self, soup, first_fetch):
+    def preprocess_html(self, soup):
+        article = soup.find(id='story')
+        # The NYT is apparently A/B testing a new page layout
+        has_supplemental = article.find(**classes('story-body-supplemental')) is not None
+        if has_supplemental:
+            keep_only_tags = [
+                dict(id='story-header'),
+                classes('story-body-supplemental story-interrupter'),
+            ]
+        else:
+            keep_only_tags = [
+                dict(id='story')
+            ]
+        body = Tag(soup, 'body')
+        for spec in keep_only_tags:
+            for tag in soup.find('body').findAll(**spec):
+                body.insert(len(body.contents), tag)
+        soup.find('body').replaceWith(body)
+
+        # Remove the header bar with New York Times as an SVG in it
+        for svg in soup.findAll('svg'):
+            h = svg.findParent('header')
+            if h is not None:
+                h.extract()
+
+        # Add a space to the dateline
         t = soup.find(**classes('dateline'))
         if t is not None:
             t.insert(0, ' ')
@@ -217,3 +240,28 @@ class NewYorkTimes(BasicNewsRecipe):
         if is_web_edition:
             return self.parse_web_sections()
         return self.parse_todays_page()
+
+    # The NYT occassionally returns bogus articles for some reason just in case
+    # it is because of cookies, dont store cookies
+    def get_browser(self, *args, **kwargs):
+        return self
+
+    def clone_browser(self, *args, **kwargs):
+        return self.get_browser()
+
+    def open_novisit(self, *args, **kwargs):
+        from calibre import browser
+        br = browser()
+        response = br.open_novisit(*args, **kwargs)
+        # headers = response.info()
+        # if headers.get('X-PageType') == 'vi-story':
+        #     import tempfile
+        #     with tempfile.NamedTemporaryFile(suffix='.html', dir='/t/n', delete=False) as f:
+        #         f.write(response.read())
+        #     import time
+        #     time.sleep(1)
+        #     br = browser()
+        #     response = br.open_novisit(*args, **kwargs)
+        return response
+
+    open = open_novisit