One more fix for NYTimes

2025-07-09 03:04:10 -04:00 · 2010-06-02 10:46:53 -06:00 · 2010-06-02 10:46:53 -06:00 · 3a12b18dc3
commit 3a12b18dc3
parent f4bbf10ee3
2 changed files with 12 additions and 12 deletions
--- a/resources/recipes/nytimes.recipe
+++ b/resources/recipes/nytimes.recipe
@ -391,10 +391,14 @@ class NYTimes(BasicNewsRecipe):
        return ans

    def preprocess_html(self, soup):
-        # Skip ad pages before actual article
+        # Skip ad pages served before actual article
        skip_tag = soup.find(True, {'name':'skip'})
        if skip_tag is not None:
-            soup = self.index_to_soup(skip_tag.parent['href'])
+            self.log.error("Found forwarding link: %s" % skip_tag.parent['href'])
+            url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href'])
+            url += '?pagewanted=all'
+            self.log.error("Skipping ad to article at '%s'" % url)
+            soup = self.index_to_soup(url)
        return self.strip_anchors(soup)

    def postprocess_html(self,soup, True):
--- a/resources/recipes/nytimes_sub.recipe
+++ b/resources/recipes/nytimes_sub.recipe
@ -280,18 +280,14 @@ class NYTimes(BasicNewsRecipe):
        return ans

    def preprocess_html(self, soup):
-        '''
-        refresh = soup.find('meta', {'http-equiv':'refresh'})
-        if refresh is None:
-            return soup
-        content = refresh.get('content').partition('=')[2]
-        raw = self.browser.open('http://www.nytimes.com'+content).read()
-        return BeautifulSoup(raw.decode('cp1252', 'replace'))
-        '''
-        # Skip ad pages before actual article
+        # Skip ad pages served before actual article
        skip_tag = soup.find(True, {'name':'skip'})
        if skip_tag is not None:
-            soup = self.index_to_soup(skip_tag.parent['href'])
+            self.log.error("Found forwarding link: %s" % skip_tag.parent['href'])
+            url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href'])
+            url += '?pagewanted=all'
+            self.log.error("Skipping ad to article at '%s'" % url)
+            soup = self.index_to_soup(url)
        return self.strip_anchors(soup)

    def postprocess_html(self,soup, True):