Fix NYTimes Top Stories recipe

2025-07-09 03:04:10 -04:00 · 2010-05-28 12:50:18 -06:00 · 2010-05-28 12:50:18 -06:00 · 25c4013b04
commit 25c4013b04
parent 6363aaa5b9
1 changed files with 7 additions and 3 deletions
--- a/resources/recipes/nytimes.recipe
+++ b/resources/recipes/nytimes.recipe
@ -9,14 +9,13 @@ import re
 import time
 from calibre import entity_to_unicode
 from calibre.web.feeds.recipes import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString, \
-Comment, BeautifulStoneSoup
+from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment

 class NYTimes(BasicNewsRecipe):

    title       = 'New York Times Top Stories'
    __author__  = 'GRiker'
-    language = 'en'
+    language = _('English')
    description = 'Top Stories from the New York Times'

    # List of sections typically included in Top Stories.  Use a keyword from the
@ -257,6 +256,7 @@ class NYTimes(BasicNewsRecipe):
        # Fetch the outer table
        table = soup.find('table')
        previousTable = table
+        contentTable = None

        # Find the deepest table containing the stories
        while True :
@ -388,6 +388,10 @@ class NYTimes(BasicNewsRecipe):
        return ans

    def preprocess_html(self, soup):
+        # Skip ad pages before actual article
+        skip_tag = soup.find(True, {'name':'skip'})
+        if skip_tag is not None:
+            soup = self.index_to_soup(skip_tag.parent['href'])
        return self.strip_anchors(soup)

    def postprocess_html(self,soup, True):