Fix NYTimes Top Stories recipe

This commit is contained in:
Kovid Goyal 2010-05-28 12:50:18 -06:00
parent 6363aaa5b9
commit 25c4013b04

View File

@ -9,14 +9,13 @@ import re
import time import time
from calibre import entity_to_unicode from calibre import entity_to_unicode
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString, \ from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment
Comment, BeautifulStoneSoup
class NYTimes(BasicNewsRecipe): class NYTimes(BasicNewsRecipe):
title = 'New York Times Top Stories' title = 'New York Times Top Stories'
__author__ = 'GRiker' __author__ = 'GRiker'
language = 'en' language = _('English')
description = 'Top Stories from the New York Times' description = 'Top Stories from the New York Times'
# List of sections typically included in Top Stories. Use a keyword from the # List of sections typically included in Top Stories. Use a keyword from the
@ -257,6 +256,7 @@ class NYTimes(BasicNewsRecipe):
# Fetch the outer table # Fetch the outer table
table = soup.find('table') table = soup.find('table')
previousTable = table previousTable = table
contentTable = None
# Find the deepest table containing the stories # Find the deepest table containing the stories
while True : while True :
@ -388,6 +388,10 @@ class NYTimes(BasicNewsRecipe):
return ans return ans
def preprocess_html(self, soup): def preprocess_html(self, soup):
# Skip ad pages before actual article
skip_tag = soup.find(True, {'name':'skip'})
if skip_tag is not None:
soup = self.index_to_soup(skip_tag.parent['href'])
return self.strip_anchors(soup) return self.strip_anchors(soup)
def postprocess_html(self,soup, True): def postprocess_html(self,soup, True):