mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Fix NYTimes Top Stories recipe
This commit is contained in:
parent
6363aaa5b9
commit
25c4013b04
@ -9,14 +9,13 @@ import re
|
|||||||
import time
|
import time
|
||||||
from calibre import entity_to_unicode
|
from calibre import entity_to_unicode
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString, \
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment
|
||||||
Comment, BeautifulStoneSoup
|
|
||||||
|
|
||||||
class NYTimes(BasicNewsRecipe):
|
class NYTimes(BasicNewsRecipe):
|
||||||
|
|
||||||
title = 'New York Times Top Stories'
|
title = 'New York Times Top Stories'
|
||||||
__author__ = 'GRiker'
|
__author__ = 'GRiker'
|
||||||
language = 'en'
|
language = _('English')
|
||||||
description = 'Top Stories from the New York Times'
|
description = 'Top Stories from the New York Times'
|
||||||
|
|
||||||
# List of sections typically included in Top Stories. Use a keyword from the
|
# List of sections typically included in Top Stories. Use a keyword from the
|
||||||
@ -257,6 +256,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
# Fetch the outer table
|
# Fetch the outer table
|
||||||
table = soup.find('table')
|
table = soup.find('table')
|
||||||
previousTable = table
|
previousTable = table
|
||||||
|
contentTable = None
|
||||||
|
|
||||||
# Find the deepest table containing the stories
|
# Find the deepest table containing the stories
|
||||||
while True :
|
while True :
|
||||||
@ -388,6 +388,10 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
return ans
|
return ans
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
|
# Skip ad pages before actual article
|
||||||
|
skip_tag = soup.find(True, {'name':'skip'})
|
||||||
|
if skip_tag is not None:
|
||||||
|
soup = self.index_to_soup(skip_tag.parent['href'])
|
||||||
return self.strip_anchors(soup)
|
return self.strip_anchors(soup)
|
||||||
|
|
||||||
def postprocess_html(self,soup, True):
|
def postprocess_html(self,soup, True):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user