Fix #1224 (no_stylesheets didn't get rid of all original CSS reference) and improve recipe for the nytimes

This commit is contained in:
Kovid Goyal 2008-11-03 10:00:01 -08:00
parent a3fa08360c
commit f8ee3e0c4e
2 changed files with 10 additions and 10 deletions

View File

@ -497,6 +497,10 @@ class BasicNewsRecipe(object, LoggingInterface):
def _postprocess_html(self, soup, first_fetch, job_info):
if self.no_stylesheets:
for link in list(soup.findAll('link', type=re.compile('css')))+list(soup.findAll('style')):
link.extract()
head = soup.find('head')
if not head:
head = soup.find('body')
@ -513,9 +517,6 @@ class BasicNewsRecipe(object, LoggingInterface):
url, __appname__, center=self.center_navbar)
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
body.insert(0, elem)
if self.no_stylesheets:
for link in list(soup.findAll('link', type=re.compile('css'))):
link.extract()
if self.remove_javascript:
for script in list(soup.findAll('script')):
script.extract()

View File

@ -17,12 +17,11 @@ class NYTimes(BasicNewsRecipe):
description = 'Daily news from the New York Times'
timefmt = ' [%a, %d %b, %Y]'
needs_subscription = True
remove_tags_before = dict(name='h1')
remove_tags_after = dict(id='footer')
remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool']}),
dict(id=['footer', 'navigation', 'archive', 'side_search', 'blog_sidebar', 'side_tool', 'side_index']),
dict(name=['script', 'noscript'])]
remove_tags_before = dict(id='article')
remove_tags_after = dict(id='article')
remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink clearfix']}),
dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive', 'side_search', 'blog_sidebar', 'side_tool', 'side_index']),
dict(name=['script', 'noscript', 'style'])]
encoding = 'cp1252'
no_stylesheets = True
extra_css = 'h1 {font: sans-serif large;}\n.byline {font:monospace;}'
@ -59,7 +58,7 @@ class NYTimes(BasicNewsRecipe):
if not a:
continue
url = re.sub(r'\?.*', '', a['href'])
url += '?pagewanted=print'
url += '?pagewanted=all'
title = self.tag_to_string(a, use_alt=True).strip()
description = ''
pubdate = strftime('%a, %d %b')