Fix #1224 (no_stylesheets didn't get rid of all original CSS reference) and improve recipe for the nytimes

2025-07-09 03:04:10 -04:00 · 2008-11-03 10:00:01 -08:00 · 2008-11-03 10:00:01 -08:00 · f8ee3e0c4e
commit f8ee3e0c4e
parent a3fa08360c
2 changed files with 10 additions and 10 deletions
--- a/src/calibre/web/feeds/news.py
+++ b/src/calibre/web/feeds/news.py
@ -497,6 +497,10 @@ class BasicNewsRecipe(object, LoggingInterface):
                
            
    def _postprocess_html(self, soup, first_fetch, job_info):
+        if self.no_stylesheets:
+            for link in list(soup.findAll('link', type=re.compile('css')))+list(soup.findAll('style')):
+                link.extract()
+        
        head = soup.find('head')
        if not head:
            head = soup.find('body')
@ -513,9 +517,6 @@ class BasicNewsRecipe(object, LoggingInterface):
                                             url, __appname__, center=self.center_navbar)
                elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
                body.insert(0, elem)
-        if self.no_stylesheets:
-            for link in list(soup.findAll('link', type=re.compile('css'))):
-                link.extract()
        if self.remove_javascript:
            for script in list(soup.findAll('script')):
                script.extract()
--- a/src/calibre/web/feeds/recipes/nytimes.py
+++ b/src/calibre/web/feeds/recipes/nytimes.py
@ -17,12 +17,11 @@ class NYTimes(BasicNewsRecipe):
    description = 'Daily news from the New York Times'
    timefmt = ' [%a, %d %b, %Y]'
    needs_subscription = True
-    
-    remove_tags_before = dict(name='h1')
-    remove_tags_after  = dict(id='footer')
-    remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool']}), 
-                   dict(id=['footer', 'navigation', 'archive', 'side_search', 'blog_sidebar', 'side_tool', 'side_index']), 
-                   dict(name=['script', 'noscript'])]
+    remove_tags_before = dict(id='article')
+    remove_tags_after  = dict(id='article')
+    remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink clearfix']}), 
+                   dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive', 'side_search', 'blog_sidebar', 'side_tool', 'side_index']), 
+                   dict(name=['script', 'noscript', 'style'])]
    encoding = 'cp1252'
    no_stylesheets = True
    extra_css = 'h1 {font: sans-serif large;}\n.byline {font:monospace;}'
@ -59,7 +58,7 @@ class NYTimes(BasicNewsRecipe):
                if not a:
                    continue
                url = re.sub(r'\?.*', '', a['href'])
-                url += '?pagewanted=print'
+                url += '?pagewanted=all'
                title = self.tag_to_string(a, use_alt=True).strip()
                description = ''
                pubdate = strftime('%a, %d %b')