Fix National Post

2025-07-09 03:04:10 -04:00 · 2012-05-03 16:17:27 +05:30 · 2012-05-03 16:17:27 +05:30 · 61aa60ab07
commit 61aa60ab07
parent df1195e9ff
1 changed files with 5 additions and 27 deletions
--- a/recipes/national_post.recipe
+++ b/recipes/national_post.recipe
@ -1,5 +1,4 @@
 from calibre.web.feeds.recipes import BasicNewsRecipe
 from calibre.ebooks.BeautifulSoup import BeautifulSoup
 class NYTimes(BasicNewsRecipe):
@ -11,22 +10,8 @@ class NYTimes(BasicNewsRecipe):
    needs_subscription = False
    no_stylesheets = True
-    #remove_tags_before = dict(name='h1', attrs={'class':'heading'})
+    auto_cleanup = True
-    remove_tags_after  = dict(name='div', attrs={'class':'npStoryTools npWidth1-6 npRight npTxtStrong'})
+    auto_cleanup_keep = '//*[@class="npStoryPhoto npTxtPlain"]'
    remove_tags = [
       dict(name='iframe'),
       dict(name='div', attrs={'class':['story-tools', 'npStoryTools npWidth1-6 npRight npTxtStrong']}),
       #dict(name='div', attrs={'id':['qrformdiv', 'inSection', 'alpha-inner']}),
       #dict(name='form', attrs={'onsubmit':''}),
       dict(name='ul', attrs={'class':'npTxtAlt npGroup npTxtCentre npStoryShare npTxtStrong npTxtDim'}),
    ]
   # def preprocess_html(self, soup):
        # table = soup.find('table')
        # if table is not None:
            # table.extract()
        # return soup
    #TO GET ARTICLE TOC
@ -53,14 +38,14 @@ class NYTimes(BasicNewsRecipe):
                if current_section is not None and x.name == 'h5':
                    # Article found
                    title = self.tag_to_string(x)
-                    a = x.find('a', href=lambda x: x and 'story' in x)
+                    a = x.find('a', href=True)
                    if a is None:
                        continue
                    url = a.get('href', False)
                    if not url or not title:
                        continue
                    #if url.startswith('story'):
-                    url = 'http://www.nationalpost.com/todays-paper/'+url
+                    #url = 'http://www.nationalpost.com/todays-paper/'+url
                    self.log('\t\tFound article:', title)
                    self.log('\t\t\t', url)
                    current_articles.append({'title': title, 'url':url,
@ -70,11 +55,4 @@ class NYTimes(BasicNewsRecipe):
                feeds.append((current_section, current_articles))
            return feeds
-    def preprocess_html(self, soup):
+
        story = soup.find(name='div', attrs={'id':'npContentMain'})
        ##td = heading.findParent(name='td')
        ##td.extract()
        soup = BeautifulSoup('<html><head><title>t</title></head><body></body></html>')
        body = soup.find(name='body')
        body.insert(0, story)
        return soup