from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import BeautifulSoup class NYTimes(BasicNewsRecipe): title = 'National Post' __author__ = 'Krittika Goyal' description = 'Canadian national newspaper' timefmt = ' [%d %b, %Y]' needs_subscription = False language = 'en_CA' no_stylesheets = True #remove_tags_before = dict(name='h1', attrs={'class':'heading'}) #remove_tags_after = dict(name='td', attrs={'class':'newptool1'}) remove_tags = [ dict(name='iframe'), dict(name='div', attrs={'class':'story-tools'}), #dict(name='div', attrs={'id':['qrformdiv', 'inSection', 'alpha-inner']}), #dict(name='form', attrs={'onsubmit':''}), #dict(name='table', attrs={'cellspacing':'0'}), ] # def preprocess_html(self, soup): # table = soup.find('table') # if table is not None: # table.extract() # return soup #TO GET ARTICLE TOC def nejm_get_index(self): return self.index_to_soup('http://www.nationalpost.com/todays-paper/index.html') # To parse artice toc def parse_index(self): soup = self.nejm_get_index() div = soup.find(id='LegoText4') current_section = None current_articles = [] feeds = [] for x in div.findAll(True): if x.name == 'h4': # Section found if current_articles and current_section: feeds.append((current_section, current_articles)) current_section = self.tag_to_string(x) current_articles = [] self.log('\tFound section:', current_section) if current_section is not None and x.name == 'h3': # Article found title = self.tag_to_string(x) a = x.find('a', href=lambda x: x and 'story' in x) if a is None: continue url = a.get('href', False) if not url or not title: continue if url.startswith('story'): url = 'http://www.nationalpost.com/todays-paper/'+url self.log('\t\tFound article:', title) self.log('\t\t\t', url) current_articles.append({'title': title, 'url':url, 'description':'', 'date':''}) if current_articles and current_section: feeds.append((current_section, current_articles)) return feeds def preprocess_html(self, soup): story = soup.find(name='div', attrs={'class':'triline'}) page2_link = soup.find('p','pagenav') if page2_link: atag = page2_link.find('a',href=True) if atag: page2_url = atag['href'] if page2_url.startswith('story'): page2_url = 'http://www.nationalpost.com/todays-paper/'+page2_url elif page2_url.startswith( '/todays-paper/story.html'): page2_url = 'http://www.nationalpost.com/'+page2_url page2_soup = self.index_to_soup(page2_url) if page2_soup: page2_content = page2_soup.find('div','story-content') if page2_content: full_story = BeautifulSoup('
') full_story.insert(0,story) full_story.insert(1,page2_content) story = full_story soup = BeautifulSoup('t') body = soup.find(name='body') body.insert(0, story) return soup