from calibre.web.feeds.news import BasicNewsRecipe class TheGrid(BasicNewsRecipe): #: The title to use for the ebook title = u'The Grid' #: A couple of lines that describe the content this recipe downloads. #: This will be used primarily in a GUI that presents a list of recipes. description = (u'The Grid is a weekly city magazine and daily website providing a fresh, ' 'accessible voice for Toronto.') #: The author of this recipe __author__ = u'Yusuf W' #: The language that the news is in. Must be an ISO-639 code either #: two or three characters long language = 'en_CA' #: Publication type #: Set to newspaper, magazine or blog publication_type = 'newspaper' #: Convenient flag to disable loading of stylesheets for websites #: that have overly complex stylesheets unsuitable for conversion #: to ebooks formats #: If True stylesheets are not downloaded and processed no_stylesheets = True #: List of tags to be removed. Specified tags are removed from downloaded HTML. remove_tags_before = dict(name='div', id='content') remove_tags_after = dict(name='div', id='content') remove_tags = [ dict(name='div', attrs={'class':'right-content pull-right'}), dict(name='div', attrs={'class':'right-content'}), dict(name='div', attrs={'class':'ftr-line'}), dict(name='div', attrs={'class':'pull-right'}), dict(name='div', id='comments'), dict(name='div', id='tags') ] #: Keep only the specified tags and their children. #keep_only_tags = [dict(name='div', id='content')] cover_margins = (0, 0, '#ffffff') INDEX = 'http://www.thegridto.com' def get_cover_url(self): soup = self.index_to_soup(self.INDEX) cover_url = soup.find(attrs={'class':'article-block latest-issue'}).find('img')['src'] return cover_url def parse_index(self): # Get the latest issue soup = self.index_to_soup(self.INDEX) a = soup.find('div', attrs={'class': 'full-content stuff-ftr'}).findAll('a')[2] # Parse the index of the latest issue self.INDEX = self.INDEX + a['href'] soup = self.index_to_soup(self.INDEX) feeds = [] for section in ['city', 'life', 'culture']: section_class = 'left-content article-listing ' + section + ' pull-left' div = soup.find(attrs={'class': section_class}) articles = [] for a in div.findAll(attrs={'class':'post-title'}): title = self.tag_to_string(a) url = a['href'] articles.append({'title': title, 'url': url, 'description':'', 'date':''}) feeds.append((section, articles)) return feeds