__license__ = 'GPL v3' __copyright__ = '2010-2012, NiLuJe ' ''' Fetch DoghouseDiaries. ''' import re from calibre.web.feeds.news import BasicNewsRecipe class DoghouseDiaries(BasicNewsRecipe): title = 'Doghouse Diaries' description = 'A webcomic.' __author__ = 'NiLuJe' language = 'en' use_embedded_content = False # 14 comics per fetch (not really days... but we can't easily get the date of individual comics, short of parsing each one...) oldest_article = 14 cover_url = 'http://www.thedoghousediaries.com/logos/logo3.png' masthead_url = 'http://www.thedoghousediaries.com/logos/logo3.png' keep_only_tags = [dict(name='img', attrs={'class': re.compile("comic-item*")}), dict(name='h1'), dict(name='div', attrs={'class':'entry'}), dict(name='p', id='alttext')] remove_tags = [dict(name='div', attrs={'class':'pin-it-btn-wrapper'}), dict(name='span'), dict(name='div', id='wp_fb_like_button')] remove_attributes = ['width', 'height'] no_stylesheets = True # Turn image bubblehelp into a paragraph (NOTE: We run before the remove_tags cleanup, so we need to make sure we only parse the comic-item img, not the pinterest one pulled by the entry div) preprocess_regexps = [ (re.compile(r'()'), lambda m: '%s%s

%s

' % (m.group(1), m.group(3), m.group(2))) ] def parse_index(self): INDEX = 'http://www.thedoghousediaries.com/' soup = self.index_to_soup(INDEX) articles = [] # Since the feed sucks, and there's no real archive, we use the 'Quick Archive' thingie, but we can't get the date from here, so stop after 14 comics... for item in soup.findAll('option', {}, True, None, self.oldest_article+1): # Skip the quick archive itself if ( item['value'] != '0' ): articles.append({ 'title': self.tag_to_string(item).encode('UTF-8'), 'url': item['value'], 'description': '', 'content': '', }) return [('Doghouse Diaries', articles)]