From a7cab66b6f851d27e23d1799eecc85c6769aeca7 Mon Sep 17 00:00:00 2001 From: NiLuJe Date: Fri, 14 Sep 2012 17:47:38 +0200 Subject: [PATCH 1/2] Tweak XKCD recipe: Add a masthead image, put the strip title in an h1 tag --- recipes/xkcd.recipe | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/recipes/xkcd.recipe b/recipes/xkcd.recipe index 42dceda65b..2aa704992e 100644 --- a/recipes/xkcd.recipe +++ b/recipes/xkcd.recipe @@ -2,6 +2,8 @@ __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' ''' Changelog: +2012-04-06 +Fixed empty articles, added masthead img (NiLuJe) 2011-09-24 Changed cover (drMerry) ''' @@ -13,7 +15,8 @@ import time, re from calibre.web.feeds.news import BasicNewsRecipe class XkcdCom(BasicNewsRecipe): - cover_url = 'http://imgs.xkcd.com/s/9be30a7.png' + cover_url = 'http://imgs.xkcd.com/static/terrible_small_logo.png' + masthead_url = 'http://imgs.xkcd.com/static/terrible_small_logo.png' title = 'xkcd' description = 'A webcomic of romance and math humor.' __author__ = 'Martin Pitt updated by DrMerry.' @@ -21,13 +24,14 @@ class XkcdCom(BasicNewsRecipe): use_embedded_content = False oldest_article = 60 - keep_only_tags = [dict(id='middleContainer')] - remove_tags = [dict(name='ul'), dict(name='h3'), dict(name='br')] + #keep_only_tags = [dict(id='middleContainer')] + #remove_tags = [dict(name='ul'), dict(name='h3'), dict(name='br')] + keep_only_tags = [dict(id='comic')] no_stylesheets = True - # turn image bubblehelp into a paragraph + # turn image bubblehelp into a paragraph, and put alt in a heading preprocess_regexps = [ - (re.compile(r'()'), - lambda m: '%s%s

%s

' % (m.group(1), m.group(3), m.group(2))) + (re.compile(r'()'), + lambda m: '

%s

%s%s%s

%s

' % (m.group(4), m.group(1), m.group(3), m.group(5), m.group(2))) ] def parse_index(self): From ad0123a2b03c4e7cf375dd45cb6f66f93615ac93 Mon Sep 17 00:00:00 2001 From: NiLuJe Date: Fri, 14 Sep 2012 17:49:03 +0200 Subject: [PATCH 2/2] Add a recipe for DogHouse Diaries, an online comic --- recipes/doghousediaries.recipe | 52 ++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 recipes/doghousediaries.recipe diff --git a/recipes/doghousediaries.recipe b/recipes/doghousediaries.recipe new file mode 100644 index 0000000000..e52db094b1 --- /dev/null +++ b/recipes/doghousediaries.recipe @@ -0,0 +1,52 @@ +__license__ = 'GPL v3' +__copyright__ = '2010-2012, NiLuJe ' + +''' +Fetch DoghouseDiaries. +''' + +import re +from calibre.web.feeds.news import BasicNewsRecipe + +class DoghouseDiaries(BasicNewsRecipe): + title = 'Doghouse Diaries' + description = 'A webcomic.' + __author__ = 'NiLuJe' + language = 'en' + + use_embedded_content = False + # 14 comics per fetch (not really days... but we can't easily get the date of individual comics, short of parsing each one...) + oldest_article = 14 + + cover_url = 'http://www.thedoghousediaries.com/logos/logo3.png' + masthead_url = 'http://www.thedoghousediaries.com/logos/logo3.png' + + keep_only_tags = [dict(name='img', attrs={'class': re.compile("comic-item*")}), dict(name='h1'), dict(name='div', attrs={'class':'entry'}), dict(name='p', id='alttext')] + remove_tags = [dict(name='div', attrs={'class':'pin-it-btn-wrapper'}), dict(name='span'), dict(name='div', id='wp_fb_like_button')] + remove_attributes = ['width', 'height'] + no_stylesheets = True + + # Turn image bubblehelp into a paragraph (NOTE: We run before the remove_tags cleanup, so we need to make sure we only parse the comic-item img, not the pinterest one pulled by the entry div) + preprocess_regexps = [ + (re.compile(r'()'), + lambda m: '%s%s

%s

' % (m.group(1), m.group(3), m.group(2))) + ] + + def parse_index(self): + INDEX = 'http://www.thedoghousediaries.com/' + + soup = self.index_to_soup(INDEX) + articles = [] + # Since the feed sucks, and there's no real archive, we use the 'Quick Archive' thingie, but we can't get the date from here, so stop after 14 comics... + for item in soup.findAll('option', {}, True, None, self.oldest_article+1): + # Skip the quick archive itself + if ( item['value'] != '0' ): + articles.append({ + 'title': self.tag_to_string(item).encode('UTF-8'), + 'url': item['value'], + 'description': '', + 'content': '', + }) + + return [('Doghouse Diaries', articles)] +