Update xkcd and add doghousediaries by NiLuJe

This commit is contained in:
Kovid Goyal 2012-09-14 21:50:02 +05:30
commit 9884e55a3b
2 changed files with 62 additions and 6 deletions

View File

@ -0,0 +1,52 @@
__license__ = 'GPL v3'
__copyright__ = '2010-2012, NiLuJe <niluje at ak-team.com>'
'''
Fetch DoghouseDiaries.
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
class DoghouseDiaries(BasicNewsRecipe):
title = 'Doghouse Diaries'
description = 'A webcomic.'
__author__ = 'NiLuJe'
language = 'en'
use_embedded_content = False
# 14 comics per fetch (not really days... but we can't easily get the date of individual comics, short of parsing each one...)
oldest_article = 14
cover_url = 'http://www.thedoghousediaries.com/logos/logo3.png'
masthead_url = 'http://www.thedoghousediaries.com/logos/logo3.png'
keep_only_tags = [dict(name='img', attrs={'class': re.compile("comic-item*")}), dict(name='h1'), dict(name='div', attrs={'class':'entry'}), dict(name='p', id='alttext')]
remove_tags = [dict(name='div', attrs={'class':'pin-it-btn-wrapper'}), dict(name='span'), dict(name='div', id='wp_fb_like_button')]
remove_attributes = ['width', 'height']
no_stylesheets = True
# Turn image bubblehelp into a paragraph (NOTE: We run before the remove_tags cleanup, so we need to make sure we only parse the comic-item img, not the pinterest one pulled by the entry div)
preprocess_regexps = [
(re.compile(r'(<img.*src="http://thedoghousediaries.com/comics/.*title=")([^"]+)(".*>)'),
lambda m: '%s%s<p id="alttext"><strong>%s</strong></p>' % (m.group(1), m.group(3), m.group(2)))
]
def parse_index(self):
INDEX = 'http://www.thedoghousediaries.com/'
soup = self.index_to_soup(INDEX)
articles = []
# Since the feed sucks, and there's no real archive, we use the 'Quick Archive' thingie, but we can't get the date from here, so stop after 14 comics...
for item in soup.findAll('option', {}, True, None, self.oldest_article+1):
# Skip the quick archive itself
if ( item['value'] != '0' ):
articles.append({
'title': self.tag_to_string(item).encode('UTF-8'),
'url': item['value'],
'description': '',
'content': '',
})
return [('Doghouse Diaries', articles)]

View File

@ -2,6 +2,8 @@ __license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''
Changelog:
2012-04-06
Fixed empty articles, added masthead img (NiLuJe)
2011-09-24
Changed cover (drMerry)
'''
@ -13,7 +15,8 @@ import time, re
from calibre.web.feeds.news import BasicNewsRecipe
class XkcdCom(BasicNewsRecipe):
cover_url = 'http://imgs.xkcd.com/s/9be30a7.png'
cover_url = 'http://imgs.xkcd.com/static/terrible_small_logo.png'
masthead_url = 'http://imgs.xkcd.com/static/terrible_small_logo.png'
title = 'xkcd'
description = 'A webcomic of romance and math humor.'
__author__ = 'Martin Pitt updated by DrMerry.'
@ -21,13 +24,14 @@ class XkcdCom(BasicNewsRecipe):
use_embedded_content = False
oldest_article = 60
keep_only_tags = [dict(id='middleContainer')]
remove_tags = [dict(name='ul'), dict(name='h3'), dict(name='br')]
#keep_only_tags = [dict(id='middleContainer')]
#remove_tags = [dict(name='ul'), dict(name='h3'), dict(name='br')]
keep_only_tags = [dict(id='comic')]
no_stylesheets = True
# turn image bubblehelp into a paragraph
# turn image bubblehelp into a paragraph, and put alt in a heading
preprocess_regexps = [
(re.compile(r'(<img.*title=")([^"]+)(".*>)'),
lambda m: '%s%s<p>%s</p>' % (m.group(1), m.group(3), m.group(2)))
(re.compile(r'(<img.*title=")([^"]+)(".alt=")([^"]+)(".*>)'),
lambda m: '<h1>%s</h1>%s%s%s<p>%s</p>' % (m.group(4), m.group(1), m.group(3), m.group(5), m.group(2)))
]
def parse_index(self):