Update xkcd and add doghousediaries by NiLuJe

2025-07-09 03:04:10 -04:00 · 2012-09-14 21:50:02 +05:30 · 2012-09-14 21:50:02 +05:30 · 9884e55a3b
commit 9884e55a3b
parent 87fc87978b ad0123a2b0
2 changed files with 62 additions and 6 deletions
--- a/recipes/doghousediaries.recipe
+++ b/recipes/doghousediaries.recipe
@ -0,0 +1,52 @@
 __license__   = 'GPL v3'
 __copyright__ = '2010-2012, NiLuJe <niluje at ak-team.com>'
 '''
 Fetch DoghouseDiaries.
 '''
 import re
 from calibre.web.feeds.news import BasicNewsRecipe
 class DoghouseDiaries(BasicNewsRecipe):
    title = 'Doghouse Diaries'
    description = 'A webcomic.'
    __author__ = 'NiLuJe'
    language = 'en'
    use_embedded_content = False
    # 14 comics per fetch (not really days... but we can't easily get the date of individual comics, short of parsing each one...)
    oldest_article = 14
    cover_url = 'http://www.thedoghousediaries.com/logos/logo3.png'
    masthead_url = 'http://www.thedoghousediaries.com/logos/logo3.png'
    keep_only_tags = [dict(name='img', attrs={'class': re.compile("comic-item*")}), dict(name='h1'), dict(name='div', attrs={'class':'entry'}), dict(name='p', id='alttext')]
    remove_tags = [dict(name='div', attrs={'class':'pin-it-btn-wrapper'}), dict(name='span'), dict(name='div', id='wp_fb_like_button')]
    remove_attributes = ['width', 'height']
    no_stylesheets = True
    # Turn image bubblehelp into a paragraph (NOTE: We run before the remove_tags cleanup, so we need to make sure we only parse the comic-item img, not the pinterest one pulled by the entry div)
    preprocess_regexps = [
        (re.compile(r'(<img.*src="http://thedoghousediaries.com/comics/.*title=")([^"]+)(".*>)'),
         lambda m: '%s%s<p id="alttext"><strong>%s</strong></p>' % (m.group(1), m.group(3), m.group(2)))
    ]
    def parse_index(self):
        INDEX = 'http://www.thedoghousediaries.com/'
        soup = self.index_to_soup(INDEX)
        articles = []
        # Since the feed sucks, and there's no real archive, we use the 'Quick Archive' thingie, but we can't get the date from here, so stop after 14 comics...
        for item in soup.findAll('option', {}, True, None, self.oldest_article+1):
            # Skip the quick archive itself
            if ( item['value'] != '0' ):
                articles.append({
                    'title': self.tag_to_string(item).encode('UTF-8'),
                    'url': item['value'],
                    'description': '',
                    'content': '',
                })
        return [('Doghouse Diaries', articles)]
--- a/recipes/xkcd.recipe
+++ b/recipes/xkcd.recipe
@ -2,6 +2,8 @@ __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
 '''
 Changelog:
 2012-04-06
 Fixed empty articles, added masthead img (NiLuJe)
 2011-09-24
 Changed cover (drMerry)
 '''
@ -13,7 +15,8 @@ import time, re
 from calibre.web.feeds.news import BasicNewsRecipe
 class XkcdCom(BasicNewsRecipe):
-    cover_url = 'http://imgs.xkcd.com/s/9be30a7.png'
+    cover_url = 'http://imgs.xkcd.com/static/terrible_small_logo.png'
    masthead_url = 'http://imgs.xkcd.com/static/terrible_small_logo.png'
    title = 'xkcd'
    description = 'A webcomic of romance and math humor.'
    __author__ = 'Martin Pitt updated by DrMerry.'
@ -21,13 +24,14 @@ class XkcdCom(BasicNewsRecipe):
    use_embedded_content   = False
    oldest_article = 60
-    keep_only_tags = [dict(id='middleContainer')]
+    #keep_only_tags = [dict(id='middleContainer')]
-    remove_tags = [dict(name='ul'), dict(name='h3'), dict(name='br')]
+    #remove_tags = [dict(name='ul'), dict(name='h3'), dict(name='br')]
    keep_only_tags = [dict(id='comic')]
    no_stylesheets = True
-    # turn image bubblehelp into a paragraph
+    # turn image bubblehelp into a paragraph, and put alt in a heading
    preprocess_regexps = [
-        (re.compile(r'(<img.*title=")([^"]+)(".*>)'),
+        (re.compile(r'(<img.*title=")([^"]+)(".alt=")([^"]+)(".*>)'),
-         lambda m: '%s%s<p>%s</p>' % (m.group(1), m.group(3), m.group(2)))
+         lambda m: '<h1>%s</h1>%s%s%s<p>%s</p>' % (m.group(4), m.group(1), m.group(3), m.group(5), m.group(2)))
    ]
    def parse_index(self):