calibre/recipes/doghousediaries.recipe

__license__   = 'GPL v3'
__copyright__ = '2010-2012, NiLuJe <niluje at ak-team.com>'

'''
Fetch DoghouseDiaries.
'''

import re
from calibre.web.feeds.news import BasicNewsRecipe

class DoghouseDiaries(BasicNewsRecipe):
    title = 'Doghouse Diaries'
    description = 'A webcomic.'
    __author__ = 'NiLuJe'
    language = 'en'

    use_embedded_content = False
    # 14 comics per fetch (not really days... but we can't easily get the date of individual comics, short of parsing each one...)
    oldest_article = 14

    cover_url = 'http://www.thedoghousediaries.com/logos/logo3.png'
    masthead_url = 'http://www.thedoghousediaries.com/logos/logo3.png'

    keep_only_tags = [dict(name='img', attrs={'class': re.compile("comic-item*")}), dict(name='h1'), dict(name='div', attrs={'class':'entry'}), dict(name='p', id='alttext')]
    remove_tags = [dict(name='div', attrs={'class':'pin-it-btn-wrapper'}), dict(name='span'), dict(name='div', id='wp_fb_like_button')]
    remove_attributes = ['width', 'height']
    no_stylesheets = True

    # Turn image bubblehelp into a paragraph (NOTE: We run before the remove_tags cleanup, so we need to make sure we only parse the comic-item img, not the pinterest one pulled by the entry div)
    preprocess_regexps = [
        (re.compile(r'(<img.*src="http://thedoghousediaries.com/comics/.*title=")([^"]+)(".*>)'),
         lambda m: '%s%s<p id="alttext"><strong>%s</strong></p>' % (m.group(1), m.group(3), m.group(2)))
    ]

    def parse_index(self):
        INDEX = 'http://www.thedoghousediaries.com/'

        soup = self.index_to_soup(INDEX)
        articles = []
        # Since the feed sucks, and there's no real archive, we use the 'Quick Archive' thingie, but we can't get the date from here, so stop after 14 comics...
        for item in soup.findAll('option', {}, True, None, self.oldest_article+1):
            # Skip the quick archive itself
            if ( item['value'] != '0' ):
                articles.append({
                    'title': self.tag_to_string(item).encode('UTF-8'),
                    'url': item['value'],
                    'description': '',
                    'content': '',
                })

        return [('Doghouse Diaries', articles)]