__license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' ''' Changelog: 2015-07-29 Fixed issue with 'a' tags outside of list of articles (cjp) 2012-04-06 Fixed empty articles, added masthead img (NiLuJe) 2011-09-24 Changed cover (drMerry) ''' ''' Fetch xkcd. ''' import time import re from calibre.web.feeds.news import BasicNewsRecipe class XkcdCom(BasicNewsRecipe): cover_url = 'http://imgs.xkcd.com/static/terrible_small_logo.png' masthead_url = 'http://imgs.xkcd.com/static/terrible_small_logo.png' title = 'xkcd' description = 'A webcomic of romance and math humor.' __author__ = 'Martin Pitt updated by DrMerry.' language = 'en' use_embedded_content = False oldest_article = 60 keep_only_tags = [dict(id='comic')] no_stylesheets = True # turn image bubblehelp into a paragraph, and put alt in a heading preprocess_regexps = [ (re.compile(r'()'), lambda m: '

%s

%s%s%s

%s

' % (m.group(4), m.group(1), m.group(3), m.group(5), m.group(2))) ] def parse_index(self): INDEX = 'http://xkcd.com/archive/' soup = self.index_to_soup(INDEX) articles = [] for item in soup.find('div', attrs={'id': 'middleContainer'}).findAll('a', title=True): articles.append({ 'date': item['title'], 'timestamp': time.mktime(time.strptime(item['title'], '%Y-%m-%d')) + 1, 'url': 'http://xkcd.com' + item['href'], 'title': self.tag_to_string(item), 'description': '', 'content': '', }) return [('xkcd', articles)]