From 9ccae653feef7c610c70814b3dea017de54c1cd3 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 12 Oct 2011 04:44:05 +0530 Subject: [PATCH] Fix #872447 (Updated geek and poke recipe) --- recipes/geek_poke.recipe | 58 ++++++++++++++++++++++++++++++++-------- 1 file changed, 47 insertions(+), 11 deletions(-) diff --git a/recipes/geek_poke.recipe b/recipes/geek_poke.recipe index 8fa9e7ca29..7f5117b586 100644 --- a/recipes/geek_poke.recipe +++ b/recipes/geek_poke.recipe @@ -1,35 +1,71 @@ -#!/usr/bin/python - from calibre.web.feeds.news import BasicNewsRecipe import re +from calibre.utils.magick import Image class AdvancedUserRecipe1307556816(BasicNewsRecipe): title = u'Geek and Poke' __author__ = u'DrMerry' description = u'Geek and Poke Cartoons' + publisher = u'Oliver Widder' + author = u'Oliver Widder, DrMerry (calibre-code), calibre' oldest_article = 31 max_articles_per_feed = 100 language = u'en' simultaneous_downloads = 5 #delay = 1 - timefmt = ' [%A, %d %B, %Y]' + timefmt = ' [%a, %d %B, %Y]' summary_length = -1 no_stylesheets = True + category = 'News.IT, Cartoon, Humor, Geek' + use_embedded_content = False cover_url = 'http://geekandpoke.typepad.com/aboutcoders.jpeg' remove_javascript = True remove_empty_feeds = True publication_type = 'blog' + conversion_options = { + 'comments' : '' + ,'tags' : category + ,'language' : language + ,'publisher' : publisher + ,'author' : author + } - preprocess_regexps = [ (re.compile(r'(

 

||]*>Tweet|]*>|)', re.DOTALL|re.IGNORECASE),lambda match: ''), - (re.compile(r'( | )', re.DOTALL|re.IGNORECASE),lambda match: ' '), - (re.compile(r'()+', re.DOTALL|re.IGNORECASE),lambda match: '
') - ] + remove_tags_before = dict(name='p', attrs={'class':'content-nav'}) + remove_tags_after = dict(name='div', attrs={'class':'entry-content'}) + remove_tags = [dict(name='div', attrs={'class':'entry-footer'}), + dict(name='div', attrs={'id':'alpha'}), + dict(name='div', attrs={'id':'gamma'}), + dict(name='iframe'), + dict(name='p', attrs={'class':'content-nav'})] - extra_css = 'body, h3, p, h2, h1, div, span{margin:0px} h2.date-header {font-size: 0.7em; color:#eee;} h3.entry-header{font-size: 1.0em} div.entry-body{font-size: 0.9em}' + filter_regexps = [(r'feedburner\.com'), + (r'pixel.quantserve\.com'), + (r'googlesyndication\.com'), + (r'yimg\.com'), + (r'scorecardresearch\.com')] + preprocess_regexps = [(re.compile(r'(

( |\s)*

|]*>Tweet|]*>|)', re.DOTALL|re.IGNORECASE),lambda match: ''), + (re.compile(r'( |\s\s)+\s*', re.DOTALL|re.IGNORECASE),lambda match: ' '), + (re.compile(r']*>([^<]*)[^>]*(]*>)', re.DOTALL|re.IGNORECASE), lambda match: match.group(2) + '
' + match.group(1) + '
'), + (re.compile(r'(]*>)]>((?!', re.DOTALL|re.IGNORECASE),lambda match: match.group(1) + match.group(2) + ''), + (re.compile(r'(]*alt="([^"]*)"[^>]*>)', re.DOTALL|re.IGNORECASE),lambda match: match.group(1) + '
' + match.group(2) + ''), + (re.compile(r'()+', re.DOTALL|re.IGNORECASE),lambda match: '
'), + (re.compile(r'', re.DOTALL), lambda m: '') + ] - remove_tags_before = dict(name='h2', attrs={'class':'date-header'}) - remove_tags_after = dict(name='div', attrs={'class':'entry-body'}) + extra_css = 'body, h3, p, #MERRYdate, h1, div, span{margin:0px; padding:0px} h3.entry-header{font-size: 0.8em} div.entry-body{font-size: 0.7em} #MERRYdate {font-size: 0.5em}' + def postprocess_html(self, soup, first): + for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')): + iurl = tag['src'] + img = Image() + img.open(iurl) + width, height = img.size + #print 'img is: ', iurl, 'width is: ', width, 'height is: ', height + img.trim(0) + img.save(iurl) + width, height = img.size + #print 'img is: ', iurl, 'width is: ', width, 'height is: ', height + return soup - feeds = [(u'Geek and Poke', u'http://feeds.feedburner.com/GeekAndPoke?format=xml')] + feeds = ['http://feeds.feedburner.com/GeekAndPoke?format=xml']