#!/usr/bin/env python2 __license__ = 'GPL v3' ''' magazynconsido.pl/ ''' from calibre.web.feeds.news import BasicNewsRecipe from calibre.utils.magick import Image class magazynconsido(BasicNewsRecipe): title = u'Magazyn Consido' __author__ = 'Artur Stachecki ,teepel ' language = 'pl' description =u'Portal dla architektów i projektantów' masthead_url='http://qualitypixels.pl/wp-content/themes/airlock/advance/inc/timthumb.php?src=http://qualitypixels.pl/wp-content/uploads/2012/01/logotyp-magazynconsido-11.png&w=455&zc=1' oldest_article = 7 max_articles_per_feed = 100 remove_javascript=True no_stylesheets = True use_embedded_content = False keep_only_tags =[] keep_only_tags.append(dict(name = 'h1')) keep_only_tags.append(dict(name = 'p')) keep_only_tags.append(dict(attrs = {'class' : 'navigation'})) remove_tags =[dict(attrs = {'style' : 'font-size: x-small;' })] remove_tags_after =[dict(attrs = {'class' : 'navigation' })] extra_css=''' img {max-width:30%; max-height:30%; display: block; margin-left: auto; margin-right: auto;} h1 {text-align: center;}''' def parse_index(self): #(kk) soup = self.index_to_soup('http://feeds.feedburner.com/magazynconsido?format=xml') feeds = [] articles = {} sections = [] section = '' for item in soup.findAll('item') : section = self.tag_to_string(item.category) if not articles.has_key(section) : sections.append(section) articles[section] = [] article_url = self.tag_to_string(item.guid) article_title = self.tag_to_string(item.title) article_date = self.tag_to_string(item.pubDate) article_description = self.tag_to_string(item.description) articles[section].append( { 'title' : article_title, 'url' : article_url, 'date' : article_date, 'description' : article_description }) for section in sections : if section == 'Video': feeds.append((section, articles[section])) feeds.pop() else: feeds.append((section, articles[section])) return feeds def append_page(self, soup, appendtag): apage = soup.find('div', attrs={'class':'wp-pagenavi'}) if apage is not None: nexturl = soup.find('a', attrs={'class':'nextpostslink'}) soup2 = self.index_to_soup(nexturl['href']) pagetext = soup2.findAll('p') for tag in pagetext: pos = len(appendtag.contents) appendtag.insert(pos, tag) while appendtag.find('div', attrs={'class': ['height: 35px;', 'post-meta', 'addthis_toolbox addthis_default_style addthis_', 'post-meta-bottom', 'block_recently_post', 'fbcomments', 'pin-it-button', 'pages', 'navigation']}) is not None: appendtag.find('div', attrs={'class': ['height: 35px;', 'post-meta', 'addthis_toolbox addthis_default_style addthis_', 'post-meta-bottom', 'block_recently_post', 'fbcomments', 'pin-it-button', 'pages', 'navigation']}).replaceWith('') def preprocess_html(self, soup): #(kk) self.append_page(soup, soup.body) return self.adeify_images(soup) def postprocess_html(self, soup, first): #process all the images for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')): iurl = tag['src'] img = Image() img.open(iurl) if img < 0: raise RuntimeError('Out of memory') img.type = "GrayscaleType" img.save(iurl) return soup