diff --git a/recipes/gosc_full.recipe b/recipes/gosc_full.recipe new file mode 100644 index 0000000000..d1761a4d25 --- /dev/null +++ b/recipes/gosc_full.recipe @@ -0,0 +1,95 @@ +# -*- coding: utf-8 -*- +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = '2011, Piotr Kontek, piotr.kontek@gmail.com \ + 2013-2014, Tomasz Długosz, tomek3d@gmail.com' + +from calibre.web.feeds.news import BasicNewsRecipe +import re +from lxml import html + +class GN(BasicNewsRecipe): + + __author__ = 'Piotr Kontek, Tomasz Długosz' + title = u'Gość Niedzielny - pełny numer' + description = 'Ogólnopolski tygodnik katolicki - pełny numer sprzed 4 tygodni' + encoding = 'utf-8' + no_stylesheets = True + language = 'pl' + remove_javascript = True + + def find_last_issue(self): + raw = self.index_to_soup('http://gosc.pl/wyszukaj/wydania/3.Gosc-Niedzielny/', raw=True) + doc = html.fromstring(raw) + page = doc.xpath('//div[@class="c"]//div[@class="search-result"]/div[1]/div[2]/h1//a/@href') + + return page[4] + + def parse_index(self): + soup = self.index_to_soup('http://gosc.pl' + self.find_last_issue()) + #self.masthead = 'http://www.gosc.pl' + soup.find('div',attrs={'class':'fl-w100 release-wp'}).findAll('a')[-4].contents[0]['src'] + feeds = [] + #wstepniak + a = soup.find('div',attrs={'class':'release-wp-b'}).find('a') + articles = [ + {'title' : self.tag_to_string(a), + 'url' : 'http://www.gosc.pl' + a['href'] + }] + feeds.append((u'Wstępniak',articles)) + #kategorie + for addr in soup.findAll('a',attrs={'href':re.compile('kategoria')}): + if addr.string != u'wszystkie artyku\u0142y z tej kategorii \xbb': + main_block = self.index_to_soup('http://www.gosc.pl' + addr['href']) + articles = list(self.find_articles(main_block)) + if len(articles) > 0: + section = addr.string + feeds.append((section, articles)) + return feeds + + def find_articles(self, main_block): + for a in main_block.findAll('div', attrs={'class':['prev_doc2', 'sr-document']}): + art = a.find('a') + yield { + 'title' : self.tag_to_string(art), + 'url' : 'http://www.gosc.pl' + art['href'] + } + + def append_page(self, soup, appendtag): + chpage= appendtag.find(attrs={'class':'pgr_nrs'}) + if chpage: + for page in chpage.findAll('a'): + soup2 = self.index_to_soup('http://gosc.pl' + page['href']) + pagetext = soup2.find(attrs={'class':'intextAd'}) + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + r = soup.find(attrs={'class':'lightbox'}) + if r: + r.contents[0]['src'] = r['href'] + return soup + + def postprocess_html(self, soup, first_fetch): + for r in soup.findAll(attrs={'class':'pgr'}): + r.extract() + return soup + + keep_only_tags = [ + dict(name='div', attrs={'class':'cf txt'}) + ] + + remove_tags = [ + dict(name='p', attrs={'class':['r tr', 'l l-2', 'wykop']}), + dict(name='div', attrs={'class':['doc_actions', 'cf', 'fr1_cl']}), + dict(name='div', attrs={'id':'vote'}), + dict(name='a', attrs={'class':'img_enlarge'}) + ] + + extra_css = ''' + h1 {font-size:150%} + p.limiter {font-size:150%; font-weight: bold} + span.cm-i-a {text-transform:uppercase;} + span.cm-i-p {font-style:italic; font-size:70%} + ''' diff --git a/recipes/icons/gosc_full.png b/recipes/icons/gosc_full.png new file mode 100644 index 0000000000..9f7c9ed350 Binary files /dev/null and b/recipes/icons/gosc_full.png differ