#!/usr/bin/env python2 # -*- coding: utf-8 -*- __license__ = 'GPL v3' __copyright__ = '2011, Piotr Kontek, piotr.kontek@gmail.com \ 2013-2016, Tomasz Długosz, tomek3d@gmail.com' from calibre.web.feeds.news import BasicNewsRecipe import re from lxml import html class GN(BasicNewsRecipe): __author__ = 'Piotr Kontek, Tomasz Długosz' title = u'Gość Niedzielny' description = 'Ogólnopolski tygodnik katolicki - fragmenty artykułów z aktualnego numeru' encoding = 'utf-8' no_stylesheets = True language = 'pl' remove_javascript = True def find_last_issue(self): raw = self.index_to_soup( 'http://gosc.pl/wyszukaj/wydania/3.Gosc-Niedzielny/', raw=True) doc = html.fromstring(raw) page = doc.xpath( '//div[@class="search-result release-result"]/div[1]/div[2]/h1//a/@href') return page[0] def parse_index(self): soup = self.index_to_soup('http://gosc.pl' + self.find_last_issue()) feeds = [] # wstepniak a = soup.find('div', attrs={'class': 'release-wp-b'}).find('a') articles = [ {'title': self.tag_to_string(a), 'url': 'http://www.gosc.pl' + a['href'] }] feeds.append((u'Wstępniak', articles)) # kategorie for addr in soup.findAll('a', attrs={'href': re.compile('kategoria')}): if addr.string != u'wszystkie artyku\u0142y z tej kategorii \xbb': main_block = self.index_to_soup( 'http://www.gosc.pl' + addr['href']) articles = list(self.find_articles(main_block)) if len(articles) > 0: section = addr.string feeds.append((section, articles)) return feeds def find_articles(self, main_block): for a in main_block.findAll('div', attrs={'class': ['prev_doc_n1 prev_doc_img21']}): art = a.find('a') yield { 'title': self.tag_to_string(art), 'url': 'http://www.gosc.pl' + art['href'] } def append_page(self, soup, appendtag): chpage = appendtag.find(attrs={'class': 'pgr_nrs'}) if chpage: for page in chpage.findAll('a'): soup2 = self.index_to_soup('http://gosc.pl' + page['href']) pagetext = soup2.find(attrs={'class': 'intextAd'}) pos = len(appendtag.contents) appendtag.insert(pos, pagetext) def preprocess_html(self, soup): self.append_page(soup, soup.body) return soup def postprocess_html(self, soup, first_fetch): for r in soup.findAll(attrs={'class': 'pgr'}): r.extract() return soup keep_only_tags = [ dict(name='div', attrs={'class': 'cf txt'}) ] remove_tags = [ dict(name='p', attrs={'class': ['r tr', 'l l-2', 'wykop', 'l l-2 doc-source']}), dict(name='div', attrs={'class': ['doc_actions', 'cf', 'fr1_cl','txt__social-icons','txt__tags']}), dict(name='div', attrs={'id': 'vote'}), dict(name='a', attrs={'class': 'img_enlarge'}) ] extra_css = ''' h1 {font-size:150%} p.limiter {font-size:150%; font-weight: bold} span.cm-i-a {text-transform:uppercase;} span.cm-i-p {font-style:italic; font-size:70%} '''