# -*- coding: utf-8 -*- from __future__ import unicode_literals __license__ = 'GPL v3' __copyright__ = '2011, Piotr Kontek, piotr.kontek@gmail.com \ 2013-2014, Tomasz Długosz, tomek3d@gmail.com' from calibre.web.feeds.news import BasicNewsRecipe import re, time from lxml import html class GN(BasicNewsRecipe): __author__ = 'Piotr Kontek, Tomasz Długosz' title = u'Gość Niedzielny - pełny numer' publisher = 'Wydawnictwo Kurii Metropolitalnej w Katowicach' description = 'Ogólnopolski tygodnik katolicki - pełny numer sprzed 4 tygodni' encoding = 'utf-8' no_stylesheets = True language = 'pl' remove_javascript = True masthead_url = 'http://m.gosc.pl/static/themes/czerwony_gosc-mobile/logo.png' def find_last_issue(self): raw = self.index_to_soup('http://m.gosc.pl/wyszukaj/wydania/3.Gosc-Niedzielny/', raw=True) doc = html.fromstring(raw) page = doc.xpath('//div[@class="search-result"]/div[1]/div[1]/a/@href') if time.strftime("%w") in ['3','4']: return page[5] else: return page[4] def parse_index(self): self.last_issue = self.find_last_issue() soup = self.index_to_soup('http://gosc.pl' + self.last_issue) self.cover_url = 'http://www.gosc.pl' + soup.find('div',attrs={'class':'fl-w100 release-wp'}).findAll('a')[-4].contents[0]['src'] feeds = [] enlisted = [] # editorial: a = soup.find('div',attrs={'class':'release-wp-b'}) art = a.find('a') articles = [ {'title' : self.tag_to_string(art), 'url' : 'http://www.gosc.pl' + art['href'], 'description' : self.tag_to_string(a.find('p',attrs={'class':'b lead'})) }] feeds.append((u'Na dobry początek',articles)) enlisted.append(articles[0].get('url')) # columns: for addr in soup.findAll('a',attrs={'href':re.compile('kategoria')}): if addr.string != u'wszystkie artyku\u0142y z tej kategorii \xbb': main_block = self.index_to_soup('http://www.gosc.pl' + addr['href']) articles = list(self.find_articles(main_block)) if len(articles) > 0: section = addr.string feeds.append((section, articles)) enlisted.extend(list(article.get('url') for article in articles)) # not assigned content: page = 1 not_assigned = [] while True: soup = self.index_to_soup('http://gosc.pl' + self.last_issue.replace('przeglad','wszystko') + '/' + str(page)) articles = list(self.find_articles(soup)) not_assigned.extend([x for x in articles if x.get('url') not in enlisted]) page+=1 pages = soup.find('span', attrs={'class':'pgr_nrs'}) if str(page) not in [self.tag_to_string(x)[1] for x in pages.findAll('a')]: break feeds.insert(1,(u'Nieprzypisane', not_assigned)) return feeds def find_articles(self, main_block): for a in main_block.findAll('div', attrs={'class':['prev_doc2', 'sr-document']}): art = a.find('a') yield { 'title' : self.tag_to_string(art), 'url' : 'http://www.gosc.pl' + art['href'], 'date' : self.tag_to_string(a.find('p', attrs={'class':'sr-date'})), 'description' : self.tag_to_string(a.find('p', attrs={'class':'sr-lead'})) } def append_page(self, soup, appendtag): chpage= appendtag.find(attrs={'class':'pgr_nrs'}) if chpage: for page in chpage.findAll('a'): soup2 = self.index_to_soup('http://gosc.pl' + page['href']) pagetext = soup2.find(attrs={'class':'intextAd'}) pos = len(appendtag.contents) appendtag.insert(pos, pagetext) def preprocess_html(self, soup): self.append_page(soup, soup.body) r = soup.find(attrs={'class':'lightbox'}) if r: r.contents[0]['src'] = r['href'] return soup def postprocess_html(self, soup, first_fetch): for r in soup.findAll(attrs={'class':'pgr'}): r.extract() for r in soup.findAll(attrs={'class':['di_dr', 'doc_image']}): del r['style'] for r in soup.findAll(attrs={'class':'cm-i-a'}): r.replaceWith('
' + r.prettify() + '
') return soup keep_only_tags = [ dict(name='div', attrs={'class':'cf txt'}) ] remove_tags = [ dict(name='p', attrs={'class':['r tr', 'l l-2', 'wykop', 'tags']}), dict(name='div', attrs={'class':['doc_actions', 'cf', 'fr1_cl']}), dict(name='div', attrs={'id':'vote'}), dict(name='link'), dict(name='a', attrs={'class':'img_enlarge'}) ] extra_css = ''' h1 {font-size:150%} p.limiter {font-size:150%; font-weight: bold} span.cm-i-a {text-transform:uppercase;font-size:50%} span.cm-i-p {font-style:italic; font-size:70%;text-align:right} '''