# -*- coding: utf-8 -*- from calibre.web.feeds.news import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import Comment import re class Gazeta_Wyborcza(BasicNewsRecipe): title = u'Gazeta Wyborcza' __author__ = 'fenuks, Artur Stachecki' language = 'pl' description = 'Wiadomości z Polski i ze świata. Serwisy tematyczne i lokalne w 20 miastach.' category = 'newspaper' publication_type = 'newspaper' # encoding = 'iso-8859-2' masthead_url = 'http://bi.gazeta.pl/im/5/10285/z10285445AA.jpg' INDEX = 'http://wyborcza.pl' remove_empty_feeds = True oldest_article = 3 max_articles_per_feed = 100 remove_javascript = True no_stylesheets = True use_embedded_content = False ignore_duplicate_articles = {'title', 'url'} # rules for gazeta.pl preprocess_regexps = [ (re.compile(u'Czytaj więcej.*', re.DOTALL), lambda m: '')] keep_only_tags = [dict(id='gazeta_article')] remove_tags = [dict(id=['gazeta_article_tools', 'gazeta_article_miniatures']), dict( attrs={'class': ['mod mod_sociallist', 'c0', 'fb', 'voteNeedLogin']})] remove_tags_after = dict(id='gazeta_article_body') # rules for wyborcza.biz preprocess_regexps.append((re.compile( u'(
)?(
)? Czytaj (także|też):.*?\.?
', re.DOTALL), lambda m: '')) feeds = [(u'Kraj', u'http://rss.feedsportal.com/c/32739/f/530266/index.rss'), (u'\u015awiat', u'http://rss.feedsportal.com/c/32739/f/530270/index.rss'), (u'Wyborcza.biz', u'http://wyborcza.biz/pub/rss/wyborcza_biz_wiadomosci.htm'), (u'Komentarze', u'http://rss.feedsportal.com/c/32739/f/530312/index.rss'), (u'Kultura', u'http://rss.gazeta.pl/pub/rss/gazetawyborcza_kultura.xml'), (u'Nauka', u'http://rss.feedsportal.com/c/32739/f/530269/index.rss'), (u'Opinie', u'http://rss.gazeta.pl/pub/rss/opinie.xml'), (u'Gazeta \u015awi\u0105teczna', u'http://rss.feedsportal.com/c/32739/f/530431/index.rss'), (u'Du\u017cy Format', u'http://rss.feedsportal.com/c/32739/f/530265/index.rss'), (u'Witamy w Polsce', u'http://rss.feedsportal.com/c/32739/f/530476/index.rss'), (u'M\u0119ska Muzyka', u'http://rss.feedsportal.com/c/32739/f/530337/index.rss'), (u'Lata Lec\u0105', u'http://rss.feedsportal.com/c/32739/f/530326/index.rss'), (u'Solidarni z Tybetem', u'http://rss.feedsportal.com/c/32739/f/530461/index.rss'), (u'W pon. - \u017bakowski', u'http://rss.feedsportal.com/c/32739/f/530491/index.rss'), (u'We wt. - Kolenda-Zalewska', u'http://rss.feedsportal.com/c/32739/f/530310/index.rss'), (u'\u015aroda w \u015brod\u0119', u'http://rss.feedsportal.com/c/32739/f/530428/index.rss'), (u'W pi\u0105tek - Olejnik', u'http://rss.feedsportal.com/c/32739/f/530364/index.rss'), (u'Nekrologi', u'http://rss.feedsportal.com/c/32739/f/530358/index.rss') ] def print_version(self, url): if 'feedsportal.com' in url: s = url.rpartition('wyborcza0Bpl') u = s[2] if not s[0]: u = url.rpartition('gazeta0Bpl')[2] u = u.replace('/l/', '/') u = u.replace('/ia1.htm', '') u = u.replace('/story01.htm', '') u = u.replace('0C', '/') u = u.replace('A', '') u = u.replace('0E', '-') u = u.replace('0H', ',') u = u.replace('0I', '_') u = u.replace('0B', '.') u = self.INDEX + u return u else: return url def preprocess_html(self, soup): tag = soup.find(id='Str') if soup.find(attrs={'class': 'piano_btn_1'}): return None elif tag and tag.findAll('a'): self.append_page(soup, soup.body) return soup def append_page(self, soup, appendtag): tag = soup.find('div', attrs={'id': 'Str'}) try: baseurl = soup.find(name='meta', attrs={ 'property': 'og:url'})['content'] except: return 1 link = tag.findAll('a')[-1] while link: soup2 = self.index_to_soup(baseurl + link['href']) link = soup2.find('div', attrs={'id': 'Str'}).findAll('a')[-1] if u'następne' not in link.string: link = '' pagetext = soup2.find(id='artykul') comments = pagetext.findAll( text=lambda text: isinstance(text, Comment)) for comment in comments: comment.extract() pos = len(appendtag.contents) appendtag.insert(pos, pagetext) tag.extract() def get_cover_url(self): soup = self.index_to_soup('http://wyborcza.pl/0,76762,3751429.html') cover = soup.find(attrs={'class': 'gallerycontent'}) self.cover_url = cover.ul.li.a.img['src'].replace('P.jpg', '.jpg') return getattr(self, 'cover_url', self.cover_url) def image_url_processor(self, baseurl, url): if url.startswith(' '): return url.strip() else: return url