calibre/recipes/gazeta_wyborcza.recipe
2019-04-13 09:17:31 +05:30

126 lines
5.2 KiB
Plaintext

# -*- coding: utf-8 -*-
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Comment
import re
class Gazeta_Wyborcza(BasicNewsRecipe):
title = u'Gazeta Wyborcza'
__author__ = 'fenuks, Artur Stachecki'
language = 'pl'
description = 'Wiadomości z Polski i ze świata. Serwisy tematyczne i lokalne w 20 miastach.'
category = 'newspaper'
publication_type = 'newspaper'
# encoding = 'iso-8859-2'
masthead_url = 'http://bi.gazeta.pl/im/5/10285/z10285445AA.jpg'
INDEX = 'http://wyborcza.pl'
remove_empty_feeds = True
oldest_article = 3
max_articles_per_feed = 100
remove_javascript = True
no_stylesheets = True
use_embedded_content = False
ignore_duplicate_articles = {'title', 'url'}
# rules for gazeta.pl
preprocess_regexps = [
(re.compile(u'<b>Czytaj więcej</b>.*', re.DOTALL), lambda m: '</body>')]
keep_only_tags = [dict(id='gazeta_article')]
remove_tags = [dict(id=['gazeta_article_tools', 'gazeta_article_miniatures']), dict(
attrs={'class': ['mod mod_sociallist', 'c0', 'fb', 'voteNeedLogin']})]
remove_tags_after = dict(id='gazeta_article_body')
# rules for wyborcza.biz
preprocess_regexps.append((re.compile(
u'(<br>)?(<br>)? Czytaj (także|też):.*?</a>\\.?<br>', re.DOTALL), lambda m: ''))
feeds = [(u'Kraj', u'http://rss.feedsportal.com/c/32739/f/530266/index.rss'),
(u'\u015awiat', u'http://rss.feedsportal.com/c/32739/f/530270/index.rss'),
(u'Wyborcza.biz', u'http://wyborcza.biz/pub/rss/wyborcza_biz_wiadomosci.htm'),
(u'Komentarze', u'http://rss.feedsportal.com/c/32739/f/530312/index.rss'),
(u'Kultura', u'http://rss.gazeta.pl/pub/rss/gazetawyborcza_kultura.xml'),
(u'Nauka', u'http://rss.feedsportal.com/c/32739/f/530269/index.rss'),
(u'Opinie', u'http://rss.gazeta.pl/pub/rss/opinie.xml'),
(u'Gazeta \u015awi\u0105teczna',
u'http://rss.feedsportal.com/c/32739/f/530431/index.rss'),
(u'Du\u017cy Format',
u'http://rss.feedsportal.com/c/32739/f/530265/index.rss'),
(u'Witamy w Polsce', u'http://rss.feedsportal.com/c/32739/f/530476/index.rss'),
(u'M\u0119ska Muzyka',
u'http://rss.feedsportal.com/c/32739/f/530337/index.rss'),
(u'Lata Lec\u0105', u'http://rss.feedsportal.com/c/32739/f/530326/index.rss'),
(u'Solidarni z Tybetem',
u'http://rss.feedsportal.com/c/32739/f/530461/index.rss'),
(u'W pon. - \u017bakowski',
u'http://rss.feedsportal.com/c/32739/f/530491/index.rss'),
(u'We wt. - Kolenda-Zalewska',
u'http://rss.feedsportal.com/c/32739/f/530310/index.rss'),
(u'\u015aroda w \u015brod\u0119',
u'http://rss.feedsportal.com/c/32739/f/530428/index.rss'),
(u'W pi\u0105tek - Olejnik',
u'http://rss.feedsportal.com/c/32739/f/530364/index.rss'),
(u'Nekrologi', u'http://rss.feedsportal.com/c/32739/f/530358/index.rss')
]
def print_version(self, url):
if 'feedsportal.com' in url:
s = url.rpartition('wyborcza0Bpl')
u = s[2]
if not s[0]:
u = url.rpartition('gazeta0Bpl')[2]
u = u.replace('/l/', '/')
u = u.replace('/ia1.htm', '')
u = u.replace('/story01.htm', '')
u = u.replace('0C', '/')
u = u.replace('A', '')
u = u.replace('0E', '-')
u = u.replace('0H', ',')
u = u.replace('0I', '_')
u = u.replace('0B', '.')
u = self.INDEX + u
return u
else:
return url
def preprocess_html(self, soup):
tag = soup.find(id='Str')
if soup.find(attrs={'class': 'piano_btn_1'}):
return None
elif tag and tag.findAll('a'):
self.append_page(soup, soup.body)
return soup
def append_page(self, soup, appendtag):
tag = soup.find('div', attrs={'id': 'Str'})
try:
baseurl = soup.find(name='meta', attrs={
'property': 'og:url'})['content']
except:
return 1
link = tag.findAll('a')[-1]
while link:
soup2 = self.index_to_soup(baseurl + link['href'])
link = soup2.find('div', attrs={'id': 'Str'}).findAll('a')[-1]
if u'następne' not in link.string:
link = ''
pagetext = soup2.find(id='artykul')
comments = pagetext.findAll(
text=lambda text: isinstance(text, Comment))
for comment in comments:
comment.extract()
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
tag.extract()
def get_cover_url(self):
soup = self.index_to_soup('http://wyborcza.pl/0,76762,3751429.html')
cover = soup.find(attrs={'class': 'gallerycontent'})
self.cover_url = cover.ul.li.a.img['src'].replace('P.jpg', '.jpg')
return getattr(self, 'cover_url', self.cover_url)
def image_url_processor(self, baseurl, url):
if url.startswith(' '):
return url.strip()
else:
return url