calibre/recipes/odkrywcy_pl.recipe
Kovid Goyal 567040ee1e Perform PEP8 compliance checks on the entire codebase
Some bits of PEP 8 are turned off via setup.cfg
2016-07-29 21:25:17 +05:30

112 lines
4.6 KiB
Plaintext

__license__ = 'GPL v3'
import re
import datetime
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Comment
class Odkrywcy(BasicNewsRecipe):
title = u'Odkrywcy.pl'
__author__ = 'fenuks'
description = u''
language = 'pl'
extra_css = 'img {display: block;}'
cover_url = ''
INDEX = 'http://odkrywcy.pl'
use_embedded_content = False
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = True
remove_empty_feeds = True
remove_javascript = True
remove_attributes = ['style', 'font']
ignore_duplicate_articles = {'title', 'url'}
keep_only_tags = [dict(attrs={'class': 'content'})]
remove_tags = [
dict(name='a', attrs={'href': ['#opOpinie', '#opinie']}), dict(attrs={'class': ['fr', 'clra', 'close', 'wpsocial-fbFanpageBox', 'tagi', 'test']}),
dict(id=['rekSrd05', 'moreTopNews']), dict(name='img', attrs={'class': 'zr'}), dict(name='img', attrs={'alt': u'Następne'})]
remove_tags_after = dict(id='aTxt')
feeds = [(u'', '')]
def find_articles(self, url):
articles = []
soup = self.index_to_soup(url)
for i in soup.findAll(attrs={'class': 'katZj clra'}):
tmp = i.find('small')
datestring = re.search(
'dodano: (\d{4}-\d{2}-\d{2})', tmp.string).group(1)
d = datetime.datetime.strptime(datestring, "%Y-%m-%d").date()
if (datetime.datetime.now().date() - d).days > self.oldest_article:
continue
tmp = i.find('a')
title = tmp.string
url = self.INDEX + tmp['href']
articles.append({'title': title,
'url': url,
'date': '',
'description': ''
})
return articles
def parse_index(self):
feeds = []
feeds.append((u'Człowiek', self.find_articles(
'http://odkrywcy.pl/kat,111396,name,Czlowiek,kategoria.html')))
feeds.append((u'Technologie', self.find_articles(
'http://odkrywcy.pl/kat,111398,name,Technologie,kategoria.html')))
feeds.append((u'Ekologia', self.find_articles(
'http://odkrywcy.pl/kat,111400,name,Ekologia,kategoria.html')))
feeds.append((u'Kosmos', self.find_articles(
'http://odkrywcy.pl/kat,111402,name,Kosmos,kategoria.html')))
feeds.append((u'Cywilizacja', self.find_articles(
'http://odkrywcy.pl/kat,111404,name,Cywilizacja,kategoria.html')))
feeds.append((u'Przyroda', self.find_articles(
'http://odkrywcy.pl/kat,111406,name,Przyroda,kategoria.html')))
feeds.append((u'Fizyka i chemia', self.find_articles(
'http://odkrywcy.pl/kat,111408,name,Fizyka,kategoria.html')))
feeds.append((u'Historia', self.find_articles(
'http://odkrywcy.pl/kat,122994,name,Historia,kategoria.html')))
feeds.append((u'Media', self.find_articles(
'http://odkrywcy.pl/kat,116794,name,Media,media.html')))
return feeds
def append_page(self, soup, appendtag):
tag = soup.find('a', attrs={'class': 'btnNext'})
urls = []
while tag is not None:
if tag['href'] in urls:
break
urls.append(tag['href'])
soup2 = self.index_to_soup(self.INDEX + tag['href'])
tag = soup2.find(name='a', attrs={'class': 'btnNext'})
pagetext = soup2.findAll(attrs={'class': 'content'})
for container in pagetext:
header = container.find(name='h1')
if header:
header.extract()
for comment in container.findAll(text=lambda text: isinstance(text, Comment)):
comment.extract()
for container in pagetext:
pos = len(appendtag.contents)
appendtag.insert(pos, container)
for r in appendtag.findAll(attrs={'class': 'galStr'}):
r.extract()
for r in appendtag.findAll(attrs={'alt': 'Następne'}):
r.extract()
for r in appendtag.findAll(attrs={'alt': 'Poprzednie'}):
r.extract()
for r in appendtag.findAll(attrs={'class': 'clra'}):
r.extract()
for r in appendtag.findAll(attrs={'class': 'close'}):
r.extract()
for r in appendtag.findAll(attrs={'class': 'tagi'}):
r.extract()
for r in appendtag.findAll(attrs={'id': 'moreTopNews'}):
r.extract()
def preprocess_html(self, soup):
self.append_page(soup, soup.body)
return soup