mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
new Polish news sources
This commit is contained in:
parent
1e5ce66ca3
commit
b08854e60a
65
recipes/cdrinfo_pl.recipe
Normal file
65
recipes/cdrinfo_pl.recipe
Normal file
@ -0,0 +1,65 @@
|
||||
__license__ = 'GPL v3'
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import Comment
|
||||
class cdrinfo(BasicNewsRecipe):
|
||||
title = u'CDRinfo.pl'
|
||||
__author__ = 'fenuks'
|
||||
description = u'Serwis poświęcony archiwizacji danych. Testy i recenzje nagrywarek. Programy do nagrywania płyt. Dyski twarde, dyski SSD i serwery sieciowe NAS. Rankingi dyskow twardych, najszybsze dyski twarde, newsy, artykuły, testy, recenzje, porady, oprogramowanie. Zestawienie nagrywarek, najnowsze biosy do nagrywarek, programy dla dysków twardych.'
|
||||
category = 'it, hardware'
|
||||
#publication_type = ''
|
||||
language = 'pl'
|
||||
#encoding = ''
|
||||
#extra_css = ''
|
||||
cover_url = 'http://www.cdrinfo.pl/gfx/graph3/top.jpg'
|
||||
#masthead_url = ''
|
||||
use_embedded_content = False
|
||||
oldest_article = 777
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
remove_empty_feeds = True
|
||||
remove_javascript = True
|
||||
remove_attributes = ['style']
|
||||
preprocess_regexps = [(re.compile(u'<p[^>]*?>Uprzejmie prosimy o przestrzeganie netykiety.+?www\.gravatar\.com</a>\.</p>', re.DOTALL), lambda match: '')]
|
||||
ignore_duplicate_articles = {'title', 'url'}
|
||||
|
||||
keep_only_tags = [dict(name='input', attrs={'name':'ref'}), dict(id='text')]
|
||||
remove_tags = [dict(attrs={'class':['navigation', 'sociable']}), dict(name='hr'), dict(id='respond')]
|
||||
remove_tags_after = dict(id='artnawigacja')
|
||||
feeds = [(u'Wiadomości', 'http://feeds.feedburner.com/cdrinfo'), (u'Recenzje', 'http://www.cdrinfo.pl/rss/rss_recenzje.php'),
|
||||
(u'Konsole', 'http://konsole.cdrinfo.pl/rss/rss_konsole_news.xml'),
|
||||
(u'Pliki', 'http://www.cdrinfo.pl/rss/rss_pliki.xml')
|
||||
]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
if soup.find(id='artnawigacja'):
|
||||
self.append_page(soup, soup.body)
|
||||
return soup
|
||||
|
||||
def append_page(self, soup, appendtag):
|
||||
baseurl = 'http://cdrinfo.pl' + soup.find(name='input', attrs={'name':'ref'})['value'] + '/'
|
||||
if baseurl[-2] == '/':
|
||||
baseurl = baseurl[:-1]
|
||||
tag = soup.find(id='artnawigacja')
|
||||
div = tag.find('div', attrs={'align':'right'})
|
||||
while div:
|
||||
counter = 0
|
||||
while counter < 5:
|
||||
try:
|
||||
soup2 = self.index_to_soup(baseurl+div.a['href'])
|
||||
break
|
||||
except:
|
||||
counter += 1
|
||||
tag2 = soup2.find(id='artnawigacja')
|
||||
div = tag2.find('div', attrs={'align':'right'})
|
||||
pagetext = soup2.find(attrs={'class':'art'})
|
||||
comments = pagetext.findAll(text=lambda text:isinstance(text, Comment))
|
||||
for comment in comments:
|
||||
comment.extract()
|
||||
for r in soup2.findAll(attrs={'class':'star-rating'}):
|
||||
r.extract()
|
||||
for r in soup2.findAll(attrs={'class':'star-rating2'}):
|
||||
r.extract()
|
||||
pos = len(appendtag.contents)
|
||||
appendtag.insert(pos, pagetext)
|
||||
tag.extract()
|
88
recipes/gazeta_pl_bydgoszcz.recipe
Normal file
88
recipes/gazeta_pl_bydgoszcz.recipe
Normal file
@ -0,0 +1,88 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import Comment
|
||||
import re
|
||||
class gw_bydgoszcz(BasicNewsRecipe):
|
||||
title = u'Gazeta Wyborcza Bydgoszcz'
|
||||
__author__ = 'fenuks'
|
||||
language = 'pl'
|
||||
description = 'Wiadomości z Bydgoszczy na portalu Gazeta.pl.'
|
||||
category = 'newspaper'
|
||||
publication_type = 'newspaper'
|
||||
masthead_url = 'http://bi.gazeta.pl/im/3/4089/m4089863.gif'
|
||||
INDEX = 'http://bydgoszcz.gazeta.pl'
|
||||
cover_url = 'http://bi.gazeta.pl/i/hp/hp2009/logo.gif'
|
||||
remove_empty_feeds = True
|
||||
oldest_article = 3
|
||||
max_articles_per_feed = 100
|
||||
remove_javascript = True
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
ignore_duplicate_articles = {'title', 'url'}
|
||||
|
||||
#rules for gazeta.pl
|
||||
preprocess_regexps = [(re.compile(u'<b>Czytaj więcej</b>.*', re.DOTALL), lambda m: '</body>')]
|
||||
keep_only_tags = [dict(id='gazeta_article')]
|
||||
remove_tags = [dict(id=['gazeta_article_tools', 'gazeta_article_miniatures']), dict(attrs={'class':['mod mod_sociallist', 'c0', 'fb', 'voteNeedLogin']})]
|
||||
remove_tags_after = dict(id='gazeta_article_body')
|
||||
|
||||
feeds = [(u'Wiadomości', u'http://rss.feedsportal.com/c/32739/f/530239/index.rss')]
|
||||
|
||||
def print_version(self, url):
|
||||
if 'feedsportal.com' in url:
|
||||
s = url.rpartition('gazeta0Bpl')
|
||||
u = s[2]
|
||||
if not s[0]:
|
||||
u = url.rpartition('wyborcza0Bpl')[2]
|
||||
u = u.replace('/l/', '/')
|
||||
u = u.replace('/ia1.htm', '')
|
||||
u = u.replace('0Dbo0F1', '')
|
||||
u = u.replace('/story01.htm', '')
|
||||
u = u.replace('0C', '/')
|
||||
u = u.replace('A', '')
|
||||
u = u.replace('0E', '-')
|
||||
u = u.replace('0H', ',')
|
||||
u = u.replace('0I', '_')
|
||||
u = u.replace('0B', '.')
|
||||
u = self.INDEX + u
|
||||
return u
|
||||
else:
|
||||
return url
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
tag = soup.find(id='Str')
|
||||
if soup.find(attrs={'class': 'piano_btn_1'}):
|
||||
return None
|
||||
elif tag and tag.findAll('a'):
|
||||
self.append_page(soup, soup.body)
|
||||
return soup
|
||||
|
||||
def append_page(self, soup, appendtag):
|
||||
loop = False
|
||||
tag = soup.find('div', attrs={'id': 'Str'})
|
||||
try:
|
||||
baseurl = soup.find(name='meta', attrs={'property':'og:url'})['content']
|
||||
except:
|
||||
return 1
|
||||
link = tag.findAll('a')[-1]
|
||||
while link:
|
||||
soup2 = self.index_to_soup(baseurl + link['href'])
|
||||
link = soup2.find('div', attrs={'id': 'Str'}).findAll('a')[-1]
|
||||
if not u'następne' in link.string:
|
||||
link = ''
|
||||
pagetext = soup2.find(id='artykul')
|
||||
comments = pagetext.findAll(text=lambda text:isinstance(text, Comment))
|
||||
for comment in comments:
|
||||
comment.extract()
|
||||
pos = len(appendtag.contents)
|
||||
appendtag.insert(pos, pagetext)
|
||||
tag.extract()
|
||||
|
||||
def image_url_processor(self, baseurl, url):
|
||||
if url.startswith(' '):
|
||||
return url.strip()
|
||||
else:
|
||||
return url
|
Loading…
x
Reference in New Issue
Block a user