mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 10:14:46 -04:00
Updated various Polish recipes
This commit is contained in:
parent
8154607cc1
commit
02f823da68
@ -10,6 +10,7 @@ class Elektroda(BasicNewsRecipe):
|
|||||||
category = 'electronics'
|
category = 'electronics'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
|
no_stylesheets= True
|
||||||
remove_tags_before=dict(name='span', attrs={'class':'postbody'})
|
remove_tags_before=dict(name='span', attrs={'class':'postbody'})
|
||||||
remove_tags_after=dict(name='td', attrs={'class':'spaceRow'})
|
remove_tags_after=dict(name='td', attrs={'class':'spaceRow'})
|
||||||
remove_tags=[dict(name='a', attrs={'href':'#top'})]
|
remove_tags=[dict(name='a', attrs={'href':'#top'})]
|
||||||
|
@ -12,8 +12,8 @@ class Gameplay_pl(BasicNewsRecipe):
|
|||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
remove_javascript= True
|
remove_javascript= True
|
||||||
no_stylesheets= True
|
no_stylesheets= True
|
||||||
keep_only_tags=[dict(name='div', attrs={'class':['news_endpage_tit', 'news']})]
|
keep_only_tags=[dict(name='div', attrs={'class':['news_endpage_tit', 'news', 'news_container']})]
|
||||||
remove_tags=[dict(name='div', attrs={'class':['galeria', 'noedit center im', 'news_list', 'news_list_autor', 'stop_bot', 'tagi']}), dict(attrs={'usemap':'#map'})]
|
remove_tags=[dict(name='div', attrs={'class':['galeria', 'noedit center im', 'news_list', 'news_list_autor', 'stop_bot', 'tagi', 'news_tagi']}), dict(attrs={'usemap':'#map'}), dict(name='a', attrs={'class':['pin-it-button', 'twitter-share-button']})]
|
||||||
feeds = [(u'Wiadomo\u015bci', u'http://gameplay.pl/rss/')]
|
feeds = [(u'Wiadomo\u015bci', u'http://gameplay.pl/rss/')]
|
||||||
|
|
||||||
def image_url_processor(self, baseurl, url):
|
def image_url_processor(self, baseurl, url):
|
||||||
|
@ -12,10 +12,13 @@ class Gram_pl(BasicNewsRecipe):
|
|||||||
no_stylesheets= True
|
no_stylesheets= True
|
||||||
extra_css = 'h2 {font-style: italic; font-size:20px;} .picbox div {float: left;}'
|
extra_css = 'h2 {font-style: italic; font-size:20px;} .picbox div {float: left;}'
|
||||||
cover_url=u'http://www.gram.pl/www/01/img/grampl_zima.png'
|
cover_url=u'http://www.gram.pl/www/01/img/grampl_zima.png'
|
||||||
remove_tags= [dict(name='p', attrs={'class':['extraText', 'must-log-in']}), dict(attrs={'class':['el', 'headline', 'post-info']}), dict(name='div', attrs={'class':['twojaOcena', 'comment-body', 'comment-author vcard', 'comment-meta commentmetadata', 'tw_button']}), dict(id=['igit_rpwt_css', 'comments', 'reply-title', 'igit_title'])]
|
remove_tags= [dict(name='p', attrs={'class':['extraText', 'must-log-in']}), dict(attrs={'class':['el', 'headline', 'post-info', 'entry-footer clearfix']}), dict(name='div', attrs={'class':['twojaOcena', 'comment-body', 'comment-author vcard', 'comment-meta commentmetadata', 'tw_button', 'entry-comment-counter', 'snap_nopreview sharing robots-nocontent']}), dict(id=['igit_rpwt_css', 'comments', 'reply-title', 'igit_title'])]
|
||||||
keep_only_tags= [dict(name='div', attrs={'class':['main', 'arkh-postmetadataheader', 'arkh-postcontent', 'post', 'content', 'news_header', 'news_subheader', 'news_text']}), dict(attrs={'class':['contentheading', 'contentpaneopen']})]
|
keep_only_tags= [dict(name='div', attrs={'class':['main', 'arkh-postmetadataheader', 'arkh-postcontent', 'post', 'content', 'news_header', 'news_subheader', 'news_text']}), dict(attrs={'class':['contentheading', 'contentpaneopen']}), dict(name='article')]
|
||||||
feeds = [(u'Informacje', u'http://www.gram.pl/feed_news.asp'),
|
feeds = [(u'Informacje', u'http://www.gram.pl/feed_news.asp'),
|
||||||
(u'Publikacje', u'http://www.gram.pl/feed_news.asp?type=articles')]
|
(u'Publikacje', u'http://www.gram.pl/feed_news.asp?type=articles'),
|
||||||
|
(u'Kolektyw- Indie Games', u'http://indie.gram.pl/feed/'),
|
||||||
|
#(u'Kolektyw- Moto Games', u'http://www.motogames.gram.pl/news.rss')
|
||||||
|
]
|
||||||
|
|
||||||
def parse_feeds (self):
|
def parse_feeds (self):
|
||||||
feeds = BasicNewsRecipe.parse_feeds(self)
|
feeds = BasicNewsRecipe.parse_feeds(self)
|
||||||
|
@ -8,15 +8,21 @@ class Historia_org_pl(BasicNewsRecipe):
|
|||||||
category = 'history'
|
category = 'history'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
oldest_article = 8
|
oldest_article = 8
|
||||||
remove_empty_feeds=True
|
remove_empty_feeds= True
|
||||||
|
no_stylesheets = True
|
||||||
|
use_embedded_content = True
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
|
|
||||||
feeds = [(u'Wszystkie', u'http://www.historia.org.pl/index.php?format=feed&type=rss'),
|
feeds = [(u'Wszystkie', u'http://www.historia.org.pl/index.php?format=feed&type=atom'),
|
||||||
(u'Wiadomości', u'http://www.historia.org.pl/index.php/wiadomosci.feed?type=rss'),
|
(u'Wiadomości', u'http://www.historia.org.pl/index.php/wiadomosci.feed?type=atom'),
|
||||||
(u'Publikacje', u'http://www.historia.org.pl/index.php/publikacje.feed?type=rss'),
|
(u'Publikacje', u'http://www.historia.org.pl/index.php/publikacje.feed?type=atom'),
|
||||||
(u'Publicystyka', u'http://www.historia.org.pl/index.php/publicystyka.feed?type=rss'),
|
(u'Publicystyka', u'http://www.historia.org.pl/index.php/publicystyka.feed?type=atom'),
|
||||||
(u'Recenzje', u'http://historia.org.pl/index.php/recenzje.feed?type=rss'),
|
(u'Recenzje', u'http://historia.org.pl/index.php/recenzje.feed?type=atom'),
|
||||||
(u'Kultura i sztuka', u'http://www.historia.org.pl/index.php/kultura-i-sztuka.feed?type=rss'),
|
(u'Kultura i sztuka', u'http://www.historia.org.pl/index.php/kultura-i-sztuka.feed?type=atom'),
|
||||||
(u'Rekonstykcje', u'http://www.historia.org.pl/index.php/rekonstrukcje.feed?type=rss'),
|
(u'Rekonstykcje', u'http://www.historia.org.pl/index.php/rekonstrukcje.feed?type=atom'),
|
||||||
(u'Projekty', u'http://www.historia.org.pl/index.php/projekty.feed?type=rss'),
|
(u'Projekty', u'http://www.historia.org.pl/index.php/projekty.feed?type=atom'),
|
||||||
(u'Konkursy'), (u'http://www.historia.org.pl/index.php/konkursy.feed?type=rss')]
|
(u'Konkursy'), (u'http://www.historia.org.pl/index.php/konkursy.feed?type=atom')]
|
||||||
|
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
return url + '?tmpl=component&print=1&layout=default&page='
|
@ -11,7 +11,7 @@ class OCLab(BasicNewsRecipe):
|
|||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
keep_only_tags=[dict(id='main')]
|
keep_only_tags=[dict(id='main')]
|
||||||
remove_tags_after= dict(attrs={'class':'single-postmetadata'})
|
remove_tags_after= dict(attrs={'class':'single-postmetadata'})
|
||||||
remove_tags=[dict(attrs={'class':['single-postmetadata', 'pagebar']})]
|
remove_tags=[dict(attrs={'class':['single-postmetadata', 'pagebar', 'shr-bookmarks shr-bookmarks-expand shr-bookmarks-center shr-bookmarks-bg-enjoy']})]
|
||||||
feeds = [(u'Wpisy', u'http://oclab.pl/feed/')]
|
feeds = [(u'Wpisy', u'http://oclab.pl/feed/')]
|
||||||
|
|
||||||
|
|
||||||
|
@ -11,70 +11,19 @@ class Polska_times(BasicNewsRecipe):
|
|||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
remove_emty_feeds= True
|
remove_emty_feeds= True
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
preprocess_regexps = [(re.compile(ur'<b>Czytaj także:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur',<b>Czytaj też:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>Zobacz także:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<center><h4><a.*?</a></h4></center>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ TEŻ:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ WIĘCEJ:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ TAKŻE:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>\* CZYTAJ KONIECZNIE:.*', re.DOTALL), lambda match: '</body>'), (re.compile(ur'<b>Nasze serwisy:</b>.*', re.DOTALL), lambda match: '</body>') ]
|
#preprocess_regexps = [(re.compile(ur'<b>Czytaj także:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur',<b>Czytaj też:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>Zobacz także:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<center><h4><a.*?</a></h4></center>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ TEŻ:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ WIĘCEJ:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ TAKŻE:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>\* CZYTAJ KONIECZNIE:.*', re.DOTALL), lambda match: '</body>'), (re.compile(ur'<b>Nasze serwisy:</b>.*', re.DOTALL), lambda match: '</body>') ]
|
||||||
keep_only_tags= [dict(id=['tytul-artykulu', 'kontent'])]
|
remove_tags_after= dict(attrs={'src':'http://nm.dz.com.pl/dz.png'})
|
||||||
remove_tags_after= dict(id='material-tagi')
|
remove_tags=[dict(id='mat-podobne'), dict(name='a', attrs={'class':'czytajDalej'}), dict(attrs={'src':'http://nm.dz.com.pl/dz.png'})]
|
||||||
remove_tags=[dict(attrs={'id':'reklama_srodtekst_0'}), dict(attrs={'id':'material-tagi'}), dict(name='div', attrs={'class':'zakladki'}), dict(attrs={'title':u'CZYTAJ TAKŻE'}), dict(attrs={'id':'podobne'}), dict(name='a', attrs={'href':'http://www.dzienniklodzki.pl/newsletter'})]
|
|
||||||
feeds = [(u'Fakty', u'http://polskatimes.feedsportal.com/c/32980/f/533648/index.rss'), (u'Opinie', u'http://www.polskatimes.pl/rss/opinie.xml'), (u'Sport', u'http://polskatimes.feedsportal.com/c/32980/f/533649/index.rss'), (u'Pieni\u0105dze', u'http://polskatimes.feedsportal.com/c/32980/f/533657/index.rss'), (u'Twoje finanse', u'http://www.polskatimes.pl/rss/twojefinanse.xml'), (u'Kultura', u'http://polskatimes.feedsportal.com/c/32980/f/533650/index.rss'), (u'Dodatki', u'http://www.polskatimes.pl/rss/dodatki.xml')]
|
feeds = [(u'Fakty', u'http://polskatimes.feedsportal.com/c/32980/f/533648/index.rss'), (u'Opinie', u'http://www.polskatimes.pl/rss/opinie.xml'), (u'Sport', u'http://polskatimes.feedsportal.com/c/32980/f/533649/index.rss'), (u'Pieni\u0105dze', u'http://polskatimes.feedsportal.com/c/32980/f/533657/index.rss'), (u'Twoje finanse', u'http://www.polskatimes.pl/rss/twojefinanse.xml'), (u'Kultura', u'http://polskatimes.feedsportal.com/c/32980/f/533650/index.rss'), (u'Dodatki', u'http://www.polskatimes.pl/rss/dodatki.xml')]
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
return url.replace('artykul', 'drukuj')
|
||||||
|
|
||||||
def skip_ad_pages(self, soup):
|
def skip_ad_pages(self, soup):
|
||||||
if 'Advertisement' in soup.title:
|
if 'Advertisement' in soup.title:
|
||||||
nexturl=soup.find('a')['href']
|
nexturl=soup.find('a')['href']
|
||||||
return self.index_to_soup(nexturl, raw=True)
|
return self.index_to_soup(nexturl, raw=True)
|
||||||
|
|
||||||
def append_page(self, soup, appendtag):
|
|
||||||
nexturl=soup.find(id='nastepna_strona')
|
|
||||||
while nexturl:
|
|
||||||
soup2= self.index_to_soup(nexturl['href'])
|
|
||||||
nexturl=soup2.find(id='nastepna_strona')
|
|
||||||
pagetext = soup2.find(id='tresc')
|
|
||||||
for dictionary in self.remove_tags:
|
|
||||||
v=pagetext.findAll(attrs=dictionary['attrs'])
|
|
||||||
for delete in v:
|
|
||||||
delete.extract()
|
|
||||||
for b in pagetext.findAll(name='b'):
|
|
||||||
if b.string:
|
|
||||||
if u'CZYTAJ TEŻ' in b.string or u'Czytaj także' in b.string or u'Czytaj też' in b.string or u'Zobacz także' in b.string:
|
|
||||||
b.extract()
|
|
||||||
for center in pagetext.findAll(name='center'):
|
|
||||||
if center.h4:
|
|
||||||
if center.h4.a:
|
|
||||||
center.extract()
|
|
||||||
pos = len(appendtag.contents)
|
|
||||||
appendtag.insert(pos, pagetext)
|
|
||||||
for paginator in appendtag.findAll(attrs={'class':'stronicowanie'}):
|
|
||||||
paginator.extract()
|
|
||||||
|
|
||||||
def image_article(self, soup, appendtag):
|
|
||||||
nexturl=soup.find('a', attrs={'class':'nastepna'})
|
|
||||||
urls=[]
|
|
||||||
while nexturl:
|
|
||||||
if nexturl not in urls:
|
|
||||||
urls.append(nexturl)
|
|
||||||
else:
|
|
||||||
break
|
|
||||||
soup2= self.index_to_soup('http://www.polskatimes.pl/artykul/' + nexturl['href'])
|
|
||||||
nexturl=soup2.find('a', attrs={'class':'nastepna'})
|
|
||||||
if nexturl in urls:
|
|
||||||
break;
|
|
||||||
pagetext = soup2.find(id='galeria-material')
|
|
||||||
pos = len(appendtag.contents)
|
|
||||||
appendtag.insert(pos, '<br />')
|
|
||||||
pos = len(appendtag.contents)
|
|
||||||
appendtag.insert(pos, pagetext)
|
|
||||||
for rem in appendtag.findAll(attrs={'class':['galeriaNawigator', 'miniaturyPojemnik']}):
|
|
||||||
rem.extract()
|
|
||||||
for paginator in appendtag.findAll(attrs={'class':'stronicowanie'}):
|
|
||||||
paginator.extract()
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
|
||||||
if soup.find('a', attrs={'class':'nastepna'}):
|
|
||||||
self.image_article(soup, soup.body)
|
|
||||||
elif soup.find(id='nastepna_strona'):
|
|
||||||
self.append_page(soup, soup.body)
|
|
||||||
return soup
|
|
||||||
|
|
||||||
|
|
||||||
def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
soup = self.index_to_soup('http://www.prasa24.pl/gazeta/metropolia-warszawska/')
|
soup = self.index_to_soup('http://www.prasa24.pl/gazeta/metropolia-warszawska/')
|
||||||
self.cover_url=soup.find(id='pojemnik').img['src']
|
self.cover_url=soup.find(id='pojemnik').img['src']
|
||||||
|
@ -13,10 +13,11 @@ class tvn24(BasicNewsRecipe):
|
|||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
keep_only_tags=[dict(id='tvn24_wiadomosci_detal'), dict(name='h1', attrs={'class':'standardHeader1'}), dict(attrs={'class':['date60m rd5', 'imageBackground fl rd7', 'contentFromCMS']})]
|
keep_only_tags=[dict(name='h1', attrs={'class':'standardHeader1'}), dict(attrs={'class':['date60m rd5', 'imageBackground fl rd7', 'contentFromCMS']}), dict(attrs={'class':'mainLeftColumn'})]
|
||||||
remove_tags_after= dict(name='div', attrs={'class':'socialBoxesBottom'})
|
remove_tags=[dict(attrs={'class':['commentsInfo', 'textSize', 'related newsNews align-right', 'box', 'watchMaterial text']})]
|
||||||
remove_tags=[dict(attrs={'class':['tagi_detal', 'socialBoxesBottom', 'twitterBox', 'commentsInfo', 'textSize', 'obj_ukrytydruk obj_ramka1_r', 'related newsNews align-right', 'box', 'newsUserList', 'watchMaterial text']})]
|
#remove_tags_after= dict(attrs={'class':'articleAuthors mb30 mt5 grey_v6'})
|
||||||
feeds = [(u'Najnowsze', u'http://www.tvn24.pl/najnowsze.xml'), (u'Polska', u'www.tvn24.pl/polska.xml'), (u'\u015awiat', u'http://www.tvn24.pl/swiat.xml'), (u'Sport', u'http://www.tvn24.pl/sport.xml'), (u'Biznes', u'http://www.tvn24.pl/biznes.xml'), (u'Meteo', u'http://www.tvn24.pl/meteo.xml'), (u'Micha\u0142ki', u'http://www.tvn24.pl/michalki.xml'), (u'Kultura', u'http://www.tvn24.pl/kultura.xml')]
|
feeds = [(u'Najnowsze', u'http://www.tvn24.pl/najnowsze.xml'), ]
|
||||||
|
#(u'Polska', u'www.tvn24.pl/polska.xml'), (u'\u015awiat', u'http://www.tvn24.pl/swiat.xml'), (u'Sport', u'http://www.tvn24.pl/sport.xml'), (u'Biznes', u'http://www.tvn24.pl/biznes.xml'), (u'Meteo', u'http://www.tvn24.pl/meteo.xml'), (u'Micha\u0142ki', u'http://www.tvn24.pl/michalki.xml'), (u'Kultura', u'http://www.tvn24.pl/kultura.xml')]
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
for item in soup.findAll(style=True):
|
for item in soup.findAll(style=True):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user