From 632ae65855f1c14b75b74e0812c9a875d19af6c5 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 21 Mar 2012 08:52:11 +0530 Subject: [PATCH] Updated various Polish recipes --- recipes/android_com_pl.recipe | 1 + recipes/cgm_pl.recipe | 13 ++++++++----- recipes/elektroda_pl.recipe | 16 ++++++++++++++++ recipes/film_web.recipe | 2 +- recipes/gram_pl.recipe | 36 ++++++++++++++++++++++++++++++++--- recipes/naczytniki.recipe | 6 +++--- recipes/overclock_pl.recipe | 23 +++++----------------- recipes/palmtop_pl.recipe | 4 +++- recipes/pc_arena.recipe | 36 +++++++++++++++-------------------- recipes/pc_centre_pl.recipe | 35 +++++++--------------------------- recipes/tablety_pl.recipe | 7 ++++--- recipes/wnp.recipe | 9 +++++++-- 12 files changed, 103 insertions(+), 85 deletions(-) diff --git a/recipes/android_com_pl.recipe b/recipes/android_com_pl.recipe index a44d5e560a..c7a4a97d3c 100644 --- a/recipes/android_com_pl.recipe +++ b/recipes/android_com_pl.recipe @@ -6,6 +6,7 @@ class Android_com_pl(BasicNewsRecipe): description = 'Android.com.pl - biggest polish Android site' category = 'Android, mobile' language = 'pl' + use_embedded_content=True cover_url =u'http://upload.wikimedia.org/wikipedia/commons/thumb/d/d7/Android_robot.svg/220px-Android_robot.svg.png' oldest_article = 8 max_articles_per_feed = 100 diff --git a/recipes/cgm_pl.recipe b/recipes/cgm_pl.recipe index 673a9f940b..4ab4402c3a 100644 --- a/recipes/cgm_pl.recipe +++ b/recipes/cgm_pl.recipe @@ -1,4 +1,5 @@ from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup class CGM(BasicNewsRecipe): title = u'CGM' @@ -17,9 +18,9 @@ class CGM(BasicNewsRecipe): remove_tags_before=dict(id='mainContent') remove_tags_after=dict(name='div', attrs={'class':'fbContainer'}) remove_tags=[dict(name='div', attrs={'class':'fbContainer'}), - dict(name='p', attrs={'class':['tagCloud', 'galleryAuthor']}), - dict(id=['movieShare', 'container'])] - feeds = [(u'Informacje', u'http://www.cgm.pl/rss.xml'), (u'Polecamy', u'http://www.cgm.pl/rss,4,news.xml'), + dict(name='p', attrs={'class':['tagCloud', 'galleryAuthor']}), + dict(id=['movieShare', 'container'])] + feeds = [(u'Informacje', u'http://www.cgm.pl/rss.xml'), (u'Polecamy', u'http://www.cgm.pl/rss,4,news.xml'), (u'Recenzje', u'http://www.cgm.pl/rss,1,news.xml')] @@ -33,10 +34,12 @@ class CGM(BasicNewsRecipe): img='http://www.cgm.pl'+img[img.find('url(')+4:img.find(')')] gallery.contents[1].name='img' gallery.contents[1]['src']=img + pos = len(gallery.contents) + gallery.insert(pos, BeautifulSoup('
')) for item in soup.findAll(style=True): del item['style'] ad=soup.findAll('a') for r in ad: - if 'www.hustla.pl' in r['href'] or 'www.ebilet.pl' in r['href']: + if 'www.hustla.pl' in r['href'] or 'www.ebilet.pl' in r['href']: r.extract() - return soup \ No newline at end of file + return soup diff --git a/recipes/elektroda_pl.recipe b/recipes/elektroda_pl.recipe index c2123cb8cf..55858020ad 100644 --- a/recipes/elektroda_pl.recipe +++ b/recipes/elektroda_pl.recipe @@ -1,4 +1,5 @@ from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup class Elektroda(BasicNewsRecipe): title = u'Elektroda' @@ -13,3 +14,18 @@ class Elektroda(BasicNewsRecipe): remove_tags_after=dict(name='td', attrs={'class':'spaceRow'}) remove_tags=[dict(name='a', attrs={'href':'#top'})] feeds = [(u'Elektroda', u'http://www.elektroda.pl/rtvforum/rss.php')] + + + def preprocess_html(self, soup): + tag=soup.find('span', attrs={'class':'postbody'}) + if tag: + pos = len(tag.contents) + tag.insert(pos, BeautifulSoup('
')) + return soup + + def parse_feeds (self): + feeds = BasicNewsRecipe.parse_feeds(self) + for feed in feeds: + for article in feed.articles[:]: + article.title=article.title[article.title.find("::")+3:] + return feeds diff --git a/recipes/film_web.recipe b/recipes/film_web.recipe index 0671deec6c..877d4472bc 100644 --- a/recipes/film_web.recipe +++ b/recipes/film_web.recipe @@ -13,7 +13,7 @@ class Filmweb_pl(BasicNewsRecipe): remove_empty_feeds=True extra_css = '.hdrBig {font-size:22px;} ul {list-style-type:none; padding: 0; margin: 0;}' remove_tags= [dict(name='div', attrs={'class':['recommendOthers']}), dict(name='ul', attrs={'class':'fontSizeSet'})] - keep_only_tags= [dict(name='h1', attrs={'class':'hdrBig'}), dict(name='div', attrs={'class':['newsInfo', 'reviewContent fontSizeCont description']})] + keep_only_tags= [dict(name='h1', attrs={'class':['hdrBig', 'hdrEntity']}), dict(name='div', attrs={'class':['newsInfo', 'newsInfoSmall', 'reviewContent description']})] feeds = [(u'Wszystkie newsy', u'http://www.filmweb.pl/feed/news/latest'), (u'News / Filmy w produkcji', 'http://www.filmweb.pl/feed/news/category/filminproduction'), (u'News / Festiwale, nagrody i przeglądy', u'http://www.filmweb.pl/feed/news/category/festival'), diff --git a/recipes/gram_pl.recipe b/recipes/gram_pl.recipe index c8655dc9cd..07927796c0 100644 --- a/recipes/gram_pl.recipe +++ b/recipes/gram_pl.recipe @@ -9,12 +9,12 @@ class Gram_pl(BasicNewsRecipe): oldest_article = 8 max_articles_per_feed = 100 no_stylesheets= True - extra_css = 'h2 {font-style: italic; font-size:20px;}' + extra_css = 'h2 {font-style: italic; font-size:20px;} .picbox div {float: left;}' cover_url=u'http://www.gram.pl/www/01/img/grampl_zima.png' remove_tags= [dict(name='p', attrs={'class':['extraText', 'must-log-in']}), dict(attrs={'class':['el', 'headline', 'post-info']}), dict(name='div', attrs={'class':['twojaOcena', 'comment-body', 'comment-author vcard', 'comment-meta commentmetadata', 'tw_button']}), dict(id=['igit_rpwt_css', 'comments', 'reply-title', 'igit_title'])] keep_only_tags= [dict(name='div', attrs={'class':['main', 'arkh-postmetadataheader', 'arkh-postcontent', 'post', 'content', 'news_header', 'news_subheader', 'news_text']}), dict(attrs={'class':['contentheading', 'contentpaneopen']})] - feeds = [(u'gram.pl - informacje', u'http://www.gram.pl/feed_news.asp'), - (u'gram.pl - publikacje', u'http://www.gram.pl/feed_news.asp?type=articles')] + feeds = [(u'Informacje', u'http://www.gram.pl/feed_news.asp'), + (u'Publikacje', u'http://www.gram.pl/feed_news.asp?type=articles')] def parse_feeds (self): feeds = BasicNewsRecipe.parse_feeds(self) @@ -23,3 +23,33 @@ class Gram_pl(BasicNewsRecipe): if 'REKLAMA SKLEP' in article.title.upper() or u'ARTYKUŁ:' in article.title.upper(): feed.articles.remove(article) return feeds + + def append_page(self, soup, appendtag): + nexturl = appendtag.find('a', attrs={'class':'cpn'}) + while nexturl: + soup2 = self.index_to_soup('http://www.gram.pl'+ nexturl['href']) + r=appendtag.find(id='pgbox') + if r: + r.extract() + pagetext = soup2.find(attrs={'class':'main'}) + r=pagetext.find('h1') + if r: + r.extract() + r=pagetext.find('h2') + if r: + r.extract() + for r in pagetext.findAll('script'): + r.extract() + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + nexturl = appendtag.find('a', attrs={'class':'cpn'}) + r=appendtag.find(id='pgbox') + if r: + r.extract() + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + tag=soup.findAll(name='div', attrs={'class':'picbox'}) + for t in tag: + t['style']='float: left;' + return soup \ No newline at end of file diff --git a/recipes/naczytniki.recipe b/recipes/naczytniki.recipe index 2ae6bc391e..3d1a8b6095 100644 --- a/recipes/naczytniki.recipe +++ b/recipes/naczytniki.recipe @@ -7,12 +7,12 @@ class naczytniki(BasicNewsRecipe): cover_url = 'http://naczytniki.pl/wp-content/uploads/2010/08/logo_nc28.png' language = 'pl' description ='everything about e-readers' - category='readers' + category='e-readers' no_stylesheets=True + use_embedded_content=False oldest_article = 7 max_articles_per_feed = 100 preprocess_regexps = [(re.compile(ur'


Zobacz także:

.*?', re.DOTALL), lambda match: '') ] - remove_tags_after= dict(name='div', attrs={'class':'sociable'}) keep_only_tags=[dict(name='div', attrs={'class':'post'})] remove_tags=[dict(name='span', attrs={'class':'comments'}), dict(name='div', attrs={'class':'sociable'})] - feeds = [(u'Wpisy', u'http://naczytniki.pl/?feed=rss2')] + feeds = [(u'Wpisy', u'http://naczytniki.pl/?feed=rss2')] \ No newline at end of file diff --git a/recipes/overclock_pl.recipe b/recipes/overclock_pl.recipe index d7f4c8093d..953dee67eb 100644 --- a/recipes/overclock_pl.recipe +++ b/recipes/overclock_pl.recipe @@ -17,21 +17,8 @@ class Overclock_pl(BasicNewsRecipe): remove_tags=[dict(name='span', attrs={'class':'info'}), dict(attrs={'class':'shareit'})] feeds = [(u'Aktualno\u015bci', u'http://www.overclock.pl/rss.news.xml'), (u'Testy i recenzje', u'http://www.overclock.pl/rss.articles.xml')] - - def append_page(self, soup, appendtag): - tag=soup.find(id='navigation') - if tag: - nexturl=tag.findAll('option') - tag.extract() - for nextpage in nexturl[2:]: - soup2 = self.index_to_soup(nextpage['value']) - pagetext = soup2.find(id='content') - pos = len(appendtag.contents) - appendtag.insert(pos, pagetext) - rem=appendtag.find(attrs={'alt':'Pierwsza'}) - if rem: - rem.parent.extract() - - def preprocess_html(self, soup): - self.append_page(soup, soup.body) - return soup \ No newline at end of file + def print_version(self, url): + if 'articles/show' in url: + return url.replace('show', 'showall') + else: + return url \ No newline at end of file diff --git a/recipes/palmtop_pl.recipe b/recipes/palmtop_pl.recipe index ace772e7e7..87da5d0d1c 100644 --- a/recipes/palmtop_pl.recipe +++ b/recipes/palmtop_pl.recipe @@ -10,5 +10,7 @@ class palmtop_pl(BasicNewsRecipe): oldest_article = 7 max_articles_per_feed = 100 no_stylesheets = True - + use_embedded_content=True + #remove_tags_before=dict(name='h2') + #remove_tags_after=dict(attrs={'class':'entry clearfix'}) feeds = [(u'Newsy', u'http://palmtop.pl/feed/atom/')] diff --git a/recipes/pc_arena.recipe b/recipes/pc_arena.recipe index faefeb25c0..952db30c3e 100644 --- a/recipes/pc_arena.recipe +++ b/recipes/pc_arena.recipe @@ -1,31 +1,25 @@ from calibre.web.feeds.news import BasicNewsRecipe class PC_Arena(BasicNewsRecipe): title = u'PCArena' - oldest_article = 18300 + oldest_article = 7 max_articles_per_feed = 100 __author__ = 'fenuks' description = u'Najnowsze informacje z branży IT - testy, recenzje, aktualności, rankingi, wywiady. Twoje źródło informacji o sprzęcie komputerowym.' category = 'IT' language = 'pl' - masthead_url='http://pcarena.pl/public/design/frontend/images/logo.gif' - cover_url= 'http://pcarena.pl/public/design/frontend/images/logo.gif' + masthead_url='http://pcarena.pl/pcarena/img/logo.png' + cover_url= 'http://pcarena.pl/pcarena/img/logo.png' no_stylesheets = True - keep_only_tags=[dict(attrs={'class':['artHeader', 'art']})] - remove_tags=[dict(attrs={'class':'pages'})] - feeds = [(u'Newsy', u'http://pcarena.pl/misc/rss/news'), (u'Artyku\u0142y', u'http://pcarena.pl/misc/rss/articles')] + remove_empty_feeds=True + #keep_only_tags=[dict(attrs={'class':['artHeader', 'art']})] + #remove_tags=[dict(attrs={'class':'pages'})] + feeds = [(u'Aktualności', u'http://pcarena.pl/aktualnosci/feeds.rss'), (u'Testy', u'http://pcarena.pl/testy/feeds.rss'), (u'Software', u'http://pcarena.pl/oprogramowanie/feeds.rss'), (u'Poradniki', u'http://pcarena.pl/poradniki/feeds.rss'), (u'Mobile', u'http://pcarena.pl/mobile/feeds.rss')] + + def print_version(self, url): + return url.replace('show', 'print') - def append_page(self, soup, appendtag): - tag=soup.find(name='div', attrs={'class':'pagNum'}) - if tag: - nexturl=tag.findAll('a') - tag.extract() - for nextpage in nexturl[1:]: - nextpage= 'http://pcarena.pl' + nextpage['href'] - soup2 = self.index_to_soup(nextpage) - pagetext = soup2.find(attrs={'class':'artBody'}) - pos = len(appendtag.contents) - appendtag.insert(pos, pagetext) - - def preprocess_html(self, soup): - self.append_page(soup, soup.body) - return soup \ No newline at end of file + def image_url_processor(self, baseurl, url): + if 'http' not in url: + return 'http://pcarena.pl' + url + else: + return url \ No newline at end of file diff --git a/recipes/pc_centre_pl.recipe b/recipes/pc_centre_pl.recipe index 68a17888ce..f4eccd70a0 100644 --- a/recipes/pc_centre_pl.recipe +++ b/recipes/pc_centre_pl.recipe @@ -10,32 +10,11 @@ class PC_Centre(BasicNewsRecipe): masthead_url= 'http://pccentre.pl/views/images/logo.gif' cover_url= 'http://pccentre.pl/views/images/logo.gif' no_stylesheets = True - keep_only_tags= [dict(id='content')] - remove_tags=[dict(attrs={'class':['ikony r', 'list_of_content', 'dot accordion']}), dict(id='comments')] - feeds = [(u'Publikacje', u'http://pccentre.pl/backend.php?mode=a'), (u'Aktualno\u015bci', u'http://pccentre.pl/backend.php'), (u'Sprz\u0119t komputerowy', u'http://pccentre.pl/backend.php?mode=n§ion=2'), (u'Oprogramowanie', u'http://pccentre.pl/backend.php?mode=n§ion=3'), (u'Gry komputerowe i konsole', u'http://pccentre.pl/backend.php?mode=n§ion=4'), (u'Internet', u'http://pccentre.pl/backend.php?mode=n§ion=7'), (u'Bezpiecze\u0144stwo', u'http://pccentre.pl/backend.php?mode=n§ion=5'), (u'Multimedia', u'http://pccentre.pl/backend.php?mode=n§ion=6'), (u'Biznes', u'http://pccentre.pl/backend.php?mode=n§ion=9')] + remove_empty_feeds = True + #keep_only_tags= [dict(id='content')] + #remove_tags=[dict(attrs={'class':['ikony r', 'list_of_content', 'dot accordion']}), dict(id='comments')] + remove_tags=[dict(attrs={'class':'logo_print'})] + feeds = [(u'Aktualno\u015bci', u'http://pccentre.pl/backend.php'), (u'Publikacje', u'http://pccentre.pl/backend.php?mode=a'), (u'Sprz\u0119t komputerowy', u'http://pccentre.pl/backend.php?mode=n§ion=2'), (u'Oprogramowanie', u'http://pccentre.pl/backend.php?mode=n§ion=3'), (u'Gry komputerowe i konsole', u'http://pccentre.pl/backend.php?mode=n§ion=4'), (u'Internet', u'http://pccentre.pl/backend.php?mode=n§ion=7'), (u'Bezpiecze\u0144stwo', u'http://pccentre.pl/backend.php?mode=n§ion=5'), (u'Multimedia', u'http://pccentre.pl/backend.php?mode=n§ion=6'), (u'Biznes', u'http://pccentre.pl/backend.php?mode=n§ion=9')] - - def append_page(self, soup, appendtag): - tag=soup.find(name='div', attrs={'class':'pages'}) - if tag: - nexturl=tag.findAll('a') - tag.extract() - for nextpage in nexturl[:-1]: - nextpage= 'http://pccentre.pl' + nextpage['href'] - soup2 = self.index_to_soup(nextpage) - pagetext = soup2.find(id='content') - rem=pagetext.findAll(attrs={'class':['subtitle', 'content_info', 'list_of_content', 'pages', 'social2', 'pcc_acc', 'pcc_acc_na']}) - for r in rem: - r.extract() - rem=pagetext.findAll(id='comments') - for r in rem: - r.extract() - rem=pagetext.findAll('h1') - for r in rem: - r.extract() - pos = len(appendtag.contents) - appendtag.insert(pos, pagetext) - - def preprocess_html(self, soup): - self.append_page(soup, soup.body) - return soup \ No newline at end of file + def print_version(self, url): + return url.replace('show', 'print') \ No newline at end of file diff --git a/recipes/tablety_pl.recipe b/recipes/tablety_pl.recipe index f4c1efa9b8..1c3f46f967 100644 --- a/recipes/tablety_pl.recipe +++ b/recipes/tablety_pl.recipe @@ -8,10 +8,11 @@ class Tablety_pl(BasicNewsRecipe): cover_url = 'http://www.tablety.pl/wp-content/themes/kolektyw/img/logo.png' category = 'IT' language = 'pl' + use_embedded_content=True oldest_article = 8 max_articles_per_feed = 100 preprocess_regexps = [(re.compile(ur'

Przeczytaj także.*?

', re.DOTALL), lambda match: ''), (re.compile(ur'

Przeczytaj koniecznie.*?

', re.DOTALL), lambda match: '')] - remove_tags_before=dict(name="h1", attrs={'class':'entry-title'}) - remove_tags_after=dict(name="div", attrs={'class':'snap_nopreview sharing robots-nocontent'}) - remove_tags=[dict(name='div', attrs={'class':'snap_nopreview sharing robots-nocontent'})] + #remove_tags_before=dict(name="h1", attrs={'class':'entry-title'}) + #remove_tags_after=dict(name="footer", attrs={'class':'entry-footer clearfix'}) + #remove_tags=[dict(name='footer', attrs={'class':'entry-footer clearfix'}), dict(name='div', attrs={'class':'entry-comment-counter'})] feeds = [(u'Najnowsze posty', u'http://www.tablety.pl/feed/')] diff --git a/recipes/wnp.recipe b/recipes/wnp.recipe index e53e4cc66b..ee87112437 100644 --- a/recipes/wnp.recipe +++ b/recipes/wnp.recipe @@ -1,5 +1,5 @@ from calibre.web.feeds.news import BasicNewsRecipe - +import re class AdvancedUserRecipe1312886443(BasicNewsRecipe): title = u'WNP' @@ -8,10 +8,11 @@ class AdvancedUserRecipe1312886443(BasicNewsRecipe): description = u'Wirtualny Nowy Przemysł' category = 'economy' language = 'pl' + preprocess_regexps = [(re.compile(ur'Czytaj też:.*?', re.DOTALL), lambda match: ''), (re.compile(ur'Czytaj więcej:.*?', re.DOTALL), lambda match: '')] oldest_article = 8 max_articles_per_feed = 100 no_stylesheets= True - keep_only_tags = dict(name='div', attrs={'id':'contentText'}) + remove_tags=[dict(attrs={'class':'printF'})] feeds = [(u'Wiadomości gospodarcze', u'http://www.wnp.pl/rss/serwis_rss.xml'), (u'Serwis Energetyka - Gaz', u'http://www.wnp.pl/rss/serwis_rss_1.xml'), (u'Serwis Nafta - Chemia', u'http://www.wnp.pl/rss/serwis_rss_2.xml'), @@ -19,3 +20,7 @@ class AdvancedUserRecipe1312886443(BasicNewsRecipe): (u'Serwis Górnictwo', u'http://www.wnp.pl/rss/serwis_rss_4.xml'), (u'Serwis Logistyka', u'http://www.wnp.pl/rss/serwis_rss_5.xml'), (u'Serwis IT', u'http://www.wnp.pl/rss/serwis_rss_6.xml')] + + + def print_version(self, url): + return 'http://wnp.pl/drukuj/' +url[url.find(',')+1:] \ No newline at end of file