From 5da024c674a495b9d40266ff2ecbb7bfa0b46bc9 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 14 Dec 2012 15:02:01 +0530 Subject: [PATCH] Updated various Polish recipes --- recipes/adventure_zone_pl.recipe | 9 +++--- recipes/android_com_pl.recipe | 6 ++-- recipes/dzieje_pl.recipe | 50 ++++++++++++++++++++++++++++++-- recipes/film_web.recipe | 6 ++++ recipes/gildia_pl.recipe | 12 +++++--- recipes/gram_pl.recipe | 49 ++++++++++++------------------- recipes/historia_pl.recipe | 19 ++++++------ recipes/kosmonauta_pl.recipe | 17 ++++++++++- recipes/ksiazka_pl.recipe | 5 ++-- recipes/mlody_technik_pl.recipe | 2 +- 10 files changed, 117 insertions(+), 58 deletions(-) diff --git a/recipes/adventure_zone_pl.recipe b/recipes/adventure_zone_pl.recipe index 485a2e0c5b..2224937f3c 100644 --- a/recipes/adventure_zone_pl.recipe +++ b/recipes/adventure_zone_pl.recipe @@ -9,11 +9,12 @@ class Adventure_zone(BasicNewsRecipe): no_stylesheets = True oldest_article = 20 max_articles_per_feed = 100 + cover_url = 'http://www.adventure-zone.info/inne/logoaz_2012.png' index='http://www.adventure-zone.info/fusion/' use_embedded_content=False preprocess_regexps = [(re.compile(r"Komentarze", re.IGNORECASE), lambda m: ''), - (re.compile(r'\'), lambda match: ''), - (re.compile(r'\'), lambda match: '')] + (re.compile(r''), lambda match: ''), + (re.compile(r''), lambda match: '')] remove_tags_before= dict(name='td', attrs={'class':'main-bg'}) remove_tags= [dict(name='img', attrs={'alt':'Drukuj'})] remove_tags_after= dict(id='comments') @@ -36,11 +37,11 @@ class Adventure_zone(BasicNewsRecipe): return feeds - def get_cover_url(self): + '''def get_cover_url(self): soup = self.index_to_soup('http://www.adventure-zone.info/fusion/news.php') cover=soup.find(id='box_OstatninumerAZ') self.cover_url='http://www.adventure-zone.info/fusion/'+ cover.center.a.img['src'] - return getattr(self, 'cover_url', self.cover_url) + return getattr(self, 'cover_url', self.cover_url)''' def skip_ad_pages(self, soup): diff --git a/recipes/android_com_pl.recipe b/recipes/android_com_pl.recipe index c7a4a97d3c..a4a387d414 100644 --- a/recipes/android_com_pl.recipe +++ b/recipes/android_com_pl.recipe @@ -3,11 +3,11 @@ from calibre.web.feeds.news import BasicNewsRecipe class Android_com_pl(BasicNewsRecipe): title = u'Android.com.pl' __author__ = 'fenuks' - description = 'Android.com.pl - biggest polish Android site' + description = u'Android.com.pl - to największe w Polsce centrum Android OS. Znajdziesz tu: nowości, forum, pomoc, recenzje, gry, aplikacje.' category = 'Android, mobile' language = 'pl' use_embedded_content=True - cover_url =u'http://upload.wikimedia.org/wikipedia/commons/thumb/d/d7/Android_robot.svg/220px-Android_robot.svg.png' + cover_url =u'http://android.com.pl/wp-content/themes/android/images/logo.png' oldest_article = 8 max_articles_per_feed = 100 - feeds = [(u'Android', u'http://android.com.pl/component/content/frontpage/frontpage.feed?type=rss')] + feeds = [(u'Android', u'http://android.com.pl/feed/')] diff --git a/recipes/dzieje_pl.recipe b/recipes/dzieje_pl.recipe index 0aafa5d2f4..603591e9f0 100644 --- a/recipes/dzieje_pl.recipe +++ b/recipes/dzieje_pl.recipe @@ -7,18 +7,64 @@ class Dzieje(BasicNewsRecipe): cover_url = 'http://www.dzieje.pl/sites/default/files/dzieje_logo.png' category = 'history' language = 'pl' - index='http://dzieje.pl' + ignore_duplicate_articles = {'title', 'url'} + index = 'http://dzieje.pl' oldest_article = 8 max_articles_per_feed = 100 remove_javascript=True no_stylesheets= True keep_only_tags = [dict(name='h1', attrs={'class':'title'}), dict(id='content-area')] remove_tags = [dict(attrs={'class':'field field-type-computed field-field-tagi'}), dict(id='dogory')] - feeds = [(u'Dzieje', u'http://dzieje.pl/rss.xml')] + #feeds = [(u'Dzieje', u'http://dzieje.pl/rss.xml')] + def append_page(self, soup, appendtag): + tag = appendtag.find('li', attrs={'class':'pager-next'}) + if tag: + while tag: + url = tag.a['href'] + if not url.startswith('http'): + url = 'http://dzieje.pl'+tag.a['href'] + soup2 = self.index_to_soup(url) + pagetext = soup2.find(id='content-area').find(attrs={'class':'content'}) + for r in pagetext.findAll(attrs={'class':['fieldgroup group-groupkul', 'fieldgroup group-zdjeciekult', 'fieldgroup group-zdjecieciekaw', 'fieldgroup group-zdjecieksiazka', 'fieldgroup group-zdjeciedu', 'field field-type-filefield field-field-zdjecieglownawyd']}): + r.extract() + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + tag = soup2.find('li', attrs={'class':'pager-next'}) + for r in appendtag.findAll(attrs={'class':['item-list', 'field field-type-computed field-field-tagi', ]}): + r.extract() + + def find_articles(self, url): + articles = [] + soup=self.index_to_soup(url) + tag=soup.find(id='content-area').div.div + for i in tag.findAll('div', recursive=False): + temp = i.find(attrs={'class':'views-field-title'}).span.a + title = temp.string + url = self.index + temp['href'] + date = '' #i.find(attrs={'class':'views-field-created'}).span.string + articles.append({'title' : title, + 'url' : url, + 'date' : date, + 'description' : '' + }) + return articles + + def parse_index(self): + feeds = [] + feeds.append((u"Wiadomości", self.find_articles('http://dzieje.pl/wiadomosci'))) + feeds.append((u"Kultura i sztuka", self.find_articles('http://dzieje.pl/kulturaisztuka'))) + feeds.append((u"Film", self.find_articles('http://dzieje.pl/kino'))) + feeds.append((u"Rozmaitości historyczne", self.find_articles('http://dzieje.pl/rozmaitości'))) + feeds.append((u"Książka", self.find_articles('http://dzieje.pl/ksiazka'))) + feeds.append((u"Wystawa", self.find_articles('http://dzieje.pl/wystawa'))) + feeds.append((u"Edukacja", self.find_articles('http://dzieje.pl/edukacja'))) + feeds.append((u"Dzieje się", self.find_articles('http://dzieje.pl/wydarzenia'))) + return feeds def preprocess_html(self, soup): for a in soup('a'): if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: a['href']=self.index + a['href'] + self.append_page(soup, soup.body) return soup \ No newline at end of file diff --git a/recipes/film_web.recipe b/recipes/film_web.recipe index 01d7514e0d..6b014e8f93 100644 --- a/recipes/film_web.recipe +++ b/recipes/film_web.recipe @@ -17,6 +17,7 @@ class FilmWebPl(BasicNewsRecipe): preprocess_regexps = [(re.compile(u'\(kliknij\,\ aby powiększyć\)', re.IGNORECASE), lambda m: ''), ]#(re.compile(ur' | ', re.IGNORECASE), lambda m: '')] extra_css = '.hdrBig {font-size:22px;} ul {list-style-type:none; padding: 0; margin: 0;}' remove_tags= [dict(name='div', attrs={'class':['recommendOthers']}), dict(name='ul', attrs={'class':'fontSizeSet'}), dict(attrs={'class':'userSurname anno'})] + remove_attributes = ['style',] keep_only_tags= [dict(name='h1', attrs={'class':['hdrBig', 'hdrEntity']}), dict(name='div', attrs={'class':['newsInfo', 'newsInfoSmall', 'reviewContent description']})] feeds = [(u'News / Filmy w produkcji', 'http://www.filmweb.pl/feed/news/category/filminproduction'), (u'News / Festiwale, nagrody i przeglądy', u'http://www.filmweb.pl/feed/news/category/festival'), @@ -50,4 +51,9 @@ class FilmWebPl(BasicNewsRecipe): for i in soup.findAll('sup'): if not i.string or i.string.startswith('(kliknij'): i.extract() + tag = soup.find(name='ul', attrs={'class':'inline sep-line'}) + if tag: + tag.name = 'div' + for t in tag.findAll('li'): + t.name = 'div' return soup diff --git a/recipes/gildia_pl.recipe b/recipes/gildia_pl.recipe index def57203e4..525cf6c605 100644 --- a/recipes/gildia_pl.recipe +++ b/recipes/gildia_pl.recipe @@ -4,9 +4,10 @@ import re class Gildia(BasicNewsRecipe): title = u'Gildia.pl' __author__ = 'fenuks' - description = 'Gildia - cultural site' + description = u'Fantastyczny Portal Kulturalny - newsy, recenzje, galerie, wywiady. Literatura, film, gry komputerowe i planszowe, komiks, RPG, sklep. Nie lekceważ potęgi wyobraźni!' cover_url = 'http://www.film.gildia.pl/_n_/portal/redakcja/logo/logo-gildia.pl-500.jpg' category = 'culture' + cover_url = 'http://gildia.pl/images/logo-main.png' language = 'pl' oldest_article = 8 max_articles_per_feed = 100 @@ -23,10 +24,13 @@ class Gildia(BasicNewsRecipe): content = soup.find('div', attrs={'class':'news'}) if 'recenzj' in soup.title.string.lower(): for link in content.findAll(name='a'): - if 'recenzj' in link['href']: - self.log.warn('odnosnik') - self.log.warn(link['href']) + if 'recenzj' in link['href'] or 'muzyka/plyty' in link['href']: return self.index_to_soup(link['href'], raw=True) + if 'fragmen' in soup.title.string.lower(): + for link in content.findAll(name='a'): + if 'fragment' in link['href']: + return self.index_to_soup(link['href'], raw=True) + def preprocess_html(self, soup): for a in soup('a'): diff --git a/recipes/gram_pl.recipe b/recipes/gram_pl.recipe index 79157630f5..3852f65d32 100644 --- a/recipes/gram_pl.recipe +++ b/recipes/gram_pl.recipe @@ -1,19 +1,20 @@ from calibre.web.feeds.news import BasicNewsRecipe - +from calibre.ebooks.BeautifulSoup import BeautifulSoup class Gram_pl(BasicNewsRecipe): title = u'Gram.pl' __author__ = 'fenuks' - description = 'Gram.pl - site about computer games' + description = u'Serwis społecznościowy o grach: recenzje, newsy, zapowiedzi, encyklopedia gier, forum. Gry PC, PS3, X360, PS Vita, sprzęt dla graczy.' category = 'games' language = 'pl' oldest_article = 8 index='http://www.gram.pl' max_articles_per_feed = 100 + ignore_duplicate_articles = {'title', 'url'} no_stylesheets= True - extra_css = 'h2 {font-style: italic; font-size:20px;} .picbox div {float: left;}' + #extra_css = 'h2 {font-style: italic; font-size:20px;} .picbox div {float: left;}' cover_url=u'http://www.gram.pl/www/01/img/grampl_zima.png' - remove_tags= [dict(name='p', attrs={'class':['extraText', 'must-log-in']}), dict(attrs={'class':['el', 'headline', 'post-info', 'entry-footer clearfix']}), dict(name='div', attrs={'class':['twojaOcena', 'comment-body', 'comment-author vcard', 'comment-meta commentmetadata', 'tw_button', 'entry-comment-counter', 'snap_nopreview sharing robots-nocontent', 'sharedaddy sd-sharing-enabled']}), dict(id=['igit_rpwt_css', 'comments', 'reply-title', 'igit_title'])] - keep_only_tags= [dict(name='div', attrs={'class':['main', 'arkh-postmetadataheader', 'arkh-postcontent', 'post', 'content', 'news_header', 'news_subheader', 'news_text']}), dict(attrs={'class':['contentheading', 'contentpaneopen']}), dict(name='article')] + keep_only_tags= [dict(id='articleModule')] + remove_tags = [dict(attrs={'class':['breadCrump', 'dymek', 'articleFooter']})] feeds = [(u'Informacje', u'http://www.gram.pl/feed_news.asp'), (u'Publikacje', u'http://www.gram.pl/feed_news.asp?type=articles'), (u'Kolektyw- Indie Games', u'http://indie.gram.pl/feed/'), @@ -28,35 +29,21 @@ class Gram_pl(BasicNewsRecipe): feed.articles.remove(article) return feeds - def append_page(self, soup, appendtag): - nexturl = appendtag.find('a', attrs={'class':'cpn'}) - while nexturl: - soup2 = self.index_to_soup('http://www.gram.pl'+ nexturl['href']) - r=appendtag.find(id='pgbox') - if r: - r.extract() - pagetext = soup2.find(attrs={'class':'main'}) - r=pagetext.find('h1') - if r: - r.extract() - r=pagetext.find('h2') - if r: - r.extract() - for r in pagetext.findAll('script'): - r.extract() - pos = len(appendtag.contents) - appendtag.insert(pos, pagetext) - nexturl = appendtag.find('a', attrs={'class':'cpn'}) - r=appendtag.find(id='pgbox') - if r: - r.extract() def preprocess_html(self, soup): - self.append_page(soup, soup.body) - tag=soup.findAll(name='div', attrs={'class':'picbox'}) - for t in tag: - t['style']='float: left;' + tag=soup.find(name='div', attrs={'class':'summary'}) + if tag: + tag.find(attrs={'class':'pros'}).insert(0, BeautifulSoup('

Plusy:

').h2) + tag.find(attrs={'class':'cons'}).insert(0, BeautifulSoup('

Minusy:

').h2) + tag = soup.find(name='section', attrs={'class':'cenzurka'}) + if tag: + rate = tag.p.img['data-ocena'] + tag.p.img.extract() + tag.p.insert(len(tag.p.contents)-2, BeautifulSoup('

Ocena: {0}

'.format(rate)).h2) for a in soup('a'): if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: a['href']=self.index + a['href'] + tag=soup.find(name='span', attrs={'class':'platforma'}) + if tag: + tag.name = 'p' return soup diff --git a/recipes/historia_pl.recipe b/recipes/historia_pl.recipe index f3353fe89f..60554c0924 100644 --- a/recipes/historia_pl.recipe +++ b/recipes/historia_pl.recipe @@ -3,7 +3,7 @@ from calibre.web.feeds.news import BasicNewsRecipe class Historia_org_pl(BasicNewsRecipe): title = u'Historia.org.pl' __author__ = 'fenuks' - description = u'history site' + description = u'Artykuły dotyczące historii w układzie epok i tematów, forum. Najlepsza strona historii. Matura z historii i egzamin gimnazjalny z historii.' cover_url = 'http://lh3.googleusercontent.com/_QeRQus12wGg/TOvHsZ2GN7I/AAAAAAAAD_o/LY1JZDnq7ro/logo5.jpg' category = 'history' language = 'pl' @@ -12,16 +12,15 @@ class Historia_org_pl(BasicNewsRecipe): no_stylesheets = True use_embedded_content = True max_articles_per_feed = 100 + ignore_duplicate_articles = {'title', 'url'} - feeds = [(u'Wszystkie', u'http://www.historia.org.pl/index.php?format=feed&type=atom'), - (u'Wiadomości', u'http://www.historia.org.pl/index.php/wiadomosci.feed?type=atom'), - (u'Publikacje', u'http://www.historia.org.pl/index.php/publikacje.feed?type=atom'), - (u'Publicystyka', u'http://www.historia.org.pl/index.php/publicystyka.feed?type=atom'), - (u'Recenzje', u'http://historia.org.pl/index.php/recenzje.feed?type=atom'), - (u'Kultura i sztuka', u'http://www.historia.org.pl/index.php/kultura-i-sztuka.feed?type=atom'), - (u'Rekonstykcje', u'http://www.historia.org.pl/index.php/rekonstrukcje.feed?type=atom'), - (u'Projekty', u'http://www.historia.org.pl/index.php/projekty.feed?type=atom'), - (u'Konkursy'), (u'http://www.historia.org.pl/index.php/konkursy.feed?type=atom')] + + feeds = [(u'Wszystkie', u'http://historia.org.pl/feed/'), + (u'Wiadomości', u'http://historia.org.pl/Kategoria/wiadomosci/feed/'), + (u'Publikacje', u'http://historia.org.pl/Kategoria/artykuly/feed/'), + (u'Publicystyka', u'http://historia.org.pl/Kategoria/publicystyka/feed/'), + (u'Recenzje', u'http://historia.org.pl/Kategoria/recenzje/feed/'), + (u'Projekty', u'http://historia.org.pl/Kategoria/projekty/feed/'),] def print_version(self, url): diff --git a/recipes/kosmonauta_pl.recipe b/recipes/kosmonauta_pl.recipe index ddfa26df36..d1caa85950 100644 --- a/recipes/kosmonauta_pl.recipe +++ b/recipes/kosmonauta_pl.recipe @@ -9,6 +9,21 @@ class Kosmonauta(BasicNewsRecipe): language = 'pl' cover_url='http://bi.gazeta.pl/im/4/10393/z10393414X,Kosmonauta-net.jpg' no_stylesheets = True + INDEX = 'http://www.kosmonauta.net' oldest_article = 7 + no_stylesheets = True max_articles_per_feed = 100 - feeds = [(u'Kosmonauta.net', u'http://www.kosmonauta.net/index.php/feed/rss.html')] + keep_only_tags = [dict(name='div', attrs={'class':'item-page'})] + remove_tags = [dict(attrs={'class':['article-tools clearfix', 'cedtag', 'nav clearfix', 'jwDisqusForm']})] + remove_tags_after = dict(name='div', attrs={'class':'cedtag'}) + feeds = [(u'Kosmonauta.net', u'http://www.kosmonauta.net/?format=feed&type=atom')] + + def preprocess_html(self, soup): + for a in soup.findAll(name='a'): + if a.has_key('href'): + href = a['href'] + if not href.startswith('http'): + a['href'] = self.INDEX + href + print '%%%%%%%%%%%%%%%%%%%%%%%%%', a['href'] + return soup + \ No newline at end of file diff --git a/recipes/ksiazka_pl.recipe b/recipes/ksiazka_pl.recipe index 7f9999f782..f91cb4f4f7 100644 --- a/recipes/ksiazka_pl.recipe +++ b/recipes/ksiazka_pl.recipe @@ -1,15 +1,16 @@ from calibre.web.feeds.news import BasicNewsRecipe import re class Ksiazka_net_pl(BasicNewsRecipe): - title = u'ksiazka.net.pl' + title = u'książka.net.pl' __author__ = 'fenuks' - description = u'Ksiazka.net.pl - book vortal' + description = u'Portal Księgarski - tematyczny serwis o książkach. Wydarzenia z rynku księgarsko-wydawniczego, nowości, zapowiedzi, bestsellery, setki recenzji. Niezbędne informacje dla każdego miłośnika książek, księgarza, bibliotekarza i wydawcy.' cover_url = 'http://www.ksiazka.net.pl/fileadmin/templates/ksiazka.net.pl/images/1PortalKsiegarski-logo.jpg' category = 'books' language = 'pl' oldest_article = 8 max_articles_per_feed = 100 no_stylesheets= True + remove_empty_feeds = True #extra_css = 'img {float: right;}' preprocess_regexps = [(re.compile(ur'Podoba mi się, kupuję:'), lambda match: '
')] remove_tags_before= dict(name='div', attrs={'class':'m-body'}) diff --git a/recipes/mlody_technik_pl.recipe b/recipes/mlody_technik_pl.recipe index d019efb94c..741397d08a 100644 --- a/recipes/mlody_technik_pl.recipe +++ b/recipes/mlody_technik_pl.recipe @@ -2,7 +2,7 @@ from calibre.web.feeds.news import BasicNewsRecipe class Mlody_technik(BasicNewsRecipe): - title = u'Mlody technik' + title = u'Młody technik' __author__ = 'fenuks' description = u'Młody technik' category = 'science'