diff --git a/recipes/archeowiesci.recipe b/recipes/archeowiesci.recipe index 3c93d3644f..e121ba4d42 100644 --- a/recipes/archeowiesci.recipe +++ b/recipes/archeowiesci.recipe @@ -7,6 +7,7 @@ class Archeowiesci(BasicNewsRecipe): language = 'pl' cover_url='http://archeowiesci.pl/wp-content/uploads/2011/05/Archeowiesci2-115x115.jpg' oldest_article = 7 + needs_subscription='optional' max_articles_per_feed = 100 auto_cleanup = True remove_tags=[dict(name='span', attrs={'class':['post-ratings', 'post-ratings-loading']})] @@ -16,6 +17,16 @@ class Archeowiesci(BasicNewsRecipe): feeds = BasicNewsRecipe.parse_feeds(self) for feed in feeds: for article in feed.articles[:]: - if 'subskrypcja' in article.title: + if self.username is None and 'subskrypcja' in article.title: feed.articles.remove(article) return feeds + + def get_browser(self): + br = BasicNewsRecipe.get_browser() + if self.username is not None and self.password is not None: + br.open('http://archeowiesci.pl/wp-login.php') + br.select_form(name='loginform') + br['log'] = self.username + br['pwd'] = self.password + br.submit() + return br \ No newline at end of file diff --git a/recipes/astronomia_pl.recipe b/recipes/astronomia_pl.recipe index a142520ec5..89a0e4c889 100644 --- a/recipes/astronomia_pl.recipe +++ b/recipes/astronomia_pl.recipe @@ -1,15 +1,18 @@ from calibre.web.feeds.news import BasicNewsRecipe - +import re class Astronomia_pl(BasicNewsRecipe): title = u'Astronomia.pl' __author__ = 'fenuks' description = 'Astronomia - polish astronomy site' + masthead_url = 'http://www.astronomia.pl/grafika/logo.gif' cover_url = 'http://www.astronomia.pl/grafika/logo.gif' category = 'astronomy, science' language = 'pl' oldest_article = 8 max_articles_per_feed = 100 - #no_stylesheets=True + extra_css='#h2 {font-size: 18px;}' + no_stylesheets=True + preprocess_regexps = [(re.compile(ur'Przeczytaj także:.*?', re.DOTALL), lambda match: '') ] remove_tags_before=dict(name='div', attrs={'id':'a1'}) keep_only_tags=[dict(name='div', attrs={'id':['a1', 'h2']})] feeds = [(u'Wiadomości z astronomii i astronautyki', u'http://www.astronomia.pl/rss/')] diff --git a/recipes/benchmark_pl.recipe b/recipes/benchmark_pl.recipe index d5b4997aa7..cc74cc9128 100644 --- a/recipes/benchmark_pl.recipe +++ b/recipes/benchmark_pl.recipe @@ -4,16 +4,17 @@ class Benchmark_pl(BasicNewsRecipe): title = u'Benchmark.pl' __author__ = 'fenuks' description = u'benchmark.pl -IT site' + masthead_url = 'http://www.benchmark.pl/i/logo-footer.png' cover_url = 'http://www.ieaddons.pl/benchmark/logo_benchmark_new.gif' category = 'IT' language = 'pl' oldest_article = 8 max_articles_per_feed = 100 no_stylesheets=True - preprocess_regexps = [(re.compile(ur'\bWięcej o .*', re.DOTALL|re.IGNORECASE), lambda match: '')] + preprocess_regexps = [(re.compile(ur'

 Zobacz poprzednie Opinie dnia:.*', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(ur'Więcej o .*?', re.DOTALL|re.IGNORECASE), lambda match: '')] keep_only_tags=[dict(name='div', attrs={'class':['m_zwykly', 'gallery']})] remove_tags_after=dict(name='div', attrs={'class':'body'}) - remove_tags=[dict(name='div', attrs={'class':['kategoria', 'socialize', 'thumb', 'panelOcenaObserwowane', 'categoryNextToSocializeGallery']})] + remove_tags=[dict(name='div', attrs={'class':['kategoria', 'socialize', 'thumb', 'panelOcenaObserwowane', 'categoryNextToSocializeGallery']}), dict(name='table', attrs={'background':'http://www.benchmark.pl/uploads/backend_img/a/fotki_newsy/opinie_dnia/bg.png'}), dict(name='table', attrs={'width':'210', 'cellspacing':'1', 'cellpadding':'4', 'border':'0', 'align':'right'})] INDEX= 'http://www.benchmark.pl' feeds = [(u'Aktualności', u'http://www.benchmark.pl/rss/aktualnosci-pliki.xml'), (u'Testy i recenzje', u'http://www.benchmark.pl/rss/testy-recenzje-minirecenzje.xml')] diff --git a/recipes/biolog_pl.recipe b/recipes/biolog_pl.recipe index af9ad77e44..b10bf0d925 100644 --- a/recipes/biolog_pl.recipe +++ b/recipes/biolog_pl.recipe @@ -10,10 +10,11 @@ class Biolog_pl(BasicNewsRecipe): description = u'Przyrodnicze aktualności ze świata nauki (codziennie aktualizowane), kurs biologii, testy i sprawdziany, forum dyskusyjne.' category = 'biology' language = 'pl' + masthead_url= 'http://www.biolog.pl/naukowy,portal,biolog.png' cover_url='http://www.biolog.pl/naukowy,portal,biolog.png' no_stylesheets = True #keeps_only_tags=[dict(id='main')] remove_tags_before=dict(id='main') remove_tags_after=dict(name='a', attrs={'name':'komentarze'}) - remove_tags=[dict(name='img', attrs={'alt':'Komentarze'})] + remove_tags=[dict(name='img', attrs={'alt':'Komentarze'}), dict(name='span', attrs={'class':'menu_odsylacze'})] feeds = [(u'Wszystkie', u'http://www.biolog.pl/backend.php'), (u'Medycyna', u'http://www.biolog.pl/medycyna-rss.php'), (u'Ekologia', u'http://www.biolog.pl/rss-ekologia.php'), (u'Genetyka i biotechnologia', u'http://www.biolog.pl/rss-biotechnologia.php'), (u'Botanika', u'http://www.biolog.pl/rss-botanika.php'), (u'Le\u015bnictwo', u'http://www.biolog.pl/rss-lesnictwo.php'), (u'Zoologia', u'http://www.biolog.pl/rss-zoologia.php')] diff --git a/recipes/cd_action.recipe b/recipes/cd_action.recipe index b4cf6b326c..ff46774dc9 100644 --- a/recipes/cd_action.recipe +++ b/recipes/cd_action.recipe @@ -1,16 +1,20 @@ from calibre.web.feeds.news import BasicNewsRecipe - class CD_Action(BasicNewsRecipe): title = u'CD-Action' __author__ = 'fenuks' - description = 'cdaction.pl - polish magazine about games site' + description = 'cdaction.pl - polish games magazine site' category = 'games' language = 'pl' oldest_article = 8 max_articles_per_feed = 100 no_stylesheets= True - cover_url =u'http://s.cdaction.pl/obrazki/logo-CD-Action_172k9.JPG' keep_only_tags= dict(id='news_content') remove_tags_after= dict(name='div', attrs={'class':'tresc'}) feeds = [(u'Newsy', u'http://www.cdaction.pl/rss_newsy.xml')] + + + def get_cover_url(self): + soup = self.index_to_soup('http://www.cdaction.pl/magazyn/') + self.cover_url='http://www.cdaction.pl'+ soup.find(id='wspolnik').div.a['href'] + return getattr(self, 'cover_url', self.cover_url) \ No newline at end of file diff --git a/recipes/cgm_pl.recipe b/recipes/cgm_pl.recipe index 591155ff85..673a9f940b 100644 --- a/recipes/cgm_pl.recipe +++ b/recipes/cgm_pl.recipe @@ -5,6 +5,7 @@ class CGM(BasicNewsRecipe): oldest_article = 7 __author__ = 'fenuks' description = u'Codzienna Gazeta Muzyczna' + masthead_url='http://www.cgm.pl/img/header/logo.gif' cover_url = 'http://www.krafcy.com/foto/tinymce/Image/cgm%281%29.jpg' category = 'music' language = 'pl' @@ -23,21 +24,19 @@ class CGM(BasicNewsRecipe): def preprocess_html(self, soup): + gallery=soup.find('div', attrs={'class':'galleryFlash'}) + if gallery: + img=gallery.div + gallery.img.extract() + if img: + img=img['style'] + img='http://www.cgm.pl'+img[img.find('url(')+4:img.find(')')] + gallery.contents[1].name='img' + gallery.contents[1]['src']=img for item in soup.findAll(style=True): del item['style'] ad=soup.findAll('a') for r in ad: - if 'http://www.hustla.pl' in r['href'] or 'http://www.ebilet.pl' in r['href']: + if 'www.hustla.pl' in r['href'] or 'www.ebilet.pl' in r['href']: r.extract() - gallery=soup.find('div', attrs={'class':'galleryFlash'}) - if gallery: - img=gallery.find('embed') - if img: - img=img['src'][35:] - img='http://www.cgm.pl/_vault/_gallery/_photo/'+img - param=gallery.findAll(name='param') - for i in param: - i.extract() - gallery.contents[1].name='img' - gallery.contents[1]['src']=img return soup \ No newline at end of file diff --git a/recipes/computerworld_pl.recipe b/recipes/computerworld_pl.recipe index 90b7d63c56..2ec457e4de 100644 --- a/recipes/computerworld_pl.recipe +++ b/recipes/computerworld_pl.recipe @@ -7,10 +7,11 @@ class Computerworld_pl(BasicNewsRecipe): description = u'Serwis o IT w przemyśle, finansach, handlu, administracji oraz rynku IT i telekomunikacyjnym - wiadomości, opinie, analizy, porady prawne' category = 'IT' language = 'pl' + masthead_url= 'http://g1.computerworld.pl/cw/beta_gfx/cw2.gif' no_stylesheets=True oldest_article = 7 max_articles_per_feed = 100 - keep_only_tags=[dict(name='div', attrs={'id':'s'})] + keep_only_tags=[dict(attrs={'class':['tyt_news', 'prawo', 'autor', 'tresc']})] remove_tags_after=dict(name='div', attrs={'class':'rMobi'}) remove_tags=[dict(name='div', attrs={'class':['nnav', 'rMobi']}), dict(name='table', attrs={'class':'ramka_slx'})] feeds = [(u'Wiadomo\u015bci', u'http://rssout.idg.pl/cw/news_iso.xml')] diff --git a/recipes/dobreprogamy.recipe b/recipes/dobreprogamy.recipe index 72f9c966bd..a27a9b0877 100644 --- a/recipes/dobreprogamy.recipe +++ b/recipes/dobreprogamy.recipe @@ -7,6 +7,7 @@ class Dobreprogramy_pl(BasicNewsRecipe): __licence__ ='GPL v3' category = 'IT' language = 'pl' + masthead_url='http://static.dpcdn.pl/css/Black/Images/header_logo_napis_fullVersion.png' cover_url = 'http://userlogos.org/files/logos/Karmody/dobreprogramy_01.png' description = u'Aktualności i blogi z dobreprogramy.pl' encoding = 'utf-8' @@ -16,7 +17,8 @@ class Dobreprogramy_pl(BasicNewsRecipe): oldest_article = 8 max_articles_per_feed = 100 preprocess_regexps = [(re.compile(ur'
Twoja przeglądarka nie obsługuje Flasha i HTML5 lub wyłączono obsługę JavaScript...
'), lambda match: '') ] - remove_tags = [dict(name='div', attrs={'class':['komentarze', 'block', 'portalInfo', 'menuBar', 'topBar']})] - keep_only_tags = [dict(name='div', attrs={'class':['mainBar', 'newsContent', 'postTitle title', 'postInfo', 'contentText', 'content']})] + keep_only_tags=[dict(attrs={'class':['news', 'entry single']})] + remove_tags = [dict(name='div', attrs={'class':['newsOptions', 'noPrint', 'komentarze', 'tags font-heading-master']})] + #remove_tags = [dict(name='div', attrs={'class':['komentarze', 'block', 'portalInfo', 'menuBar', 'topBar']})] feeds = [(u'Aktualności', 'http://feeds.feedburner.com/dobreprogramy/Aktualnosci'), ('Blogi', 'http://feeds.feedburner.com/dobreprogramy/BlogCzytelnikow')] diff --git a/recipes/dziennik_pl.recipe b/recipes/dziennik_pl.recipe index b5453659ef..6da7e0240d 100644 --- a/recipes/dziennik_pl.recipe +++ b/recipes/dziennik_pl.recipe @@ -8,15 +8,17 @@ class Dziennik_pl(BasicNewsRecipe): description = u'Wiadomości z kraju i ze świata. Wiadomości gospodarcze. Znajdziesz u nas informacje, wydarzenia, komentarze, opinie.' category = 'newspaper' language = 'pl' - cover_url='http://6.s.dziennik.pl/images/og_dziennik.jpg' + masthead_url= 'http://5.s.dziennik.pl/images/logos.png' + cover_url= 'http://5.s.dziennik.pl/images/logos.png' no_stylesheets = True oldest_article = 7 max_articles_per_feed = 100 remove_javascript=True remove_empty_feeds=True - preprocess_regexps = [(re.compile("Komentarze:"), lambda m: '')] + extra_css= 'ul {list-style: none; padding: 0; margin: 0;} li {float: left;margin: 0 0.15em;}' + preprocess_regexps = [(re.compile("Komentarze:"), lambda m: ''), (re.compile('

>>> CZYTAJ TAKŻE: ".*?"

'), lambda m: '')] keep_only_tags=[dict(id='article')] - remove_tags=[dict(name='div', attrs={'class':['art_box_dodatki', 'new_facebook_icons2', 'leftArt', 'article_print', 'quiz-widget']}), dict(name='a', attrs={'class':'komentarz'})] + remove_tags=[dict(name='div', attrs={'class':['art_box_dodatki', 'new_facebook_icons2', 'leftArt', 'article_print', 'quiz-widget', 'belka-spol', 'belka-spol belka-spol-bottom', 'art_data_tags', 'cl_right', 'boxRounded gal_inside']}), dict(name='a', attrs={'class':['komentarz', 'article_icon_addcommnent']})] feeds = [(u'Wszystko', u'http://rss.dziennik.pl/Dziennik-PL/'), (u'Wiadomości', u'http://rss.dziennik.pl/Dziennik-Wiadomosci'), (u'Gospodarka', u'http://rss.dziennik.pl/Dziennik-Gospodarka'), @@ -30,6 +32,12 @@ class Dziennik_pl(BasicNewsRecipe): (u'Podróże', u'http://rss.dziennik.pl/Dziennik-Podroze/'), (u'Nieruchomości', u'http://rss.dziennik.pl/Dziennik-Nieruchomosci')] + def skip_ad_pages(self, soup): + tag=soup.find(name='a', attrs={'title':'CZYTAJ DALEJ'}) + if tag: + new_soup=self.index_to_soup(tag['href'], raw=True) + return new_soup + def append_page(self, soup, appendtag): tag=soup.find('a', attrs={'class':'page_next'}) if tag: @@ -56,3 +64,4 @@ class Dziennik_pl(BasicNewsRecipe): def preprocess_html(self, soup): self.append_page(soup, soup.body) return soup + diff --git a/recipes/film_web.recipe b/recipes/film_web.recipe index 1c72e5704e..0671deec6c 100644 --- a/recipes/film_web.recipe +++ b/recipes/film_web.recipe @@ -10,7 +10,8 @@ class Filmweb_pl(BasicNewsRecipe): oldest_article = 8 max_articles_per_feed = 100 no_stylesheets= True - extra_css = '.hdrBig {font-size:22px;}' + remove_empty_feeds=True + extra_css = '.hdrBig {font-size:22px;} ul {list-style-type:none; padding: 0; margin: 0;}' remove_tags= [dict(name='div', attrs={'class':['recommendOthers']}), dict(name='ul', attrs={'class':'fontSizeSet'})] keep_only_tags= [dict(name='h1', attrs={'class':'hdrBig'}), dict(name='div', attrs={'class':['newsInfo', 'reviewContent fontSizeCont description']})] feeds = [(u'Wszystkie newsy', u'http://www.filmweb.pl/feed/news/latest'), diff --git a/recipes/gazeta_wyborcza.recipe b/recipes/gazeta_wyborcza.recipe index 0959ff80a3..489caf231f 100644 --- a/recipes/gazeta_wyborcza.recipe +++ b/recipes/gazeta_wyborcza.recipe @@ -4,10 +4,11 @@ from calibre.web.feeds.news import BasicNewsRecipe class Gazeta_Wyborcza(BasicNewsRecipe): title = u'Gazeta Wyborcza' __author__ = 'fenuks' - cover_url = 'http://bi.gazeta.pl/im/5/10285/z10285445AA.jpg' language = 'pl' description ='news from gazeta.pl' category='newspaper' + publication_type = 'newspaper' + masthead_url='http://bi.gazeta.pl/im/5/10285/z10285445AA.jpg' INDEX='http://wyborcza.pl' remove_empty_feeds= True oldest_article = 3 @@ -81,3 +82,10 @@ class Gazeta_Wyborcza(BasicNewsRecipe): return url else: return url.replace('http://wyborcza.biz/biznes/1', 'http://wyborcza.biz/biznes/2029020') + + def get_cover_url(self): + soup = self.index_to_soup('http://wyborcza.pl/0,76762,3751429.html') + cover=soup.find(id='GWmini2') + soup = self.index_to_soup('http://wyborcza.pl/'+ cover.contents[3].a['href']) + self.cover_url='http://wyborcza.pl' + soup.img['src'] + return getattr(self, 'cover_url', self.cover_url) diff --git a/recipes/gry_online_pl.recipe b/recipes/gry_online_pl.recipe index d9c461dc63..e188e4988c 100644 --- a/recipes/gry_online_pl.recipe +++ b/recipes/gry_online_pl.recipe @@ -8,29 +8,31 @@ class Gry_online_pl(BasicNewsRecipe): language = 'pl' oldest_article = 13 INDEX= 'http://www.gry-online.pl/' - cover_url='http://www.gry-online.pl/img/1st_10/1st-gol-logo.png' + masthead_url='http://www.gry-online.pl/im/gry-online-logo.png' + cover_url='http://www.gry-online.pl/im/gry-online-logo.png' max_articles_per_feed = 100 no_stylesheets= True - extra_css = 'p.wn1{font-size:22px;}' - remove_tags_after= [dict(name='div', attrs={'class':['tresc-newsa']})] - keep_only_tags = [dict(name='div', attrs={'class':['txthead']}), dict(name='p', attrs={'class':['wtx1', 'wn1', 'wob']}), dict(name='a', attrs={'class':['num_str_nex']})] - #remove_tags= [dict(name='div', attrs={'class':['news_plat']})] + keep_only_tags=[dict(name='div', attrs={'class':'gc660'})] + remove_tags=[dict({'class':['nav-social', 'add-info', 'smlb', 'lista lista3 lista-gry', 'S013po', 'zm_gfx_cnt_bottom', 'ocen-txt', 'wiecej-txt', 'wiecej-txt2']})] feeds = [(u'Newsy', 'http://www.gry-online.pl/rss/news.xml'), ('Teksty', u'http://www.gry-online.pl/rss/teksty.xml')] def append_page(self, soup, appendtag): - nexturl = soup.find('a', attrs={'class':'num_str_nex'}) - if appendtag.find('a', attrs={'class':'num_str_nex'}) is not None: - appendtag.find('a', attrs={'class':'num_str_nex'}).replaceWith('\n') - if nexturl is not None: - if 'strona' in nexturl.div.string: - nexturl= self.INDEX + nexturl['href'] - soup2 = self.index_to_soup(nexturl) - pagetext = soup2.findAll(name='p', attrs={'class':['wtx1', 'wn1', 'wob']}) - for tag in pagetext: - pos = len(appendtag.contents) - appendtag.insert(pos, tag) - self.append_page(soup2, appendtag) + tag = appendtag.find('div', attrs={'class':'n5p'}) + if tag: + nexturls=tag.findAll('a') + for nexturl in nexturls[1:]: + try: + soup2 = self.index_to_soup('http://www.gry-online.pl/S020.asp'+ nexturl['href']) + except: + soup2 = self.index_to_soup('http://www.gry-online.pl/S022.asp'+ nexturl['href']) + pagetext = soup2.find(attrs={'class':'gc660'}) + for r in pagetext.findAll(name='header'): + r.extract() + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + for r in appendtag.findAll(attrs={'class':['n5p', 'add-info', 'twitter-share-button']}): + r.extract() def preprocess_html(self, soup): diff --git a/recipes/marketing_magazine.recipe b/recipes/marketing_magazine.recipe index 55b6ea2584..59a7c95843 100644 --- a/recipes/marketing_magazine.recipe +++ b/recipes/marketing_magazine.recipe @@ -1,4 +1,5 @@ __license__ = 'GPL v3' + from calibre.web.feeds.news import BasicNewsRecipe class AdvancedUserRecipe1327062445(BasicNewsRecipe): @@ -7,10 +8,13 @@ class AdvancedUserRecipe1327062445(BasicNewsRecipe): max_articles_per_feed = 100 auto_cleanup = True remove_javascript = True - masthead_url = 'http://www.simrendeogun.com/wp-content/uploads/2011/06/New-Marketing-Magazine-Logo.jpg' - feeds = [(u'My Marketing', u'http://feed43.com/0537744466058428.xml'), (u'My Marketing_', u'http://feed43.com/8126723074604845.xml'), (u'Venturini', u'http://robertoventurini.blogspot.com/feeds/posts/default?alt=rss'), (u'Ninja Marketing', u'http://feeds.feedburner.com/NinjaMarketing'), (u'Comunitàzione', u'http://www.comunitazione.it/feed/novita.asp'), (u'Brandforum news', u'http://www.brandforum.it/rss/news'), (u'Brandforum papers', u'http://www.brandforum.it/rss/papers'), (u'Disambiguando', u'http://giovannacosenza.wordpress.com/feed/')] + no_stylesheets = True __author__ = 'faber1971' - description = 'Collection of Italian marketing websites - v1.00 (28, January 2012)' + description = 'Collection of Italian marketing websites - v1.01 (19, February 2012)' language = 'it' - + remove_tags = [ + dict(name='ul', attrs={'id':'ads0'}) + ] + masthead_url = 'http://www.simrendeogun.com/wp-content/uploads/2011/06/New-Marketing-Magazine-Logo.jpg' + feeds = [(u'My Marketing', u'http://feed43.com/0537744466058428.xml'), (u'My Marketing_', u'http://feed43.com/8126723074604845.xml'), (u'Venturini', u'http://robertoventurini.blogspot.com/feeds/posts/default?alt=rss'), (u'Ninja Marketing', u'http://feeds.feedburner.com/NinjaMarketing'), (u'Comunitàzione', u'http://www.comunitazione.it/feed/novita.asp'), (u'Brandforum news', u'http://www.brandforum.it/rss/news'), (u'Brandforum papers', u'http://www.brandforum.it/rss/papers'), (u'Disambiguando', u'http://giovannacosenza.wordpress.com/feed/')] diff --git a/recipes/naczytniki.recipe b/recipes/naczytniki.recipe index e4769d58bc..2ae6bc391e 100644 --- a/recipes/naczytniki.recipe +++ b/recipes/naczytniki.recipe @@ -1,8 +1,9 @@ from calibre.web.feeds.news import BasicNewsRecipe - +import re class naczytniki(BasicNewsRecipe): title = u'naczytniki.pl' __author__ = 'fenuks' + masthead_url= 'http://naczytniki.pl/wp-content/uploads/2010/08/logo_nc28.png' cover_url = 'http://naczytniki.pl/wp-content/uploads/2010/08/logo_nc28.png' language = 'pl' description ='everything about e-readers' @@ -10,6 +11,7 @@ class naczytniki(BasicNewsRecipe): no_stylesheets=True oldest_article = 7 max_articles_per_feed = 100 + preprocess_regexps = [(re.compile(ur'


Zobacz także:

.*?', re.DOTALL), lambda match: '') ] remove_tags_after= dict(name='div', attrs={'class':'sociable'}) keep_only_tags=[dict(name='div', attrs={'class':'post'})] remove_tags=[dict(name='span', attrs={'class':'comments'}), dict(name='div', attrs={'class':'sociable'})] diff --git a/recipes/nowa_fantastyka.recipe b/recipes/nowa_fantastyka.recipe index 74534f3346..ec556da5fa 100644 --- a/recipes/nowa_fantastyka.recipe +++ b/recipes/nowa_fantastyka.recipe @@ -1,21 +1,33 @@ # -*- coding: utf-8 -*- from calibre.web.feeds.news import BasicNewsRecipe +import re + class Nowa_Fantastyka(BasicNewsRecipe): title = u'Nowa Fantastyka' oldest_article = 7 __author__ = 'fenuks' + __modified_by__ = 'zaslav' language = 'pl' encoding='latin2' description ='site for fantasy readers' category='fantasy' + masthead_url='http://farm5.static.flickr.com/4133/4956658792_7ba7fbf562.jpg' + #extra_css='.tytul {font-size: 20px;}' #not working max_articles_per_feed = 100 INDEX='http://www.fantastyka.pl/' no_stylesheets=True needs_subscription = 'optional' - remove_tags_before=dict(attrs={'class':'belka1-tlo-md'}) + remove_tags_before=dict(attrs={'class':'naglowek2'}) #remove_tags_after=dict(name='span', attrs={'class':'naglowek-oceny'}) - remove_tags_after=dict(name='td', attrs={'class':'belka1-bot'}) - remove_tags=[dict(attrs={'class':'avatar2'}), dict(name='span', attrs={'class':'alert-oceny'}), dict(name='img', attrs={'src':['obrazki/sledz1.png', 'obrazki/print.gif', 'obrazki/mlnf.gif']}), dict(name='b', text='Dodaj komentarz'),dict(name='a', attrs={'href':'http://www.fantastyka.pl/10,1727.html'})] + remove_tags_after=dict(name='form', attrs={'name':'form1'}) + remove_tags=[dict(attrs={'class':['avatar2', 'belka-margin', 'naglowek2']}), dict(name='span', attrs={'class':'alert-oceny'}), dict(name='img', attrs={'src':['obrazki/sledz1.png', 'obrazki/print.gif', 'obrazki/mlnf.gif']}), dict(name='b', text='Dodaj komentarz'),dict(name='a', attrs={'href':'http://www.fantastyka.pl/10,1727.html'}), dict(name='form')] + preprocess_regexps = [ + (re.compile(r'\'), lambda match: ''), + (re.compile(r'\'), lambda match: ''), + (re.compile(r'\'), lambda match: '')] + + + def find_articles(self, url): articles = [] @@ -41,10 +53,10 @@ class Nowa_Fantastyka(BasicNewsRecipe): return feeds + def get_cover_url(self): - soup = self.index_to_soup('http://www.fantastyka.pl/1.html') - cover=soup.find(name='img', attrs={'class':'okladka'}) - self.cover_url=self.INDEX+ cover['src'] + soup = self.index_to_soup('http://www.e-kiosk.pl/nowa_fantastyka') + self.cover_url='http://www.e-kiosk.pl' + soup.find(name='a', attrs={'class':'img'})['href'] return getattr(self, 'cover_url', self.cover_url) def get_browser(self): @@ -56,3 +68,18 @@ class Nowa_Fantastyka(BasicNewsRecipe): br['pass'] = self.password br.submit() return br + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + for item in soup.findAll(font=True): + del item['font'] + for item in soup.findAll(align=True): + del item['align'] + for item in soup.findAll(name='tr'): + item.name='div' + title=soup.find(attrs={'class':'tytul'}) + if title: + title['style']='font-size: 20px; font-weight: bold;' + self.log.warn(soup) + return soup diff --git a/recipes/tablety_pl.recipe b/recipes/tablety_pl.recipe index d06e32d9af..f4c1efa9b8 100644 --- a/recipes/tablety_pl.recipe +++ b/recipes/tablety_pl.recipe @@ -1,14 +1,16 @@ from calibre.web.feeds.news import BasicNewsRecipe - +import re class Tablety_pl(BasicNewsRecipe): title = u'Tablety.pl' __author__ = 'fenuks' description = u'tablety.pl - latest tablet news' + masthead_url= 'http://www.tablety.pl/wp-content/themes/kolektyw/img/logo.png' cover_url = 'http://www.tablety.pl/wp-content/themes/kolektyw/img/logo.png' category = 'IT' language = 'pl' oldest_article = 8 max_articles_per_feed = 100 + preprocess_regexps = [(re.compile(ur'

Przeczytaj także.*?

', re.DOTALL), lambda match: ''), (re.compile(ur'

Przeczytaj koniecznie.*?

', re.DOTALL), lambda match: '')] remove_tags_before=dict(name="h1", attrs={'class':'entry-title'}) remove_tags_after=dict(name="div", attrs={'class':'snap_nopreview sharing robots-nocontent'}) remove_tags=[dict(name='div', attrs={'class':'snap_nopreview sharing robots-nocontent'})] diff --git a/recipes/ubuntu_pl.recipe b/recipes/ubuntu_pl.recipe index 24212e8608..84912e44fa 100644 --- a/recipes/ubuntu_pl.recipe +++ b/recipes/ubuntu_pl.recipe @@ -4,10 +4,12 @@ class Ubuntu_pl(BasicNewsRecipe): title = u'UBUNTU.pl' __author__ = 'fenuks' description = 'UBUNTU.pl - polish ubuntu community site' + masthead_url= 'http://ubuntu.pl/img/logo.jpg' cover_url = 'http://ubuntu.pl/img/logo.jpg' category = 'linux, IT' language = 'pl' no_stylesheets = True + remove_empty_feeds = True oldest_article = 8 max_articles_per_feed = 100 extra_css = '#main {text-align:left;}'