diff --git a/recipes/archeowiesci.recipe b/recipes/archeowiesci.recipe index 3c93d3644f..e121ba4d42 100644 --- a/recipes/archeowiesci.recipe +++ b/recipes/archeowiesci.recipe @@ -7,6 +7,7 @@ class Archeowiesci(BasicNewsRecipe): language = 'pl' cover_url='http://archeowiesci.pl/wp-content/uploads/2011/05/Archeowiesci2-115x115.jpg' oldest_article = 7 + needs_subscription='optional' max_articles_per_feed = 100 auto_cleanup = True remove_tags=[dict(name='span', attrs={'class':['post-ratings', 'post-ratings-loading']})] @@ -16,6 +17,16 @@ class Archeowiesci(BasicNewsRecipe): feeds = BasicNewsRecipe.parse_feeds(self) for feed in feeds: for article in feed.articles[:]: - if 'subskrypcja' in article.title: + if self.username is None and 'subskrypcja' in article.title: feed.articles.remove(article) return feeds + + def get_browser(self): + br = BasicNewsRecipe.get_browser() + if self.username is not None and self.password is not None: + br.open('http://archeowiesci.pl/wp-login.php') + br.select_form(name='loginform') + br['log'] = self.username + br['pwd'] = self.password + br.submit() + return br \ No newline at end of file diff --git a/recipes/astronomia_pl.recipe b/recipes/astronomia_pl.recipe index a142520ec5..89a0e4c889 100644 --- a/recipes/astronomia_pl.recipe +++ b/recipes/astronomia_pl.recipe @@ -1,15 +1,18 @@ from calibre.web.feeds.news import BasicNewsRecipe - +import re class Astronomia_pl(BasicNewsRecipe): title = u'Astronomia.pl' __author__ = 'fenuks' description = 'Astronomia - polish astronomy site' + masthead_url = 'http://www.astronomia.pl/grafika/logo.gif' cover_url = 'http://www.astronomia.pl/grafika/logo.gif' category = 'astronomy, science' language = 'pl' oldest_article = 8 max_articles_per_feed = 100 - #no_stylesheets=True + extra_css='#h2 {font-size: 18px;}' + no_stylesheets=True + preprocess_regexps = [(re.compile(ur'Przeczytaj także:.*?', re.DOTALL), lambda match: '') ] remove_tags_before=dict(name='div', attrs={'id':'a1'}) keep_only_tags=[dict(name='div', attrs={'id':['a1', 'h2']})] feeds = [(u'Wiadomości z astronomii i astronautyki', u'http://www.astronomia.pl/rss/')] diff --git a/recipes/benchmark_pl.recipe b/recipes/benchmark_pl.recipe index d5b4997aa7..cc74cc9128 100644 --- a/recipes/benchmark_pl.recipe +++ b/recipes/benchmark_pl.recipe @@ -4,16 +4,17 @@ class Benchmark_pl(BasicNewsRecipe): title = u'Benchmark.pl' __author__ = 'fenuks' description = u'benchmark.pl -IT site' + masthead_url = 'http://www.benchmark.pl/i/logo-footer.png' cover_url = 'http://www.ieaddons.pl/benchmark/logo_benchmark_new.gif' category = 'IT' language = 'pl' oldest_article = 8 max_articles_per_feed = 100 no_stylesheets=True - preprocess_regexps = [(re.compile(ur'\bWięcej o .*', re.DOTALL|re.IGNORECASE), lambda match: '')] + preprocess_regexps = [(re.compile(ur'

 Zobacz poprzednie Opinie dnia:.*', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(ur'Więcej o .*?', re.DOTALL|re.IGNORECASE), lambda match: '')] keep_only_tags=[dict(name='div', attrs={'class':['m_zwykly', 'gallery']})] remove_tags_after=dict(name='div', attrs={'class':'body'}) - remove_tags=[dict(name='div', attrs={'class':['kategoria', 'socialize', 'thumb', 'panelOcenaObserwowane', 'categoryNextToSocializeGallery']})] + remove_tags=[dict(name='div', attrs={'class':['kategoria', 'socialize', 'thumb', 'panelOcenaObserwowane', 'categoryNextToSocializeGallery']}), dict(name='table', attrs={'background':'http://www.benchmark.pl/uploads/backend_img/a/fotki_newsy/opinie_dnia/bg.png'}), dict(name='table', attrs={'width':'210', 'cellspacing':'1', 'cellpadding':'4', 'border':'0', 'align':'right'})] INDEX= 'http://www.benchmark.pl' feeds = [(u'Aktualności', u'http://www.benchmark.pl/rss/aktualnosci-pliki.xml'), (u'Testy i recenzje', u'http://www.benchmark.pl/rss/testy-recenzje-minirecenzje.xml')] diff --git a/recipes/biolog_pl.recipe b/recipes/biolog_pl.recipe index af9ad77e44..b10bf0d925 100644 --- a/recipes/biolog_pl.recipe +++ b/recipes/biolog_pl.recipe @@ -10,10 +10,11 @@ class Biolog_pl(BasicNewsRecipe): description = u'Przyrodnicze aktualności ze świata nauki (codziennie aktualizowane), kurs biologii, testy i sprawdziany, forum dyskusyjne.' category = 'biology' language = 'pl' + masthead_url= 'http://www.biolog.pl/naukowy,portal,biolog.png' cover_url='http://www.biolog.pl/naukowy,portal,biolog.png' no_stylesheets = True #keeps_only_tags=[dict(id='main')] remove_tags_before=dict(id='main') remove_tags_after=dict(name='a', attrs={'name':'komentarze'}) - remove_tags=[dict(name='img', attrs={'alt':'Komentarze'})] + remove_tags=[dict(name='img', attrs={'alt':'Komentarze'}), dict(name='span', attrs={'class':'menu_odsylacze'})] feeds = [(u'Wszystkie', u'http://www.biolog.pl/backend.php'), (u'Medycyna', u'http://www.biolog.pl/medycyna-rss.php'), (u'Ekologia', u'http://www.biolog.pl/rss-ekologia.php'), (u'Genetyka i biotechnologia', u'http://www.biolog.pl/rss-biotechnologia.php'), (u'Botanika', u'http://www.biolog.pl/rss-botanika.php'), (u'Le\u015bnictwo', u'http://www.biolog.pl/rss-lesnictwo.php'), (u'Zoologia', u'http://www.biolog.pl/rss-zoologia.php')] diff --git a/recipes/cd_action.recipe b/recipes/cd_action.recipe index b4cf6b326c..ff46774dc9 100644 --- a/recipes/cd_action.recipe +++ b/recipes/cd_action.recipe @@ -1,16 +1,20 @@ from calibre.web.feeds.news import BasicNewsRecipe - class CD_Action(BasicNewsRecipe): title = u'CD-Action' __author__ = 'fenuks' - description = 'cdaction.pl - polish magazine about games site' + description = 'cdaction.pl - polish games magazine site' category = 'games' language = 'pl' oldest_article = 8 max_articles_per_feed = 100 no_stylesheets= True - cover_url =u'http://s.cdaction.pl/obrazki/logo-CD-Action_172k9.JPG' keep_only_tags= dict(id='news_content') remove_tags_after= dict(name='div', attrs={'class':'tresc'}) feeds = [(u'Newsy', u'http://www.cdaction.pl/rss_newsy.xml')] + + + def get_cover_url(self): + soup = self.index_to_soup('http://www.cdaction.pl/magazyn/') + self.cover_url='http://www.cdaction.pl'+ soup.find(id='wspolnik').div.a['href'] + return getattr(self, 'cover_url', self.cover_url) \ No newline at end of file diff --git a/recipes/cgm_pl.recipe b/recipes/cgm_pl.recipe index 591155ff85..673a9f940b 100644 --- a/recipes/cgm_pl.recipe +++ b/recipes/cgm_pl.recipe @@ -5,6 +5,7 @@ class CGM(BasicNewsRecipe): oldest_article = 7 __author__ = 'fenuks' description = u'Codzienna Gazeta Muzyczna' + masthead_url='http://www.cgm.pl/img/header/logo.gif' cover_url = 'http://www.krafcy.com/foto/tinymce/Image/cgm%281%29.jpg' category = 'music' language = 'pl' @@ -23,21 +24,19 @@ class CGM(BasicNewsRecipe): def preprocess_html(self, soup): + gallery=soup.find('div', attrs={'class':'galleryFlash'}) + if gallery: + img=gallery.div + gallery.img.extract() + if img: + img=img['style'] + img='http://www.cgm.pl'+img[img.find('url(')+4:img.find(')')] + gallery.contents[1].name='img' + gallery.contents[1]['src']=img for item in soup.findAll(style=True): del item['style'] ad=soup.findAll('a') for r in ad: - if 'http://www.hustla.pl' in r['href'] or 'http://www.ebilet.pl' in r['href']: + if 'www.hustla.pl' in r['href'] or 'www.ebilet.pl' in r['href']: r.extract() - gallery=soup.find('div', attrs={'class':'galleryFlash'}) - if gallery: - img=gallery.find('embed') - if img: - img=img['src'][35:] - img='http://www.cgm.pl/_vault/_gallery/_photo/'+img - param=gallery.findAll(name='param') - for i in param: - i.extract() - gallery.contents[1].name='img' - gallery.contents[1]['src']=img return soup \ No newline at end of file diff --git a/recipes/chr_mon.recipe b/recipes/chr_mon.recipe index 6f41b95763..50b626fcbf 100644 --- a/recipes/chr_mon.recipe +++ b/recipes/chr_mon.recipe @@ -33,6 +33,32 @@ class ChristianScienceMonitor(BasicNewsRecipe): remove_javascript = True no_stylesheets = True + requires_version = (0, 8, 39) + + def preprocess_raw_html(self, raw, url): + try: + from html5lib import parse + root = parse(raw, namespaceHTMLElements=False, + treebuilder='lxml').getroot() + from lxml import etree + for tag in root.xpath( + '//script|//style|//noscript|//meta|//link|//object'): + tag.getparent().remove(tag) + for elem in list(root.iterdescendants(tag=etree.Comment)): + elem.getparent().remove(elem) + ans = etree.tostring(root, encoding=unicode) + ans = re.sub('.*', lambda match : ''), - (r'
.*?
', lambda m: ''), - (r'Full HTML version of this story which may include photos, graphics, and related links.*', - lambda match : ''), - ]] extra_css = ''' h1{ color:#000000;font-family: Georgia,Times,"Times New Roman",serif; font-size: large} .sub{ color:#000000;font-family: Georgia,Times,"Times New Roman",serif; font-size: small;} diff --git a/recipes/ciekawostki_historyczne.recipe b/recipes/ciekawostki_historyczne.recipe new file mode 100644 index 0000000000..7c5138196d --- /dev/null +++ b/recipes/ciekawostki_historyczne.recipe @@ -0,0 +1,48 @@ +from calibre.web.feeds.news import BasicNewsRecipe +import re +class Ciekawostki_Historyczne(BasicNewsRecipe): + title = u'Ciekawostki Historyczne' + oldest_article = 7 + __author__ = 'fenuks' + description = u'Serwis popularnonaukowy - odkrycia, kontrowersje, historia, ciekawostki, badania, ciekawostki z przeszłości.' + category = 'history' + language = 'pl' + masthead_url= 'http://ciekawostkihistoryczne.pl/wp-content/themes/Wordpress_Magazine/images/logo-ciekawostki-historyczne-male.jpg' + cover_url='http://ciekawostkihistoryczne.pl/wp-content/themes/Wordpress_Magazine/images/logo-ciekawostki-historyczne-male.jpg' + max_articles_per_feed = 100 + preprocess_regexps = [(re.compile(ur'Ten artykuł ma kilka stron.*?', re.DOTALL), lambda match: ''), (re.compile(ur'

Zobacz też:

.*?', re.DOTALL), lambda match: '')] + no_stylesheets=True + remove_empty_feeds=True + keep_only_tags=[dict(name='div', attrs={'class':'post'})] + remove_tags=[dict(id='singlepostinfo')] + feeds = [(u'Staro\u017cytno\u015b\u0107', u'http://ciekawostkihistoryczne.pl/tag/starozytnosc/feed/'), (u'\u015aredniowiecze', u'http://ciekawostkihistoryczne.pl/tag/sredniowiecze/feed/'), (u'Nowo\u017cytno\u015b\u0107', u'http://ciekawostkihistoryczne.pl/tag/nowozytnosc/feed/'), (u'XIX wiek', u'http://ciekawostkihistoryczne.pl/tag/xix-wiek/feed/'), (u'1914-1939', u'http://ciekawostkihistoryczne.pl/tag/1914-1939/feed/'), (u'1939-1945', u'http://ciekawostkihistoryczne.pl/tag/1939-1945/feed/'), (u'Powojnie (od 1945)', u'http://ciekawostkihistoryczne.pl/tag/powojnie/feed/'), (u'Recenzje', u'http://ciekawostkihistoryczne.pl/category/recenzje/feed/')] + + def append_page(self, soup, appendtag): + tag=soup.find(name='h7') + if tag: + if tag.br: + pass + elif tag.nextSibling.name=='p': + tag=tag.nextSibling + nexturl = tag.findAll('a') + for nextpage in nexturl: + tag.extract() + nextpage= nextpage['href'] + soup2 = self.index_to_soup(nextpage) + pagetext = soup2.find(name='div', attrs={'class':'post'}) + for r in pagetext.findAll('div', attrs={'id':'singlepostinfo'}): + r.extract() + for r in pagetext.findAll('div', attrs={'class':'wp-caption alignright'}): + r.extract() + for r in pagetext.findAll('h1'): + r.extract() + pagetext.find('h6').nextSibling.extract() + pagetext.find('h7').nextSibling.extract() + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + return soup + + \ No newline at end of file diff --git a/recipes/computerworld_pl.recipe b/recipes/computerworld_pl.recipe index 90b7d63c56..2ec457e4de 100644 --- a/recipes/computerworld_pl.recipe +++ b/recipes/computerworld_pl.recipe @@ -7,10 +7,11 @@ class Computerworld_pl(BasicNewsRecipe): description = u'Serwis o IT w przemyśle, finansach, handlu, administracji oraz rynku IT i telekomunikacyjnym - wiadomości, opinie, analizy, porady prawne' category = 'IT' language = 'pl' + masthead_url= 'http://g1.computerworld.pl/cw/beta_gfx/cw2.gif' no_stylesheets=True oldest_article = 7 max_articles_per_feed = 100 - keep_only_tags=[dict(name='div', attrs={'id':'s'})] + keep_only_tags=[dict(attrs={'class':['tyt_news', 'prawo', 'autor', 'tresc']})] remove_tags_after=dict(name='div', attrs={'class':'rMobi'}) remove_tags=[dict(name='div', attrs={'class':['nnav', 'rMobi']}), dict(name='table', attrs={'class':'ramka_slx'})] feeds = [(u'Wiadomo\u015bci', u'http://rssout.idg.pl/cw/news_iso.xml')] diff --git a/recipes/dobreprogamy.recipe b/recipes/dobreprogamy.recipe index 72f9c966bd..a27a9b0877 100644 --- a/recipes/dobreprogamy.recipe +++ b/recipes/dobreprogamy.recipe @@ -7,6 +7,7 @@ class Dobreprogramy_pl(BasicNewsRecipe): __licence__ ='GPL v3' category = 'IT' language = 'pl' + masthead_url='http://static.dpcdn.pl/css/Black/Images/header_logo_napis_fullVersion.png' cover_url = 'http://userlogos.org/files/logos/Karmody/dobreprogramy_01.png' description = u'Aktualności i blogi z dobreprogramy.pl' encoding = 'utf-8' @@ -16,7 +17,8 @@ class Dobreprogramy_pl(BasicNewsRecipe): oldest_article = 8 max_articles_per_feed = 100 preprocess_regexps = [(re.compile(ur'
Twoja przeglądarka nie obsługuje Flasha i HTML5 lub wyłączono obsługę JavaScript...
'), lambda match: '') ] - remove_tags = [dict(name='div', attrs={'class':['komentarze', 'block', 'portalInfo', 'menuBar', 'topBar']})] - keep_only_tags = [dict(name='div', attrs={'class':['mainBar', 'newsContent', 'postTitle title', 'postInfo', 'contentText', 'content']})] + keep_only_tags=[dict(attrs={'class':['news', 'entry single']})] + remove_tags = [dict(name='div', attrs={'class':['newsOptions', 'noPrint', 'komentarze', 'tags font-heading-master']})] + #remove_tags = [dict(name='div', attrs={'class':['komentarze', 'block', 'portalInfo', 'menuBar', 'topBar']})] feeds = [(u'Aktualności', 'http://feeds.feedburner.com/dobreprogramy/Aktualnosci'), ('Blogi', 'http://feeds.feedburner.com/dobreprogramy/BlogCzytelnikow')] diff --git a/recipes/dziennik_pl.recipe b/recipes/dziennik_pl.recipe index b5453659ef..6da7e0240d 100644 --- a/recipes/dziennik_pl.recipe +++ b/recipes/dziennik_pl.recipe @@ -8,15 +8,17 @@ class Dziennik_pl(BasicNewsRecipe): description = u'Wiadomości z kraju i ze świata. Wiadomości gospodarcze. Znajdziesz u nas informacje, wydarzenia, komentarze, opinie.' category = 'newspaper' language = 'pl' - cover_url='http://6.s.dziennik.pl/images/og_dziennik.jpg' + masthead_url= 'http://5.s.dziennik.pl/images/logos.png' + cover_url= 'http://5.s.dziennik.pl/images/logos.png' no_stylesheets = True oldest_article = 7 max_articles_per_feed = 100 remove_javascript=True remove_empty_feeds=True - preprocess_regexps = [(re.compile("Komentarze:"), lambda m: '')] + extra_css= 'ul {list-style: none; padding: 0; margin: 0;} li {float: left;margin: 0 0.15em;}' + preprocess_regexps = [(re.compile("Komentarze:"), lambda m: ''), (re.compile('

>>> CZYTAJ TAKŻE: ".*?"

'), lambda m: '')] keep_only_tags=[dict(id='article')] - remove_tags=[dict(name='div', attrs={'class':['art_box_dodatki', 'new_facebook_icons2', 'leftArt', 'article_print', 'quiz-widget']}), dict(name='a', attrs={'class':'komentarz'})] + remove_tags=[dict(name='div', attrs={'class':['art_box_dodatki', 'new_facebook_icons2', 'leftArt', 'article_print', 'quiz-widget', 'belka-spol', 'belka-spol belka-spol-bottom', 'art_data_tags', 'cl_right', 'boxRounded gal_inside']}), dict(name='a', attrs={'class':['komentarz', 'article_icon_addcommnent']})] feeds = [(u'Wszystko', u'http://rss.dziennik.pl/Dziennik-PL/'), (u'Wiadomości', u'http://rss.dziennik.pl/Dziennik-Wiadomosci'), (u'Gospodarka', u'http://rss.dziennik.pl/Dziennik-Gospodarka'), @@ -30,6 +32,12 @@ class Dziennik_pl(BasicNewsRecipe): (u'Podróże', u'http://rss.dziennik.pl/Dziennik-Podroze/'), (u'Nieruchomości', u'http://rss.dziennik.pl/Dziennik-Nieruchomosci')] + def skip_ad_pages(self, soup): + tag=soup.find(name='a', attrs={'title':'CZYTAJ DALEJ'}) + if tag: + new_soup=self.index_to_soup(tag['href'], raw=True) + return new_soup + def append_page(self, soup, appendtag): tag=soup.find('a', attrs={'class':'page_next'}) if tag: @@ -56,3 +64,4 @@ class Dziennik_pl(BasicNewsRecipe): def preprocess_html(self, soup): self.append_page(soup, soup.body) return soup + diff --git a/recipes/film_web.recipe b/recipes/film_web.recipe index 1c72e5704e..0671deec6c 100644 --- a/recipes/film_web.recipe +++ b/recipes/film_web.recipe @@ -10,7 +10,8 @@ class Filmweb_pl(BasicNewsRecipe): oldest_article = 8 max_articles_per_feed = 100 no_stylesheets= True - extra_css = '.hdrBig {font-size:22px;}' + remove_empty_feeds=True + extra_css = '.hdrBig {font-size:22px;} ul {list-style-type:none; padding: 0; margin: 0;}' remove_tags= [dict(name='div', attrs={'class':['recommendOthers']}), dict(name='ul', attrs={'class':'fontSizeSet'})] keep_only_tags= [dict(name='h1', attrs={'class':'hdrBig'}), dict(name='div', attrs={'class':['newsInfo', 'reviewContent fontSizeCont description']})] feeds = [(u'Wszystkie newsy', u'http://www.filmweb.pl/feed/news/latest'), diff --git a/recipes/gameplay_pl.recipe b/recipes/gameplay_pl.recipe new file mode 100644 index 0000000000..f3384263d6 --- /dev/null +++ b/recipes/gameplay_pl.recipe @@ -0,0 +1,21 @@ +from calibre.web.feeds.news import BasicNewsRecipe +class Gameplay_pl(BasicNewsRecipe): + title = u'Gameplay.pl' + oldest_article = 7 + __author__ = 'fenuks' + description = u'gameplay.pl - serwis o naszych zainteresowaniach, grach, filmach, książkach, muzyce, fotografii i konsolach.' + category = 'games, movies, books, music' + language = 'pl' + masthead_url= 'http://gameplay.pl/img/gpy_top_logo.png' + cover_url= 'http://gameplay.pl/img/gpy_top_logo.png' + max_articles_per_feed = 100 + no_stylesheets= True + keep_only_tags=[dict(name='div', attrs={'class':['news_endpage_tit', 'news']})] + remove_tags=[dict(name='div', attrs={'class':['galeria', 'noedit center im']})] + feeds = [(u'Wiadomo\u015bci', u'http://gameplay.pl/rss/')] + + def image_url_processor(self, baseurl, url): + if 'http' not in url: + return 'http://gameplay.pl'+ url[2:] + else: + return url diff --git a/recipes/gazeta_wyborcza.recipe b/recipes/gazeta_wyborcza.recipe index 0959ff80a3..489caf231f 100644 --- a/recipes/gazeta_wyborcza.recipe +++ b/recipes/gazeta_wyborcza.recipe @@ -4,10 +4,11 @@ from calibre.web.feeds.news import BasicNewsRecipe class Gazeta_Wyborcza(BasicNewsRecipe): title = u'Gazeta Wyborcza' __author__ = 'fenuks' - cover_url = 'http://bi.gazeta.pl/im/5/10285/z10285445AA.jpg' language = 'pl' description ='news from gazeta.pl' category='newspaper' + publication_type = 'newspaper' + masthead_url='http://bi.gazeta.pl/im/5/10285/z10285445AA.jpg' INDEX='http://wyborcza.pl' remove_empty_feeds= True oldest_article = 3 @@ -81,3 +82,10 @@ class Gazeta_Wyborcza(BasicNewsRecipe): return url else: return url.replace('http://wyborcza.biz/biznes/1', 'http://wyborcza.biz/biznes/2029020') + + def get_cover_url(self): + soup = self.index_to_soup('http://wyborcza.pl/0,76762,3751429.html') + cover=soup.find(id='GWmini2') + soup = self.index_to_soup('http://wyborcza.pl/'+ cover.contents[3].a['href']) + self.cover_url='http://wyborcza.pl' + soup.img['src'] + return getattr(self, 'cover_url', self.cover_url) diff --git a/recipes/gry_online_pl.recipe b/recipes/gry_online_pl.recipe index d9c461dc63..e188e4988c 100644 --- a/recipes/gry_online_pl.recipe +++ b/recipes/gry_online_pl.recipe @@ -8,29 +8,31 @@ class Gry_online_pl(BasicNewsRecipe): language = 'pl' oldest_article = 13 INDEX= 'http://www.gry-online.pl/' - cover_url='http://www.gry-online.pl/img/1st_10/1st-gol-logo.png' + masthead_url='http://www.gry-online.pl/im/gry-online-logo.png' + cover_url='http://www.gry-online.pl/im/gry-online-logo.png' max_articles_per_feed = 100 no_stylesheets= True - extra_css = 'p.wn1{font-size:22px;}' - remove_tags_after= [dict(name='div', attrs={'class':['tresc-newsa']})] - keep_only_tags = [dict(name='div', attrs={'class':['txthead']}), dict(name='p', attrs={'class':['wtx1', 'wn1', 'wob']}), dict(name='a', attrs={'class':['num_str_nex']})] - #remove_tags= [dict(name='div', attrs={'class':['news_plat']})] + keep_only_tags=[dict(name='div', attrs={'class':'gc660'})] + remove_tags=[dict({'class':['nav-social', 'add-info', 'smlb', 'lista lista3 lista-gry', 'S013po', 'zm_gfx_cnt_bottom', 'ocen-txt', 'wiecej-txt', 'wiecej-txt2']})] feeds = [(u'Newsy', 'http://www.gry-online.pl/rss/news.xml'), ('Teksty', u'http://www.gry-online.pl/rss/teksty.xml')] def append_page(self, soup, appendtag): - nexturl = soup.find('a', attrs={'class':'num_str_nex'}) - if appendtag.find('a', attrs={'class':'num_str_nex'}) is not None: - appendtag.find('a', attrs={'class':'num_str_nex'}).replaceWith('\n') - if nexturl is not None: - if 'strona' in nexturl.div.string: - nexturl= self.INDEX + nexturl['href'] - soup2 = self.index_to_soup(nexturl) - pagetext = soup2.findAll(name='p', attrs={'class':['wtx1', 'wn1', 'wob']}) - for tag in pagetext: - pos = len(appendtag.contents) - appendtag.insert(pos, tag) - self.append_page(soup2, appendtag) + tag = appendtag.find('div', attrs={'class':'n5p'}) + if tag: + nexturls=tag.findAll('a') + for nexturl in nexturls[1:]: + try: + soup2 = self.index_to_soup('http://www.gry-online.pl/S020.asp'+ nexturl['href']) + except: + soup2 = self.index_to_soup('http://www.gry-online.pl/S022.asp'+ nexturl['href']) + pagetext = soup2.find(attrs={'class':'gc660'}) + for r in pagetext.findAll(name='header'): + r.extract() + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + for r in appendtag.findAll(attrs={'class':['n5p', 'add-info', 'twitter-share-button']}): + r.extract() def preprocess_html(self, soup): diff --git a/recipes/icons/ciekawostki_historyczne.png b/recipes/icons/ciekawostki_historyczne.png new file mode 100644 index 0000000000..fa0e2c0591 Binary files /dev/null and b/recipes/icons/ciekawostki_historyczne.png differ diff --git a/recipes/icons/gameplay_pl.png b/recipes/icons/gameplay_pl.png new file mode 100644 index 0000000000..1b7081f393 Binary files /dev/null and b/recipes/icons/gameplay_pl.png differ diff --git a/recipes/icons/in4_pl.png b/recipes/icons/in4_pl.png new file mode 100644 index 0000000000..b3351629f0 Binary files /dev/null and b/recipes/icons/in4_pl.png differ diff --git a/recipes/icons/informacje_usa.png b/recipes/icons/informacje_usa.png new file mode 100644 index 0000000000..4c30e3bcbc Binary files /dev/null and b/recipes/icons/informacje_usa.png differ diff --git a/recipes/icons/kresy_pl.png b/recipes/icons/kresy_pl.png new file mode 100644 index 0000000000..db8ef4efec Binary files /dev/null and b/recipes/icons/kresy_pl.png differ diff --git a/recipes/icons/oclab_pl.png b/recipes/icons/oclab_pl.png new file mode 100644 index 0000000000..45ecd2533e Binary files /dev/null and b/recipes/icons/oclab_pl.png differ diff --git a/recipes/icons/overclock_pl.png b/recipes/icons/overclock_pl.png new file mode 100644 index 0000000000..38c0b13bfe Binary files /dev/null and b/recipes/icons/overclock_pl.png differ diff --git a/recipes/icons/palmtop_pl.png b/recipes/icons/palmtop_pl.png new file mode 100644 index 0000000000..d711a41682 Binary files /dev/null and b/recipes/icons/palmtop_pl.png differ diff --git a/recipes/icons/pc_arena.png b/recipes/icons/pc_arena.png new file mode 100644 index 0000000000..10be204b36 Binary files /dev/null and b/recipes/icons/pc_arena.png differ diff --git a/recipes/icons/pc_centre_pl.png b/recipes/icons/pc_centre_pl.png new file mode 100644 index 0000000000..e2fbf1eefb Binary files /dev/null and b/recipes/icons/pc_centre_pl.png differ diff --git a/recipes/icons/pc_foster.png b/recipes/icons/pc_foster.png new file mode 100644 index 0000000000..433970bcc1 Binary files /dev/null and b/recipes/icons/pc_foster.png differ diff --git a/recipes/icons/polska_times.png b/recipes/icons/polska_times.png new file mode 100644 index 0000000000..f233f45518 Binary files /dev/null and b/recipes/icons/polska_times.png differ diff --git a/recipes/icons/pure_pc.png b/recipes/icons/pure_pc.png new file mode 100644 index 0000000000..e5e102eee7 Binary files /dev/null and b/recipes/icons/pure_pc.png differ diff --git a/recipes/icons/tanuki.png b/recipes/icons/tanuki.png new file mode 100644 index 0000000000..fe46d7e8dc Binary files /dev/null and b/recipes/icons/tanuki.png differ diff --git a/recipes/icons/tvn24.png b/recipes/icons/tvn24.png new file mode 100644 index 0000000000..864a6624ac Binary files /dev/null and b/recipes/icons/tvn24.png differ diff --git a/recipes/icons/webhosting_pl.png b/recipes/icons/webhosting_pl.png new file mode 100644 index 0000000000..0e11a3065e Binary files /dev/null and b/recipes/icons/webhosting_pl.png differ diff --git a/recipes/in4_pl.recipe b/recipes/in4_pl.recipe new file mode 100644 index 0000000000..16ad622b46 --- /dev/null +++ b/recipes/in4_pl.recipe @@ -0,0 +1,44 @@ +from calibre.web.feeds.news import BasicNewsRecipe +import re +class in4(BasicNewsRecipe): + title = u'IN4.pl' + oldest_article = 7 + max_articles_per_feed = 100 + __author__ = 'fenuks' + description = u'Serwis Informacyjny - Aktualnosci, recenzje' + category = 'IT' + language = 'pl' + #cover_url= 'http://www.in4.pl/recenzje/337/in4pl.jpg' + no_stylesheets = True + remove_empty_feeds = True + preprocess_regexps = [(re.compile(ur'

', re.DOTALL), lambda match: ''), (re.compile(ur'

Zobacz też:.*?

', re.DOTALL), lambda match: '')] + keep_only_tags=[dict(name='div', attrs={'class':'box box-single'})] + remove_tags_after= dict(attrs={'class':'tags'}) + remove_tags= [dict(attrs={'class':['postmetadata', 'tags', 'banner']}), dict(name='a', attrs={'title':['Drukuj', u'Wyślij']})] + feeds = [(u'Informacje', u'http://www.informacjeusa.com/feed/')] diff --git a/recipes/kresy_pl.recipe b/recipes/kresy_pl.recipe new file mode 100644 index 0000000000..3dfc2c057c --- /dev/null +++ b/recipes/kresy_pl.recipe @@ -0,0 +1,14 @@ +from calibre.web.feeds.news import BasicNewsRecipe +class Kresy(BasicNewsRecipe): + title = u'Kresy' + __author__ = 'fenuks' + description = u'portal społeczności kresowej' + language = 'pl' + masthead_url= 'http://www.kresy.pl/public/img/logo.png' + cover_url= 'http://www.kresy.pl/public/img/logo.png' + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + keep_only_tags= [dict(id='artykul')] + remove_tags= [dict(attrs={'class':['twitter-share-button', 'likefbborder', 'tagi']})] + feeds = [(u'Wszystkie', u'http://www.kresy.pl/rss')] diff --git a/recipes/la_pausa_caffe.recipe b/recipes/la_pausa_caffe.recipe new file mode 100644 index 0000000000..1a87d33dcf --- /dev/null +++ b/recipes/la_pausa_caffe.recipe @@ -0,0 +1,17 @@ +__version__ = 'v1.0' +__date__ = '13, February 2011' + +from calibre.web.feeds.news import BasicNewsRecipe + +class AdvancedUserRecipe1329125921(BasicNewsRecipe): + title = u'La pausa caff\xe8' + __author__ = 'faber1971' + description = 'An Italian satirical blog' + language = 'it' + + oldest_article = 7 + max_articles_per_feed = 100 + auto_cleanup = True + no_stylesheets = True + feeds = [(u'La pausa caff\xe8', u'http://feeds.feedburner.com/LapausaCaffe')] + diff --git a/recipes/marketing_magazine.recipe b/recipes/marketing_magazine.recipe index 55b6ea2584..0c14939cd8 100644 --- a/recipes/marketing_magazine.recipe +++ b/recipes/marketing_magazine.recipe @@ -1,4 +1,5 @@ __license__ = 'GPL v3' + from calibre.web.feeds.news import BasicNewsRecipe class AdvancedUserRecipe1327062445(BasicNewsRecipe): @@ -7,10 +8,13 @@ class AdvancedUserRecipe1327062445(BasicNewsRecipe): max_articles_per_feed = 100 auto_cleanup = True remove_javascript = True + no_stylesheets = True + remove_tags = [ + dict(name='ul', attrs={'id':'ads0'}) + ] masthead_url = 'http://www.simrendeogun.com/wp-content/uploads/2011/06/New-Marketing-Magazine-Logo.jpg' - feeds = [(u'My Marketing', u'http://feed43.com/0537744466058428.xml'), (u'My Marketing_', u'http://feed43.com/8126723074604845.xml'), (u'Venturini', u'http://robertoventurini.blogspot.com/feeds/posts/default?alt=rss'), (u'Ninja Marketing', u'http://feeds.feedburner.com/NinjaMarketing'), (u'Comunitàzione', u'http://www.comunitazione.it/feed/novita.asp'), (u'Brandforum news', u'http://www.brandforum.it/rss/news'), (u'Brandforum papers', u'http://www.brandforum.it/rss/papers'), (u'Disambiguando', u'http://giovannacosenza.wordpress.com/feed/')] __author__ = 'faber1971' - description = 'Collection of Italian marketing websites - v1.00 (28, January 2012)' + description = 'Collection of Italian marketing websites - v1.03 (20, February 2012)' language = 'it' - + feeds = [(u'My Marketing', u'http://feed43.com/0537744466058428.xml'), (u'My Marketing_', u'http://feed43.com/8126723074604845.xml'), (u'Venturini', u'http://robertoventurini.blogspot.com/feeds/posts/default?alt=rss'), (u'Ninja Marketing', u'http://feeds.feedburner.com/NinjaMarketing'), (u'Comunitàzione', u'http://www.comunitazione.it/feed/novita.asp'), (u'Brandforum news', u'http://www.brandforum.it/rss/news'), (u'Brandforum papers', u'http://www.brandforum.it/rss/papers'), (u'MarketingArena', u'http://feeds.feedburner.com/marketingarena'), (u'minimarketing', u'http://feeds.feedburner.com/minimarketingit'), (u'Disambiguando', u'http://giovannacosenza.wordpress.com/feed/')] diff --git a/recipes/mediapart.recipe b/recipes/mediapart.recipe index 4540879f72..a5bc4e96f9 100644 --- a/recipes/mediapart.recipe +++ b/recipes/mediapart.recipe @@ -1,16 +1,17 @@ __license__ = 'GPL v3' -__copyright__ = '2009, Mathieu Godlewski ; 2010, Louis Gesbert ' +__copyright__ = '2009, Mathieu Godlewski ; 2010, 2011, Louis Gesbert ' ''' Mediapart ''' -from calibre.ebooks.BeautifulSoup import Tag +import re +from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.web.feeds.news import BasicNewsRecipe class Mediapart(BasicNewsRecipe): title = 'Mediapart' - __author__ = 'Mathieu Godlewski' - description = 'Global news in french from online newspapers' + __author__ = 'Mathieu Godlewski, Louis Gesbert' + description = 'Global news in french from news site Mediapart' oldest_article = 7 language = 'fr' needs_subscription = True @@ -18,52 +19,30 @@ class Mediapart(BasicNewsRecipe): max_articles_per_feed = 50 no_stylesheets = True - cover_url = 'http://www.mediapart.fr/sites/all/themes/mediapart/mediapart/images/annonce.jpg' + cover_url = 'http://static.mediapart.fr/files/pave_mediapart.jpg' feeds = [ ('Les articles', 'http://www.mediapart.fr/articles/feed'), ] -# -- print-version has poor quality on this website, better do the conversion ourselves -# -# preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE|re.DOTALL), i[1]) for i in -# [ -# (r'', lambda match : '

'+match.group(1)+'

'), -# (r'[^>]+]*>([^<]*)[^<]*', -# lambda match : ''+match.group(1)+''), -# (r'\'', lambda match: '’'), -# ] -# ] -# -# remove_tags = [ dict(name='div', attrs={'class':'print-source_url'}), -# dict(name='div', attrs={'class':'print-links'}), -# dict(name='img', attrs={'src':'entete_article.png'}), -# dict(name='br') ] -# -# def print_version(self, url): -# raw = self.browser.open(url).read() -# soup = BeautifulSoup(raw.decode('utf8', 'replace')) -# div = soup.find('div', {'id':re.compile('node-\d+')}) -# if div is None: -# return None -# article_id = string.replace(div['id'], 'node-', '') -# if article_id is None: -# return None -# return 'http://www.mediapart.fr/print/'+article_id +# -- print-version -# -- Non-print version [dict(name='div', attrs={'class':'advert'})] - - keep_only_tags = [ - dict(name='h1', attrs={'class':'title'}), - dict(name='div', attrs={'class':'page_papier_detail'}), + preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE|re.DOTALL), i[1]) for i in + [ + (r'', lambda match : '

'+match.group(1)+'

'), + (r'\'', lambda match: '’') ] + ] - def preprocess_html(self,soup): - for title in soup.findAll('div', {'class':'titre'}): - tag = Tag(soup, 'h3') - title.replaceWith(tag) - tag.insert(0,title) - return soup + remove_tags = [ dict(name='div', attrs={'class':'print-source_url'}) ] + + def print_version(self, url): + raw = self.browser.open(url).read() + soup = BeautifulSoup(raw.decode('utf8', 'replace')) + link = soup.find('a', {'title':'Imprimer'}) + if link is None: + return None + return link['href'] # -- Handle login @@ -76,4 +55,3 @@ class Mediapart(BasicNewsRecipe): br['pass'] = self.password br.submit() return br - diff --git a/recipes/naczytniki.recipe b/recipes/naczytniki.recipe index e4769d58bc..2ae6bc391e 100644 --- a/recipes/naczytniki.recipe +++ b/recipes/naczytniki.recipe @@ -1,8 +1,9 @@ from calibre.web.feeds.news import BasicNewsRecipe - +import re class naczytniki(BasicNewsRecipe): title = u'naczytniki.pl' __author__ = 'fenuks' + masthead_url= 'http://naczytniki.pl/wp-content/uploads/2010/08/logo_nc28.png' cover_url = 'http://naczytniki.pl/wp-content/uploads/2010/08/logo_nc28.png' language = 'pl' description ='everything about e-readers' @@ -10,6 +11,7 @@ class naczytniki(BasicNewsRecipe): no_stylesheets=True oldest_article = 7 max_articles_per_feed = 100 + preprocess_regexps = [(re.compile(ur'


Zobacz także:

.*?', re.DOTALL), lambda match: '') ] remove_tags_after= dict(name='div', attrs={'class':'sociable'}) keep_only_tags=[dict(name='div', attrs={'class':'post'})] remove_tags=[dict(name='span', attrs={'class':'comments'}), dict(name='div', attrs={'class':'sociable'})] diff --git a/recipes/nowa_fantastyka.recipe b/recipes/nowa_fantastyka.recipe index 74534f3346..ec556da5fa 100644 --- a/recipes/nowa_fantastyka.recipe +++ b/recipes/nowa_fantastyka.recipe @@ -1,21 +1,33 @@ # -*- coding: utf-8 -*- from calibre.web.feeds.news import BasicNewsRecipe +import re + class Nowa_Fantastyka(BasicNewsRecipe): title = u'Nowa Fantastyka' oldest_article = 7 __author__ = 'fenuks' + __modified_by__ = 'zaslav' language = 'pl' encoding='latin2' description ='site for fantasy readers' category='fantasy' + masthead_url='http://farm5.static.flickr.com/4133/4956658792_7ba7fbf562.jpg' + #extra_css='.tytul {font-size: 20px;}' #not working max_articles_per_feed = 100 INDEX='http://www.fantastyka.pl/' no_stylesheets=True needs_subscription = 'optional' - remove_tags_before=dict(attrs={'class':'belka1-tlo-md'}) + remove_tags_before=dict(attrs={'class':'naglowek2'}) #remove_tags_after=dict(name='span', attrs={'class':'naglowek-oceny'}) - remove_tags_after=dict(name='td', attrs={'class':'belka1-bot'}) - remove_tags=[dict(attrs={'class':'avatar2'}), dict(name='span', attrs={'class':'alert-oceny'}), dict(name='img', attrs={'src':['obrazki/sledz1.png', 'obrazki/print.gif', 'obrazki/mlnf.gif']}), dict(name='b', text='Dodaj komentarz'),dict(name='a', attrs={'href':'http://www.fantastyka.pl/10,1727.html'})] + remove_tags_after=dict(name='form', attrs={'name':'form1'}) + remove_tags=[dict(attrs={'class':['avatar2', 'belka-margin', 'naglowek2']}), dict(name='span', attrs={'class':'alert-oceny'}), dict(name='img', attrs={'src':['obrazki/sledz1.png', 'obrazki/print.gif', 'obrazki/mlnf.gif']}), dict(name='b', text='Dodaj komentarz'),dict(name='a', attrs={'href':'http://www.fantastyka.pl/10,1727.html'}), dict(name='form')] + preprocess_regexps = [ + (re.compile(r'\'), lambda match: ''), + (re.compile(r'\'), lambda match: ''), + (re.compile(r'\'), lambda match: '')] + + + def find_articles(self, url): articles = [] @@ -41,10 +53,10 @@ class Nowa_Fantastyka(BasicNewsRecipe): return feeds + def get_cover_url(self): - soup = self.index_to_soup('http://www.fantastyka.pl/1.html') - cover=soup.find(name='img', attrs={'class':'okladka'}) - self.cover_url=self.INDEX+ cover['src'] + soup = self.index_to_soup('http://www.e-kiosk.pl/nowa_fantastyka') + self.cover_url='http://www.e-kiosk.pl' + soup.find(name='a', attrs={'class':'img'})['href'] return getattr(self, 'cover_url', self.cover_url) def get_browser(self): @@ -56,3 +68,18 @@ class Nowa_Fantastyka(BasicNewsRecipe): br['pass'] = self.password br.submit() return br + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + for item in soup.findAll(font=True): + del item['font'] + for item in soup.findAll(align=True): + del item['align'] + for item in soup.findAll(name='tr'): + item.name='div' + title=soup.find(attrs={'class':'tytul'}) + if title: + title['style']='font-size: 20px; font-weight: bold;' + self.log.warn(soup) + return soup diff --git a/recipes/oclab_pl.recipe b/recipes/oclab_pl.recipe new file mode 100644 index 0000000000..b0df89ba72 --- /dev/null +++ b/recipes/oclab_pl.recipe @@ -0,0 +1,31 @@ +from calibre.web.feeds.news import BasicNewsRecipe +class OCLab(BasicNewsRecipe): + title = u'OCLab.pl' + oldest_article = 7 + max_articles_per_feed = 100 + __author__ = 'fenuks' + description = u'Portal OCLab.pl jest miejscem przyjaznym pasjonatom sprzętu komputerowego, w szczególności overclockerom, które będzie służyć im za aktualną bazę wiedzy o podkręcaniu komputera, źródło aktualnych informacji z rynku oraz opinii na temat sprzętu komputerowego.' + category = 'IT' + language = 'pl' + cover_url= 'http://www.idealforum.ru/attachment.php?attachmentid=7963&d=1316008118' + no_stylesheets = True + keep_only_tags=[dict(id='main')] + remove_tags_after= dict(attrs={'class':'single-postmetadata'}) + remove_tags=[dict(attrs={'class':['single-postmetadata', 'pagebar']})] + feeds = [(u'Wpisy', u'http://oclab.pl/feed/')] + + + def append_page(self, soup, appendtag): + tag=soup.find(attrs={'class':'contentjumpddl'}) + if tag: + nexturl=tag.findAll('option') + for nextpage in nexturl[1:-1]: + soup2 = self.index_to_soup(nextpage['value']) + pagetext = soup2.find(attrs={'class':'single-entry'}) + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + for r in appendtag.findAll(attrs={'class':'post-nav-bottom-list'}): + r.extract() + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + return soup diff --git a/recipes/overclock_pl.recipe b/recipes/overclock_pl.recipe new file mode 100644 index 0000000000..d7f4c8093d --- /dev/null +++ b/recipes/overclock_pl.recipe @@ -0,0 +1,37 @@ +import re +from calibre.web.feeds.news import BasicNewsRecipe +class Overclock_pl(BasicNewsRecipe): + title = u'Overclock.pl' + oldest_article = 7 + max_articles_per_feed = 100 + __author__ = 'fenuks' + description = u'Vortal poświęcony tematyce hardware, kładący największy nacisk na podkręcanie / overclocking (włącznie z extreme) i chłodzenie / cooling (air cooling, water cooling, freon cooling, dry ice, liquid nitrogen).' + category = 'IT' + language = 'pl' + masthead_url='http://www.overclock.pl/gfx/logo_m.png' + cover_url='http://www.overclock.pl/gfx/logo_m.png' + no_stylesheets = True + remove_empty_feeds = True + preprocess_regexps = [(re.compile(ur'Komentarze do aktualności:.*?', re.DOTALL), lambda match: ''), (re.compile(ur'

Nawigacja

', re.DOTALL), lambda match: '') ] + keep_only_tags=[dict(name='div', attrs={'class':'news'}), dict(id='articleContent')] + remove_tags=[dict(name='span', attrs={'class':'info'}), dict(attrs={'class':'shareit'})] + feeds = [(u'Aktualno\u015bci', u'http://www.overclock.pl/rss.news.xml'), (u'Testy i recenzje', u'http://www.overclock.pl/rss.articles.xml')] + + + def append_page(self, soup, appendtag): + tag=soup.find(id='navigation') + if tag: + nexturl=tag.findAll('option') + tag.extract() + for nextpage in nexturl[2:]: + soup2 = self.index_to_soup(nextpage['value']) + pagetext = soup2.find(id='content') + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + rem=appendtag.find(attrs={'alt':'Pierwsza'}) + if rem: + rem.parent.extract() + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + return soup \ No newline at end of file diff --git a/recipes/palmtop_pl.recipe b/recipes/palmtop_pl.recipe new file mode 100644 index 0000000000..ace772e7e7 --- /dev/null +++ b/recipes/palmtop_pl.recipe @@ -0,0 +1,14 @@ +from calibre.web.feeds.news import BasicNewsRecipe +class palmtop_pl(BasicNewsRecipe): + title = u'Palmtop.pl' + __author__ = 'fenuks' + description = 'wortal technologii mobilnych' + category = 'mobile' + language = 'pl' + cover_url='http://cdn.g-point.biz/wp-content/themes/palmtop-new/images/header_palmtop_logo.png' + masthead_url='http://cdn.g-point.biz/wp-content/themes/palmtop-new/images/header_palmtop_logo.png' + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + + feeds = [(u'Newsy', u'http://palmtop.pl/feed/atom/')] diff --git a/recipes/pc_arena.recipe b/recipes/pc_arena.recipe new file mode 100644 index 0000000000..faefeb25c0 --- /dev/null +++ b/recipes/pc_arena.recipe @@ -0,0 +1,31 @@ +from calibre.web.feeds.news import BasicNewsRecipe +class PC_Arena(BasicNewsRecipe): + title = u'PCArena' + oldest_article = 18300 + max_articles_per_feed = 100 + __author__ = 'fenuks' + description = u'Najnowsze informacje z branży IT - testy, recenzje, aktualności, rankingi, wywiady. Twoje źródło informacji o sprzęcie komputerowym.' + category = 'IT' + language = 'pl' + masthead_url='http://pcarena.pl/public/design/frontend/images/logo.gif' + cover_url= 'http://pcarena.pl/public/design/frontend/images/logo.gif' + no_stylesheets = True + keep_only_tags=[dict(attrs={'class':['artHeader', 'art']})] + remove_tags=[dict(attrs={'class':'pages'})] + feeds = [(u'Newsy', u'http://pcarena.pl/misc/rss/news'), (u'Artyku\u0142y', u'http://pcarena.pl/misc/rss/articles')] + + def append_page(self, soup, appendtag): + tag=soup.find(name='div', attrs={'class':'pagNum'}) + if tag: + nexturl=tag.findAll('a') + tag.extract() + for nextpage in nexturl[1:]: + nextpage= 'http://pcarena.pl' + nextpage['href'] + soup2 = self.index_to_soup(nextpage) + pagetext = soup2.find(attrs={'class':'artBody'}) + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + return soup \ No newline at end of file diff --git a/recipes/pc_centre_pl.recipe b/recipes/pc_centre_pl.recipe new file mode 100644 index 0000000000..68a17888ce --- /dev/null +++ b/recipes/pc_centre_pl.recipe @@ -0,0 +1,41 @@ +from calibre.web.feeds.news import BasicNewsRecipe +class PC_Centre(BasicNewsRecipe): + title = u'PC Centre' + oldest_article = 7 + max_articles_per_feed = 100 + __author__ = 'fenuks' + description = u'Portal komputerowy, a w nim: testy sprzętu komputerowego, recenzje gier i oprogramowania. a także opisy produktów związanych z komputerami.' + category = 'IT' + language = 'pl' + masthead_url= 'http://pccentre.pl/views/images/logo.gif' + cover_url= 'http://pccentre.pl/views/images/logo.gif' + no_stylesheets = True + keep_only_tags= [dict(id='content')] + remove_tags=[dict(attrs={'class':['ikony r', 'list_of_content', 'dot accordion']}), dict(id='comments')] + feeds = [(u'Publikacje', u'http://pccentre.pl/backend.php?mode=a'), (u'Aktualno\u015bci', u'http://pccentre.pl/backend.php'), (u'Sprz\u0119t komputerowy', u'http://pccentre.pl/backend.php?mode=n§ion=2'), (u'Oprogramowanie', u'http://pccentre.pl/backend.php?mode=n§ion=3'), (u'Gry komputerowe i konsole', u'http://pccentre.pl/backend.php?mode=n§ion=4'), (u'Internet', u'http://pccentre.pl/backend.php?mode=n§ion=7'), (u'Bezpiecze\u0144stwo', u'http://pccentre.pl/backend.php?mode=n§ion=5'), (u'Multimedia', u'http://pccentre.pl/backend.php?mode=n§ion=6'), (u'Biznes', u'http://pccentre.pl/backend.php?mode=n§ion=9')] + + + def append_page(self, soup, appendtag): + tag=soup.find(name='div', attrs={'class':'pages'}) + if tag: + nexturl=tag.findAll('a') + tag.extract() + for nextpage in nexturl[:-1]: + nextpage= 'http://pccentre.pl' + nextpage['href'] + soup2 = self.index_to_soup(nextpage) + pagetext = soup2.find(id='content') + rem=pagetext.findAll(attrs={'class':['subtitle', 'content_info', 'list_of_content', 'pages', 'social2', 'pcc_acc', 'pcc_acc_na']}) + for r in rem: + r.extract() + rem=pagetext.findAll(id='comments') + for r in rem: + r.extract() + rem=pagetext.findAll('h1') + for r in rem: + r.extract() + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + return soup \ No newline at end of file diff --git a/recipes/pc_foster.recipe b/recipes/pc_foster.recipe new file mode 100644 index 0000000000..ab8c2b66b1 --- /dev/null +++ b/recipes/pc_foster.recipe @@ -0,0 +1,35 @@ +from calibre.web.feeds.news import BasicNewsRecipe +class PC_Foster(BasicNewsRecipe): + title = u'PC Foster' + oldest_article = 7 + max_articles_per_feed = 100 + __author__ = 'fenuks' + description = u'Vortal technologiczny: testy, recenzje sprzętu komputerowego i telefonów, nowinki hardware, programy i gry dla Windows. Podkręcanie, modding i Overclocking.' + category = 'IT' + language = 'pl' + masthead_url='http://pcfoster.pl/public/images/logo.png' + cover_url= 'http://pcfoster.pl/public/images/logo.png' + no_stylesheets= True + remove_empty_feeds= True + keep_only_tags= [dict(id=['news_details', 'review_details']), dict(attrs={'class':'pager more_top'})] + remove_tags=[dict(name='p', attrs={'class':'right'})] + feeds = [(u'G\u0142\xf3wny', u'http://pcfoster.pl/public/rss/main.xml')] + + + def append_page(self, soup, appendtag): + nexturl= appendtag.find(attrs={'alt':u'Następna strona'}) + if nexturl: + appendtag.find(attrs={'class':'pager more_top'}).extract() + while nexturl: + nexturl='http://pcfoster.pl' + nexturl.parent['href'] + soup2 = self.index_to_soup(nexturl) + nexturl=soup2.find(attrs={'alt':u'Następna strona'}) + pagetext = soup2.find(attrs={'class':'content'}) + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + for r in appendtag.findAll(attrs={'class':'review_content double'}): + r.extract() + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + return soup \ No newline at end of file diff --git a/recipes/polska_times.recipe b/recipes/polska_times.recipe new file mode 100644 index 0000000000..4126576fe2 --- /dev/null +++ b/recipes/polska_times.recipe @@ -0,0 +1,81 @@ +from calibre.web.feeds.news import BasicNewsRecipe +import re +class Polska_times(BasicNewsRecipe): + title = u'Polska Times' + __author__ = 'fenuks' + description = u'Internetowe wydanie dziennika ogólnopolskiego Polska The Times. Najświeższe informacje: wydarzenia w kraju i na świecie, reportaże, poradniki, opinie.' + category = 'newspaper' + language = 'pl' + masthead_url = 'http://s.polskatimes.pl/g/logo_naglowek/polska.gif?17' + oldest_article = 7 + max_articles_per_feed = 100 + remove_emty_feeds= True + no_stylesheets = True + preprocess_regexps = [(re.compile(ur'Czytaj także:.*?', re.DOTALL), lambda match: ''), (re.compile(ur',Czytaj też:.*?', re.DOTALL), lambda match: ''), (re.compile(ur'Zobacz także:.*?', re.DOTALL), lambda match: ''), (re.compile(ur'

', re.DOTALL), lambda match: ''), (re.compile(ur'CZYTAJ TEŻ:.*?', re.DOTALL), lambda match: ''), (re.compile(ur'CZYTAJ WIĘCEJ:.*?', re.DOTALL), lambda match: ''), (re.compile(ur'CZYTAJ TAKŻE:.*?', re.DOTALL), lambda match: ''), (re.compile(ur'\* CZYTAJ KONIECZNIE:.*', re.DOTALL), lambda match: ''), (re.compile(ur'Nasze serwisy:.*', re.DOTALL), lambda match: '') ] + keep_only_tags= [dict(id=['tytul-artykulu', 'kontent'])] + remove_tags_after= dict(id='material-tagi') + remove_tags=[dict(attrs={'id':'reklama_srodtekst_0'}), dict(attrs={'id':'material-tagi'}), dict(name='div', attrs={'class':'zakladki'}), dict(attrs={'title':u'CZYTAJ TAKŻE'}), dict(attrs={'id':'podobne'}), dict(name='a', attrs={'href':'http://www.dzienniklodzki.pl/newsletter'})] + feeds = [(u'Fakty', u'http://polskatimes.feedsportal.com/c/32980/f/533648/index.rss'), (u'Opinie', u'http://www.polskatimes.pl/rss/opinie.xml'), (u'Sport', u'http://polskatimes.feedsportal.com/c/32980/f/533649/index.rss'), (u'Pieni\u0105dze', u'http://polskatimes.feedsportal.com/c/32980/f/533657/index.rss'), (u'Twoje finanse', u'http://www.polskatimes.pl/rss/twojefinanse.xml'), (u'Kultura', u'http://polskatimes.feedsportal.com/c/32980/f/533650/index.rss'), (u'Dodatki', u'http://www.polskatimes.pl/rss/dodatki.xml')] + + def skip_ad_pages(self, soup): + if 'Advertisement' in soup.title: + nexturl=soup.find('a')['href'] + return self.index_to_soup(nexturl, raw=True) + + def append_page(self, soup, appendtag): + nexturl=soup.find(id='nastepna_strona') + while nexturl: + soup2= self.index_to_soup(nexturl['href']) + nexturl=soup2.find(id='nastepna_strona') + pagetext = soup2.find(id='tresc') + for dictionary in self.remove_tags: + v=pagetext.findAll(attrs=dictionary['attrs']) + for delete in v: + delete.extract() + for b in pagetext.findAll(name='b'): + if b.string: + if u'CZYTAJ TEŻ' in b.string or u'Czytaj także' in b.string or u'Czytaj też' in b.string or u'Zobacz także' in b.string: + b.extract() + for center in pagetext.findAll(name='center'): + if center.h4: + if center.h4.a: + center.extract() + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + for paginator in appendtag.findAll(attrs={'class':'stronicowanie'}): + paginator.extract() + + def image_article(self, soup, appendtag): + nexturl=soup.find('a', attrs={'class':'nastepna'}) + urls=[] + while nexturl: + if nexturl not in urls: + urls.append(nexturl) + else: + break + soup2= self.index_to_soup('http://www.polskatimes.pl/artykul/' + nexturl['href']) + nexturl=soup2.find('a', attrs={'class':'nastepna'}) + if nexturl in urls: + break; + pagetext = soup2.find(id='galeria-material') + pos = len(appendtag.contents) + appendtag.insert(pos, '
') + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + for rem in appendtag.findAll(attrs={'class':['galeriaNawigator', 'miniaturyPojemnik']}): + rem.extract() + for paginator in appendtag.findAll(attrs={'class':'stronicowanie'}): + paginator.extract() + + def preprocess_html(self, soup): + if soup.find('a', attrs={'class':'nastepna'}): + self.image_article(soup, soup.body) + elif soup.find(id='nastepna_strona'): + self.append_page(soup, soup.body) + return soup + + + def get_cover_url(self): + soup = self.index_to_soup('http://www.prasa24.pl/gazeta/metropolia-warszawska/') + self.cover_url=soup.find(id='pojemnik').img['src'] + return getattr(self, 'cover_url', self.cover_url) \ No newline at end of file diff --git a/recipes/pure_pc.recipe b/recipes/pure_pc.recipe new file mode 100644 index 0000000000..7a6c43bb7e --- /dev/null +++ b/recipes/pure_pc.recipe @@ -0,0 +1,33 @@ +from calibre.web.feeds.news import BasicNewsRecipe +class PurePC(BasicNewsRecipe): + title = u'PurePC' + oldest_article = 7 + max_articles_per_feed = 100 + __author__ = 'fenuks' + description = u'Artykuły, aktualności, sprzęt, forum, chłodzenie, modding, urządzenia mobilne - wszystko w jednym miejscu.' + category = 'IT' + language = 'pl' + masthead_url= 'http://www.purepc.pl/themes/new/images/purepc.jpg' + cover_url= 'http://www.purepc.pl/themes/new/images/purepc.jpg' + no_stylesheets = True + keep_only_tags= [dict(id='content')] + remove_tags_after= dict(attrs={'class':'fivestar-widget'}) + remove_tags= [dict(id='navigator'), dict(attrs={'class':['box-tools', 'fivestar-widget', 'PageMenuList']})] + feeds = [(u'Wiadomo\u015bci', u'http://www.purepc.pl/node/feed')] + + + def append_page(self, soup, appendtag): + nexturl= appendtag.find(attrs={'class':'pager-next'}) + if nexturl: + while nexturl: + soup2 = self.index_to_soup('http://www.purepc.pl'+ nexturl.a['href']) + nexturl=soup2.find(attrs={'class':'pager-next'}) + pagetext = soup2.find(attrs={'class':'article'}) + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + for r in appendtag.findAll(attrs={'class':['PageMenuList', 'pager', 'fivestar-widget']}): + r.extract() + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + return soup \ No newline at end of file diff --git a/recipes/tablety_pl.recipe b/recipes/tablety_pl.recipe index d06e32d9af..f4c1efa9b8 100644 --- a/recipes/tablety_pl.recipe +++ b/recipes/tablety_pl.recipe @@ -1,14 +1,16 @@ from calibre.web.feeds.news import BasicNewsRecipe - +import re class Tablety_pl(BasicNewsRecipe): title = u'Tablety.pl' __author__ = 'fenuks' description = u'tablety.pl - latest tablet news' + masthead_url= 'http://www.tablety.pl/wp-content/themes/kolektyw/img/logo.png' cover_url = 'http://www.tablety.pl/wp-content/themes/kolektyw/img/logo.png' category = 'IT' language = 'pl' oldest_article = 8 max_articles_per_feed = 100 + preprocess_regexps = [(re.compile(ur'

Przeczytaj także.*?

', re.DOTALL), lambda match: ''), (re.compile(ur'

Przeczytaj koniecznie.*?

', re.DOTALL), lambda match: '')] remove_tags_before=dict(name="h1", attrs={'class':'entry-title'}) remove_tags_after=dict(name="div", attrs={'class':'snap_nopreview sharing robots-nocontent'}) remove_tags=[dict(name='div', attrs={'class':'snap_nopreview sharing robots-nocontent'})] diff --git a/recipes/tanuki.recipe b/recipes/tanuki.recipe new file mode 100644 index 0000000000..666cb8aa77 --- /dev/null +++ b/recipes/tanuki.recipe @@ -0,0 +1,37 @@ +from calibre.web.feeds.news import BasicNewsRecipe +import re +class tanuki(BasicNewsRecipe): + title = u'Tanuki' + oldest_article = 7 + __author__ = 'fenuks' + category = 'anime, manga' + language = 'pl' + max_articles_per_feed = 100 + encoding='utf-8' + extra_css= 'ul {list-style: none; padding: 0; margin: 0;} .kadr{float: left;} .dwazdania {float: right;}' + preprocess_regexps = [(re.compile(ur'

', re.DOTALL), lambda match: ''), (re.compile(ur'', re.DOTALL), lambda match: '')] + remove_empty_feeds= True + no_stylesheets = True + keep_only_tags=[dict(attrs={'class':['animename', 'storyname', 'nextarrow','sideinfov', 'sidelinfov', 'sideinfo', 'sidelinfo']}), dict(name='table', attrs={'summary':'Technikalia'}), dict(attrs={'class':['chaptername','copycat']}), dict(id='rightcolumn'), dict(attrs={'class':['headn_tt', 'subtable']})] + remove_tags=[dict(name='div', attrs={'class':'screen'}), dict(id='randomtoplist'), dict(attrs={'class':'note'})] + feeds = [(u'Anime', u'http://anime.tanuki.pl/rss_anime.xml'), (u'Manga', u'http://manga.tanuki.pl/rss_manga.xml'), (u'Tomiki', u'http://manga.tanuki.pl/rss_mangabooks.xml'), (u'Artyku\u0142y', u'http://czytelnia.tanuki.pl/rss_czytelnia_artykuly.xml'), (u'Opowiadania', u'http://czytelnia.tanuki.pl/rss_czytelnia.xml')] + + + def append_page(self, soup, appendtag): + nexturl= appendtag.find(attrs={'class':'nextarrow'}) + if nexturl: + while nexturl: + soup2 = self.index_to_soup('http://czytelnia.tanuki.pl'+ nexturl['href']) + nexturl=soup2.find(attrs={'class':'nextarrow'}) + pagetext = soup2.find(attrs={'class':['chaptername', 'copycat']}) + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + pagetext = soup2.find(attrs={'class':'copycat'}) + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + for r in appendtag.findAll(attrs={'class':'nextarrow'}): + r.extract() + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + return soup \ No newline at end of file diff --git a/recipes/the_sun.recipe b/recipes/the_sun.recipe index 5699ec106c..80b37f329a 100644 --- a/recipes/the_sun.recipe +++ b/recipes/the_sun.recipe @@ -1,49 +1,57 @@ import re -from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import Tag +from calibre.web.feeds.recipes import BasicNewsRecipe -class AdvancedUserRecipe1268409464(BasicNewsRecipe): - title = u'The Sun' - __author__ = 'Chaz Ralph' - description = 'News from The Sun' +class AdvancedUserRecipe1325006965(BasicNewsRecipe): + + title = u'The Sun UK' + cover_url = 'http://www.thesun.co.uk/img/global/new-masthead-logo.png' + + description = 'A Recipe for The Sun tabloid UK - uses feed43' + __author__ = 'Dave Asbury' + # last updated 20/2/12 + language = 'en_GB' oldest_article = 1 - max_articles_per_feed = 100 - language = 'en' + max_articles_per_feed = 15 + remove_empty_feeds = True no_stylesheets = True - extra_css = '.headline {font-size: x-large;} \n .fact { padding-top: 10pt }' - encoding= 'iso-8859-1' - remove_javascript = True + + masthead_url = 'http://www.thesun.co.uk/sol/img/global/Sun-logo.gif' + encoding = 'cp1251' + + encoding = 'cp1252' + remove_empty_feeds = True + remove_javascript = True + no_stylesheets = True + + extra_css = ''' + body{ text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;} + ''' + + preprocess_regexps = [ + (re.compile(r'