From 672b991af464caba79693b9990238a84d6aed8c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tomasz=20D=C5=82ugosz?= Date: Thu, 21 Mar 2013 00:27:17 +0100 Subject: [PATCH 1/7] update esensja and add rss version --- recipes/esenja.recipe | 210 +++++++++++++++++++++----------- recipes/esensja_(rss).recipe | 109 +++++++++++++++++ recipes/icons/esenja.png | Bin 0 -> 329 bytes recipes/icons/esensja_(rss).png | Bin 0 -> 329 bytes 4 files changed, 248 insertions(+), 71 deletions(-) create mode 100644 recipes/esensja_(rss).recipe create mode 100644 recipes/icons/esenja.png create mode 100644 recipes/icons/esensja_(rss).png diff --git a/recipes/esenja.recipe b/recipes/esenja.recipe index b8b94ad66e..503b27b9fa 100644 --- a/recipes/esenja.recipe +++ b/recipes/esenja.recipe @@ -3,85 +3,153 @@ __license__ = 'GPL v3' __copyright__ = '2010, matek09, matek09@gmail.com' -from calibre.web.feeds.news import BasicNewsRecipe import re +from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup, Comment class Esensja(BasicNewsRecipe): - title = u'Esensja' - __author__ = 'matek09' - description = 'Monthly magazine' - encoding = 'utf-8' - no_stylesheets = True - language = 'pl' - remove_javascript = True - HREF = '0' + title = u'Esensja' + __author__ = 'matek09 & fenuks' + description = 'Magazyn kultury popularnej' + encoding = 'utf-8' + no_stylesheets = True + language = 'pl' + remove_javascript = True + masthead_url = 'http://esensja.pl/img/wrss.gif' + oldest_article = 1 + URL = 'http://esensja.pl' + HREF = '0' + remove_attributes = ['style', 'bgcolor', 'alt', 'color'] + keep_only_tags = [dict(attrs={'class':'sekcja'}), ] + #keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'article'}) + #remove_tags_before = dict(dict(name = 'div', attrs = {'class' : 't-title'})) + remove_tags_after = dict(id='tekst') - #keep_only_tags =[] - #keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'article'}) - remove_tags_before = dict(dict(name = 'div', attrs = {'class' : 't-title'})) - remove_tags_after = dict(dict(name = 'img', attrs = {'src' : '../../../2000/01/img/tab_bot.gif'})) + remove_tags = [dict(name = 'img', attrs = {'src' : ['../../../2000/01/img/tab_top.gif', '../../../2000/01/img/tab_bot.gif']}), + dict(name = 'div', attrs = {'class' : 't-title2 nextpage'}), + #dict(attrs={'rel':'lightbox[galeria]'}) + dict(attrs={'class':['tekst_koniec', 'ref', 'wykop']}), + dict(attrs={'itemprop':['copyrightHolder', 'publisher']}), + dict(id='komentarze') + + ] - remove_tags =[] - remove_tags.append(dict(name = 'img', attrs = {'src' : '../../../2000/01/img/tab_top.gif'})) - remove_tags.append(dict(name = 'img', attrs = {'src' : '../../../2000/01/img/tab_bot.gif'})) - remove_tags.append(dict(name = 'div', attrs = {'class' : 't-title2 nextpage'})) + extra_css = ''' + .t-title {font-size: x-large; font-weight: bold; text-align: left} + .t-author {font-size: x-small; text-align: left} + .t-title2 {font-size: x-small; font-style: italic; text-align: left} + .text {font-size: small; text-align: left} + .annot-ref {font-style: italic; text-align: left} + ''' - extra_css = ''' - .t-title {font-size: x-large; font-weight: bold; text-align: left} - .t-author {font-size: x-small; text-align: left} - .t-title2 {font-size: x-small; font-style: italic; text-align: left} - .text {font-size: small; text-align: left} - .annot-ref {font-style: italic; text-align: left} - ''' + preprocess_regexps = [(re.compile(r'alt="[^"]*"'), lambda match: ''), + (re.compile(ur'(title|alt)="[^"]*?"', re.DOTALL), lambda match: ''), + ] - preprocess_regexps = [(re.compile(r'alt="[^"]*"'), - lambda match: '')] + def parse_index(self): + soup = self.index_to_soup('http://www.esensja.pl/magazyn/') + a = soup.find('a', attrs={'href' : re.compile('.*/index.html')}) + year = a['href'].split('/')[0] + month = a['href'].split('/')[1] + self.HREF = 'http://www.esensja.pl/magazyn/' + year + '/' + month + '/iso/' + soup = self.index_to_soup(self.HREF + '01.html') + self.cover_url = 'http://www.esensja.pl/magazyn/' + year + '/' + month + '/img/ilustr/cover_b.jpg' + feeds = [] + chapter = '' + subchapter = '' + articles = [] + intro = soup.find('div', attrs={'class' : 'n-title'}) + ''' + introduction = {'title' : self.tag_to_string(intro.a), + 'url' : self.HREF + intro.a['href'], + 'date' : '', + 'description' : ''} + chapter = 'Wprowadzenie' + articles.append(introduction) + ''' + + for tag in intro.findAllNext(attrs={'class': ['chapter', 'subchapter', 'n-title']}): + if tag.name in 'td': + if len(articles) > 0: + section = chapter + if len(subchapter) > 0: + section += ' - ' + subchapter + feeds.append((section, articles)) + articles = [] + if tag['class'] == 'chapter': + chapter = self.tag_to_string(tag).capitalize() + subchapter = '' + else: + subchapter = self.tag_to_string(tag) + subchapter = self.tag_to_string(tag) + continue + + finalurl = tag.a['href'] + if not finalurl.startswith('http'): + finalurl = self.HREF + finalurl + articles.append({'title' : self.tag_to_string(tag.a), 'url' : finalurl, 'date' : '', 'description' : ''}) + + a = self.index_to_soup(finalurl) + i = 1 + + while True: + div = a.find('div', attrs={'class' : 't-title2 nextpage'}) + if div is not None: + link = div.a['href'] + if not link.startswith('http'): + link = self.HREF + link + a = self.index_to_soup(link) + articles.append({'title' : self.tag_to_string(tag.a) + ' c. d. ' + str(i), 'url' : link, 'date' : '', 'description' : ''}) + i = i + 1 + else: + break - def parse_index(self): - soup = self.index_to_soup('http://www.esensja.pl/magazyn/') - a = soup.find('a', attrs={'href' : re.compile('.*/index.html')}) - year = a['href'].split('/')[0] - month = a['href'].split('/')[1] - self.HREF = 'http://www.esensja.pl/magazyn/' + year + '/' + month + '/iso/' - soup = self.index_to_soup(self.HREF + '01.html') - self.cover_url = 'http://www.esensja.pl/magazyn/' + year + '/' + month + '/img/ilustr/cover_b.jpg' - feeds = [] - intro = soup.find('div', attrs={'class' : 'n-title'}) - introduction = {'title' : self.tag_to_string(intro.a), - 'url' : self.HREF + intro.a['href'], - 'date' : '', - 'description' : ''} - chapter = 'Wprowadzenie' - subchapter = '' - articles = [] - articles.append(introduction) - for tag in intro.findAllNext(attrs={'class': ['chapter', 'subchapter', 'n-title']}): - if tag.name in 'td': - if len(articles) > 0: - section = chapter - if len(subchapter) > 0: - section += ' - ' + subchapter - feeds.append((section, articles)) - articles = [] - if tag['class'] == 'chapter': - chapter = self.tag_to_string(tag).capitalize() - subchapter = '' - else: - subchapter = self.tag_to_string(tag) - subchapter = self.tag_to_string(tag) - continue - articles.append({'title' : self.tag_to_string(tag.a), 'url' : self.HREF + tag.a['href'], 'date' : '', 'description' : ''}) + return feeds - a = self.index_to_soup(self.HREF + tag.a['href']) - i = 1 - while True: - div = a.find('div', attrs={'class' : 't-title2 nextpage'}) - if div is not None: - a = self.index_to_soup(self.HREF + div.a['href']) - articles.append({'title' : self.tag_to_string(tag.a) + ' c. d. ' + str(i), 'url' : self.HREF + div.a['href'], 'date' : '', 'description' : ''}) - i = i + 1 - else: - break + def append_page(self, soup, appendtag): + r = appendtag.find(attrs={'class':'wiecej_xxx'}) + if r: + nr = r.findAll(attrs={'class':'tn-link'})[-1] + try: + nr = int(nr.a.string) + except: + return + baseurl = soup.find(attrs={'property':'og:url'})['content'] + '&strona={0}' + for number in range(2, nr+1): + soup2 = self.index_to_soup(baseurl.format(number)) + pagetext = soup2.find(attrs={'class':'tresc'}) + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + for r in appendtag.findAll(attrs={'class':['wiecej_xxx', 'tekst_koniec']}): + r.extract() + for r in appendtag.findAll('script'): + r.extract() + + comments = appendtag.findAll(text=lambda text:isinstance(text, Comment)) + for comment in comments: + comment.extract() + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + for tag in soup.findAll(attrs={'class':'img_box_right'}): + temp = tag.find('img') + src = '' + if temp: + src = temp.get('src', '') + for r in tag.findAll('a', recursive=False): + r.extract() + info = tag.find(attrs={'class':'img_info'}) + text = str(tag) + if not src: + src = re.search('src="[^"]*?"', text) + if src: + src = src.group(0) + src = src[5:].replace('//', '/') + if src: + tag.contents = [] + tag.insert(0, BeautifulSoup(''.format(self.URL, src))) + if info: + tag.insert(len(tag.contents), info) + return soup - return feeds diff --git a/recipes/esensja_(rss).recipe b/recipes/esensja_(rss).recipe new file mode 100644 index 0000000000..af23ea58a9 --- /dev/null +++ b/recipes/esensja_(rss).recipe @@ -0,0 +1,109 @@ +__license__ = 'GPL v3' +import re +from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup, Comment + +class EsensjaRSS(BasicNewsRecipe): + title = u'Esensja (RSS)' + __author__ = 'fenuks' + description = u'Magazyn kultury popularnej' + category = 'reading, fantasy, reviews, boardgames, culture' + #publication_type = '' + language = 'pl' + encoding = 'utf-8' + INDEX = 'http://www.esensja.pl' + extra_css = '''.t-title {font-size: x-large; font-weight: bold; text-align: left} + .t-author {font-size: x-small; text-align: left} + .t-title2 {font-size: x-small; font-style: italic; text-align: left} + .text {font-size: small; text-align: left} + .annot-ref {font-style: italic; text-align: left} + ''' + cover_url = '' + masthead_url = 'http://esensja.pl/img/wrss.gif' + use_embedded_content = False + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + remove_empty_feeds = True + remove_javascript = True + ignore_duplicate_articles = {'title', 'url'} + preprocess_regexps = [(re.compile(r'alt="[^"]*"'), lambda match: ''), + (re.compile(ur'(title|alt)="[^"]*?"', re.DOTALL), lambda match: ''), + ] + remove_attributes = ['style', 'bgcolor', 'alt', 'color'] + keep_only_tags = [dict(attrs={'class':'sekcja'}), ] + remove_tags_after = dict(id='tekst') + + remove_tags = [dict(name = 'img', attrs = {'src' : ['../../../2000/01/img/tab_top.gif', '../../../2000/01/img/tab_bot.gif']}), + dict(name = 'div', attrs = {'class' : 't-title2 nextpage'}), + #dict(attrs={'rel':'lightbox[galeria]'}) + dict(attrs={'class':['tekst_koniec', 'ref', 'wykop']}), + dict(attrs={'itemprop':['copyrightHolder', 'publisher']}), + dict(id='komentarze') + ] + + feeds = [(u'Książka', u'http://esensja.pl/rss/ksiazka.rss'), + (u'Film', u'http://esensja.pl/rss/film.rss'), + (u'Komiks', u'http://esensja.pl/rss/komiks.rss'), + (u'Gry', u'http://esensja.pl/rss/gry.rss'), + (u'Muzyka', u'http://esensja.pl/rss/muzyka.rss'), + (u'Twórczość', u'http://esensja.pl/rss/tworczosc.rss'), + (u'Varia', u'http://esensja.pl/rss/varia.rss'), + (u'Zgryźliwi Tetrycy', u'http://esensja.pl/rss/tetrycy.rss'), + (u'Nowe książki', u'http://esensja.pl/rss/xnowosci.rss'), + (u'Ostatnio dodane książki', u'http://esensja.pl/rss/xdodane.rss'), + ] + + def get_cover_url(self): + soup = self.index_to_soup(self.INDEX) + cover = soup.find(id='panel_1') + self.cover_url = self.INDEX + cover.find('a')['href'].replace('index.html', '') + 'img/ilustr/cover_b.jpg' + return getattr(self, 'cover_url', self.cover_url) + + + def append_page(self, soup, appendtag): + r = appendtag.find(attrs={'class':'wiecej_xxx'}) + if r: + nr = r.findAll(attrs={'class':'tn-link'})[-1] + try: + nr = int(nr.a.string) + except: + return + baseurl = soup.find(attrs={'property':'og:url'})['content'] + '&strona={0}' + for number in range(2, nr+1): + soup2 = self.index_to_soup(baseurl.format(number)) + pagetext = soup2.find(attrs={'class':'tresc'}) + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + for r in appendtag.findAll(attrs={'class':['wiecej_xxx', 'tekst_koniec']}): + r.extract() + for r in appendtag.findAll('script'): + r.extract() + + comments = appendtag.findAll(text=lambda text:isinstance(text, Comment)) + for comment in comments: + comment.extract() + + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + for tag in soup.findAll(attrs={'class':'img_box_right'}): + temp = tag.find('img') + src = '' + if temp: + src = temp.get('src', '') + for r in tag.findAll('a', recursive=False): + r.extract() + info = tag.find(attrs={'class':'img_info'}) + text = str(tag) + if not src: + src = re.search('src="[^"]*?"', text) + if src: + src = src.group(0) + src = src[5:].replace('//', '/') + if src: + tag.contents = [] + tag.insert(0, BeautifulSoup(''.format(self.INDEX, src))) + if info: + tag.insert(len(tag.contents), info) + return soup diff --git a/recipes/icons/esenja.png b/recipes/icons/esenja.png new file mode 100644 index 0000000000000000000000000000000000000000..185e46ea95f03b9908aed609d485469399e8e7fa GIT binary patch literal 329 zcmV-P0k-~$P){kjl(e?Ql$NYiT6~zoJ0$=B z00DGTPE!Ct=GbNc005CmL_t&-l~vE%4#FT1MNuv>M4+Wu0dctX|9`Rv;|m)|CX1(G zEG#h=`chy)E&)_hDdp;`_vui8Qi}4QUl;*Gi2R;9T6!x}7-A zIRMefvn^-h1hCfTXSDwSt+mo65pj+Yu{kjl(e?Ql$NYiT6~zoJ0$=B z00DGTPE!Ct=GbNc005CmL_t&-l~vE%4#FT1MNuv>M4+Wu0dctX|9`Rv;|m)|CX1(G zEG#h=`chy)E&)_hDdp;`_vui8Qi}4QUl;*Gi2R;9T6!x}7-A zIRMefvn^-h1hCfTXSDwSt+mo65pj+Yu Date: Thu, 21 Mar 2013 00:36:09 +0100 Subject: [PATCH 2/7] add blog_biszopa --- recipes/blog_biszopa.recipe | 30 ++++++++++++++++++++++++++++++ recipes/icons/blog_biszopa.png | Bin 0 -> 755 bytes 2 files changed, 30 insertions(+) create mode 100644 recipes/blog_biszopa.recipe create mode 100644 recipes/icons/blog_biszopa.png diff --git a/recipes/blog_biszopa.recipe b/recipes/blog_biszopa.recipe new file mode 100644 index 0000000000..7393f23f3b --- /dev/null +++ b/recipes/blog_biszopa.recipe @@ -0,0 +1,30 @@ +__license__ = 'GPL v3' +from calibre.web.feeds.news import BasicNewsRecipe + +class BlogBiszopa(BasicNewsRecipe): + title = u'Blog Biszopa' + __author__ = 'fenuks' + description = u'Zapiski z Granitowego Miasta' + category = 'history' + #publication_type = '' + language = 'pl' + #encoding = '' + #extra_css = '' + cover_url = 'http://blogbiszopa.pl/wp-content/themes/biszop/images/logo.png' + masthead_url = '' + use_embedded_content = False + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + remove_empty_feeds = True + remove_javascript = True + remove_attributes = ['style', 'font'] + ignore_duplicate_articles = {'title', 'url'} + + keep_only_tags = [dict(id='main-content')] + remove_tags = [dict(name='footer')] + #remove_tags_after = {} + #remove_tags_before = {} + + feeds = [(u'Artyku\u0142y', u'http://blogbiszopa.pl/feed/')] + diff --git a/recipes/icons/blog_biszopa.png b/recipes/icons/blog_biszopa.png new file mode 100644 index 0000000000000000000000000000000000000000..eaba074cde3292f57b7f885fe740b86b0c5a8ef9 GIT binary patch literal 755 zcmeAS@N?(olHy`uVBq!ia0vp^0wB!63?wyl`GbKJOS+@4BLl<6e(pbstUx|vage(c z!@6@aFM%9|WRD45bDP46hOx7_4S6Fo+k-*%fF5l=v3l z6XN>+|Np-qK75)o^-XH}hnCit?jG+-%YRt*JX;ZFkhif2)+-{kik5_xA0wvbi&7?xVHq9 Date: Thu, 21 Mar 2013 00:36:54 +0100 Subject: [PATCH 3/7] add websecurity --- recipes/icons/websecurity_pl.png | Bin 0 -> 863 bytes recipes/websecurity_pl.recipe | 28 ++++++++++++++++++++++++++++ 2 files changed, 28 insertions(+) create mode 100644 recipes/icons/websecurity_pl.png create mode 100644 recipes/websecurity_pl.recipe diff --git a/recipes/icons/websecurity_pl.png b/recipes/icons/websecurity_pl.png new file mode 100644 index 0000000000000000000000000000000000000000..32eff82072ebbbfefb6f6f964a0be47f8614ff07 GIT binary patch literal 863 zcmeAS@N?(olHy`uVBq!ia0vp^0wB!63?wyl`GbKJOS+@4BLl<6e(pbstUx|vage(c z!@6@aFM%9|WRD45bDP46hOx7_4S6Fo+k-*%fF5)aMo8 z6XN>+|Npo0DsO;DPUWql>X*`rKUc54v#@z@Zu7Oa;jMxiL>5Rwk%HPAIpw!N#So;R zijb05hMFS>WT=7+1nYXEp!$9O!ndkgKv}3_pb7FSe;+@2XK03`9Vn~%!Q20>f+|qU z2Ooc+)Eh;$ch(N?Y#iPys=asf`nF)vpJ&hiJb(Uu;bNe&uRZ<$zI^=$1Yf`Y{qp5& z?}Q(lw*r0i@AKy`^^G5b!@nRNVLFD}QX?@i997y}S3f8MFR={R#||A8XhD`|;z~@e}`k z{rdm!-;Z@0fZ_ACyYJ7#M_+sT-WgkbsjYu!V)4jKs0J9^CM7|B!Hh}g>hzcx+1QzM zYtJU9Jx~eRx-CTce){*zKYw1fJoe+56_cRfrf-}48QE=Depq44qNBsIZso@epnk?A zZ+91l4pvzYAcwQSBeIx*fm;}a85w5HkpK!F_H=O!k+__kkdTs;mYACSJmK+^N6#Ln zCL}yCHhd^9AR;6tC~9md*v{eM;WcxH$7zoPtUf+IXV3ULnuHuUbm`QsW7jS*sU@Vp zfAHeTlNWE2*o6y9i@$&P^6A^g2F@S9ezLQ$v9=!wnbA^J)%vsD-R02Pj;_AW-tO{Z z-v&+YU1 Date: Thu, 21 Mar 2013 00:56:45 +0100 Subject: [PATCH 4/7] various fixes --- recipes/adventure_zone_pl.recipe | 25 ++++-------- recipes/benchmark_pl.recipe | 53 ++++++++++++++---------- recipes/biweekly.recipe | 2 +- recipes/ciekawostki_historyczne.recipe | 54 ++++++++++--------------- recipes/computerworld_pl.recipe | 16 ++++++-- recipes/conowego_pl.recipe | 8 +++- recipes/dzieje_pl.recipe | 18 +++++---- recipes/dziennik_pl.recipe | 29 +++++++------ recipes/dziennik_wschodni.recipe | 6 +++ recipes/echo_dnia.recipe | 5 +++ recipes/film_web.recipe | 18 +++++++-- recipes/gazeta_lubuska.recipe | 5 +++ recipes/gazeta_pomorska.recipe | 5 +++ recipes/gazeta_wspolczesna.recipe | 5 +++ recipes/gazeta_wyborcza.recipe | 13 +++++- recipes/gcn.recipe | 7 +++- recipes/gildia_pl.recipe | 11 ++--- recipes/gram_pl.recipe | 1 + recipes/gry_online_pl.recipe | 13 +++++- recipes/icons/historia_pl.png | Bin 806 -> 869 bytes recipes/icons/nowy_obywatel.png | Bin recipes/in4_pl.recipe | 17 +++++--- recipes/infra_pl.recipe | 4 +- recipes/kosmonauta_pl.recipe | 2 + recipes/kurier_galicyjski.recipe | 6 ++- recipes/kurier_poranny.recipe | 6 +++ recipes/linux_journal.recipe | 6 ++- recipes/nto.recipe | 5 +++ recipes/oclab_pl.recipe | 6 +++ recipes/overclock_pl.recipe | 3 +- recipes/pc_foster.recipe | 7 ++++ recipes/pure_pc.recipe | 7 +++- recipes/ubuntu_pomoc_org.recipe | 9 +++-- 33 files changed, 248 insertions(+), 124 deletions(-) mode change 100755 => 100644 recipes/icons/nowy_obywatel.png diff --git a/recipes/adventure_zone_pl.recipe b/recipes/adventure_zone_pl.recipe index 2a6cf9957d..dd47af946a 100644 --- a/recipes/adventure_zone_pl.recipe +++ b/recipes/adventure_zone_pl.recipe @@ -21,35 +21,24 @@ class Adventure_zone(BasicNewsRecipe): extra_css = '.main-bg{text-align: left;} td.capmain{ font-size: 22px; }' feeds = [(u'Nowinki', u'http://www.adventure-zone.info/fusion/feeds/news.php')] - '''def parse_feeds (self): - feeds = BasicNewsRecipe.parse_feeds(self) - soup=self.index_to_soup(u'http://www.adventure-zone.info/fusion/feeds/news.php') - tag=soup.find(name='channel') - titles=[] - for r in tag.findAll(name='image'): - r.extract() - art=tag.findAll(name='item') - for i in art: - titles.append(i.title.string) - for feed in feeds: - for article in feed.articles[:]: - article.title=titles[feed.articles.index(article)] - return feeds''' - - '''def get_cover_url(self): soup = self.index_to_soup('http://www.adventure-zone.info/fusion/news.php') cover=soup.find(id='box_OstatninumerAZ') self.cover_url='http://www.adventure-zone.info/fusion/'+ cover.center.a.img['src'] return getattr(self, 'cover_url', self.cover_url)''' + def populate_article_metadata(self, article, soup, first): result = re.search('(.+) - Adventure Zone', soup.title.string) if result: - article.title = result.group(1) + result = result.group(1) else: result = soup.body.find('strong') if result: - article.title = result.string + result = result.string + if result: + result = result.replace('&', '&') + result = result.replace(''', '’') + article.title = result def skip_ad_pages(self, soup): skip_tag = soup.body.find(name='td', attrs={'class':'main-bg'}) diff --git a/recipes/benchmark_pl.recipe b/recipes/benchmark_pl.recipe index 95c5488a24..c934cc4ac4 100644 --- a/recipes/benchmark_pl.recipe +++ b/recipes/benchmark_pl.recipe @@ -1,5 +1,7 @@ from calibre.web.feeds.news import BasicNewsRecipe import re +from calibre.ebooks.BeautifulSoup import Comment + class BenchmarkPl(BasicNewsRecipe): title = u'Benchmark.pl' __author__ = 'fenuks' @@ -13,10 +15,10 @@ class BenchmarkPl(BasicNewsRecipe): no_stylesheets = True remove_attributes = ['style'] preprocess_regexps = [(re.compile(ur'

 Zobacz poprzednie Opinie dnia:.*', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(ur'Więcej o .*?', re.DOTALL|re.IGNORECASE), lambda match: '')] - keep_only_tags=[dict(name='div', attrs={'class':['m_zwykly', 'gallery']}), dict(id='article')] - remove_tags_after=dict(name='div', attrs={'class':'body'}) - remove_tags=[dict(name='div', attrs={'class':['kategoria', 'socialize', 'thumb', 'panelOcenaObserwowane', 'categoryNextToSocializeGallery', 'breadcrumb', 'footer', 'moreTopics']}), dict(name='table', attrs={'background':'http://www.benchmark.pl/uploads/backend_img/a/fotki_newsy/opinie_dnia/bg.png'}), dict(name='table', attrs={'width':'210', 'cellspacing':'1', 'cellpadding':'4', 'border':'0', 'align':'right'})] - INDEX= 'http://www.benchmark.pl' + keep_only_tags = [dict(name='div', attrs={'class':['m_zwykly', 'gallery']}), dict(id='article')] + remove_tags_after = dict(id='article') + remove_tags = [dict(name='div', attrs={'class':['comments', 'body', 'kategoria', 'socialize', 'thumb', 'panelOcenaObserwowane', 'categoryNextToSocializeGallery', 'breadcrumb', 'footer', 'moreTopics']}), dict(name='table', attrs = {'background':'http://www.benchmark.pl/uploads/backend_img/a/fotki_newsy/opinie_dnia/bg.png'}), dict(name='table', attrs={'width':'210', 'cellspacing':'1', 'cellpadding':'4', 'border':'0', 'align':'right'})] + INDEX = 'http://www.benchmark.pl' feeds = [(u'Aktualności', u'http://www.benchmark.pl/rss/aktualnosci-pliki.xml'), (u'Testy i recenzje', u'http://www.benchmark.pl/rss/testy-recenzje-minirecenzje.xml')] @@ -27,7 +29,12 @@ class BenchmarkPl(BasicNewsRecipe): soup2 = self.index_to_soup(nexturl['href']) nexturl = soup2.find(attrs={'class':'next'}) pagetext = soup2.find(name='div', attrs={'class':'body'}) - appendtag.find('div', attrs={'class':'k_ster'}).extract() + tag = appendtag.find('div', attrs={'class':'k_ster'}) + if tag: + tag.extract() + comments = pagetext.findAll(text=lambda text:isinstance(text, Comment)) + for comment in comments: + comment.extract() pos = len(appendtag.contents) appendtag.insert(pos, pagetext) if appendtag.find('div', attrs={'class':'k_ster'}): @@ -37,40 +44,44 @@ class BenchmarkPl(BasicNewsRecipe): def image_article(self, soup, appendtag): - nexturl=soup.find('div', attrs={'class':'preview'}) - if nexturl is not None: - nexturl=nexturl.find('a', attrs={'class':'move_next'}) - image=appendtag.find('div', attrs={'class':'preview'}).div['style'][16:] - image=self.INDEX + image[:image.find("')")] + nexturl = soup.find('div', attrs={'class':'preview'}) + if nexturl: + nexturl = nexturl.find('a', attrs={'class':'move_next'}) + image = appendtag.find('div', attrs={'class':'preview'}).div['style'][16:] + image = self.INDEX + image[:image.find("')")] appendtag.find(attrs={'class':'preview'}).name='img' appendtag.find(attrs={'class':'preview'})['src']=image appendtag.find('a', attrs={'class':'move_next'}).extract() - while nexturl is not None: - nexturl= self.INDEX + nexturl['href'] + while nexturl: + nexturl = self.INDEX + nexturl['href'] soup2 = self.index_to_soup(nexturl) - nexturl=soup2.find('a', attrs={'class':'move_next'}) - image=soup2.find('div', attrs={'class':'preview'}).div['style'][16:] - image=self.INDEX + image[:image.find("')")] + nexturl = soup2.find('a', attrs={'class':'move_next'}) + image = soup2.find('div', attrs={'class':'preview'}).div['style'][16:] + image = self.INDEX + image[:image.find("')")] soup2.find(attrs={'class':'preview'}).name='img' soup2.find(attrs={'class':'preview'})['src']=image - pagetext=soup2.find('div', attrs={'class':'gallery'}) + pagetext = soup2.find('div', attrs={'class':'gallery'}) pagetext.find('div', attrs={'class':'title'}).extract() pagetext.find('div', attrs={'class':'thumb'}).extract() pagetext.find('div', attrs={'class':'panelOcenaObserwowane'}).extract() - if nexturl is not None: + if nexturl: pagetext.find('a', attrs={'class':'move_next'}).extract() pagetext.find('a', attrs={'class':'move_back'}).extract() + comments = pagetext.findAll(text=lambda text:isinstance(text, Comment)) + for comment in comments: + comment.extract() pos = len(appendtag.contents) appendtag.insert(pos, pagetext) - def preprocess_html(self, soup): - if soup.find('div', attrs={'class':'preview'}) is not None: + if soup.find('div', attrs={'class':'preview'}): self.image_article(soup, soup.body) else: self.append_page(soup, soup.body) for a in soup('a'): - if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: - a['href']=self.INDEX + a['href'] + if a.has_key('href') and not a['href'].startswith('http'): + a['href'] = self.INDEX + a['href'] + for r in soup.findAll(attrs={'class':['comments', 'body']}): + r.extract() return soup diff --git a/recipes/biweekly.recipe b/recipes/biweekly.recipe index e4df1919a1..a1bf41f621 100644 --- a/recipes/biweekly.recipe +++ b/recipes/biweekly.recipe @@ -14,7 +14,7 @@ from calibre.web.feeds.news import BasicNewsRecipe class biweekly(BasicNewsRecipe): __author__ = u'Łukasz Grąbczewski' title = 'Biweekly' - language = 'en' + language = 'en_PL' publisher = 'National Audiovisual Institute' publication_type = 'magazine' description = u'link with culture [English edition of Polish magazine]: literature, theatre, film, art, music, views, talks' diff --git a/recipes/ciekawostki_historyczne.recipe b/recipes/ciekawostki_historyczne.recipe index 7c5138196d..2b20ee9e42 100644 --- a/recipes/ciekawostki_historyczne.recipe +++ b/recipes/ciekawostki_historyczne.recipe @@ -1,5 +1,7 @@ from calibre.web.feeds.news import BasicNewsRecipe import re +from calibre.ebooks.BeautifulSoup import Comment + class Ciekawostki_Historyczne(BasicNewsRecipe): title = u'Ciekawostki Historyczne' oldest_article = 7 @@ -7,42 +9,30 @@ class Ciekawostki_Historyczne(BasicNewsRecipe): description = u'Serwis popularnonaukowy - odkrycia, kontrowersje, historia, ciekawostki, badania, ciekawostki z przeszłości.' category = 'history' language = 'pl' - masthead_url= 'http://ciekawostkihistoryczne.pl/wp-content/themes/Wordpress_Magazine/images/logo-ciekawostki-historyczne-male.jpg' - cover_url='http://ciekawostkihistoryczne.pl/wp-content/themes/Wordpress_Magazine/images/logo-ciekawostki-historyczne-male.jpg' + masthead_url = 'http://ciekawostkihistoryczne.pl/wp-content/themes/Wordpress_Magazine/images/logo-ciekawostki-historyczne-male.jpg' + cover_url = 'http://ciekawostkihistoryczne.pl/wp-content/themes/Wordpress_Magazine/images/logo-ciekawostki-historyczne-male.jpg' max_articles_per_feed = 100 + oldest_article = 140000 preprocess_regexps = [(re.compile(ur'Ten artykuł ma kilka stron.*?', re.DOTALL), lambda match: ''), (re.compile(ur'

Zobacz też:

.*?', re.DOTALL), lambda match: '')] - no_stylesheets=True - remove_empty_feeds=True - keep_only_tags=[dict(name='div', attrs={'class':'post'})] - remove_tags=[dict(id='singlepostinfo')] + no_stylesheets = True + remove_empty_feeds = True + keep_only_tags = [dict(name='div', attrs={'class':'post'})] + recursions = 5 + remove_tags = [dict(id='singlepostinfo')] + feeds = [(u'Staro\u017cytno\u015b\u0107', u'http://ciekawostkihistoryczne.pl/tag/starozytnosc/feed/'), (u'\u015aredniowiecze', u'http://ciekawostkihistoryczne.pl/tag/sredniowiecze/feed/'), (u'Nowo\u017cytno\u015b\u0107', u'http://ciekawostkihistoryczne.pl/tag/nowozytnosc/feed/'), (u'XIX wiek', u'http://ciekawostkihistoryczne.pl/tag/xix-wiek/feed/'), (u'1914-1939', u'http://ciekawostkihistoryczne.pl/tag/1914-1939/feed/'), (u'1939-1945', u'http://ciekawostkihistoryczne.pl/tag/1939-1945/feed/'), (u'Powojnie (od 1945)', u'http://ciekawostkihistoryczne.pl/tag/powojnie/feed/'), (u'Recenzje', u'http://ciekawostkihistoryczne.pl/category/recenzje/feed/')] - def append_page(self, soup, appendtag): - tag=soup.find(name='h7') - if tag: - if tag.br: - pass - elif tag.nextSibling.name=='p': - tag=tag.nextSibling - nexturl = tag.findAll('a') - for nextpage in nexturl: - tag.extract() - nextpage= nextpage['href'] - soup2 = self.index_to_soup(nextpage) - pagetext = soup2.find(name='div', attrs={'class':'post'}) - for r in pagetext.findAll('div', attrs={'id':'singlepostinfo'}): - r.extract() - for r in pagetext.findAll('div', attrs={'class':'wp-caption alignright'}): - r.extract() - for r in pagetext.findAll('h1'): - r.extract() - pagetext.find('h6').nextSibling.extract() - pagetext.find('h7').nextSibling.extract() - pos = len(appendtag.contents) - appendtag.insert(pos, pagetext) + def is_link_wanted(self, url, tag): + return 'ciekawostkihistoryczne' in url and url[-2] in {'2', '3', '4', '5', '6'} - def preprocess_html(self, soup): - self.append_page(soup, soup.body) + def postprocess_html(self, soup, first_fetch): + tag = soup.find('h7') + if tag: + tag.nextSibling.extract() + if not first_fetch: + for r in soup.findAll(['h1']): + r.extract() + soup.find('h6').nextSibling.extract() return soup - \ No newline at end of file + diff --git a/recipes/computerworld_pl.recipe b/recipes/computerworld_pl.recipe index e9aab68226..6d4e2a179f 100644 --- a/recipes/computerworld_pl.recipe +++ b/recipes/computerworld_pl.recipe @@ -1,5 +1,5 @@ # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai - +import re from calibre.web.feeds.news import BasicNewsRecipe class Computerworld_pl(BasicNewsRecipe): title = u'Computerworld.pl' @@ -12,8 +12,16 @@ class Computerworld_pl(BasicNewsRecipe): no_stylesheets = True oldest_article = 7 max_articles_per_feed = 100 - keep_only_tags = [dict(attrs={'class':['tyt_news', 'prawo', 'autor', 'tresc']})] - remove_tags_after = dict(name='div', attrs={'class':'rMobi'}) - remove_tags = [dict(name='div', attrs={'class':['nnav', 'rMobi']}), dict(name='table', attrs={'class':'ramka_slx'})] + remove_attributes = ['style',] + preprocess_regexps = [(re.compile(u'Zobacz również:', re.IGNORECASE), lambda m: ''), (re.compile(ur'[*]+reklama[*]+', re.IGNORECASE), lambda m: ''),] + keep_only_tags = [dict(id=['szpaltaL', 's2011'])] + remove_tags_after = dict(name='div', attrs={'class':'tresc'}) + remove_tags = [dict(attrs={'class':['nnav', 'rMobi', 'tagi', 'rec']}),] feeds = [(u'Wiadomo\u015bci', u'http://rssout.idg.pl/cw/news_iso.xml')] + def skip_ad_pages(self, soup): + if soup.title.string.lower() == 'advertisement': + tag = soup.find(name='a') + if tag: + new_soup = self.index_to_soup(tag['href'], raw=True) + return new_soup \ No newline at end of file diff --git a/recipes/conowego_pl.recipe b/recipes/conowego_pl.recipe index f180a756b2..9b2f6e8200 100644 --- a/recipes/conowego_pl.recipe +++ b/recipes/conowego_pl.recipe @@ -1,5 +1,6 @@ from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup +from calibre.ebooks.BeautifulSoup import BeautifulSoup, Comment + class CoNowegoPl(BasicNewsRecipe): title = u'conowego.pl' __author__ = 'fenuks' @@ -34,7 +35,10 @@ class CoNowegoPl(BasicNewsRecipe): pagetext = soup2.find(attrs={'class':'ni_content'}) pos = len(appendtag.contents) appendtag.insert(pos, pagetext) - + + comments = appendtag.findAll(text=lambda text:isinstance(text, Comment)) + for comment in comments: + comment.extract() for r in appendtag.findAll(attrs={'class':['pages', 'paginationWrap']}): r.extract() diff --git a/recipes/dzieje_pl.recipe b/recipes/dzieje_pl.recipe index b74f18c006..57774698bb 100644 --- a/recipes/dzieje_pl.recipe +++ b/recipes/dzieje_pl.recipe @@ -1,4 +1,5 @@ from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import Comment class Dzieje(BasicNewsRecipe): title = u'dzieje.pl' @@ -11,8 +12,8 @@ class Dzieje(BasicNewsRecipe): index = 'http://dzieje.pl' oldest_article = 8 max_articles_per_feed = 100 - remove_javascript=True - no_stylesheets= True + remove_javascript = True + no_stylesheets = True keep_only_tags = [dict(name='h1', attrs={'class':'title'}), dict(id='content-area')] remove_tags = [dict(attrs={'class':'field field-type-computed field-field-tagi'}), dict(id='dogory')] #feeds = [(u'Dzieje', u'http://dzieje.pl/rss.xml')] @@ -28,16 +29,19 @@ class Dzieje(BasicNewsRecipe): pagetext = soup2.find(id='content-area').find(attrs={'class':'content'}) for r in pagetext.findAll(attrs={'class':['fieldgroup group-groupkul', 'fieldgroup group-zdjeciekult', 'fieldgroup group-zdjecieciekaw', 'fieldgroup group-zdjecieksiazka', 'fieldgroup group-zdjeciedu', 'field field-type-filefield field-field-zdjecieglownawyd']}): r.extract() - pos = len(appendtag.contents) + comments = pagetext.findAll(text=lambda text:isinstance(text, Comment)) appendtag.insert(pos, pagetext) tag = soup2.find('li', attrs={'class':'pager-next'}) for r in appendtag.findAll(attrs={'class':['item-list', 'field field-type-computed field-field-tagi', ]}): r.extract() + comments = appendtag.findAll(text=lambda text:isinstance(text, Comment)) + for comment in comments: + comment.extract() def find_articles(self, url): articles = [] - soup=self.index_to_soup(url) - tag=soup.find(id='content-area').div.div + soup = self.index_to_soup(url) + tag = soup.find(id='content-area').div.div for i in tag.findAll('div', recursive=False): temp = i.find(attrs={'class':'views-field-title'}).span.a title = temp.string @@ -64,7 +68,7 @@ class Dzieje(BasicNewsRecipe): def preprocess_html(self, soup): for a in soup('a'): - if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: - a['href']=self.index + a['href'] + if a.has_key('href') and not a['href'].startswith('http'): + a['href'] = self.index + a['href'] self.append_page(soup, soup.body) return soup diff --git a/recipes/dziennik_pl.recipe b/recipes/dziennik_pl.recipe index 5b9cc457f4..44dd596324 100644 --- a/recipes/dziennik_pl.recipe +++ b/recipes/dziennik_pl.recipe @@ -2,6 +2,8 @@ from calibre.web.feeds.news import BasicNewsRecipe import re +from calibre.ebooks.BeautifulSoup import Comment + class Dziennik_pl(BasicNewsRecipe): title = u'Dziennik.pl' __author__ = 'fenuks' @@ -9,17 +11,17 @@ class Dziennik_pl(BasicNewsRecipe): category = 'newspaper' language = 'pl' masthead_url= 'http://5.s.dziennik.pl/images/logos.png' - cover_url= 'http://5.s.dziennik.pl/images/logos.png' + cover_url = 'http://5.s.dziennik.pl/images/logos.png' no_stylesheets = True oldest_article = 7 max_articles_per_feed = 100 - remove_javascript=True - remove_empty_feeds=True + remove_javascript = True + remove_empty_feeds = True ignore_duplicate_articles = {'title', 'url'} - extra_css= 'ul {list-style: none; padding: 0; margin: 0;} li {float: left;margin: 0 0.15em;}' + extra_css = 'ul {list-style: none; padding: 0; margin: 0;} li {float: left;margin: 0 0.15em;}' preprocess_regexps = [(re.compile("Komentarze:"), lambda m: ''), (re.compile('

>>> CZYTAJ TAKŻE: ".*?"

'), lambda m: '')] - keep_only_tags=[dict(id='article')] - remove_tags=[dict(name='div', attrs={'class':['art_box_dodatki', 'new_facebook_icons2', 'leftArt', 'article_print', 'quiz-widget', 'belka-spol', 'belka-spol belka-spol-bottom', 'art_data_tags', 'cl_right', 'boxRounded gal_inside']}), dict(name='a', attrs={'class':['komentarz', 'article_icon_addcommnent']})] + keep_only_tags = [dict(id='article')] + remove_tags = [dict(name='div', attrs={'class':['art_box_dodatki', 'new_facebook_icons2', 'leftArt', 'article_print', 'quiz-widget', 'belka-spol', 'belka-spol belka-spol-bottom', 'art_data_tags', 'cl_right', 'boxRounded gal_inside']}), dict(name='a', attrs={'class':['komentarz', 'article_icon_addcommnent']})] feeds = [(u'Wszystko', u'http://rss.dziennik.pl/Dziennik-PL/'), (u'Wiadomości', u'http://rss.dziennik.pl/Dziennik-Wiadomosci'), (u'Gospodarka', u'http://rss.dziennik.pl/Dziennik-Gospodarka'), @@ -34,26 +36,29 @@ class Dziennik_pl(BasicNewsRecipe): (u'Nieruchomości', u'http://rss.dziennik.pl/Dziennik-Nieruchomosci')] def skip_ad_pages(self, soup): - tag=soup.find(name='a', attrs={'title':'CZYTAJ DALEJ'}) + tag = soup.find(name='a', attrs={'title':'CZYTAJ DALEJ'}) if tag: - new_soup=self.index_to_soup(tag['href'], raw=True) + new_soup = self.index_to_soup(tag['href'], raw=True) return new_soup def append_page(self, soup, appendtag): - tag=soup.find('a', attrs={'class':'page_next'}) + tag = soup.find('a', attrs={'class':'page_next'}) if tag: appendtag.find('div', attrs={'class':'article_paginator'}).extract() while tag: - soup2= self.index_to_soup(tag['href']) - tag=soup2.find('a', attrs={'class':'page_next'}) + soup2 = self.index_to_soup(tag['href']) + tag = soup2.find('a', attrs={'class':'page_next'}) if not tag: for r in appendtag.findAll('div', attrs={'class':'art_src'}): r.extract() pagetext = soup2.find(name='div', attrs={'class':'article_body'}) for dictionary in self.remove_tags: - v=pagetext.findAll(name=dictionary['name'], attrs=dictionary['attrs']) + v = pagetext.findAll(name=dictionary['name'], attrs=dictionary['attrs']) for delete in v: delete.extract() + comments = pagetext.findAll(text=lambda text:isinstance(text, Comment)) + for comment in comments: + comment.extract() pos = len(appendtag.contents) appendtag.insert(pos, pagetext) if appendtag.find('div', attrs={'class':'article_paginator'}): diff --git a/recipes/dziennik_wschodni.recipe b/recipes/dziennik_wschodni.recipe index b44bc3f639..da5d3bb1d9 100644 --- a/recipes/dziennik_wschodni.recipe +++ b/recipes/dziennik_wschodni.recipe @@ -1,5 +1,7 @@ import re from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import Comment + class DziennikWschodni(BasicNewsRecipe): title = u'Dziennik Wschodni' __author__ = 'fenuks' @@ -72,6 +74,10 @@ class DziennikWschodni(BasicNewsRecipe): if pagetext: pos = len(appendtag.contents) appendtag.insert(pos, pagetext) + + comments = appendtag.findAll(text=lambda text:isinstance(text, Comment)) + for comment in comments: + comment.extract() def preprocess_html(self, soup): self.append_page(soup, soup.body) diff --git a/recipes/echo_dnia.recipe b/recipes/echo_dnia.recipe index c84ef1d21f..def87ce0e1 100644 --- a/recipes/echo_dnia.recipe +++ b/recipes/echo_dnia.recipe @@ -1,5 +1,6 @@ import re from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import Comment class EchoDnia(BasicNewsRecipe): title = u'Echo Dnia' @@ -68,6 +69,10 @@ class EchoDnia(BasicNewsRecipe): if pagetext: pos = len(appendtag.contents) appendtag.insert(pos, pagetext) + + comments = appendtag.findAll(text=lambda text:isinstance(text, Comment)) + for comment in comments: + comment.extract() def preprocess_html(self, soup): self.append_page(soup, soup.body) diff --git a/recipes/film_web.recipe b/recipes/film_web.recipe index b1d7f5c578..3a86438d1c 100644 --- a/recipes/film_web.recipe +++ b/recipes/film_web.recipe @@ -1,6 +1,7 @@ -from calibre.web.feeds.news import BasicNewsRecipe import re +from calibre.web.feeds.news import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import BeautifulSoup + class FilmWebPl(BasicNewsRecipe): title = u'FilmWeb' __author__ = 'fenuks' @@ -9,12 +10,14 @@ class FilmWebPl(BasicNewsRecipe): category = 'movies' language = 'pl' index = 'http://www.filmweb.pl' + #extra_css = '.MarkupPhotoHTML-7 {float:left; margin-right: 10px;}' oldest_article = 8 max_articles_per_feed = 100 no_stylesheets = True remove_empty_feeds = True ignore_duplicate_articles = {'title', 'url'} - preprocess_regexps = [(re.compile(u'\(kliknij\,\ aby powiększyć\)', re.IGNORECASE), lambda m: ''), ]#(re.compile(ur' | ', re.IGNORECASE), lambda m: '')] + remove_javascript = True + preprocess_regexps = [(re.compile(u'\(kliknij\,\ aby powiększyć\)', re.IGNORECASE), lambda m: ''), (re.compile(ur'(
\s*?
\s*?)+', re.IGNORECASE), lambda m: '
')]#(re.compile(ur' | ', re.IGNORECASE), lambda m: '')] extra_css = '.hdrBig {font-size:22px;} ul {list-style-type:none; padding: 0; margin: 0;}' remove_tags = [dict(name='div', attrs={'class':['recommendOthers']}), dict(name='ul', attrs={'class':'fontSizeSet'}), dict(attrs={'class':'userSurname anno'})] remove_attributes = ['style',] @@ -41,7 +44,12 @@ class FilmWebPl(BasicNewsRecipe): skip_tag = soup.find('a', attrs={'class':'welcomeScreenButton'}) if skip_tag is not None: return self.index_to_soup(skip_tag['href'], raw=True) - + + def postprocess_html(self, soup, first_fetch): + for r in soup.findAll(attrs={'class':'singlephoto'}): + r['style'] = 'float:left; margin-right: 10px;' + return soup + def preprocess_html(self, soup): for a in soup('a'): if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: @@ -56,4 +64,8 @@ class FilmWebPl(BasicNewsRecipe): tag.name = 'div' for t in tag.findAll('li'): t.name = 'div' + for r in soup.findAll(id=re.compile('photo-\d+')): + r.extract() + for r in soup.findAll(style=re.compile('float: ?left')): + r['class'] = 'singlephoto' return soup diff --git a/recipes/gazeta_lubuska.recipe b/recipes/gazeta_lubuska.recipe index f14c0fcce2..f2a42b63b8 100644 --- a/recipes/gazeta_lubuska.recipe +++ b/recipes/gazeta_lubuska.recipe @@ -1,5 +1,6 @@ import re from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import Comment class GazetaLubuska(BasicNewsRecipe): title = u'Gazeta Lubuska' @@ -58,6 +59,10 @@ class GazetaLubuska(BasicNewsRecipe): if pagetext: pos = len(appendtag.contents) appendtag.insert(pos, pagetext) + + comments = appendtag.findAll(text=lambda text:isinstance(text, Comment)) + for comment in comments: + comment.extract() def preprocess_html(self, soup): self.append_page(soup, soup.body) diff --git a/recipes/gazeta_pomorska.recipe b/recipes/gazeta_pomorska.recipe index 557fcb726c..a4dc8ed1ea 100644 --- a/recipes/gazeta_pomorska.recipe +++ b/recipes/gazeta_pomorska.recipe @@ -1,5 +1,6 @@ import re from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import Comment class GazetaPomorska(BasicNewsRecipe): title = u'Gazeta Pomorska' @@ -85,6 +86,10 @@ class GazetaPomorska(BasicNewsRecipe): if pagetext: pos = len(appendtag.contents) appendtag.insert(pos, pagetext) + + comments = appendtag.findAll(text=lambda text:isinstance(text, Comment)) + for comment in comments: + comment.extract() def preprocess_html(self, soup): self.append_page(soup, soup.body) diff --git a/recipes/gazeta_wspolczesna.recipe b/recipes/gazeta_wspolczesna.recipe index cfa70d4e2b..6648d8eb1a 100644 --- a/recipes/gazeta_wspolczesna.recipe +++ b/recipes/gazeta_wspolczesna.recipe @@ -1,5 +1,6 @@ import re from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import Comment class GazetaWspolczesna(BasicNewsRecipe): title = u'Gazeta Wsp\xf3\u0142czesna' @@ -57,6 +58,10 @@ class GazetaWspolczesna(BasicNewsRecipe): if pagetext: pos = len(appendtag.contents) appendtag.insert(pos, pagetext) + + comments = appendtag.findAll(text=lambda text:isinstance(text, Comment)) + for comment in comments: + comment.extract() def preprocess_html(self, soup): self.append_page(soup, soup.body) diff --git a/recipes/gazeta_wyborcza.recipe b/recipes/gazeta_wyborcza.recipe index 475a259215..c415edc9d0 100644 --- a/recipes/gazeta_wyborcza.recipe +++ b/recipes/gazeta_wyborcza.recipe @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- from calibre.web.feeds.news import BasicNewsRecipe - +from calibre.ebooks.BeautifulSoup import Comment class Gazeta_Wyborcza(BasicNewsRecipe): title = u'Gazeta.pl' @@ -16,6 +16,7 @@ class Gazeta_Wyborcza(BasicNewsRecipe): max_articles_per_feed = 100 remove_javascript = True no_stylesheets = True + ignore_duplicate_articles = {'title', 'url'} remove_tags_before = dict(id='k0') remove_tags_after = dict(id='banP4') remove_tags = [dict(name='div', attrs={'class':'rel_box'}), dict(attrs={'class':['date', 'zdjP', 'zdjM', 'pollCont', 'rel_video', 'brand', 'txt_upl']}), dict(name='div', attrs={'id':'footer'})] @@ -48,6 +49,9 @@ class Gazeta_Wyborcza(BasicNewsRecipe): url = self.INDEX + link['href'] soup2 = self.index_to_soup(url) pagetext = soup2.find(id='artykul') + comments = pagetext.findAll(text=lambda text:isinstance(text, Comment)) + for comment in comments: + comment.extract() pos = len(appendtag.contents) appendtag.insert(pos, pagetext) tag = soup2.find('div', attrs={'id': 'Str'}) @@ -65,6 +69,9 @@ class Gazeta_Wyborcza(BasicNewsRecipe): nexturl = pagetext.find(id='gal_btn_next') if nexturl: nexturl = nexturl.a['href'] + comments = pagetext.findAll(text=lambda text:isinstance(text, Comment)) + for comment in comments: + comment.extract() pos = len(appendtag.contents) appendtag.insert(pos, pagetext) rem = appendtag.find(id='gal_navi') @@ -105,3 +112,7 @@ class Gazeta_Wyborcza(BasicNewsRecipe): soup = self.index_to_soup('http://wyborcza.pl/' + cover.contents[3].a['href']) self.cover_url = 'http://wyborcza.pl' + soup.img['src'] return getattr(self, 'cover_url', self.cover_url) + + '''def image_url_processor(self, baseurl, url): + print "@@@@@@@@", url + return url.replace('http://wyborcza.pl/ ', '')''' diff --git a/recipes/gcn.recipe b/recipes/gcn.recipe index 3e4a3f365f..9b3e272b2e 100644 --- a/recipes/gcn.recipe +++ b/recipes/gcn.recipe @@ -1,5 +1,6 @@ import re from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import Comment class GCN(BasicNewsRecipe): title = u'Gazeta Codziennej Nowiny' @@ -16,7 +17,7 @@ class GCN(BasicNewsRecipe): remove_empty_feeds = True no_stylesheets = True ignore_duplicate_articles = {'title', 'url'} - + remove_attributes ['style'] preprocess_regexps = [(re.compile(ur'Czytaj:.*?', re.DOTALL), lambda match: ''), (re.compile(ur'Przeczytaj także:.*?', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(ur'Przeczytaj również:.*?', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(ur'Zobacz też:.*?', re.DOTALL|re.IGNORECASE), lambda match: '')] @@ -77,6 +78,10 @@ class GCN(BasicNewsRecipe): if pagetext: pos = len(appendtag.contents) appendtag.insert(pos, pagetext) + + comments = appendtag.findAll(text=lambda text:isinstance(text, Comment)) + for comment in comments: + comment.extract() def preprocess_html(self, soup): self.append_page(soup, soup.body) diff --git a/recipes/gildia_pl.recipe b/recipes/gildia_pl.recipe index 525cf6c605..e14085b5ba 100644 --- a/recipes/gildia_pl.recipe +++ b/recipes/gildia_pl.recipe @@ -11,12 +11,13 @@ class Gildia(BasicNewsRecipe): language = 'pl' oldest_article = 8 max_articles_per_feed = 100 - remove_empty_feeds=True - no_stylesheets=True + remove_empty_feeds = True + no_stylesheets = True ignore_duplicate_articles = {'title', 'url'} preprocess_regexps = [(re.compile(ur''), lambda match: '') ] - remove_tags=[dict(name='div', attrs={'class':'backlink'}), dict(name='div', attrs={'class':'im_img'}), dict(name='div', attrs={'class':'addthis_toolbox addthis_default_style'})] - keep_only_tags=dict(name='div', attrs={'class':'widetext'}) + ignore_duplicate_articles = {'title', 'url'} + remove_tags = [dict(name='div', attrs={'class':'backlink'}), dict(name='div', attrs={'class':'im_img'}), dict(name='div', attrs={'class':'addthis_toolbox addthis_default_style'})] + keep_only_tags = dict(name='div', attrs={'class':'widetext'}) feeds = [(u'Gry', u'http://www.gry.gildia.pl/rss'), (u'Literatura', u'http://www.literatura.gildia.pl/rss'), (u'Film', u'http://www.film.gildia.pl/rss'), (u'Horror', u'http://www.horror.gildia.pl/rss'), (u'Konwenty', u'http://www.konwenty.gildia.pl/rss'), (u'Plansz\xf3wki', u'http://www.planszowki.gildia.pl/rss'), (u'Manga i anime', u'http://www.manga.gildia.pl/rss'), (u'Star Wars', u'http://www.starwars.gildia.pl/rss'), (u'Techno', u'http://www.techno.gildia.pl/rss'), (u'Historia', u'http://www.historia.gildia.pl/rss'), (u'Magia', u'http://www.magia.gildia.pl/rss'), (u'Bitewniaki', u'http://www.bitewniaki.gildia.pl/rss'), (u'RPG', u'http://www.rpg.gildia.pl/rss'), (u'LARP', u'http://www.larp.gildia.pl/rss'), (u'Muzyka', u'http://www.muzyka.gildia.pl/rss'), (u'Nauka', u'http://www.nauka.gildia.pl/rss')] @@ -34,7 +35,7 @@ class Gildia(BasicNewsRecipe): def preprocess_html(self, soup): for a in soup('a'): - if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: + if a.has_key('href') not a['href'].startswith('http'): if '/gry/' in a['href']: a['href']='http://www.gry.gildia.pl' + a['href'] elif u'książk' in soup.title.string.lower() or u'komiks' in soup.title.string.lower(): diff --git a/recipes/gram_pl.recipe b/recipes/gram_pl.recipe index 753e4a71d3..baaac85492 100644 --- a/recipes/gram_pl.recipe +++ b/recipes/gram_pl.recipe @@ -1,5 +1,6 @@ from calibre.web.feeds.news import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import BeautifulSoup + class Gram_pl(BasicNewsRecipe): title = u'Gram.pl' __author__ = 'fenuks' diff --git a/recipes/gry_online_pl.recipe b/recipes/gry_online_pl.recipe index 2993cb0043..2876a9b4e8 100644 --- a/recipes/gry_online_pl.recipe +++ b/recipes/gry_online_pl.recipe @@ -1,5 +1,6 @@ import time from calibre.web.feeds.recipes import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import Comment class GryOnlinePl(BasicNewsRecipe): title = u'Gry-Online.pl' @@ -40,10 +41,14 @@ class GryOnlinePl(BasicNewsRecipe): r.extract() for r in pagetext.findAll(attrs={'itemprop':'description'}): r.extract() + pos = len(appendtag.contents) appendtag.insert(pos, pagetext) for r in appendtag.findAll(attrs={'class':['n5p', 'add-info', 'twitter-share-button', 'lista lista3 lista-gry']}): r.extract() + comments = appendtag.findAll(text=lambda text:isinstance(text, Comment)) + for comment in comments: + comment.extract() else: tag = appendtag.find('div', attrs={'class':'S018stronyr'}) if tag: @@ -70,16 +75,22 @@ class GryOnlinePl(BasicNewsRecipe): r.extract() for r in pagetext.findAll(attrs={'itemprop':'description'}): r.extract() + + comments = pagetext.findAll(text=lambda text:isinstance(text, Comment)) + [comment.extract() for comment in comments] pos = len(appendtag.contents) appendtag.insert(pos, pagetext) for r in appendtag.findAll(attrs={'class':['n5p', 'add-info', 'twitter-share-button', 'lista lista3 lista-gry', 'S018strony']}): r.extract() + comments = appendtag.findAll(text=lambda text:isinstance(text, Comment)) + for comment in comments: + comment.extract() def image_url_processor(self, baseurl, url): if url.startswith('..'): return url[2:] else: - return url + return url def preprocess_html(self, soup): self.append_page(soup, soup.body) diff --git a/recipes/icons/historia_pl.png b/recipes/icons/historia_pl.png index f7774e31390f85db1bf29729448dced58e4e1614..e035cadc3806565809afd43363b69b3969acf1a3 100644 GIT binary patch literal 869 zcmZ{ieM}p57{`CI83zlx#0{5;lSDBjgzT`XY<1BT+jV3EDzHqznK6cqWJD)y4rPSQ zYJ6E(G#XzvNt|RFWi4+jumT&Q?c^nGZ5a&4($XHi>mAqY^?KL#UV0vX`iFl!&o|%a zo6jFlp65$_g)Td5PZj_myX>&0O4c2lwQY;s3yPkqmSw9!U9JXT;L3KP;c?kN)>@@2 z1tw>1ST=c$hpWl~`1mCNx{LsD@~8_1;2Z=1b`k*9j{rRL!L{3O9+bI@`lB`4bUH0^ zeAMNQ&bg9EI7LQ2yN*y!IdWqY(;HI`(&9i!IlIx~n#ve?CQ2dT9G%yk#nGiTGzUmA0TVjjLYyb|KEn$iOJ$3lNSu z#AHD&x#O){-1y2<4oKaST|eb z-Qd1Kj;48*MTf1i#T9yGHQ3w71YqCyJ&7>JQ4~5biVTgV1rZ&*&8!8M~LaxhJBV^en>cGkz3aI zT9Ac9*j>9Qiq!oFsR&8Kuk~zR@YE5V=82q!FL2r5Wg2zOZm+VtsXn7&ht^y$K2(73 z(x^8|ZHMfU9IMryV-<->E7tKU;%qf^NoB3QhW#h7D_fz}pKA9i>bEH23z=CRW1V^X zUeBA#sQg23(Ax70iVDq;zV_I2i&o_~bsGFSlWlB1)BO3cZKv@}%k~U{dQxXNUUKtk zQ_nktW?$~^+&z}hE?@q(@oabH(Te3u-pqVO=C7Zej~t)hm(2O)$lTktCx(15cc|-X z{ljL%N3Q-0l3(Bcg6`hI1Lbuc=R41~pS#%lm%p-h*SLawes6W<4D!QLNxkt6Obk}T zh3w5|JAvBqV#}w;Ha5RFzfJpA+v$e3CRO8yO|pQOp<+c5q$nybsVP37QYux@{z9l& k1wq&D1itzY(DMH2_ug&)e}D+oy&?mkOk1Hbm7cuxFA$IUssI20 literal 806 zcmeAS@N?(olHy`uVBq!ia0vp^3LwnE3?yBabR7dy4+DHcT>t(1_xtC!r)xG}t!{fb zW$x>}hkv|y{`>QncV{m?UcCD8w1p4*X5Q(Za(C*yw?|L?c=q)7+fV1?3qRj`@a5K> zGcj3@7A$?V=g6&|8DF2j`1b1c>%&LSrj^_*Zg{+M{h7p^i#cUyBhqfQPB;}5e>tb> zba3+3qPk0|#aA2J?@eELwXE@MPT8Z`OK#S6eY$kz(}f%FPM!Vv`t9xJo)=qoU2Ez+ zsAzJ3&XNzuFTA>V^}LGdv)tGrd6$;gT4m3?t*c*}%7!I6EGUWa-hN!^ka2V4M4!{|q(vHx54gyd zzYw)uEWOU4<76+AbyuW1DHr7jrANO(I zRR0k=Y0a!HEz8sU95>j!iO}iP+;qcLB=)n$dv+$fB<4fRZnM^I`1h%*kbASa(h=W3 zUg8`F%8FPP^l%<`S#aKLt>5v`EJnkX?|C{eA8N0$U`yL~?O(f}{Mss&S^JM25Q-Ph zZ1C;rv~XJRs&t!+OIiAnA1}6?S9>bz?`>_%v98d{r$0r`dj2cLIW@KE^#}bA9p5|U zr#O@0vM^P)&pOL@Yw*<7%WIvP@wsVBBS(TA*S%TBvJAqj7wcZ^z4Vv8>X(dV87B`Y zovN0&MwFx^mZVxG7o`Fz1|tJQLtO()T_eK~Ljx-l11n=wT>~>K1B26|3%{ah$jwj5 YOsmALVV8%bCr|@}r>mdKI;Vst0F}Rmw*UYD diff --git a/recipes/icons/nowy_obywatel.png b/recipes/icons/nowy_obywatel.png old mode 100755 new mode 100644 diff --git a/recipes/in4_pl.recipe b/recipes/in4_pl.recipe index e385522714..f115014b5d 100644 --- a/recipes/in4_pl.recipe +++ b/recipes/in4_pl.recipe @@ -1,5 +1,7 @@ from calibre.web.feeds.news import BasicNewsRecipe import re +from calibre.ebooks.BeautifulSoup import Comment + class in4(BasicNewsRecipe): title = u'IN4.pl' oldest_article = 7 @@ -8,14 +10,14 @@ class in4(BasicNewsRecipe): description = u'Serwis Informacyjny - Aktualnosci, recenzje' category = 'IT' language = 'pl' - index='http://www.in4.pl/' + index = 'http://www.in4.pl/' #cover_url= 'http://www.in4.pl/recenzje/337/in4pl.jpg' no_stylesheets = True remove_empty_feeds = True preprocess_regexps = [(re.compile(ur'