From a1142026489dbb9a162dd3a863258a0538e81de9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tomasz=20D=C5=82ugosz?= Date: Thu, 25 Oct 2012 23:36:09 +0200 Subject: [PATCH 1/9] embed github.com/t3d/kalibrator git repo in calibre's recipes directory --- .bzrignore | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.bzrignore b/.bzrignore index aaacc9f58a..01782b1b39 100644 --- a/.bzrignore +++ b/.bzrignore @@ -35,3 +35,8 @@ nbproject/ .settings/ *.DS_Store calibre_plugins/ +recipes/.git +recipes/.gitignore +recipes/tv_*.recipe +recipes/README +recipes/katalog_egazeciarz.recipe From ace07d1cc6c9f523737a5f241ff6b0a62f8b4a9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tomasz=20D=C5=82ugosz?= Date: Thu, 25 Oct 2012 23:47:19 +0200 Subject: [PATCH 2/9] align to kalibrator - focus_pl.recipe --- recipes/focus_pl.recipe | 65 +++++++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 32 deletions(-) diff --git a/recipes/focus_pl.recipe b/recipes/focus_pl.recipe index 342aa0d2db..1954fd7803 100644 --- a/recipes/focus_pl.recipe +++ b/recipes/focus_pl.recipe @@ -2,7 +2,9 @@ import re from calibre.web.feeds.news import BasicNewsRecipe + class FocusRecipe(BasicNewsRecipe): + __license__ = 'GPL v3' __author__ = u'intromatyk ' language = 'pl' @@ -12,10 +14,10 @@ class FocusRecipe(BasicNewsRecipe): publisher = u'Gruner + Jahr Polska' category = u'News' description = u'Newspaper' - category='magazine' - cover_url='' - remove_empty_feeds= True - no_stylesheets=True + category = 'magazine' + cover_url = '' + remove_empty_feeds = True + no_stylesheets = True oldest_article = 7 max_articles_per_feed = 100000 recursions = 0 @@ -27,15 +29,15 @@ class FocusRecipe(BasicNewsRecipe): simultaneous_downloads = 5 r = re.compile('.*(?Phttp:\/\/(www.focus.pl)|(rss.feedsportal.com\/c)\/.*\.html?).*') - keep_only_tags =[] - keep_only_tags.append(dict(name = 'div', attrs = {'id' : 'cll'})) - - remove_tags =[] - remove_tags.append(dict(name = 'div', attrs = {'class' : 'ulm noprint'})) - remove_tags.append(dict(name = 'div', attrs = {'class' : 'txb'})) - remove_tags.append(dict(name = 'div', attrs = {'class' : 'h2'})) - remove_tags.append(dict(name = 'ul', attrs = {'class' : 'txu'})) - remove_tags.append(dict(name = 'div', attrs = {'class' : 'ulc'})) + keep_only_tags = [] + keep_only_tags.append(dict(name='div', attrs={'id': 'cll'})) + + remove_tags = [] + remove_tags.append(dict(name='div', attrs={'class': 'ulm noprint'})) + remove_tags.append(dict(name='div', attrs={'class': 'txb'})) + remove_tags.append(dict(name='div', attrs={'class': 'h2'})) + remove_tags.append(dict(name='ul', attrs={'class': 'txu'})) + remove_tags.append(dict(name='div', attrs={'class': 'ulc'})) extra_css = ''' body {font-family: verdana, arial, helvetica, geneva, sans-serif ;} @@ -44,18 +46,17 @@ class FocusRecipe(BasicNewsRecipe): p.lead {font-weight: bold; text-align: left;} .authordate {font-size: small; color: #696969;} .fot{font-size: x-small; color: #666666;} - ''' + ''' - - feeds = [ - ('Nauka', 'http://focus.pl.feedsportal.com/c/32992/f/532693/index.rss'), - ('Historia', 'http://focus.pl.feedsportal.com/c/32992/f/532694/index.rss'), - ('Cywilizacja', 'http://focus.pl.feedsportal.com/c/32992/f/532695/index.rss'), - ('Sport', 'http://focus.pl.feedsportal.com/c/32992/f/532696/index.rss'), - ('Technika', 'http://focus.pl.feedsportal.com/c/32992/f/532697/index.rss'), - ('Przyroda', 'http://focus.pl.feedsportal.com/c/32992/f/532698/index.rss'), - ('Technologie', 'http://focus.pl.feedsportal.com/c/32992/f/532699/index.rss'), - ] + feeds = [ + ('Nauka', 'http://www.focus.pl/nauka/rss/'), + ('Historia', 'http://www.focus.pl/historia/rss/'), + ('Cywilizacja', 'http://www.focus.pl/cywilizacja/rss/'), + ('Sport', 'http://www.focus.pl/sport/rss/'), + ('Technika', 'http://www.focus.pl/technika/rss/'), + ('Przyroda', 'http://www.focus.pl/przyroda/rss/'), + ('Technologie', 'http://www.focus.pl/gadzety/rss/') + ] def skip_ad_pages(self, soup): if ('advertisement' in soup.find('title').string.lower()): @@ -65,20 +66,20 @@ class FocusRecipe(BasicNewsRecipe): return None def get_cover_url(self): - soup=self.index_to_soup('http://www.focus.pl/magazyn/') - tag=soup.find(name='div', attrs={'class':'clr fl'}) + soup = self.index_to_soup('http://www.focus.pl/magazyn/') + tag = soup.find(name='div', attrs={'class': 'clr fl'}) if tag: - self.cover_url='http://www.focus.pl/' + tag.a['href'] + self.cover_url = 'http://www.focus.pl/' + tag.a['href'] return getattr(self, 'cover_url', self.cover_url) def print_version(self, url): - if url.count ('focus.pl.feedsportal.com'): + if url.count('focus.pl.feedsportal.com'): u = url.find('focus0Bpl') u = 'http://www.focus.pl/' + url[u + 11:] u = u.replace('0C', '/') u = u.replace('A', '') - u = u.replace ('0E','-') + u = u.replace('0E', '-') u = u.replace('/nc/1//story01.htm', '/do-druku/1') - else: - u = url.replace('/nc/1','/do-druku/1') - return u \ No newline at end of file + else: + u = url.replace('/nc/1', '/do-druku/1') + return u From 34314817149cbfc6b0b7076eb59d2bba99be8606 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tomasz=20D=C5=82ugosz?= Date: Thu, 25 Oct 2012 23:55:08 +0200 Subject: [PATCH 3/9] align to kalibrator - swiatkindle.recipe --- recipes/swiatkindle.recipe | 1 - 1 file changed, 1 deletion(-) diff --git a/recipes/swiatkindle.recipe b/recipes/swiatkindle.recipe index 9847d1359e..a6bf225294 100644 --- a/recipes/swiatkindle.recipe +++ b/recipes/swiatkindle.recipe @@ -7,7 +7,6 @@ swiatczytnikow.pl ''' import re -from calibre.web.feeds.news import BasicNewsRecipe class swiatczytnikow(BasicNewsRecipe): title = u'Swiat Czytnikow' From 027c020b0a2b01ca06175828133b4590280a22ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tomasz=20D=C5=82ugosz?= Date: Fri, 26 Oct 2012 20:47:55 +0200 Subject: [PATCH 4/9] align to kalibrator - update Artur's email --- recipes/money_pl.recipe | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recipes/money_pl.recipe b/recipes/money_pl.recipe index 075264f8f7..475c2059ff 100644 --- a/recipes/money_pl.recipe +++ b/recipes/money_pl.recipe @@ -4,7 +4,7 @@ from calibre.web.feeds.news import BasicNewsRecipe class FocusRecipe(BasicNewsRecipe): __license__ = 'GPL v3' - __author__ = u'intromatyk ' + __author__ = u'Artur Stachecki ' language = 'pl' version = 1 From 39fc5eb4ac523df772fe95f1539971a993d39d6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tomasz=20D=C5=82ugosz?= Date: Fri, 26 Oct 2012 21:29:22 +0200 Subject: [PATCH 5/9] align to kalibrator - gazeta_wyborcza --- recipes/gazeta_wyborcza.recipe | 131 +++++++++++++++++---------------- 1 file changed, 67 insertions(+), 64 deletions(-) diff --git a/recipes/gazeta_wyborcza.recipe b/recipes/gazeta_wyborcza.recipe index 5c034b10ab..3d416e444f 100644 --- a/recipes/gazeta_wyborcza.recipe +++ b/recipes/gazeta_wyborcza.recipe @@ -1,104 +1,107 @@ # -*- coding: utf-8 -*- from calibre.web.feeds.news import BasicNewsRecipe + class Gazeta_Wyborcza(BasicNewsRecipe): - title = u'Gazeta Wyborcza' - __author__ = 'fenuks' - language = 'pl' - description ='news from gazeta.pl' - category='newspaper' + title = u'Gazeta Wyborcza' + __author__ = 'fenuks, Artur Stachecki' + language = 'pl' + description = 'news from gazeta.pl' + category = 'newspaper' publication_type = 'newspaper' - masthead_url='http://bi.gazeta.pl/im/5/10285/z10285445AA.jpg' - INDEX='http://wyborcza.pl' - remove_empty_feeds= True + masthead_url = 'http://bi.gazeta.pl/im/5/10285/z10285445AA.jpg' + INDEX = 'http://wyborcza.pl' + remove_empty_feeds = True oldest_article = 3 max_articles_per_feed = 100 - remove_javascript=True - no_stylesheets=True - ignore_duplicate_articles = {'title', 'url'} - keep_only_tags = dict(id=['gazeta_article', 'article']) - remove_tags_after = dict(id='gazeta_article_share') - remove_tags = [dict(attrs={'class':['artReadMore', 'gazeta_article_related_new', 'txt_upl']}), dict(id=['gazeta_article_likes', 'gazeta_article_tools', 'rel', 'gazeta_article_tags', 'gazeta_article_share', 'gazeta_article_brand', 'gazeta_article_miniatures'])] - - feeds = [(u'Kraj', u'http://rss.feedsportal.com/c/32739/f/530266/index.rss'), (u'\u015awiat', u'http://rss.feedsportal.com/c/32739/f/530270/index.rss'), - (u'Wyborcza.biz', u'http://wyborcza.biz/pub/rss/wyborcza_biz_wiadomosci.htm'), - (u'Komentarze', u'http://rss.feedsportal.com/c/32739/f/530312/index.rss'), - (u'Kultura', u'http://rss.gazeta.pl/pub/rss/gazetawyborcza_kultura.xml'), - (u'Nauka', u'http://rss.feedsportal.com/c/32739/f/530269/index.rss'), - (u'Opinie', u'http://rss.gazeta.pl/pub/rss/opinie.xml'), - (u'Gazeta \u015awi\u0105teczna', u'http://rss.feedsportal.com/c/32739/f/530431/index.rss'), - #(u'Du\u017cy Format', u'http://rss.feedsportal.com/c/32739/f/530265/index.rss'), - (u'Witamy w Polsce', u'http://rss.feedsportal.com/c/32739/f/530476/index.rss'), - (u'M\u0119ska Muzyka', u'http://rss.feedsportal.com/c/32739/f/530337/index.rss'), - (u'Lata Lec\u0105', u'http://rss.feedsportal.com/c/32739/f/530326/index.rss'), - (u'Solidarni z Tybetem', u'http://rss.feedsportal.com/c/32739/f/530461/index.rss'), - (u'W pon. - \u017bakowski', u'http://rss.feedsportal.com/c/32739/f/530491/index.rss'), - (u'We wt. - Kolenda-Zalewska', u'http://rss.feedsportal.com/c/32739/f/530310/index.rss'), - (u'\u015aroda w \u015brod\u0119', u'http://rss.feedsportal.com/c/32739/f/530428/index.rss'), - (u'W pi\u0105tek - Olejnik', u'http://rss.feedsportal.com/c/32739/f/530364/index.rss') - ] + remove_javascript = True + no_stylesheets = True + remove_tags_before = dict(id='k0') + remove_tags_after = dict(id='banP4') + remove_tags = [dict(name='div', attrs={'class':'rel_box'}), dict(attrs={'class':['date', 'zdjP', 'zdjM', 'pollCont', 'rel_video', 'brand', 'txt_upl']}), dict(name='div', attrs={'id':'footer'})] + feeds = [(u'Kraj', u'http://rss.feedsportal.com/c/32739/f/530266/index.rss'), (u'\u015awiat', u'http://rss.feedsportal.com/c/32739/f/530270/index.rss'), + (u'Wyborcza.biz', u'http://wyborcza.biz/pub/rss/wyborcza_biz_wiadomosci.htm'), + (u'Komentarze', u'http://rss.feedsportal.com/c/32739/f/530312/index.rss'), + (u'Kultura', u'http://rss.gazeta.pl/pub/rss/gazetawyborcza_kultura.xml'), + (u'Nauka', u'http://rss.feedsportal.com/c/32739/f/530269/index.rss'), (u'Opinie', u'http://rss.gazeta.pl/pub/rss/opinie.xml'), (u'Gazeta \u015awi\u0105teczna', u'http://rss.feedsportal.com/c/32739/f/530431/index.rss'), (u'Du\u017cy Format', u'http://rss.feedsportal.com/c/32739/f/530265/index.rss'), (u'Witamy w Polsce', u'http://rss.feedsportal.com/c/32739/f/530476/index.rss'), (u'M\u0119ska Muzyka', u'http://rss.feedsportal.com/c/32739/f/530337/index.rss'), (u'Lata Lec\u0105', u'http://rss.feedsportal.com/c/32739/f/530326/index.rss'), (u'Solidarni z Tybetem', u'http://rss.feedsportal.com/c/32739/f/530461/index.rss'), (u'W pon. - \u017bakowski', u'http://rss.feedsportal.com/c/32739/f/530491/index.rss'), (u'We wt. - Kolenda-Zalewska', u'http://rss.feedsportal.com/c/32739/f/530310/index.rss'), (u'\u015aroda w \u015brod\u0119', u'http://rss.feedsportal.com/c/32739/f/530428/index.rss'), (u'W pi\u0105tek - Olejnik', u'http://rss.feedsportal.com/c/32739/f/530364/index.rss'), (u'Nekrologi', u'http://rss.feedsportal.com/c/32739/f/530358/index.rss') + ] def skip_ad_pages(self, soup): - tag=soup.find(name='a', attrs={'class':'btn'}) - if tag: - new_soup=self.index_to_soup(tag['href'], raw=True) + tag = soup.find(name='a', attrs={'class': 'btn'}) + if tag: + new_soup = self.index_to_soup(tag['href'], raw=True) return new_soup - def append_page(self, soup, appendtag): - loop=False - tag = soup.find('div', attrs={'id':'Str'}) - if appendtag.find('div', attrs={'id':'Str'}): - nexturl=tag.findAll('a') - appendtag.find('div', attrs={'id':'Str'}).extract() - loop=True + loop = False + tag = soup.find('div', attrs={'id': 'Str'}) + if appendtag.find('div', attrs={'id': 'Str'}): + nexturl = tag.findAll('a') + appendtag.find('div', attrs={'id': 'Str'}).extract() + loop = True if appendtag.find(id='source'): appendtag.find(id='source').extract() while loop: - loop=False + loop = False for link in nexturl: if u'następne' in link.string: - url= self.INDEX + link['href'] + url = self.INDEX + link['href'] soup2 = self.index_to_soup(url) pagetext = soup2.find(id='artykul') pos = len(appendtag.contents) appendtag.insert(pos, pagetext) - tag = soup2.find('div', attrs={'id':'Str'}) - nexturl=tag.findAll('a') - loop=True + tag = soup2.find('div', attrs={'id': 'Str'}) + nexturl = tag.findAll('a') + loop = True def gallery_article(self, appendtag): - tag=appendtag.find(id='container_gal') + tag = appendtag.find(id='container_gal') if tag: - nexturl=appendtag.find(id='gal_btn_next').a['href'] + nexturl = appendtag.find(id='gal_btn_next').a['href'] appendtag.find(id='gal_navi').extract() while nexturl: - soup2=self.index_to_soup(nexturl) - pagetext=soup2.find(id='container_gal') - nexturl=pagetext.find(id='gal_btn_next') + soup2 = self.index_to_soup(nexturl) + pagetext = soup2.find(id='container_gal') + nexturl = pagetext.find(id='gal_btn_next') if nexturl: - nexturl=nexturl.a['href'] + nexturl = nexturl.a['href'] pos = len(appendtag.contents) appendtag.insert(pos, pagetext) - rem=appendtag.find(id='gal_navi') + rem = appendtag.find(id='gal_navi') if rem: rem.extract() def preprocess_html(self, soup): - self.append_page(soup, soup.body) - if soup.find(id='container_gal'): - self.gallery_article(soup.body) - return soup + if soup.find(attrs={'class': 'piano_btn_1'}): + return None + else: + self.append_page(soup, soup.body) + if soup.find(id='container_gal'): + self.gallery_article(soup.body) + return soup def print_version(self, url): - if 'http://wyborcza.biz/biznes/' not in url: - return url + if url.count('rss.feedsportal.com'): + u = url.find('wyborcza0Bpl') + u = 'http://www.wyborcza.pl/' + url[u + 11:] + u = u.replace('0C', '/') + u = u.replace('A', '') + u = u.replace('0E', '-') + u = u.replace('0H', ',') + u = u.replace('0I', '_') + u = u.replace('0B', '.') + u = u.replace('/1,', '/2029020,') + u = u.replace('/story01.htm', '') + print(u) + return u + elif 'http://wyborcza.pl/1' in url: + return url.replace('http://wyborcza.pl/1', 'http://wyborcza.pl/2029020') else: - return url.replace('http://wyborcza.biz/biznes/1', 'http://wyborcza.biz/biznes/2029020') + return url.replace('http://wyborcza.biz/biznes/1', 'http://wyborcza.biz/biznes/2029020') def get_cover_url(self): soup = self.index_to_soup('http://wyborcza.pl/0,76762,3751429.html') - cover=soup.find(id='GWmini2') - soup = self.index_to_soup('http://wyborcza.pl/'+ cover.contents[3].a['href']) - self.cover_url='http://wyborcza.pl' + soup.img['src'] + cover = soup.find(id='GWmini2') + soup = self.index_to_soup('http://wyborcza.pl/' + cover.contents[3].a['href']) + self.cover_url = 'http://wyborcza.pl' + soup.img['src'] return getattr(self, 'cover_url', self.cover_url) From 5a26d7ceb3899e48765d4a95c1cf3dac452b7ae2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tomasz=20D=C5=82ugosz?= Date: Fri, 26 Oct 2012 21:35:28 +0200 Subject: [PATCH 6/9] align to kalibrator - rzeczpospolita.recipe --- recipes/rzeczpospolita.recipe | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/recipes/rzeczpospolita.recipe b/recipes/rzeczpospolita.recipe index 4ab27d8437..40cb4db3ac 100644 --- a/recipes/rzeczpospolita.recipe +++ b/recipes/rzeczpospolita.recipe @@ -34,16 +34,20 @@ class RzeczpospolitaRecipe(BasicNewsRecipe): keep_only_tags.append(dict(name = 'div', attrs = {'id' : 'story'})) remove_tags =[] + remove_tags.append(dict(name = 'div', attrs = {'class' : 'articleLeftBox'})) + remove_tags.append(dict(name = 'div', attrs = {'class' : 'socialNewTools'})) remove_tags.append(dict(name = 'div', attrs = {'id' : 'socialTools'})) remove_tags.append(dict(name = 'div', attrs = {'class' : 'articleToolBoxTop'})) remove_tags.append(dict(name = 'div', attrs = {'class' : 'clr'})) remove_tags.append(dict(name = 'div', attrs = {'id' : 'recommendations'})) - remove_tags.append(dict(name = 'div', attrs = {'id' : 'editorPicks'})) + remove_tags.append(dict(name = 'div', attrs = {'class' : 'editorPicks'})) + remove_tags.append(dict(name = 'div', attrs = {'class' : 'editorPicks editorPicksFirst'})) remove_tags.append(dict(name = 'div', attrs = {'id' : 'articleCopyrightText'})) remove_tags.append(dict(name = 'div', attrs = {'id' : 'articleCopyrightButton'})) remove_tags.append(dict(name = 'div', attrs = {'class' : 'articleToolBoxBottom'})) remove_tags.append(dict(name = 'div', attrs = {'class' : 'more'})) remove_tags.append(dict(name = 'div', attrs = {'class' : 'addRecommendation'})) + remove_tags.append(dict(name = 'h3', attrs = {'id' : 'tags'})) extra_css = ''' body {font-family: verdana, arial, helvetica, geneva, sans-serif ;} @@ -67,3 +71,4 @@ class RzeczpospolitaRecipe(BasicNewsRecipe): return start + '/' + index + '?print=tak' + From ac47781c7c183cfe7192b63886495da89c9e2cab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tomasz=20D=C5=82ugosz?= Date: Fri, 26 Oct 2012 21:36:57 +0200 Subject: [PATCH 7/9] align to kalibrator - wprost.recipe --- recipes/wprost.recipe | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/recipes/wprost.recipe b/recipes/wprost.recipe index b271665125..2adac1e113 100644 --- a/recipes/wprost.recipe +++ b/recipes/wprost.recipe @@ -3,6 +3,8 @@ __license__ = 'GPL v3' __copyright__ = '2010, matek09, matek09@gmail.com' __copyright__ = 'Modified 2011, Mariusz Wolek ' +__copyright__ = 'Modified 2012, Artur Stachecki ' + from calibre.web.feeds.news import BasicNewsRecipe import re @@ -11,7 +13,7 @@ class Wprost(BasicNewsRecipe): EDITION = 0 FIND_LAST_FULL_ISSUE = True EXCLUDE_LOCKED = True - ICO_BLOCKED = 'http://www.wprost.pl/G/icons/ico_blocked.gif' + ICO_BLOCKED = 'http://www.wprost.pl/G/layout2/ico_blocked.png' title = u'Wprost' __author__ = 'matek09' @@ -20,6 +22,7 @@ class Wprost(BasicNewsRecipe): no_stylesheets = True language = 'pl' remove_javascript = True + recursions = 0 remove_tags_before = dict(dict(name = 'div', attrs = {'id' : 'print-layer'})) remove_tags_after = dict(dict(name = 'div', attrs = {'id' : 'print-layer'})) @@ -35,13 +38,15 @@ class Wprost(BasicNewsRecipe): (re.compile(r'\\\<\/table\>'), lambda match: ''), (re.compile(r'\'), lambda match: ''), (re.compile(r'\'), lambda match: ''), - (re.compile(r'\
'), lambda match: '')] + (re.compile(r'\'), lambda match: ''), + (re.compile(r'\