From 3e84887382171a3b3e4095e0412b80f04e63de0c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tomasz=20D=C5=82ugosz?= Date: Thu, 23 May 2013 23:47:43 +0200 Subject: [PATCH] updates from kalibrator project --- recipes/adventure_zone_pl.recipe | 56 +++----------- recipes/astroflesz.recipe | 1 + recipes/gosc_niedzielny.recipe | 122 ++++++++++++------------------- recipes/histmag.recipe | 3 +- recipes/icons/geopolityka.png | Bin 0 -> 1497 bytes recipes/icons/gs24_pl.png | Bin 0 -> 428 bytes recipes/icons/homopedia_pl.png | Bin 0 -> 541 bytes recipes/icons/pc_lab.png | Bin 0 -> 697 bytes recipes/icons/polityka.png | Bin 0 -> 346 bytes recipes/icons/rynek_zdrowia.png | Bin 0 -> 418 bytes recipes/osnews_pl.recipe | 20 ++--- 11 files changed, 68 insertions(+), 134 deletions(-) create mode 100644 recipes/icons/geopolityka.png create mode 100644 recipes/icons/gs24_pl.png create mode 100644 recipes/icons/homopedia_pl.png create mode 100644 recipes/icons/pc_lab.png create mode 100644 recipes/icons/polityka.png create mode 100644 recipes/icons/rynek_zdrowia.png diff --git a/recipes/adventure_zone_pl.recipe b/recipes/adventure_zone_pl.recipe index 50a980dc92..43342a9b28 100644 --- a/recipes/adventure_zone_pl.recipe +++ b/recipes/adventure_zone_pl.recipe @@ -6,42 +6,20 @@ class Adventure_zone(BasicNewsRecipe): description = u'Czytaj więcej o przygodzie - codzienne nowinki. Szukaj u nas solucji i poradników, czytaj recenzje i zapowiedzi. Także galeria, pliki oraz forum dla wszystkich fanów gier przygodowych.' category = 'games' language = 'pl' + BASEURL = 'http://www.adventure-zone.info/fusion/' no_stylesheets = True + extra_css = '.image {float: left; margin-right: 5px;}' oldest_article = 20 max_articles_per_feed = 100 cover_url = 'http://www.adventure-zone.info/inne/logoaz_2012.png' - index = 'http://www.adventure-zone.info/fusion/' + remove_attributes = ['style'] use_embedded_content = False - preprocess_regexps = [(re.compile(r"Komentarze", re.IGNORECASE), lambda m: ''), - (re.compile(r''), lambda match: ''), - (re.compile(r''), lambda match: '')] - remove_tags_before = dict(name='td', attrs={'class':'main-bg'}) - remove_tags = [dict(name='img', attrs={'alt':'Drukuj'})] - remove_tags_after = dict(id='comments') - extra_css = '.main-bg{text-align: left;} td.capmain{ font-size: 22px; } img.news-category {float: left; margin-right: 5px;}' - feeds = [(u'Nowinki', u'http://www.adventure-zone.info/fusion/feeds/news.php')] - - '''def get_cover_url(self): - soup = self.index_to_soup('http://www.adventure-zone.info/fusion/news.php') - cover=soup.find(id='box_OstatninumerAZ') - self.cover_url='http://www.adventure-zone.info/fusion/'+ cover.center.a.img['src'] - return getattr(self, 'cover_url', self.cover_url)''' - - def populate_article_metadata(self, article, soup, first): - result = re.search('(.+) - Adventure Zone', soup.title.string) - if result: - result = result.group(1) - else: - result = soup.body.find('strong') - if result: - result = result.string - if result: - result = result.replace('&', '&') - result = result.replace(''', '’') - article.title = result + keep_only_tags = [dict(attrs={'class':'content'})] + remove_tags = [dict(attrs={'class':'footer'})] + feeds = [(u'Nowinki', u'http://www.adventure-zone.info/fusion/rss/index.php')] def skip_ad_pages(self, soup): - skip_tag = soup.body.find(name='td', attrs={'class':'main-bg'}) + skip_tag = soup.body.find(attrs={'class':'content'}) skip_tag = skip_tag.findAll(name='a') title = soup.title.string.lower() if (('zapowied' in title) or ('recenzj' in title) or ('solucj' in title) or ('poradnik' in title)): @@ -49,20 +27,10 @@ class Adventure_zone(BasicNewsRecipe): if r.strong and r.strong.string: word=r.strong.string.lower() if (('zapowied' in word) or ('recenzj' in word) or ('solucj' in word) or ('poradnik' in word)): - return self.index_to_soup('http://www.adventure-zone.info/fusion/print.php?type=A&item'+r['href'][r['href'].find('article_id')+7:], raw=True) + return self.index_to_soup(self.BASEURL+r['href'], raw=True) def preprocess_html(self, soup): - footer=soup.find(attrs={'class':'news-footer middle-border'}) - r = soup.find(name='td', attrs={'class':'capmain'}) - if r: - r.name='h1' - for item in soup.findAll(name=['tr', 'td']): - item.name='div' - if footer and len(footer('a'))>=2: - footer('a')[1].extract() - for item in soup.findAll(style=True): - del item['style'] - for a in soup('a'): - if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: - a['href']=self.index + a['href'] - return soup + for link in soup.findAll('a', href=True): + if not link['href'].startswith('http'): + link['href'] = self.BASEURL + link['href'] + return soup \ No newline at end of file diff --git a/recipes/astroflesz.recipe b/recipes/astroflesz.recipe index 676aedfd3a..902f99c2c8 100644 --- a/recipes/astroflesz.recipe +++ b/recipes/astroflesz.recipe @@ -13,6 +13,7 @@ class Astroflesz(BasicNewsRecipe): max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False + remove_empty_feeds = True remove_attributes = ['style'] keep_only_tags = [dict(id="k2Container")] remove_tags_after = dict(name='div', attrs={'class':'itemLinks'}) diff --git a/recipes/gosc_niedzielny.recipe b/recipes/gosc_niedzielny.recipe index 11beb076f5..65e6e1704c 100644 --- a/recipes/gosc_niedzielny.recipe +++ b/recipes/gosc_niedzielny.recipe @@ -6,12 +6,10 @@ __copyright__ = '2011, Piotr Kontek, piotr.kontek@gmail.com \ 2013, Tomasz Długosz, tomek3d@gmail.com' from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ptempfile import PersistentTemporaryFile -from datetime import date import re +from lxml import html class GN(BasicNewsRecipe): - EDITION = 0 __author__ = 'Piotr Kontek, Tomasz Długosz' title = u'Gość Niedzielny' @@ -20,83 +18,23 @@ class GN(BasicNewsRecipe): no_stylesheets = True language = 'pl' remove_javascript = True - temp_files = [] - articles_are_obfuscated = True + def find_last_issue(self): + raw = self.index_to_soup('http://gosc.pl/wyszukaj/wydania/3.Gosc-Niedzielny/', raw=True) + doc = html.fromstring(raw) + page = doc.xpath('//div[@class="c"]//div[@class="search-result"]/div[1]/div[2]/h1//a/@href') - def get_obfuscated_article(self, url): - br = self.get_browser() - br.open(url) - source = br.response().read() - page = self.index_to_soup(source) - - main_section = page.find('div',attrs={'class':'txt doc_prnt_prv'}) - - title = main_section.find('h2') - info = main_section.find('div', attrs={'class' : 'cf doc_info'}) - authors = info.find(attrs={'class':'l'}) - article = str(main_section.find('p', attrs={'class' : 'doc_lead'})) - first = True - for p in main_section.findAll('p', attrs={'class':None}, recursive=False): - if first and p.find('img') != None: - article += '

' - article += str(p.find('img')).replace('src="/files/','src="http://www.gosc.pl/files/') - article += '' - for s in p.findAll('span'): - article += self.tag_to_string(s) - article += '

' - else: - article += str(p).replace('src="/files/','src="http://www.gosc.pl/files/') - first = False - limiter = main_section.find('p', attrs={'class' : 'limiter'}) - if limiter: - article += str(limiter) - - html = unicode(title) - #sometimes authors are not filled in: - if authors: - html += unicode(authors) + unicode(article) - else: - html += unicode(article) - - self.temp_files.append(PersistentTemporaryFile('_temparse.html')) - self.temp_files[-1].write(html) - self.temp_files[-1].close() - return self.temp_files[-1].name - - def find_last_issue(self, year): - soup = self.index_to_soup('http://gosc.pl/wyszukaj/wydania/3.Gosc-Niedzielny/rok/' + str(year)) - - #szukam zdjęcia i linka do poprzedniego pełnego numeru - first = True - for d in soup.findAll('div', attrs={'class':'l release_preview_l'}): - img = d.find('img') - if img != None: - a = img.parent - self.EDITION = a['href'] - #this was preventing kindles from moving old issues to 'Back Issues' category: - #self.title = img['alt'] - self.cover_url = 'http://www.gosc.pl' + img['src'] - if year != date.today().year or not first: - break - first = False + return page[1] def parse_index(self): - year = date.today().year - self.find_last_issue(year) - ##jeśli to pierwszy numer w roku trzeba pobrać poprzedni rok - if self.EDITION == 0: - self.find_last_issue(year-1) - soup = self.index_to_soup('http://www.gosc.pl' + self.EDITION) + soup = self.index_to_soup('http://gosc.pl' + self.find_last_issue()) feeds = [] #wstepniak a = soup.find('div',attrs={'class':'release-wp-b'}).find('a') articles = [ {'title' : self.tag_to_string(a), - 'url' : 'http://www.gosc.pl' + a['href'].replace('/doc/','/doc_pr/'), - 'date' : '', - 'description' : ''} - ] + 'url' : 'http://www.gosc.pl' + a['href'].replace('/doc/','/doc_pr/') + }] feeds.append((u'Wstępniak',articles)) #kategorie for addr in soup.findAll('a',attrs={'href':re.compile('kategoria')}): @@ -113,16 +51,46 @@ class GN(BasicNewsRecipe): art = a.find('a') yield { 'title' : self.tag_to_string(art), - 'url' : 'http://www.gosc.pl' + art['href'].replace('/doc/','/doc_pr/'), - 'date' : '', - 'description' : '' + 'url' : 'http://www.gosc.pl' + art['href'] } for a in main_block.findAll('div', attrs={'class':'sr-document'}): art = a.find('a') yield { 'title' : self.tag_to_string(art), - 'url' : 'http://www.gosc.pl' + art['href'].replace('/doc/','/doc_pr/'), - 'date' : '', - 'description' : '' + 'url' : 'http://www.gosc.pl' + art['href'] } + def append_page(self, soup, appendtag): + chpage= appendtag.find(attrs={'class':'pgr_nrs'}) + if chpage: + for page in chpage.findAll('a'): + soup2 = self.index_to_soup('http://gosc.pl' + page['href']) + pagetext = soup2.find(attrs={'class':'intextAd'}) + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + ''' + for image_div in soup.findAll(attrs={'class':'doc_image'}): + link = + if 'm.jpg' in image['src']: + image['src'] = image['src'].replace('m.jpg', '.jpg') + ''' + return soup + + keep_only_tags = [ + dict(name='div', attrs={'class':'cf txt'}) + ] + + remove_tags = [ + dict(name='p', attrs={'class':['r tr', 'l l-2', 'wykop']}), + dict(name='div', attrs={'class':['doc_actions', 'pgr', 'fr1_cl']}), + dict(name='div', attrs={'id':'vote'}) + ] + + extra_css = ''' + h1 {font-size:150%} + div#doc_image {font-style:italic; font-size:70%} + p.limiter {font-size:150%; font-weight: bold} + ''' diff --git a/recipes/histmag.recipe b/recipes/histmag.recipe index 0009580e49..9e6ca111a7 100644 --- a/recipes/histmag.recipe +++ b/recipes/histmag.recipe @@ -13,11 +13,12 @@ class Histmag(BasicNewsRecipe): __author__ = 'matek09' description = u"Artykuly historyczne i publicystyczne" encoding = 'utf-8' + extra_css = '''.center img {display: block;}''' #preprocess_regexps = [(re.compile(r''), lambda match: '

'),(re.compile(r''), lambda match: '

')] no_stylesheets = True language = 'pl' remove_javascript = True keep_only_tags=[dict(id='article')] - remove_tags=[dict(name = 'p', attrs = {'class' : 'article-tags'})] + remove_tags=[dict(name = 'p', attrs = {'class' : 'article-tags'}), dict(attrs={'class':'twitter-share-button'})] feeds = [(u'Wszystkie', u'http://histmag.org/rss/wszystkie.xml'), (u'Wydarzenia', u'http://histmag.org/rss/wydarzenia.xml'), (u'Recenzje', u'http://histmag.org/rss/recenzje.xml'), (u'Artykuły historyczne', u'http://histmag.org/rss/historia.xml'), (u'Publicystyka', u'http://histmag.org/rss/publicystyka.xml')] diff --git a/recipes/icons/geopolityka.png b/recipes/icons/geopolityka.png new file mode 100644 index 0000000000000000000000000000000000000000..7bef643edb6f3efcd6c354ac62d8a2da95b41627 GIT binary patch literal 1497 zcmZ`(c|6ox6u(L@@v@XkdWs&Bt+7->dA5wivxUTDH`W+4W(;P=FvA#QY%>@$jD6|Z zLY9z<)~7;I_R7+$P))Kt%=`7<`=|Rk=X~$EpL@>dbI$#qOS8AL+AS_G4glC~4YzQF zQ2K|8ZHKIymXs3&qJgHirT{cDB{sZwK)#|c+|d>Qyea^sBmmZ-Bhq^SFfaf{Zij<00NWPrFFISQsv8vZ=w7})0La=qIOVfadpc`so2#1|o5qIjrlh1X(->{7 zbxrjhBy6092gmS8A7$~Ud^Y|s%nasKIC#C#~z!1#pxQ=TqU*BEzLK2I@|Nvxmav`_%&=m5GpVTMIqrUdGzkqvijPFwwoRCsw5=aTCNOU40md~OO^fn8HYl~l&#zzO1zs$S1homuBoZRem zI-QzGp;0M#TnrwIrBg^vRh-#Z4~0VE>e}k~*udz>bB9Yotju&Kot8u<2O=Y5usC`W zabt61Wp#OVd1kP`PhAJj%3_d-glG&dnL?o@5}K>H_ixn;zJ6Z(_`d&6b1^Rmhfj#Y z#F64-vgnC5#cW5bb1gR-rY9!ao14yCS_S%C2@UXa^9TtGM=?_qGibzuoaB;%G%rsC zlabR}-*NaTOi>x;;bIr;?Hq>2Ik@<+nB=;OEDuivpHm7wx~zh>zn=%f!`02z$=niN z#-&yk(Za%zQ9-^{TvkH`|9WLfI*k;M3vH~*sISamGZW^fUqWSG@jyg``4i)DdF-qj zezs7!vLY0A)Mr2LDlKBu`r0er4&T9|F%(j45i9jUN6o_9=Ltk2Itsh8FtNG1P?(?H z)mZxP(j=5BpT*3}VW!cjnT%u(D~(B~k_d4z*k~3L>M${t63=5(^0Ua?oWvM3wwh0? zE=nn4QKtrbCMI5O@+9UV`o2Ir+WnZGpDa-YbGWU40f~;Z2nvF0DY}GQi0kWT=OVL0 zq!hI@F`n`jE2;MiG`49JUm(;O5qJ!J{APnAdY3fEBz(C`;Jvd>UQ#@SOP?GQy^7J4 z*BdF~wQOy=G2~UXB`X_TKCOO!v-vNlz-0ez0UZ8pYrQHsd*2ZO$4@Xi^KIR$9Om|J z>WN&#$S%3?OhnK3g&t~D?J;x15dV`F7Ogx`*wh4zQ6Eli!|hD4-AmbL(pKlGcz;UkJlM}JpVQ9Y&xj;m{E zYH6Q<>FDa|pFDNi0Gu&2GCq6GkBp)ci7s!{PUNcK+N93 z(a9N_L$}MrBd^?dfh!O{u6lX<_`V(;8=vs=2LXY|Ao1Xk$*It=@N3f%kpLBi#zbQw z^2f%-6W$O>Ac34Hk@S{AO-`Y`o0(0egEU4ub8h~9MrKxa4vU?e2Y$`x6#VnyC*B(B5&Yv#a~|^7o!Q zcklH|%YZ$5W%u>nfAH|p14Ba#1H&(%P{RubhEf9thF1v;3|2E37{m+a>+;fg% z%?dm$7Yx~!v^gwM+WX&A`t>Tkv!C|QtYrL`e@J-2OiPzThvvlbtzN>jVUkEj;figQ zn^L!aa4R;ryju3m484f&66H^nj_vpQ$a%T-!cm!79CHhJ^5=4HnV7U{@AVIoKOep} z|G~t&f<^j3z%)*v&8j7?5hW>!C8<`)MX5lF!N|bSSl7T**T^`;(9p`zz{c>LaiJj&qd>gTe~DWM4f14Ba#1H&(%P{RubhEf9thF1v;3|2E37{m+a>KbBzV)s7&zIJ4NNJQ+8jq@1-`v3oWvE$NxS#vGVk@LvxZD z7}9_J|KDQBP`~!V$rUyCfBgUd?~M@yL-oqj4=)|wfBxRIC-(0Mg0S5^Zp%=!s*!JCUycis5^`(g8ji#K*nJ^$)|2m?d+?)$s@ zY#1VXn@t#It=Y6<*Xa*Wu5H|~cJ0nHmYLTafLa-oyxm@BjZoOgrtQpJ$#=DOYE%*POUz3RioH+G?36p$GKXVzeHw zV7eM*dn?BzbEWU`x5t+6wQE-FF)_Pu(>lwi*YL8;pQGmaJEJyVy!OGb+Tw-814Ba#1H&(%P{RubhEf9thF1v;3|2E37{m+a>CtQ?mmC|=F`9b|37^FdExGh75gqH^=~hoxnsqSN@8AF1eV^~?`1Siw!K}S2cAviY;-h)#3O4_4?!fLvoA&+R==FG0 zkyP}g?I*77y7)*lV~%lBr(VVksko`zPTu&xE9Ak}$_>YF8Rai@s90u_+&y*Cx<7yZ z&e?caKWDCQ+va08p1=G0`|J1bzYeCqJ2Bz^zkmP#{_b9T%DHOU%;lRueg3?8--)Q2 znRfXzylPjJPh0nVU+vSi5l^=jzP~WPbiv`Mu8mJ#zWx2@&y!~_E?v9z_}TNXpFg~Q z@$}`hM{i#}ZJM{WuxIh>cOSBDA6WzpT*f4CcNc~ZR#^`qXQHQzV~E7%B`4=>NtCyWeEoYXcj(9qIcy<(N&ifo3h8CjXe zFJ8QQDa~-R&CQ|d>Ww2;&fGb4iR;9+ZHz+O%D(aS`JK~Sm-kL?Ufex9o_%%y7#%(y zSny!Ng$*A%Jla;gkl65bW8p{6PS2B?JS#I_rtFZjXJF_)Eb^&xa>Zew<5WvrBT7;d zOH!?pi&B9UgOP!uv95usu90zwp`n$5iIs_=wt3{tjz Q0BT_HboFyt=akR{04jo1ZvX%Q literal 0 HcmV?d00001 diff --git a/recipes/icons/polityka.png b/recipes/icons/polityka.png new file mode 100644 index 0000000000000000000000000000000000000000..482408dc7bb854ad38cd8b0b2abd8d80e13d94d0 GIT binary patch literal 346 zcmeAS@N?(olHy`uVBq!ia0vp^0wB!73?$#)eFPF@P5LY1mPSE)O%vJw^ z;N83bY32V{t@?lW?*HAp|F7Ep|J~i{%0D7NMT{jue!&b5&u*lFI7!~_F3f^FKA(Ub z8&4O<5RU7~2?-)BOpI)7Y;KCllN}74&)nE4aA*TZS&MmdKI;Vst0FJ3@mH+?% literal 0 HcmV?d00001 diff --git a/recipes/icons/rynek_zdrowia.png b/recipes/icons/rynek_zdrowia.png new file mode 100644 index 0000000000000000000000000000000000000000..76fcf3fb9858d8c07e42fe9580efa191eaf54e3f GIT binary patch literal 418 zcmeAS@N?(olHy`uVBq!ia0vp^0wB!63?wyl`GbKJOS+@4BLl<6e(pbstU$g(vPY0F z14ES>14Ba#1H&(%P{RubhEf9thF1v;3|2E37{m+a>^84>U>g((8-@pI<{rmlQ zAFaQ7TfSqK|Dp}`yN}MFKYxz?y(>Tsj7i?^E({&4vK~NAx~Gd{h{WZ!=MVB7QV?*x z7@fh$ek>@n`~Ux?N$MB(e$?~-B*y>m!jnC-pS23C7uxQc_FnaP07q5)!mgZtFD;X+ zbImKH*SF7$dBBtNfmM16e~)9!-pO-4?k-rp>{{~kNcYNyt93g*G3lRTd3oEsVIt5< z)e_f;l9a@fRIB8oR3OD*WMF8lYhbEtWE^5>Xk}n-Wn!vrU}R-r5a&2C21P?|eoAIq XC2kG+GMW#78W=oX{an^LB{Ts5Qg)Vi literal 0 HcmV?d00001 diff --git a/recipes/osnews_pl.recipe b/recipes/osnews_pl.recipe index 455f005a7e..7251f31827 100644 --- a/recipes/osnews_pl.recipe +++ b/recipes/osnews_pl.recipe @@ -20,7 +20,7 @@ class OSNewsRecipe(BasicNewsRecipe): remove_javascript = True encoding = 'utf-8' use_embedded_content = False; - + remove_empty_feeds = True oldest_article = 7 max_articles_per_feed = 100 cover_url='http://osnews.pl/wp-content/themes/osnews/img/logo.png' @@ -31,22 +31,18 @@ class OSNewsRecipe(BasicNewsRecipe): ''' feeds = [ - (u'OSNews.pl', u'http://feeds.feedburner.com/OSnewspl') + (u'Niusy', u'http://feeds.feedburner.com/OSnewspl'), + (u'Wylęgarnia', u'http://feeds.feedburner.com/osnewspl_nowe') ] keep_only_tags = [ - dict(name = 'a', attrs = {'class' : 'news-heading'}), - dict(name = 'div', attrs = {'class' : 'newsinformations'}), - dict(name = 'div', attrs = {'id' : 'news-content'}) + dict(name = 'div', attrs = {'id' : 'content'}) ] remove_tags = [ - dict(name = 'div', attrs = {'class' : 'sociable'}), - dict(name = 'div', attrs = {'class' : 'post_prev'}), - dict(name = 'div', attrs = {'class' : 'post_next'}), - dict(name = 'div', attrs = {'class' : 'clr'}), - dict(name = 'div', attrs = {'class' : 'tw_button'}), - dict(name = 'div', attrs = {'style' : 'width:56px;height:60px;float:left;margin-right:10px'}) + dict(name = 'div', attrs = {'class' : ['newstags', 'tw_button', 'post_prev']}), + dict(name = 'div', attrs = {'id' : 'newspage_upinfo'}), ] - preprocess_regexps = [(re.compile(u'Komentarze: \(?[0-9]+\)? ?Komentarze: \(?[0-9]+\)? ?'), lambda match: '')]