From fc439f3d826828269319ee12b8bc01d4cc195a58 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tomasz=20D=C5=82ugosz?= Date: Tue, 9 Oct 2018 00:30:13 +0200 Subject: [PATCH 1/4] fix satkurier recipe --- recipes/satkurier.recipe | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/recipes/satkurier.recipe b/recipes/satkurier.recipe index dc4bca86cc..d67844c8cd 100644 --- a/recipes/satkurier.recipe +++ b/recipes/satkurier.recipe @@ -17,18 +17,11 @@ class SATKurier(BasicNewsRecipe): remove_javascript = True no_stylesheets = True - keep_only_tags = [] - keep_only_tags.append( - dict(name='div', attrs={'id': ['single_news', 'content']})) + keep_only_tags = [dict(name='div', attrs={'id': ['leftNewsContainer', 'content']})] - remove_tags = [] - remove_tags.append(dict(attrs={'id': ['news_info', 'comments']})) - remove_tags.append(dict(attrs={'href': '#czytaj'})) - remove_tags.append(dict(attrs={'align': 'center'})) - remove_tags.append(dict(attrs={'class': [ - 'date', 'category', 'right mini-add-comment', 'socialLinks', 'commentlist']})) + remove_tags = [dict(name='div', attrs={'class': ['col-xs-20', 'coverNews','btn-group']})] - remove_tags_after = [(dict(id='entry'))] + remove_tags_after = [dict(name='div',attrs={'class':'btn-group'})] feeds = [(u'Najnowsze wiadomości', u'http://feeds.feedburner.com/satkurierpl?format=xml'), (u'Sport w telewizji', From c7fc794a089ddb1cfc77fb1c84889751bcb9a81f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tomasz=20D=C5=82ugosz?= Date: Tue, 9 Oct 2018 01:02:07 +0200 Subject: [PATCH 2/4] update rzeczpospolita recipe --- recipes/rzeczpospolita.recipe | 30 ++++++++---------------------- 1 file changed, 8 insertions(+), 22 deletions(-) diff --git a/recipes/rzeczpospolita.recipe b/recipes/rzeczpospolita.recipe index 6d91d3f913..9fa67ad7de 100644 --- a/recipes/rzeczpospolita.recipe +++ b/recipes/rzeczpospolita.recipe @@ -5,7 +5,6 @@ class RzeczpospolitaRecipe(BasicNewsRecipe): __license__ = 'GPL v3' __author__ = u'kwetal, Tomasz Dlugosz, adrianf0' language = 'pl' - version = 2 title = u'Rzeczpospolita OnLine' publisher = u'Presspublica Sp.' @@ -25,28 +24,15 @@ class RzeczpospolitaRecipe(BasicNewsRecipe): feeds.append((u"Prawo", u'http://www.rp.pl/rss/1037')) # Prawo keep_only_tags = [] - keep_only_tags.append(dict(name='div', attrs={'class': 'article-content'})) + keep_only_tags.append(dict(name='h1', attrs={'id': 'article-title'})) + keep_only_tags.append(dict(name='img', attrs={'class': 'img-responsive article__image'})) + keep_only_tags.append(dict(name='div', attrs={'class': ['article-content', 'article__lead js-voice-read', 'article__content js-voice-read','article__image-desc','article__image-author']})) remove_tags = [] - remove_tags.append(dict(name='div', attrs={'id': 'article-copyright-box'})) - remove_tags.append(dict(name='div', attrs={'class': 'article-footer'})) - remove_tags.append(dict(name='div', attrs={'class': 'article-tags'})) + remove_tags.append(dict(name='div', attrs={'class': 'related-articles__wrapper'})) + remove_tags.append(dict(name='span', attrs={'class': ['article__premium-player','ad-label']})) extra_css = ''' - body {font-family: verdana, arial, helvetica, geneva, sans-serif ;} - h1{text-align: left;} - h2{font-size: medium; font-weight: bold;} - p.lead {font-weight: bold; text-align: left;} - .authordate {font-size: small; color: #696969;} - .fot{font-size: x-small; color: #666666;} - ''' - - # def skip_ad_pages(self, soup): - # if ('advertisement' in soup.find('title').string.lower()): - # href = soup.find('a').get('href') - # return self.index_to_soup(href, raw=True) - # else: - # return None - - def print_version(self, url): - return url + '?template=printart' + div.article__image-desc {font-style:italic; font-size:70%;text-align:right} + div.article__image-author {font-size:60%;text-align:right} + ''' From c4b92cebda901e301123c1f2574df6246bf42553 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tomasz=20D=C5=82ugosz?= Date: Tue, 9 Oct 2018 01:08:12 +0200 Subject: [PATCH 3/4] remove some broken recipes --- recipes/icons/spiders_web_pl.png | Bin 739 -> 0 bytes recipes/icons/sport_pl.png | Bin 542 -> 0 bytes recipes/spiders_web_pl.recipe | 20 --------- recipes/sport_pl.recipe | 75 ------------------------------- 4 files changed, 95 deletions(-) delete mode 100644 recipes/icons/spiders_web_pl.png delete mode 100644 recipes/icons/sport_pl.png delete mode 100644 recipes/spiders_web_pl.recipe delete mode 100644 recipes/sport_pl.recipe diff --git a/recipes/icons/spiders_web_pl.png b/recipes/icons/spiders_web_pl.png deleted file mode 100644 index c2e65a2b75f16f6a1484780f803bbf5af557fde4..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 739 zcmV<90v!E`P)%7L{3)p%}_&qt`VVHEpU<8!gL} z1{IskrBF#T_ceFZMw_|nxZL@7?uTOzSqFZ6+~xew`Od+%s84u{ ziknY`H-71SsXX(@Z2zPK!L7r_*=bQ&h@9>k?HSa!iQT15=+!l&?L{5RZ>U$)XQIYS zWRElrX*w4Gn+T3rb9NLaX0_GPMaVo6j^uqoNOBHD>VXjCoW6$=Sq6%|laY2PRD9)~ z-2c70-JReF=4?-BTGVpv#;_Q8)AV_cn zF+Zq|g>~1&ymXZWOjxz0xkXGg58~^yYUm@iD7ugc)zL5#lu3eC0d}?2gCUlh*y0Bq z{$Uo;f+fMsGW+tl3YxPqP*RO3I^oZXTzn~o=SCZPSyXXZ#Q)4RYCzT9d?f6;%rfBj zTtT6ylII#T%1zm)?y!$TA3LyM5Lon$%=~L!suR0Tic2u`_X~J#tg$aEkJ3}v`56m{2fSsm6}%M(2k zo#e>(P`n=b)MLLkNYfo7pus@a$=ftEL5z37*^+=w9SS@YQuN8X_!4*G&nOdGacKoJ z13xhMz70dQomiL=U^PSE1WM&9&PHLu@Q8Y%HumBqk_?qCrhbnTwNCLsf0{s#UwO zfmt(Va&mB(8=HVM0G%Pk%de)Ux@Y(9AK$RVqjb_e?Cy+ z?c2BQt?g4LO`bY=^0X;aW=xw_T~V2qmZq+(EGa4mbrR4$3=9k#*KGi5`1tYTq6G_A zuUfrk_3E{&SFc~Y7N{pEFp!y%5vUBP7;1w6AHSQ6>%#f-&!0Pg_3G72mo8nteCgcT zbC)k&0-AF2#0j8@N^#K?GH z|A8-GzI;LgUp{~S@ZrO~yLb2Q*&`(;Dk>-h3{0p7elG4!8#k?8yJq>aWy_a?5Rd_k zkooiHXQgLwb8vF8u>+$OXamC2EZ6`m7)gkT0u=yNLo`4WA27(&lz~VEj({RSQ-KJQ g>>voJnB?RQ0D3_8{q68H!2kdN07*qoM6N<$f{NAVk^lez diff --git a/recipes/spiders_web_pl.recipe b/recipes/spiders_web_pl.recipe deleted file mode 100644 index b8c5ffffba..0000000000 --- a/recipes/spiders_web_pl.recipe +++ /dev/null @@ -1,20 +0,0 @@ -from calibre.web.feeds.news import BasicNewsRecipe - - -class SpidersWeb(BasicNewsRecipe): - title = u"Spider's Web" - oldest_article = 7 - __author__ = 'fenuks' - description = u'Autorskie teksty popularnych blogerów, testy sprzętu i aplikacji, oraz wiele więcej.' - cover_url = 'http://www.spidersweb.pl/wp-content/themes/new_sw/images/spidersweb.png' - category = 'IT, WEB' - language = 'pl' - no_stylesheers = True - remove_javascript = True - use_embedded_content = False - max_articles_per_feed = 100 - keep_only_tags = [dict(id='start')] - remove_tags_after = dict(attrs={'class': 'padding20'}) - remove_tags = [dict(name='div', attrs={ - 'class': ['padding border-bottom', 'padding20', 'padding border-top']})] - feeds = [(u'Wpisy', u'http://www.spidersweb.pl/feed')] diff --git a/recipes/sport_pl.recipe b/recipes/sport_pl.recipe deleted file mode 100644 index 0320012605..0000000000 --- a/recipes/sport_pl.recipe +++ /dev/null @@ -1,75 +0,0 @@ -#!/usr/bin/env python2 - -__license__ = 'GPL v3' -__copyright__ = 'teepel 2012' - -''' -sport.pl -''' - -from calibre.web.feeds.news import BasicNewsRecipe - - -class sport_pl(BasicNewsRecipe): - title = 'Sport.pl' - __author__ = 'teepel ' - language = 'pl' - description = u'Największy portal sportowy w Polsce. Wiadomości sportowe z najważniejszych wydarzeń, relacje i wyniki meczów na żywo.' - masthead_url = 'http://press.gazeta.pl/file/mediakit/154509/c8/sportpl.jpg' - oldest_article = 1 - max_articles_per_feed = 100 - remove_javascript = True - no_stylesheets = True - remove_empty_feeds = True - ignore_duplicate_articles = {'title', 'url'} - keep_only_tags = [] - keep_only_tags.append(dict(name='div', attrs={'id': 'article'})) - - remove_tags = [] - remove_tags.append(dict(name='a', attrs={'href': 'www.gazeta.pl'})) - - feeds = [ - (u'Wszystkie wiadomości', u'http://rss.gazeta.pl/pub/rss/sport.xml'), - (u'Piłka nożna', - u'http://www.sport.pl/pub/rss/sport/pilka_nozna.htm'), - (u'F1', u'http://www.sport.pl/pub/rss/sportf1.htm'), - (u'Tenis', u'http://serwisy.gazeta.pl/pub/rss/tenis.htm'), - (u'Siatkówka', u'http://gazeta.pl.feedsportal.com/c/32739/f/611628/index.rss'), - (u'Koszykówka', u'http://gazeta.pl.feedsportal.com/c/32739/f/611647/index.rss'), - (u'Piłka ręczna', - u'http://gazeta.pl.feedsportal.com/c/32739/f/611635/index.rss'), - (u'Inne sporty', u'http://gazeta.pl.feedsportal.com/c/32739/f/611649/index.rss'), - ] - - def parse_feeds(self): - feeds = BasicNewsRecipe.parse_feeds(self) - for feed in feeds: - for article in feed.articles[:]: - if '[ZDJĘCIA]' in article.title: - article.title = article.title.replace('[ZDJĘCIA]', '') - elif '[WIDEO]' in article.title: - article.title = article.title.replace('[WIDEO]', '') - return feeds - - def print_version(self, url): - if 'feedsportal' in url: - segment = url.split('/') - urlPart = segment[-2] - urlPart = urlPart.replace('0L0Ssport0Bpl0C', '') - urlPart = urlPart.replace('0C10H', '/') - urlPart = urlPart.replace('0H', ',') - urlPart = urlPart.replace('0I', '_') - urlPart = urlPart.replace('A', '') - segment1 = urlPart.split('/') - seg1 = segment1[0] - seg2 = segment1[1] - segment2 = seg2.split(',') - part = segment2[0] + ',' + segment2[1] - return 'http://www.sport.pl/' + seg1 + '/2029020,' + part + '.html' - else: - segment = url.split('/') - part2 = segment[-2] - part1 = segment[-1] - segment2 = part1.split(',') - part = segment2[1] + ',' + segment2[2] - return 'http://www.sport.pl/' + part2 + '/2029020,' + part + '.html' From 04d5823308e79b8013d54c078a48f8fe0de8c1df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tomasz=20D=C5=82ugosz?= Date: Tue, 9 Oct 2018 01:21:09 +0200 Subject: [PATCH 4/4] remove recipes for no content or missing webpage --- recipes/dzial_zagraniczny.recipe | 28 ------- recipes/homopedia_pl.recipe | 32 -------- recipes/icons/dzial_zagraniczny.png | Bin 438 -> 0 bytes recipes/icons/homopedia_pl.png | Bin 314 -> 0 bytes recipes/odkrywcy_pl.recipe | 111 ---------------------------- recipes/rybinski.recipe | 29 -------- 6 files changed, 200 deletions(-) delete mode 100644 recipes/dzial_zagraniczny.recipe delete mode 100644 recipes/homopedia_pl.recipe delete mode 100644 recipes/icons/dzial_zagraniczny.png delete mode 100644 recipes/icons/homopedia_pl.png delete mode 100644 recipes/odkrywcy_pl.recipe delete mode 100644 recipes/rybinski.recipe diff --git a/recipes/dzial_zagraniczny.recipe b/recipes/dzial_zagraniczny.recipe deleted file mode 100644 index df5d325d15..0000000000 --- a/recipes/dzial_zagraniczny.recipe +++ /dev/null @@ -1,28 +0,0 @@ -#!/usr/bin/env python2 - -__license__ = 'GPL v3' -__author__ = 'teepel ' - -''' -dzialzagraniczny.pl -''' - -from calibre.web.feeds.news import BasicNewsRecipe - - -class dzial_zagraniczny(BasicNewsRecipe): - title = u'Dział Zagraniczny' - __author__ = 'teepel ' - language = 'pl' - description = u'Polskiego czytelnika to nie interesuje' - INDEX = 'http://dzialzagraniczny.pl' - extra_css = 'img {display: block;}' - oldest_article = 7 - cover_url = 'https://fbcdn-profile-a.akamaihd.net/hprofile-ak-prn1/c145.5.160.160/559442_415653975115959_2126205128_n.jpg' - max_articles_per_feed = 100 - remove_empty_feeds = True - remove_javascript = True - no_stylesheets = True - use_embedded_content = True - - feeds = [(u'Dział zagraniczny', u'http://feeds.feedburner.com/dyndns/UOfz')] diff --git a/recipes/homopedia_pl.recipe b/recipes/homopedia_pl.recipe deleted file mode 100644 index 6c1cc74e08..0000000000 --- a/recipes/homopedia_pl.recipe +++ /dev/null @@ -1,32 +0,0 @@ -from calibre.web.feeds.news import BasicNewsRecipe - - -class AdvancedUserRecipe1325420346(BasicNewsRecipe): - title = u'Homopedia' - __author__ = 'rainbowwarrior' - language = 'pl' - oldest_article = 7 - max_articles_per_feed = 100 - publication_type = 'newspaper' - masthead_url = 'http://a5.sphotos.ak.fbcdn.net/hphotos-ak-snc6/67335_168352243178437_166186720061656_594975_5800720_n.jpg' - encoding = 'utf-8' - - def get_cover_url(self): - return 'http://a7.sphotos.ak.fbcdn.net/hphotos-ak-snc4/65568_166186970061631_166186720061656_580324_7584264_n.jpg' - - feeds = [ - (u'Nowe has\u0142a', u'http://www.homopedia.pl/w/index.php?title=Specjalna:Nowe_strony&feed=atom&hideliu=&hidepatrolled=&hidebots=&hideredirs=1&limit=50&namespace=0'), # noqa - - (u'Blog', u'http://blog.homopedia.pl/feeds/posts/default')] - - def get_article_url(self, article): - artl = article.get('link', None) - rest, sep, article_id = artl.rpartition('/') - return 'http://www.homopedia.pl/w/index.php?redirect=no&printable=yes&title=' + article_id - - remove_tags = [dict(name='div', attrs={'class': 'noprint'}), dict(name='ul', attrs={'class': 'noprint'}), dict(name='ul', attrs={'id': 'footer-places'}), dict(name='li', attrs={'id': 'footer-info-viewcount'}), dict(name='span', attrs={'class': 'editsection'}), dict(name='div', attrs={'id': 'jump-to-nav'})] # noqa - - remove_tags_before = dict(dict(name='h2', attrs={'class': 'post-title'})) - remove_tags_after = dict(dict(name='a', attrs={'class': 'timestamp-link'})) - - extra_css = 'p{text-indent:1.5em!important;padding:0!important;margin;0!important}' diff --git a/recipes/icons/dzial_zagraniczny.png b/recipes/icons/dzial_zagraniczny.png deleted file mode 100644 index d7ff0cc303d33ee2c1ba875d971da363b46f5bed..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 438 zcmeAS@N?(olHy`uVBq!ia0vp^0wB!63?wyl`GbMfmH?j+R~9A~7B&_hE*>5}9uXlC z5it=NDH$0#85t!R6(toFH5DB#9UVO#6C)E7GZPytAhNNsv$1iqadC2SadUC;0+K#H zUOqm4J|RINAz>jQQ6VvLF*#W|Ie9rbMI|LbR8~?`RZ~+})6>^8Wzv)>)27UsH)qMR zC2Ll#S+j1&~5fcb+_Z^5oT1I{`~s$`C`uop!@fFx;TbNNU|P0?ZxaU;BYaz|5~DvN@IhzcwWqp|Melr2f-8oliu5V@|Jg2hu>vGlz|M~UI>iz#N`H4?+ zp8hgw*;+Ys`TQGI`%JqFKJSU|&(#ijmT`Bb=doSIYZnG7MTI6UQrmT(HENz^NP19~ z)b(i@Uw>V6?r~U~uCHvnojY~c!`Qsk34dzopr0EFeWv;Y7A diff --git a/recipes/icons/homopedia_pl.png b/recipes/icons/homopedia_pl.png deleted file mode 100644 index 56a579cad6ec37b7b9da0c03006edb6d089cc764..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 314 zcmV-A0mc4_P)jrKOzaIisG;g>F;oaCZ705mR3R}d5 zFferQzQ4QAh9ROC9)is#ARo+HvuVXHj8OXU self.oldest_article: - continue - tmp = i.find('a') - title = tmp.string - url = self.INDEX + tmp['href'] - articles.append({'title': title, - 'url': url, - 'date': '', - 'description': '' - }) - return articles - - def parse_index(self): - feeds = [] - feeds.append((u'Człowiek', self.find_articles( - 'http://odkrywcy.pl/kat,111396,name,Czlowiek,kategoria.html'))) - feeds.append((u'Technologie', self.find_articles( - 'http://odkrywcy.pl/kat,111398,name,Technologie,kategoria.html'))) - feeds.append((u'Ekologia', self.find_articles( - 'http://odkrywcy.pl/kat,111400,name,Ekologia,kategoria.html'))) - feeds.append((u'Kosmos', self.find_articles( - 'http://odkrywcy.pl/kat,111402,name,Kosmos,kategoria.html'))) - feeds.append((u'Cywilizacja', self.find_articles( - 'http://odkrywcy.pl/kat,111404,name,Cywilizacja,kategoria.html'))) - feeds.append((u'Przyroda', self.find_articles( - 'http://odkrywcy.pl/kat,111406,name,Przyroda,kategoria.html'))) - feeds.append((u'Fizyka i chemia', self.find_articles( - 'http://odkrywcy.pl/kat,111408,name,Fizyka,kategoria.html'))) - feeds.append((u'Historia', self.find_articles( - 'http://odkrywcy.pl/kat,122994,name,Historia,kategoria.html'))) - feeds.append((u'Media', self.find_articles( - 'http://odkrywcy.pl/kat,116794,name,Media,media.html'))) - - return feeds - - def append_page(self, soup, appendtag): - tag = soup.find('a', attrs={'class': 'btnNext'}) - urls = [] - while tag is not None: - if tag['href'] in urls: - break - urls.append(tag['href']) - soup2 = self.index_to_soup(self.INDEX + tag['href']) - tag = soup2.find(name='a', attrs={'class': 'btnNext'}) - pagetext = soup2.findAll(attrs={'class': 'content'}) - for container in pagetext: - header = container.find(name='h1') - if header: - header.extract() - for comment in container.findAll(text=lambda text: isinstance(text, Comment)): - comment.extract() - for container in pagetext: - pos = len(appendtag.contents) - appendtag.insert(pos, container) - for r in appendtag.findAll(attrs={'class': 'galStr'}): - r.extract() - for r in appendtag.findAll(attrs={'alt': 'Następne'}): - r.extract() - for r in appendtag.findAll(attrs={'alt': 'Poprzednie'}): - r.extract() - for r in appendtag.findAll(attrs={'class': 'clra'}): - r.extract() - for r in appendtag.findAll(attrs={'class': 'close'}): - r.extract() - for r in appendtag.findAll(attrs={'class': 'tagi'}): - r.extract() - for r in appendtag.findAll(attrs={'id': 'moreTopNews'}): - r.extract() - - def preprocess_html(self, soup): - self.append_page(soup, soup.body) - return soup diff --git a/recipes/rybinski.recipe b/recipes/rybinski.recipe deleted file mode 100644 index ce862cb445..0000000000 --- a/recipes/rybinski.recipe +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env python2 - -__license__ = 'GPL v3' -__copyright__ = u'2012, Tomasz Dlugosz ' -''' -rybinski.eu -''' - -from calibre.web.feeds.news import BasicNewsRecipe - - -class Rybinski(BasicNewsRecipe): - title = u'Rybinski.eu - economy of the XXI century' - description = u'Blog ekonomiczny dra hab. Krzysztofa Rybi\u0144skiego' - language = 'pl' - __author__ = u'Tomasz D\u0142ugosz' - oldest_article = 7 - max_articles_per_feed = 100 - no_stylesheets = True - - feeds = [(u'wpisy', u'http://www.rybinski.eu/?feed=rss2&lang=pl')] - - keep_only_tags = [dict(name='div', attrs={'class': 'post'})] - - remove_tags = [ - dict(name='div', attrs={'class': 'post-meta-1'}), - dict(name='div', attrs={'class': 'post-meta-2'}), - dict(name='div', attrs={'class': 'post-comments'}) - ]