From 7cfa28eb25c607f35b1cfa4a507002658b7e0bfa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tomasz=20D=C5=82ugosz?= Date: Wed, 10 Oct 2018 20:46:53 +0200 Subject: [PATCH 1/7] rss feed broken --- recipes/icons/rynek_kolejowy.png | Bin 284 -> 0 bytes recipes/rynek_kolejowy.recipe | 40 ------------------------------- 2 files changed, 40 deletions(-) delete mode 100644 recipes/icons/rynek_kolejowy.png delete mode 100644 recipes/rynek_kolejowy.recipe diff --git a/recipes/icons/rynek_kolejowy.png b/recipes/icons/rynek_kolejowy.png deleted file mode 100644 index 5a7a7f57ddc4d9ec062c4da39823e5fb621865f2..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 284 zcmV+%0ptFOP)0R}Of77WYUWquLy z#_pERgAeb)*pGW}Oq`GJKMpS31O9ZT;nxc{bRiL8q%XpULNq}2sZbM~AU`@Fj~zBB zLkB>R;)M=1FmIqT)FJgq?Le#n=;3M^p^Q5Q5HmnKpRC-@M1ftQJYxN*CB`n01fPsO8AT|tnD|@s+b!AS_LB=!2Ge9v)^$9w&xjY i37?;-?}C75=;96YGD<;G%&$iP0000 Date: Sat, 13 Oct 2018 22:40:36 +0200 Subject: [PATCH 3/7] fix rmf24 recipes --- recipes/rmf24_ESKN.recipe | 11 ++++------- recipes/rmf24_fakty.recipe | 12 ++++-------- recipes/rmf24_opinie.recipe | 13 ++++--------- 3 files changed, 12 insertions(+), 24 deletions(-) diff --git a/recipes/rmf24_ESKN.recipe b/recipes/rmf24_ESKN.recipe index ace2cf84c8..281f3a4c65 100644 --- a/recipes/rmf24_ESKN.recipe +++ b/recipes/rmf24_ESKN.recipe @@ -26,13 +26,11 @@ class RMF24_ESKN(BasicNewsRecipe): (u'Nauka', u'http://www.rmf24.pl/nauka/feed')] keep_only_tags = [ - dict(name='div', attrs={'class': 'box articleSingle print'})] + dict(name='header', attrs={'class': 'article-header'}), + dict(name='div', attrs={'class': 'article-container'})] - remove_tags = [ - dict(name='div', attrs={'class': 'toTop'}), - dict(name='div', attrs={'class': 'category'}), - dict(name='div', attrs={'class': 'REMOVE'}), - dict(name='div', attrs={'class': 'embed embedAd'})] + remove_tags = [dict(name='div', attrs={'id': 'ReklamaMobile'}), + dict(name='img', attrs={'class': 'img-responsive hidden-lg hidden-md hidden-sm'})] extra_css = ''' h1 { font-size: 1.2em; } @@ -42,7 +40,6 @@ class RMF24_ESKN(BasicNewsRecipe): (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in [ (r'

Zdj.cie

', lambda match: ''), - (r'embed embed(Left|Right|Center) articleEmbed(Audio|Wideo articleEmbedVideo|ArticleFull|ArticleTitle|ArticleListTitle|AlbumHorizontal)">', lambda match: 'REMOVE">'), # noqa (r'', lambda match: 'REMOVE">'), # noqa (r'[^<]+)', - lambda match: '' + match.group('a') + ''), - (r'
(?P[^<]+)
', - lambda match: '

' + match.group('t') + '

'), - (r'
', lambda match: ''), # noqa - (r'
', lambda match: ''), - (r'
', lambda match: ''), - (r']+>(?P

[^<]+)', - lambda match: '' + match.group('p') + ''), - (r']+>(?P[^<]+)', lambda match: match.group('a')), - (r'Orygin[^<]+', - lambda match: ''), - (r'Poka[^<]+', lambda match: '')] - ] From cbef5c2c9a8070ba0f290a35050683f8d1a13343 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tomasz=20D=C5=82ugosz?= Date: Sat, 13 Oct 2018 23:11:36 +0200 Subject: [PATCH 6/7] fix purepc.pl recipe --- recipes/pure_pc.recipe | 31 +++---------------------------- 1 file changed, 3 insertions(+), 28 deletions(-) diff --git a/recipes/pure_pc.recipe b/recipes/pure_pc.recipe index 952285c017..ae599c4759 100644 --- a/recipes/pure_pc.recipe +++ b/recipes/pure_pc.recipe @@ -11,35 +11,10 @@ class PurePC(BasicNewsRecipe): description = u'Artykuły, aktualności, sprzęt, forum, chłodzenie, modding, urządzenia mobilne - wszystko w jednym miejscu.' category = 'IT' language = 'pl' - masthead_url = 'http://www.purepc.pl/themes/new/images/purepc.jpg' cover_url = 'http://www.purepc.pl/themes/new/images/purepc.jpg' extra_css = '.wykres_logo {float: left; margin-right: 5px;}' no_stylesheets = True - keep_only_tags = [dict(id='content')] - remove_tags_after = dict(attrs={'class': 'fivestar-widget'}) - remove_tags = [dict(id='navigator'), dict( - attrs={'class': ['box-tools', 'fivestar-widget', 'PageMenuList']})] + + keep_only_tags = [dict(name='div', attrs={'class':'node page0'})] + remove_tags = [dict(name='div', attrs={'class':'article-options'})] feeds = [(u'Wiadomo\u015bci', u'http://www.purepc.pl/node/feed')] - - def append_page(self, soup, appendtag): - lasturl = appendtag.find(attrs={'class': 'pager-last'}) - if lasturl: - regex = re.search('(.+?2C)(\d+)', lasturl.a['href']) - baseurl = regex.group(1).replace('?page=0%2C', '?page=1%2C') - baseurl = 'http://www.purepc.pl' + baseurl - nr = int(regex.group(2)) - for page_nr in range(1, nr + 1): - soup2 = self.index_to_soup(baseurl + str(page_nr)) - pagetext = soup2.find(attrs={'class': 'article'}) - pos = len(appendtag.contents) - appendtag.insert(pos, pagetext) - for r in appendtag.findAll(attrs={'class': ['PageMenuList', 'pager', 'fivestar-widget']}): - r.extract() - comments = appendtag.findAll( - text=lambda text: isinstance(text, Comment)) - for comment in comments: - comment.extract() - - def preprocess_html(self, soup): - self.append_page(soup, soup.body) - return soup From 947dc24f1362a45cf2f9e50fb07b227381dbec7b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tomasz=20D=C5=82ugosz?= Date: Sat, 13 Oct 2018 23:16:54 +0200 Subject: [PATCH 7/7] broken feeds of a paywalled site --- recipes/icons/puls_biznesu.png | Bin 137 -> 0 bytes recipes/puls_biznesu.recipe | 48 --------------------------------- 2 files changed, 48 deletions(-) delete mode 100644 recipes/icons/puls_biznesu.png delete mode 100644 recipes/puls_biznesu.recipe diff --git a/recipes/icons/puls_biznesu.png b/recipes/icons/puls_biznesu.png deleted file mode 100644 index 0890c8750f5e92233db1fc9472f334a73d100759..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 137 zcmeAS@N?(olHy`uVBq!ia0vp^0wB!93?!50ihlx9oB=)|t}}rQ28RFt|37_gy&cG6 zDhcun{?G7qyMY&wr|aqB7*cU7nS+&yO(CI0kb6PPwWiQso+#}w-INO#A{ty8jw)=D ea*$16N@QS|V!?le!#a2i$V5+9KbLh*2~7YbB_\d+,\d+)', url) - if article_id: - return 'http://www.pb.pl/actionprint/' + article_id.group('id') - else: - return url - - def get_cover_url(self): - soup = self.index_to_soup('http://archiwum.pb.pl/') - cover = soup.find(name='img', attrs={'class': 'cover_picture'}) - self.cover_url = cover['src'] - return getattr(self, 'cover_url', self.cover_url)