From b08854e60acb47bd8b78894801c194c2f3e47ee7 Mon Sep 17 00:00:00 2001 From: fenuks Date: Mon, 17 Jun 2013 09:53:53 +0200 Subject: [PATCH] new Polish news sources --- recipes/cdrinfo_pl.recipe | 65 ++++++++++++++++++++++ recipes/gazeta_pl_bydgoszcz.recipe | 88 ++++++++++++++++++++++++++++++ 2 files changed, 153 insertions(+) create mode 100644 recipes/cdrinfo_pl.recipe create mode 100644 recipes/gazeta_pl_bydgoszcz.recipe diff --git a/recipes/cdrinfo_pl.recipe b/recipes/cdrinfo_pl.recipe new file mode 100644 index 0000000000..2a8b3b9a2e --- /dev/null +++ b/recipes/cdrinfo_pl.recipe @@ -0,0 +1,65 @@ +__license__ = 'GPL v3' +import re +from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import Comment +class cdrinfo(BasicNewsRecipe): + title = u'CDRinfo.pl' + __author__ = 'fenuks' + description = u'Serwis poświęcony archiwizacji danych. Testy i recenzje nagrywarek. Programy do nagrywania płyt. Dyski twarde, dyski SSD i serwery sieciowe NAS. Rankingi dyskow twardych, najszybsze dyski twarde, newsy, artykuły, testy, recenzje, porady, oprogramowanie. Zestawienie nagrywarek, najnowsze biosy do nagrywarek, programy dla dysków twardych.' + category = 'it, hardware' + #publication_type = '' + language = 'pl' + #encoding = '' + #extra_css = '' + cover_url = 'http://www.cdrinfo.pl/gfx/graph3/top.jpg' + #masthead_url = '' + use_embedded_content = False + oldest_article = 777 + max_articles_per_feed = 100 + no_stylesheets = True + remove_empty_feeds = True + remove_javascript = True + remove_attributes = ['style'] + preprocess_regexps = [(re.compile(u']*?>Uprzejmie prosimy o przestrzeganie netykiety.+?www\.gravatar\.com\.

', re.DOTALL), lambda match: '')] + ignore_duplicate_articles = {'title', 'url'} + + keep_only_tags = [dict(name='input', attrs={'name':'ref'}), dict(id='text')] + remove_tags = [dict(attrs={'class':['navigation', 'sociable']}), dict(name='hr'), dict(id='respond')] + remove_tags_after = dict(id='artnawigacja') + feeds = [(u'Wiadomości', 'http://feeds.feedburner.com/cdrinfo'), (u'Recenzje', 'http://www.cdrinfo.pl/rss/rss_recenzje.php'), + (u'Konsole', 'http://konsole.cdrinfo.pl/rss/rss_konsole_news.xml'), + (u'Pliki', 'http://www.cdrinfo.pl/rss/rss_pliki.xml') + ] + + def preprocess_html(self, soup): + if soup.find(id='artnawigacja'): + self.append_page(soup, soup.body) + return soup + + def append_page(self, soup, appendtag): + baseurl = 'http://cdrinfo.pl' + soup.find(name='input', attrs={'name':'ref'})['value'] + '/' + if baseurl[-2] == '/': + baseurl = baseurl[:-1] + tag = soup.find(id='artnawigacja') + div = tag.find('div', attrs={'align':'right'}) + while div: + counter = 0 + while counter < 5: + try: + soup2 = self.index_to_soup(baseurl+div.a['href']) + break + except: + counter += 1 + tag2 = soup2.find(id='artnawigacja') + div = tag2.find('div', attrs={'align':'right'}) + pagetext = soup2.find(attrs={'class':'art'}) + comments = pagetext.findAll(text=lambda text:isinstance(text, Comment)) + for comment in comments: + comment.extract() + for r in soup2.findAll(attrs={'class':'star-rating'}): + r.extract() + for r in soup2.findAll(attrs={'class':'star-rating2'}): + r.extract() + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + tag.extract() \ No newline at end of file diff --git a/recipes/gazeta_pl_bydgoszcz.recipe b/recipes/gazeta_pl_bydgoszcz.recipe new file mode 100644 index 0000000000..f86d642419 --- /dev/null +++ b/recipes/gazeta_pl_bydgoszcz.recipe @@ -0,0 +1,88 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' + +from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import Comment +import re +class gw_bydgoszcz(BasicNewsRecipe): + title = u'Gazeta Wyborcza Bydgoszcz' + __author__ = 'fenuks' + language = 'pl' + description = 'Wiadomości z Bydgoszczy na portalu Gazeta.pl.' + category = 'newspaper' + publication_type = 'newspaper' + masthead_url = 'http://bi.gazeta.pl/im/3/4089/m4089863.gif' + INDEX = 'http://bydgoszcz.gazeta.pl' + cover_url = 'http://bi.gazeta.pl/i/hp/hp2009/logo.gif' + remove_empty_feeds = True + oldest_article = 3 + max_articles_per_feed = 100 + remove_javascript = True + no_stylesheets = True + use_embedded_content = False + ignore_duplicate_articles = {'title', 'url'} + + #rules for gazeta.pl + preprocess_regexps = [(re.compile(u'Czytaj więcej.*', re.DOTALL), lambda m: '')] + keep_only_tags = [dict(id='gazeta_article')] + remove_tags = [dict(id=['gazeta_article_tools', 'gazeta_article_miniatures']), dict(attrs={'class':['mod mod_sociallist', 'c0', 'fb', 'voteNeedLogin']})] + remove_tags_after = dict(id='gazeta_article_body') + + feeds = [(u'Wiadomości', u'http://rss.feedsportal.com/c/32739/f/530239/index.rss')] + + def print_version(self, url): + if 'feedsportal.com' in url: + s = url.rpartition('gazeta0Bpl') + u = s[2] + if not s[0]: + u = url.rpartition('wyborcza0Bpl')[2] + u = u.replace('/l/', '/') + u = u.replace('/ia1.htm', '') + u = u.replace('0Dbo0F1', '') + u = u.replace('/story01.htm', '') + u = u.replace('0C', '/') + u = u.replace('A', '') + u = u.replace('0E', '-') + u = u.replace('0H', ',') + u = u.replace('0I', '_') + u = u.replace('0B', '.') + u = self.INDEX + u + return u + else: + return url + + def preprocess_html(self, soup): + tag = soup.find(id='Str') + if soup.find(attrs={'class': 'piano_btn_1'}): + return None + elif tag and tag.findAll('a'): + self.append_page(soup, soup.body) + return soup + + def append_page(self, soup, appendtag): + loop = False + tag = soup.find('div', attrs={'id': 'Str'}) + try: + baseurl = soup.find(name='meta', attrs={'property':'og:url'})['content'] + except: + return 1 + link = tag.findAll('a')[-1] + while link: + soup2 = self.index_to_soup(baseurl + link['href']) + link = soup2.find('div', attrs={'id': 'Str'}).findAll('a')[-1] + if not u'następne' in link.string: + link = '' + pagetext = soup2.find(id='artykul') + comments = pagetext.findAll(text=lambda text:isinstance(text, Comment)) + for comment in comments: + comment.extract() + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + tag.extract() + + def image_url_processor(self, baseurl, url): + if url.startswith(' '): + return url.strip() + else: + return url \ No newline at end of file