diff --git a/.bzrignore b/.bzrignore index 6b6450f1f9..8711782023 100644 --- a/.bzrignore +++ b/.bzrignore @@ -40,6 +40,7 @@ recipes/.gitignore recipes/README.md recipes/icon_checker.py recipes/readme_updater.py +recipes/garfield.recipe recipes/katalog_egazeciarz.recipe recipes/tv_axnscifi.recipe recipes/tv_comedycentral.recipe @@ -63,6 +64,7 @@ recipes/tv_tvppolonia.recipe recipes/tv_tvpuls.recipe recipes/tv_viasathistory.recipe recipes/icons/katalog_egazeciarz.png +recipes/icons/garfield.png recipes/icons/tv_axnscifi.png recipes/icons/tv_comedycentral.png recipes/icons/tv_discoveryscience.png diff --git a/recipes/esensja_(rss).recipe b/recipes/esensja_(rss).recipe index af23ea58a9..0afa2b0d07 100644 --- a/recipes/esensja_(rss).recipe +++ b/recipes/esensja_(rss).recipe @@ -12,12 +12,6 @@ class EsensjaRSS(BasicNewsRecipe): language = 'pl' encoding = 'utf-8' INDEX = 'http://www.esensja.pl' - extra_css = '''.t-title {font-size: x-large; font-weight: bold; text-align: left} - .t-author {font-size: x-small; text-align: left} - .t-title2 {font-size: x-small; font-style: italic; text-align: left} - .text {font-size: small; text-align: left} - .annot-ref {font-style: italic; text-align: left} - ''' cover_url = '' masthead_url = 'http://esensja.pl/img/wrss.gif' use_embedded_content = False diff --git a/recipes/forbes_pl.recipe b/recipes/forbes_pl.recipe new file mode 100644 index 0000000000..b794fc5fa1 --- /dev/null +++ b/recipes/forbes_pl.recipe @@ -0,0 +1,53 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' + +from calibre.web.feeds.news import BasicNewsRecipe +import datetime +import re + +class forbes_pl(BasicNewsRecipe): + title = u'Forbes.pl' + __author__ = 'Artur Stachecki ' + language = 'pl' + description = u'Biznes, finanse, gospodarka, strategie, wiadomości gospodarcze, analizy finasowe i strategiczne.' + oldest_article = 1 + index = 'http://www.forbes.pl' + cover_url = 'http://www.forbes.pl/resources/front/images/logo.png' + max_articles_per_feed = 100 + extra_css = '.Block-Photo {float:left; max-width: 300px; margin-right: 5px;}' + preprocess_regexps = [(re.compile(ur'

()?(Czytaj|Zobacz) (też|także):.*?

', re.DOTALL), lambda match: ''), (re.compile(ur'Zobacz:.*?', re.DOTALL), lambda match: '')] + remove_javascript = True + no_stylesheets = True + now = datetime.datetime.now() + yesterday = now - datetime.timedelta(hours=24) + yesterday = yesterday.strftime("%d.%m.%Y %H:%M:%S") + pages_count = 4 + keep_only_tags = [dict(attrs={'class':['Block-Node Content-Article ', 'Block-Node Content-Article piano-closed']})] + remove_tags = [dict(attrs={'class':['Keywords Styled', 'twitter-share-button', 'Block-List-Related Block-List']})] + + feeds = [(u'Wszystkie', 'http://www.forbes.pl/rss')] + + '''def preprocess_html(self, soup): + self.append_page(soup, soup.body) + return soup + + + def append_page(self, soup, appendtag): + cleanup = False + nexturl = appendtag.find('a', attrs={'class':'next'}) + if nexturl: + cleanup = True + while nexturl: + soup2 = self.index_to_soup(self.index + nexturl['href']) + nexturl = soup2.find('a', attrs={'class':'next'}) + pagetext = soup2.findAll(id='article-body-wrapper') + if not pagetext: + pagetext = soup2.findAll(attrs={'class':'Article-Entry Styled'}) + for comment in pagetext.findAll(text=lambda text:isinstance(text, Comment)): + comment.extract() + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + if cleanup: + for r in appendtag.findAll(attrs={'class':'paginator'}): + r.extract()''' diff --git a/recipes/gazeta_pl_krakow.recipe b/recipes/gazeta_pl_krakow.recipe index 59b3b00933..0f7633e4b2 100644 --- a/recipes/gazeta_pl_krakow.recipe +++ b/recipes/gazeta_pl_krakow.recipe @@ -10,7 +10,7 @@ krakow.gazeta.pl from calibre.web.feeds.news import BasicNewsRecipe class gw_krakow(BasicNewsRecipe): - title = u'Gazeta.pl Kraków' + title = u'Gazeta Wyborcza Kraków' __author__ = 'teepel based on GW from fenuks' language = 'pl' description =u'Wiadomości z Krakowa na portalu Gazeta.pl.' diff --git a/recipes/gazeta_pl_szczecin.recipe b/recipes/gazeta_pl_szczecin.recipe index af229c5721..501b25dfe5 100644 --- a/recipes/gazeta_pl_szczecin.recipe +++ b/recipes/gazeta_pl_szczecin.recipe @@ -5,7 +5,7 @@ import string from calibre.web.feeds.news import BasicNewsRecipe class GazetaPlSzczecin(BasicNewsRecipe): - title = u'Gazeta.pl Szczecin' + title = u'Gazeta Wyborcza Szczecin' description = u'Wiadomości ze Szczecina na portalu Gazeta.pl.' __author__ = u'Michał Szkutnik' __license__ = u'GPL v3' diff --git a/recipes/gazeta_pl_warszawa.recipe b/recipes/gazeta_pl_warszawa.recipe index 9e10a0610c..6a37a96885 100644 --- a/recipes/gazeta_pl_warszawa.recipe +++ b/recipes/gazeta_pl_warszawa.recipe @@ -10,7 +10,7 @@ warszawa.gazeta.pl from calibre.web.feeds.news import BasicNewsRecipe class gw_wawa(BasicNewsRecipe): - title = u'Gazeta.pl Warszawa' + title = u'Gazeta Wyborcza Warszawa' __author__ = 'teepel based on GW from fenuks' language = 'pl' description ='Wiadomości z Warszawy na portalu Gazeta.pl.' diff --git a/recipes/gazeta_wyborcza.recipe b/recipes/gazeta_wyborcza.recipe index c415edc9d0..310077cdec 100644 --- a/recipes/gazeta_wyborcza.recipe +++ b/recipes/gazeta_wyborcza.recipe @@ -3,7 +3,7 @@ from calibre.web.feeds.news import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import Comment class Gazeta_Wyborcza(BasicNewsRecipe): - title = u'Gazeta.pl' + title = u'Gazeta Wyborcza' __author__ = 'fenuks, Artur Stachecki' language = 'pl' description = 'Wiadomości z Polski i ze świata. Serwisy tematyczne i lokalne w 20 miastach.' diff --git a/recipes/icons/forbes_pl.png b/recipes/icons/forbes_pl.png new file mode 100644 index 0000000000..feaa47487a Binary files /dev/null and b/recipes/icons/forbes_pl.png differ diff --git a/recipes/icons/gazeta_pl_krakow.png b/recipes/icons/gazeta_pl_krakow.png index 119afbba3a..49d76d2ddc 100644 Binary files a/recipes/icons/gazeta_pl_krakow.png and b/recipes/icons/gazeta_pl_krakow.png differ diff --git a/recipes/icons/gazeta_pl_szczecin.png b/recipes/icons/gazeta_pl_szczecin.png index 119afbba3a..49d76d2ddc 100644 Binary files a/recipes/icons/gazeta_pl_szczecin.png and b/recipes/icons/gazeta_pl_szczecin.png differ diff --git a/recipes/icons/gazeta_pl_warszawa.png b/recipes/icons/gazeta_pl_warszawa.png index 119afbba3a..49d76d2ddc 100644 Binary files a/recipes/icons/gazeta_pl_warszawa.png and b/recipes/icons/gazeta_pl_warszawa.png differ diff --git a/recipes/icons/gazeta_wyborcza.png b/recipes/icons/gazeta_wyborcza.png index 119afbba3a..49d76d2ddc 100644 Binary files a/recipes/icons/gazeta_wyborcza.png and b/recipes/icons/gazeta_wyborcza.png differ diff --git a/recipes/icons/slashdot.png b/recipes/icons/slashdot.png new file mode 100644 index 0000000000..5e7487244b Binary files /dev/null and b/recipes/icons/slashdot.png differ diff --git a/recipes/icons/sportowefakty.png b/recipes/icons/sportowefakty.png new file mode 100644 index 0000000000..0128c34f26 Binary files /dev/null and b/recipes/icons/sportowefakty.png differ diff --git a/recipes/icons/wysokie_obcasy.png b/recipes/icons/wysokie_obcasy.png new file mode 100644 index 0000000000..3ab94b3c66 Binary files /dev/null and b/recipes/icons/wysokie_obcasy.png differ diff --git a/recipes/sportowefakty.recipe b/recipes/sportowefakty.recipe new file mode 100644 index 0000000000..b4186d3283 --- /dev/null +++ b/recipes/sportowefakty.recipe @@ -0,0 +1,70 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' + +import re +from calibre.web.feeds.news import BasicNewsRecipe +from calibre.utils.magick import Image + +class sportowefakty(BasicNewsRecipe): + title = u'SportoweFakty' + __author__ = 'Artur Stachecki , Tomasz Długosz ' + language = 'pl' + description = u'Najważniejsze informacje sportowe z kraju i ze świata, relacje, komentarze, wywiady, zdjęcia!' + oldest_article = 1 + masthead_url='http://www.sportowefakty.pl/images/logo.png' + max_articles_per_feed = 100 + simultaneous_downloads = 5 + use_embedded_content=False + remove_javascript=True + no_stylesheets=True + ignore_duplicate_articles = {'title', 'url'} + + keep_only_tags = [dict(attrs = {'class' : 'box-article'})] + remove_tags =[] + remove_tags.append(dict(attrs = {'class' : re.compile(r'^newsStream')})) + remove_tags.append(dict(attrs = {'target' : '_blank'})) + + feeds = [ + (u'Piłka Nożna', u'http://www.sportowefakty.pl/pilka-nozna/index.rss'), + (u'Koszykówka', u'http://www.sportowefakty.pl/koszykowka/index.rss'), + (u'Żużel', u'http://www.sportowefakty.pl/zuzel/index.rss'), + (u'Siatkówka', u'http://www.sportowefakty.pl/siatkowka/index.rss'), + (u'Zimowe', u'http://www.sportowefakty.pl/zimowe/index.rss'), + (u'Hokej', u'http://www.sportowefakty.pl/hokej/index.rss'), + (u'Moto', u'http://www.sportowefakty.pl/moto/index.rss'), + (u'Tenis', u'http://www.sportowefakty.pl/tenis/index.rss') + ] + + def get_article_url(self, article): + link = article.get('link', None) + if 'utm_source' in link: + return link.split('?utm')[0] + else: + return link + + def print_version(self, url): + print_url = url + '/drukuj' + return print_url + + def preprocess_html(self, soup): + head = soup.find('h1') + if 'Fotorelacja' in self.tag_to_string(head): + return None + else: + for alink in soup.findAll('a'): + if alink.string is not None: + tstr = alink.string + alink.replaceWith(tstr) + return soup + + def postprocess_html(self, soup, first): + for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')): + iurl = tag['src'] + img = Image() + img.open(iurl) + if img < 0: + raise RuntimeError('Out of memory') + img.type = "GrayscaleType" + img.save(iurl) + return soup diff --git a/recipes/wyborcza_duzy_format.recipe b/recipes/wyborcza_duzy_format.recipe deleted file mode 100644 index 30b0cfe418..0000000000 --- a/recipes/wyborcza_duzy_format.recipe +++ /dev/null @@ -1,144 +0,0 @@ -#!/usr/bin/env python - -from calibre.web.feeds.recipes import BasicNewsRecipe - -class GazetaWyborczaDuzyForma(BasicNewsRecipe): - cover_url = 'http://bi.gazeta.pl/im/8/5415/m5415058.gif' - title = u"Gazeta Wyborcza Duzy Format" - __author__ = 'ravcio - rlelusz[at]gmail.com' - description = u"Articles from Gazeta's website" - language = 'pl' - max_articles_per_feed = 50 #you can increade it event up to maybe 600, should still work - recursions = 0 - encoding = 'iso-8859-2' - no_stylesheets = True - remove_javascript = True - use_embedded_content = False - - - keep_only_tags = [ - dict(name='div', attrs={'id':['k1']}) - ] - - remove_tags = [ - dict(name='div', attrs={'class':['zdjM', 'rel_video', 'zdjP', 'rel_box', 'index mod_zi_dolStrony']}) - ,dict(name='div', attrs={'id':['source', 'banP4', 'article_toolbar', 'rel', 'inContext_disabled']}) - ,dict(name='ul', attrs={'id':['articleToolbar']}) - ,dict(name='img', attrs={'class':['brand']}) - ,dict(name='h5', attrs={'class':['author']}) - ,dict(name='h6', attrs={'class':['date']}) - ,dict(name='p', attrs={'class':['txt_upl']}) - ] - - remove_tags_after = [ - dict(name='div', attrs={'id':['Str']}) #nawigator numerow linii - ] - - def load_article_links(self, url, count): - print '--- load_article_links', url, count - - #page with link to articles - soup = self.index_to_soup(url) - - #table with articles - list = soup.find('div', attrs={'class':'GWdalt'}) - - #single articles (link, title, ...) - links = list.findAll('div', attrs={'class':['GWdaltE']}) - - if len(links) < count: - #load links to more articles... - - #remove new link - pages_nav = list.find('div', attrs={'class':'pages'}) - next = pages_nav.find('a', attrs={'class':'next'}) - if next: - print 'next=', next['href'] - url = 'http://wyborcza.pl' + next['href'] - #e.g. url = 'http://wyborcza.pl/0,75480.html?str=2' - - older_links = self.load_article_links(url, count - len(links)) - links.extend(older_links) - - return links - - - #produce list of articles to download - def parse_index(self): - print '--- parse_index' - - max_articles = 8000 - links = self.load_article_links('http://wyborcza.pl/0,75480.html', max_articles) - - ans = [] - key = None - articles = {} - - key = 'Uncategorized' - articles[key] = [] - - for div_art in links: - div_date = div_art.find('div', attrs={'class':'kL'}) - div = div_art.find('div', attrs={'class':'kR'}) - - a = div.find('a', href=True) - - url = a['href'] - title = a.string - description = '' - pubdate = div_date.string.rstrip().lstrip() - summary = div.find('span', attrs={'class':'lead'}) - - desc = summary.find('a', href=True) - if desc: - desc.extract() - - description = self.tag_to_string(summary, use_alt=False) - description = description.rstrip().lstrip() - - feed = key if key is not None else 'Duzy Format' - - if not articles.has_key(feed): - articles[feed] = [] - - if description != '': # skip just pictures atricle - articles[feed].append( - dict(title=title, url=url, date=pubdate, - description=description, - content='')) - - ans = [(key, articles[key])] - return ans - - def append_page(self, soup, appendtag, position): - pager = soup.find('div',attrs={'id':'Str'}) - if pager: - #seek for 'a' element with nast value (if not found exit) - list = pager.findAll('a') - - for elem in list: - if 'nast' in elem.string: - nexturl = elem['href'] - - soup2 = self.index_to_soup('http://warszawa.gazeta.pl' + nexturl) - - texttag = soup2.find('div', attrs={'id':'artykul'}) - - newpos = len(texttag.contents) - self.append_page(soup2,texttag,newpos) - texttag.extract() - appendtag.insert(position,texttag) - - def preprocess_html(self, soup): - self.append_page(soup, soup.body, 3) - - # finally remove some tags - pager = soup.find('div',attrs={'id':'Str'}) - if pager: - pager.extract() - - pager = soup.find('div',attrs={'class':'tylko_int'}) - if pager: - pager.extract() - - return soup diff --git a/recipes/wysokie_obcasy.recipe b/recipes/wysokie_obcasy.recipe new file mode 100644 index 0000000000..332bc6138d --- /dev/null +++ b/recipes/wysokie_obcasy.recipe @@ -0,0 +1,57 @@ +#!/usr/bin/env python +__license__ = 'GPL v3' + +from calibre.web.feeds.news import BasicNewsRecipe + +class WysokieObcasyRecipe(BasicNewsRecipe): + __author__ = u'Artur Stachecki ' + language = 'pl' + version = 1 + + title = u'Wysokie Obcasy' + publisher = 'Agora SA' + description = u'Serwis sobotniego dodatku do Gazety Wyborczej' + category='magazine' + language = 'pl' + publication_type = 'magazine' + cover_url='' + remove_empty_feeds= True + no_stylesheets=True + oldest_article = 7 + max_articles_per_feed = 100000 + recursions = 0 + + no_stylesheets = True + remove_javascript = True + simultaneous_downloads = 5 + + keep_only_tags =[] + keep_only_tags.append(dict(name = 'div', attrs = {'id' : 'article'})) + + remove_tags =[] + remove_tags.append(dict(name = 'img')) + remove_tags.append(dict(name = 'p', attrs = {'class' : 'info'})) + + extra_css = ''' + body {font-family: verdana, arial, helvetica, geneva, sans-serif ;} + h1{text-align: left;} + ''' + + feeds = [ + ('Wszystkie Artykuly', 'feed://www.wysokieobcasy.pl/pub/rss/wysokieobcasy.xml'), + ] + + def print_version(self,url): + baseURL='http://www.wysokieobcasy.pl/wysokie-obcasy' + segments = url.split(',') + subPath= '/2029020,' + articleURL1 = segments[1] + articleURL2 = segments[2] + printVerString=articleURL1 + ',' + articleURL2 + s= baseURL + subPath + printVerString + '.html' + return s + + def get_cover_url(self): + soup = self.index_to_soup('http://www.wysokieobcasy.pl/wysokie-obcasy/0,0.html') + self.cover_url = soup.find(attrs={'class':'holder_cr'}).find('img')['src'] + return getattr(self, 'cover_url', self.cover_url)