merge from trunk

This commit is contained in:
Charles Haley 2013-03-14 08:53:59 +01:00
commit c2afbca876
251 changed files with 56056 additions and 41286 deletions

View File

@ -19,6 +19,59 @@
# new recipes: # new recipes:
# - title: # - title:
- version: 0.9.22
date: 2013-03-08
new features:
- title: "Linux driver for the Tolino ebook reader."
tickets: [1151901]
- title: "Kobo driver: Add support for the new 'Archived' collections in Kobo firmware 2.4.0 and improve handling recommendations and previews."
tickets: [1150852]
- title: "Metadata search and replace: Allow replacing the entire set of identifiers instead of only a specific identifier when doing a search and replace. To do this, choose a source field other than identifiers and set the destination identifier type to: *"
- title: "Show a brief description of every action when customizing toolbars in Preferences->Toolbars"
- title: "Allow drag and drop of books from the calibre book list onto the button for Convert book, Edit metadata, Remove Book, Tweak Book, Polish books, etc."
bug fixes:
- title: "CHM Input: Fix incorrect decoding for CHM files whose hhc file is also a content file."
tickets: [1151721]
- title: "Conversion: Add the double low quote to list of characters that are converted to ascii."
tickets: [1152207]
- title: "Amazon metadata download: Update plugin to handle changes to Amazon site that could cause some covers to not be downloaded. Also fix finding metadata for books with + in their titles."
- title: "Content server: Fix a bug that prevented the cover being updated when files are downloaded from the content server"
- title: "Conversion: Handle the use of @import CSS rules inside <style> tags in HTML files"
- title: "Book polishing: Do not error out when polishing epub files that have XML comments in their OPF metadata section."
- title: "Book polishing: Do not error out when updating covers in EPUB files that have entries int heir manifest that point to missing files"
- title: "Book polishing: Fix a bug that could cause updating covers to error out in some books"
- title: "Fix updating the calibre application id in EPUBs that also use the application id as the package id."
- title: "Apple driver: Fix bug preventing sending books to iBooks if no books have been previously added to iBooks."
tickets: [1141078]
- title: "EPUB/AZW3 Output: Fix splitting on page breaks ignored if the page breaks are inside an element which itself has a page-break-after style applied."
- title: "EPUB/AZW3 Output: Fix incorrect splitting of html at page-break-after page breaks in certain circumstances (The split element being the first child of a parent that contains other split elements)."
tickets: [1139317]
improved recipes:
- Le Devoir
- New York Times Book Review
- Various Polish news sources
new recipes:
- title: Various new Polish news sources
- version: 0.9.21 - version: 0.9.21
date: 2013-03-01 date: 2013-03-01

View File

@ -3,7 +3,7 @@ import re
class Adventure_zone(BasicNewsRecipe): class Adventure_zone(BasicNewsRecipe):
title = u'Adventure Zone' title = u'Adventure Zone'
__author__ = 'fenuks' __author__ = 'fenuks'
description = u'Adventure zone - adventure games from A to Z' description = u'Czytaj więcej o przygodzie - codzienne nowinki. Szukaj u nas solucji i poradników, czytaj recenzje i zapowiedzi. Także galeria, pliki oraz forum dla wszystkich fanów gier przygodowych.'
category = 'games' category = 'games'
language = 'pl' language = 'pl'
no_stylesheets = True no_stylesheets = True

View File

@ -1,10 +1,11 @@
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class Archeowiesci(BasicNewsRecipe): class Archeowiesci(BasicNewsRecipe):
title = u'Archeowiesci' title = u'Archeowieści'
__author__ = 'fenuks' __author__ = 'fenuks'
category = 'archeology' category = 'archeology'
language = 'pl' language = 'pl'
description = u'Z pasją o przeszłości'
cover_url='http://archeowiesci.pl/wp-content/uploads/2011/05/Archeowiesci2-115x115.jpg' cover_url='http://archeowiesci.pl/wp-content/uploads/2011/05/Archeowiesci2-115x115.jpg'
oldest_article = 7 oldest_article = 7
needs_subscription='optional' needs_subscription='optional'

View File

@ -2,7 +2,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
class AstroNEWS(BasicNewsRecipe): class AstroNEWS(BasicNewsRecipe):
title = u'AstroNEWS' title = u'AstroNEWS'
__author__ = 'fenuks' __author__ = 'fenuks'
description = 'AstroNEWS- astronomy every day' description = u'AstroNEWS regularnie dostarcza wiadomości o wydarzeniach związanych z astronomią i astronautyką. Informujemy o aktualnych odkryciach i wydarzeniach naukowych, zapowiadamy ciekawe zjawiska astronomiczne. Serwis jest częścią portalu astronomicznego AstroNET prowadzonego przez miłośników astronomii i zawodowych astronomów.'
category = 'astronomy, science' category = 'astronomy, science'
language = 'pl' language = 'pl'
oldest_article = 8 oldest_article = 8

View File

@ -13,6 +13,7 @@ class Astroflesz(BasicNewsRecipe):
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
remove_attributes = ['style']
keep_only_tags = [dict(id="k2Container")] keep_only_tags = [dict(id="k2Container")]
remove_tags_after = dict(name='div', attrs={'class':'itemLinks'}) remove_tags_after = dict(name='div', attrs={'class':'itemLinks'})
remove_tags = [dict(name='div', attrs={'class':['itemLinks', 'itemToolbar', 'itemRatingBlock']})] remove_tags = [dict(name='div', attrs={'class':['itemLinks', 'itemToolbar', 'itemRatingBlock']})]

View File

@ -3,7 +3,7 @@ import re
class Astronomia_pl(BasicNewsRecipe): class Astronomia_pl(BasicNewsRecipe):
title = u'Astronomia.pl' title = u'Astronomia.pl'
__author__ = 'fenuks' __author__ = 'fenuks'
description = 'Astronomia - polish astronomy site' description = u'Astronomia.pl jest edukacyjnym portalem skierowanym do uczniów, studentów i miłośników astronomii. Przedstawiamy gwiazdy, planety, galaktyki, czarne dziury i wiele innych tajemnic Wszechświata.'
masthead_url = 'http://www.astronomia.pl/grafika/logo.gif' masthead_url = 'http://www.astronomia.pl/grafika/logo.gif'
cover_url = 'http://www.astronomia.pl/grafika/logo.gif' cover_url = 'http://www.astronomia.pl/grafika/logo.gif'
category = 'astronomy, science' category = 'astronomy, science'

View File

@ -0,0 +1,43 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = u'Łukasz Grąbczewski 2013'
__version__ = '1.0'
'''
bachormagazyn.pl
'''
from calibre.web.feeds.news import BasicNewsRecipe
class bachormagazyn(BasicNewsRecipe):
__author__ = u'Łukasz Grączewski'
title = u'Bachor Magazyn'
description = u'Alternatywny magazyn o alternatywach rodzicielstwa'
language = 'pl'
publisher = 'Bachor Mag.'
publication_type = 'magazine'
masthead_url = 'http://bachormagazyn.pl/wp-content/uploads/2011/10/bachor_header1.gif'
no_stylesheets = True
remove_javascript = True
use_embedded_content = False
remove_empty_feeds = True
oldest_article = 32 #monthly +1
max_articles_per_feed = 100
feeds = [
(u'Bezradnik dla nieudacznych rodziców', u'http://bachormagazyn.pl/feed/')
]
keep_only_tags = []
keep_only_tags.append(dict(name = 'div', attrs = {'id' : 'content'}))
remove_tags = []
remove_tags.append(dict(attrs = {'id' : 'nav-above'}))
remove_tags.append(dict(attrs = {'id' : 'nav-below'}))
remove_tags.append(dict(attrs = {'id' : 'comments'}))
remove_tags.append(dict(attrs = {'class' : 'entry-info'}))
remove_tags.append(dict(attrs = {'class' : 'comments-link'}))
remove_tags.append(dict(attrs = {'class' : 'sharedaddy sd-sharing-enabled'}))

View File

@ -3,7 +3,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
class Bash_org_pl(BasicNewsRecipe): class Bash_org_pl(BasicNewsRecipe):
title = u'Bash.org.pl' title = u'Bash.org.pl'
__author__ = 'fenuks' __author__ = 'fenuks'
description = 'Bash.org.pl - funny quotations from IRC discussions' description = 'Bash.org.pl - zabawne cytaty z IRC'
category = 'funny quotations, humour' category = 'funny quotations, humour'
language = 'pl' language = 'pl'
cover_url = u'http://userlogos.org/files/logos/dzikiosiol/none_0.png' cover_url = u'http://userlogos.org/files/logos/dzikiosiol/none_0.png'

View File

@ -3,14 +3,15 @@ import re
class BenchmarkPl(BasicNewsRecipe): class BenchmarkPl(BasicNewsRecipe):
title = u'Benchmark.pl' title = u'Benchmark.pl'
__author__ = 'fenuks' __author__ = 'fenuks'
description = u'benchmark.pl -IT site' description = u'benchmark.pl, recenzje i testy sprzętu, aktualności, rankingi, sterowniki, porady, opinie'
masthead_url = 'http://www.benchmark.pl/i/logo-footer.png' masthead_url = 'http://www.benchmark.pl/i/logo-footer.png'
cover_url = 'http://www.ieaddons.pl/benchmark/logo_benchmark_new.gif' cover_url = 'http://www.benchmark.pl/i/logo-dark.png'
category = 'IT' category = 'IT'
language = 'pl' language = 'pl'
oldest_article = 8 oldest_article = 8
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets=True no_stylesheets = True
remove_attributes = ['style']
preprocess_regexps = [(re.compile(ur'<h3><span style="font-size: small;">&nbsp;Zobacz poprzednie <a href="http://www.benchmark.pl/news/zestawienie/grupa_id/135">Opinie dnia:</a></span>.*</body>', re.DOTALL|re.IGNORECASE), lambda match: '</body>'), (re.compile(ur'Więcej o .*?</ul>', re.DOTALL|re.IGNORECASE), lambda match: '')] preprocess_regexps = [(re.compile(ur'<h3><span style="font-size: small;">&nbsp;Zobacz poprzednie <a href="http://www.benchmark.pl/news/zestawienie/grupa_id/135">Opinie dnia:</a></span>.*</body>', re.DOTALL|re.IGNORECASE), lambda match: '</body>'), (re.compile(ur'Więcej o .*?</ul>', re.DOTALL|re.IGNORECASE), lambda match: '')]
keep_only_tags=[dict(name='div', attrs={'class':['m_zwykly', 'gallery']}), dict(id='article')] keep_only_tags=[dict(name='div', attrs={'class':['m_zwykly', 'gallery']}), dict(id='article')]
remove_tags_after=dict(name='div', attrs={'class':'body'}) remove_tags_after=dict(name='div', attrs={'class':'body'})
@ -21,17 +22,18 @@ class BenchmarkPl(BasicNewsRecipe):
def append_page(self, soup, appendtag): def append_page(self, soup, appendtag):
nexturl = soup.find('span', attrs={'class':'next'}) nexturl = soup.find(attrs={'class':'next'})
while nexturl is not None: while nexturl:
nexturl= self.INDEX + nexturl.parent['href'] soup2 = self.index_to_soup(nexturl['href'])
soup2 = self.index_to_soup(nexturl) nexturl = soup2.find(attrs={'class':'next'})
nexturl=soup2.find('span', attrs={'class':'next'})
pagetext = soup2.find(name='div', attrs={'class':'body'}) pagetext = soup2.find(name='div', attrs={'class':'body'})
appendtag.find('div', attrs={'class':'k_ster'}).extract() appendtag.find('div', attrs={'class':'k_ster'}).extract()
pos = len(appendtag.contents) pos = len(appendtag.contents)
appendtag.insert(pos, pagetext) appendtag.insert(pos, pagetext)
if appendtag.find('div', attrs={'class':'k_ster'}) is not None: if appendtag.find('div', attrs={'class':'k_ster'}):
appendtag.find('div', attrs={'class':'k_ster'}).extract() appendtag.find('div', attrs={'class':'k_ster'}).extract()
for r in appendtag.findAll(attrs={'class':'changePage'}):
r.extract()
def image_article(self, soup, appendtag): def image_article(self, soup, appendtag):

55
recipes/biweekly.recipe Normal file
View File

@ -0,0 +1,55 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = u'Łukasz Grąbczewski 2011'
__version__ = '2.0'
import re, os
from calibre import walk
from calibre.utils.zipfile import ZipFile
from calibre.ptempfile import PersistentTemporaryFile
from calibre.web.feeds.news import BasicNewsRecipe
class biweekly(BasicNewsRecipe):
__author__ = u'Łukasz Grąbczewski'
title = 'Biweekly'
language = 'en'
publisher = 'National Audiovisual Institute'
publication_type = 'magazine'
description = u'link with culture [English edition of Polish magazine]: literature, theatre, film, art, music, views, talks'
conversion_options = {
'authors' : 'Biweekly.pl'
,'publisher' : publisher
,'language' : language
,'comments' : description
,'no_default_epub_cover' : True
,'preserve_cover_aspect_ratio': True
}
def build_index(self):
browser = self.get_browser()
browser.open('http://www.biweekly.pl/')
# find the link
epublink = browser.find_link(text_regex=re.compile('ePUB VERSION'))
# download ebook
self.report_progress(0,_('Downloading ePUB'))
response = browser.follow_link(epublink)
book_file = PersistentTemporaryFile(suffix='.epub')
book_file.write(response.read())
book_file.close()
# convert
self.report_progress(0.2,_('Converting to OEB'))
oeb = self.output_dir + '/INPUT/'
if not os.path.exists(oeb):
os.makedirs(oeb)
with ZipFile(book_file.name) as f:
f.extractall(path=oeb)
for f in walk(oeb):
if f.endswith('.opf'):
return f

View File

@ -3,7 +3,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
class CD_Action(BasicNewsRecipe): class CD_Action(BasicNewsRecipe):
title = u'CD-Action' title = u'CD-Action'
__author__ = 'fenuks' __author__ = 'fenuks'
description = 'cdaction.pl - polish games magazine site' description = 'Strona CD-Action (CDA), największego w Polsce pisma dla graczy.Pełne wersje gier, newsy, recenzje, zapowiedzi, konkursy, forum, opinie, galerie screenów,trailery, filmiki, patche, teksty. Gry komputerowe (PC) oraz na konsole (PS3, XBOX 360).'
category = 'games' category = 'games'
language = 'pl' language = 'pl'
index='http://www.cdaction.pl' index='http://www.cdaction.pl'

View File

@ -0,0 +1,66 @@
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''
Fetch Computerwoche.
'''
from calibre.web.feeds.news import BasicNewsRecipe
class Computerwoche(BasicNewsRecipe):
title = 'Computerwoche'
description = 'german computer newspaper'
language = 'de'
__author__ = 'Maria Seliger'
use_embedded_content = False
timefmt = ' [%d %b %Y]'
max_articles_per_feed = 15
linearize_tables = True
no_stylesheets = True
remove_stylesheets = True
remove_javascript = True
encoding = 'utf-8'
html2epub_options = 'base_font_size=10'
summary_length = 100
auto_cleanup = True
extra_css = '''
h2{font-family:Arial,Helvetica,sans-serif; font-size: x-small; color: #003399;}
a{font-family:Arial,Helvetica,sans-serif; font-size: x-small; font-style:italic;}
.dachzeile p{font-family:Arial,Helvetica,sans-serif; font-size: x-small; }
h1{ font-family:Arial,Helvetica,sans-serif; font-size:x-large; font-weight:bold;}
.artikelTeaser{font-family:Arial,Helvetica,sans-serif; font-size: x-small; font-weight:bold; }
body{font-family:Arial,Helvetica,sans-serif; }
.photo {font-family:Arial,Helvetica,sans-serif; font-size: x-small; color: #666666;} '''
feeds = [ ('Computerwoche', 'http://rss.feedsportal.com/c/312/f/4414/index.rss'),
('IDG Events', 'http://rss.feedsportal.com/c/401/f/7544/index.rss'),
('Computerwoche Jobs und Karriere', 'http://rss.feedsportal.com/c/312/f/434082/index.rss'),
('Computerwoche BI und ECM', 'http://rss.feedsportal.com/c/312/f/434083/index.rss'),
('Computerwoche Cloud Computing', 'http://rss.feedsportal.com/c/312/f/534647/index.rss'),
('Computerwoche Compliance und Recht', 'http://rss.feedsportal.com/c/312/f/434084/index.rss'),
('Computerwoche CRM', 'http://rss.feedsportal.com/c/312/f/434085/index.rss'),
('Computerwoche Data Center und Server', 'http://rss.feedsportal.com/c/312/f/434086/index.rss'),
('Computerwoche ERP', 'http://rss.feedsportal.com/c/312/f/434087/index.rss'),
('Computerwoche IT Macher', 'http://rss.feedsportal.com/c/312/f/534646/index.rss'),
('Computerwoche IT-Services', 'http://rss.feedsportal.com/c/312/f/434089/index.rss'),
('Computerwoche IT-Strategie', 'http://rss.feedsportal.com/c/312/f/434090/index.rss'),
('Computerwoche Mittelstands-IT', 'http://rss.feedsportal.com/c/312/f/434091/index.rss'),
('Computerwoche Mobile und Wireless', 'http://rss.feedsportal.com/c/312/f/434092/index.rss'),
('Computerwoche Netzwerk', 'http://rss.feedsportal.com/c/312/f/434093/index.rss'),
('Computerwoche Notebook und PC', 'http://rss.feedsportal.com/c/312/f/434094/index.rss'),
('Computerwoche Office und Tools', 'http://rss.feedsportal.com/c/312/f/434095/index.rss'),
('Computerwoche Security', 'http://rss.feedsportal.com/c/312/f/434098/index.rss'),
('Computerwoche SOA und BPM', 'http://rss.feedsportal.com/c/312/f/434099/index.rss'),
('Computerwoche Software Infrastruktur', 'http://rss.feedsportal.com/c/312/f/434096/index.rss'),
('Computerwoche Storage', 'http://rss.feedsportal.com/c/312/f/534645/index.rss'),
('Computerwoche VoIP und TK', 'http://rss.feedsportal.com/c/312/f/434102/index.rss'),
('Computerwoche Web', 'http://rss.feedsportal.com/c/312/f/434103/index.rss'),
('Computerwoche Home-IT', 'http://rss.feedsportal.com/c/312/f/434104/index.rss')]
def print_version(self, url):
return url.replace ('/a/', '/a/print/')

View File

@ -7,17 +7,13 @@ class Computerworld_pl(BasicNewsRecipe):
description = u'Serwis o IT w przemyśle, finansach, handlu, administracji oraz rynku IT i telekomunikacyjnym - wiadomości, opinie, analizy, porady prawne' description = u'Serwis o IT w przemyśle, finansach, handlu, administracji oraz rynku IT i telekomunikacyjnym - wiadomości, opinie, analizy, porady prawne'
category = 'IT' category = 'IT'
language = 'pl' language = 'pl'
masthead_url= 'http://g1.computerworld.pl/cw/beta_gfx/cw2.gif' masthead_url = 'http://g1.computerworld.pl/cw/beta_gfx/cw2.gif'
no_stylesheets=True cover_url = 'http://g1.computerworld.pl/cw/beta_gfx/cw2.gif'
no_stylesheets = True
oldest_article = 7 oldest_article = 7
max_articles_per_feed = 100 max_articles_per_feed = 100
keep_only_tags=[dict(attrs={'class':['tyt_news', 'prawo', 'autor', 'tresc']})] keep_only_tags = [dict(attrs={'class':['tyt_news', 'prawo', 'autor', 'tresc']})]
remove_tags_after=dict(name='div', attrs={'class':'rMobi'}) remove_tags_after = dict(name='div', attrs={'class':'rMobi'})
remove_tags=[dict(name='div', attrs={'class':['nnav', 'rMobi']}), dict(name='table', attrs={'class':'ramka_slx'})] remove_tags = [dict(name='div', attrs={'class':['nnav', 'rMobi']}), dict(name='table', attrs={'class':'ramka_slx'})]
feeds = [(u'Wiadomo\u015bci', u'http://rssout.idg.pl/cw/news_iso.xml')] feeds = [(u'Wiadomo\u015bci', u'http://rssout.idg.pl/cw/news_iso.xml')]
def get_cover_url(self):
soup = self.index_to_soup('http://www.computerworld.pl/')
cover=soup.find(name='img', attrs={'class':'prawo'})
self.cover_url=cover['src']
return getattr(self, 'cover_url', self.cover_url)

View File

@ -4,11 +4,12 @@ class CoNowegoPl(BasicNewsRecipe):
title = u'conowego.pl' title = u'conowego.pl'
__author__ = 'fenuks' __author__ = 'fenuks'
description = u'Nowy wortal technologiczny oraz gazeta internetowa. Testy najnowszych produktów, fachowe porady i recenzje. U nas znajdziesz wszystko o elektronice użytkowej !' description = u'Nowy wortal technologiczny oraz gazeta internetowa. Testy najnowszych produktów, fachowe porady i recenzje. U nas znajdziesz wszystko o elektronice użytkowej !'
cover_url = 'http://www.conowego.pl/fileadmin/templates/main/images/logo_top.png' #cover_url = 'http://www.conowego.pl/fileadmin/templates/main/images/logo_top.png'
category = 'IT, news' category = 'IT, news'
language = 'pl' language = 'pl'
oldest_article = 7 oldest_article = 7
max_articles_per_feed = 100 max_articles_per_feed = 100
INDEX = 'http://www.conowego.pl/'
no_stylesheets = True no_stylesheets = True
remove_empty_feeds = True remove_empty_feeds = True
use_embedded_content = False use_embedded_content = False
@ -36,3 +37,10 @@ class CoNowegoPl(BasicNewsRecipe):
for r in appendtag.findAll(attrs={'class':['pages', 'paginationWrap']}): for r in appendtag.findAll(attrs={'class':['pages', 'paginationWrap']}):
r.extract() r.extract()
def get_cover_url(self):
soup = self.index_to_soup('http://www.conowego.pl/magazyn/')
tag = soup.find(attrs={'class':'ms_left'})
if tag:
self.cover_url = self.INDEX + tag.find('img')['src']
return getattr(self, 'cover_url', self.cover_url)

View File

@ -1,4 +1,5 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
import re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class CzasGentlemanow(BasicNewsRecipe): class CzasGentlemanow(BasicNewsRecipe):
@ -13,8 +14,9 @@ class CzasGentlemanow(BasicNewsRecipe):
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
remove_empty_feeds = True remove_empty_feeds = True
preprocess_regexps = [(re.compile(u'<h3>Może Cię też zainteresować:</h3>'), lambda m: '')]
use_embedded_content = False use_embedded_content = False
keep_only_tags = [dict(name='div', attrs={'class':'content'})] keep_only_tags = [dict(name='div', attrs={'class':'content'})]
remove_tags = [dict(attrs={'class':'meta_comments'})] remove_tags = [dict(attrs={'class':'meta_comments'}), dict(id=['comments', 'related_posts_thumbnails'])]
remove_tags_after = dict(name='div', attrs={'class':'fblikebutton_button'}) remove_tags_after = dict(id='comments')
feeds = [(u'M\u0119ski \u015awiat', u'http://czasgentlemanow.pl/category/meski-swiat/feed/'), (u'Styl', u'http://czasgentlemanow.pl/category/styl/feed/'), (u'Vademecum Gentlemana', u'http://czasgentlemanow.pl/category/vademecum/feed/'), (u'Dom i rodzina', u'http://czasgentlemanow.pl/category/dom-i-rodzina/feed/'), (u'Honor', u'http://czasgentlemanow.pl/category/honor/feed/'), (u'Gad\u017cety Gentlemana', u'http://czasgentlemanow.pl/category/gadzety-gentlemana/feed/')] feeds = [(u'M\u0119ski \u015awiat', u'http://czasgentlemanow.pl/category/meski-swiat/feed/'), (u'Styl', u'http://czasgentlemanow.pl/category/styl/feed/'), (u'Vademecum Gentlemana', u'http://czasgentlemanow.pl/category/vademecum/feed/'), (u'Dom i rodzina', u'http://czasgentlemanow.pl/category/dom-i-rodzina/feed/'), (u'Honor', u'http://czasgentlemanow.pl/category/honor/feed/'), (u'Gad\u017cety Gentlemana', u'http://czasgentlemanow.pl/category/gadzety-gentlemana/feed/')]

View File

@ -1,6 +1,6 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__author__ = 'Mori' __author__ = 'Mori'
__version__ = 'v. 0.5' __version__ = 'v. 0.5'
''' '''
@ -11,56 +11,56 @@ from calibre.web.feeds.news import BasicNewsRecipe
import re import re
class DziennikInternautowRecipe(BasicNewsRecipe): class DziennikInternautowRecipe(BasicNewsRecipe):
__author__ = 'Mori' __author__ = 'Mori'
language = 'pl' language = 'pl'
title = u'Dziennik Internautow' title = u'Dziennik Internautow'
publisher = u'Dziennik Internaut\u00f3w Sp. z o.o.' publisher = u'Dziennik Internaut\u00f3w Sp. z o.o.'
description = u'Internet w \u017cyciu i biznesie. Porady, wywiady, interwencje, bezpiecze\u0144stwo w Sieci, technologia.' description = u'Internet w \u017cyciu i biznesie. Porady, wywiady, interwencje, bezpiecze\u0144stwo w Sieci, technologia.'
max_articles_per_feed = 100 max_articles_per_feed = 100
oldest_article = 7 oldest_article = 7
cover_url = 'http://di.com.pl/pic/logo_di_norm.gif' cover_url = 'http://di.com.pl/pic/logo_di_norm.gif'
no_stylesheets = True no_stylesheets = True
remove_javascript = True remove_javascript = True
encoding = 'utf-8' encoding = 'utf-8'
extra_css = ''' extra_css = '''
.fotodesc{font-size: 75%;} .fotodesc{font-size: 75%;}
.pub_data{font-size: 75%;} .pub_data{font-size: 75%;}
.fotonews{clear: both; padding-top: 10px; padding-bottom: 10px;} .fotonews{clear: both; padding-top: 10px; padding-bottom: 10px;}
#pub_foto{font-size: 75%; float: left; padding-right: 10px;} #pub_foto{font-size: 75%; float: left; padding-right: 10px;}
''' '''
feeds = [ feeds = [
(u'Dziennik Internaut\u00f3w', u'http://feeds.feedburner.com/glowny-di') (u'Dziennik Internaut\u00f3w', u'http://feeds.feedburner.com/glowny-di')
] ]
keep_only_tags = [ keep_only_tags = [
dict(name = 'div', attrs = {'id' : 'pub_head'}), dict(name = 'div', attrs = {'id' : 'pub_head'}),
dict(name = 'div', attrs = {'id' : 'pub_content'}) dict(name = 'div', attrs = {'id' : 'pub_content'})
] ]
remove_tags = [ remove_tags = [
dict(name = 'div', attrs = {'class' : 'poradniki_context'}), dict(name = 'div', attrs = {'class' : 'poradniki_context'}),
dict(name = 'div', attrs = {'class' : 'uniBox'}), dict(name = 'div', attrs = {'class' : 'uniBox'}),
dict(name = 'object', attrs = {}), dict(name = 'object', attrs = {}),
dict(name = 'h3', attrs = {}), dict(name = 'h3', attrs = {}),
dict(attrs={'class':'twitter-share-button'}) dict(attrs={'class':'twitter-share-button'})
] ]
preprocess_regexps = [ preprocess_regexps = [
(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[ [
(r', <a href="http://di.com.pl/komentarze,.*?</div>', lambda match: '</div>'), (r', <a href="http://di.com.pl/komentarze,.*?</div>', lambda match: '</div>'),
(r'<div class="fotonews".*?">', lambda match: '<div class="fotonews">'), (r'<div class="fotonews".*?">', lambda match: '<div class="fotonews">'),
(r'http://di.com.pl/pic/photo/mini/', lambda match: 'http://di.com.pl/pic/photo/oryginal/'), (r'http://di.com.pl/pic/photo/mini/', lambda match: 'http://di.com.pl/pic/photo/oryginal/'),
(r'\s*</', lambda match: '</'), (r'\s*</', lambda match: '</'),
] ]
] ]
def skip_ad_pages(self, soup): def skip_ad_pages(self, soup):
if 'Advertisement' in soup.title: if 'Advertisement' in soup.title:
nexturl=soup.find('a')['href'] nexturl=soup.find('a')['href']
return self.index_to_soup(nexturl, raw=True) return self.index_to_soup(nexturl, raw=True)

View File

@ -18,7 +18,7 @@ class Dobreprogramy_pl(BasicNewsRecipe):
max_articles_per_feed = 100 max_articles_per_feed = 100
preprocess_regexps = [(re.compile(ur'<div id="\S+360pmp4">Twoja przeglądarka nie obsługuje Flasha i HTML5 lub wyłączono obsługę JavaScript...</div>'), lambda match: '') ] preprocess_regexps = [(re.compile(ur'<div id="\S+360pmp4">Twoja przeglądarka nie obsługuje Flasha i HTML5 lub wyłączono obsługę JavaScript...</div>'), lambda match: '') ]
keep_only_tags=[dict(attrs={'class':['news', 'entry single']})] keep_only_tags=[dict(attrs={'class':['news', 'entry single']})]
remove_tags = [dict(attrs={'class':['newsOptions', 'noPrint', 'komentarze', 'tags font-heading-master']}), dict(id='komentarze')] remove_tags = [dict(attrs={'class':['newsOptions', 'noPrint', 'komentarze', 'tags font-heading-master']}), dict(id='komentarze'), dict(name='iframe')]
#remove_tags = [dict(name='div', attrs={'class':['komentarze', 'block', 'portalInfo', 'menuBar', 'topBar']})] #remove_tags = [dict(name='div', attrs={'class':['komentarze', 'block', 'portalInfo', 'menuBar', 'topBar']})]
feeds = [(u'Aktualności', 'http://feeds.feedburner.com/dobreprogramy/Aktualnosci'), feeds = [(u'Aktualności', 'http://feeds.feedburner.com/dobreprogramy/Aktualnosci'),
('Blogi', 'http://feeds.feedburner.com/dobreprogramy/BlogCzytelnikow')] ('Blogi', 'http://feeds.feedburner.com/dobreprogramy/BlogCzytelnikow')]

View File

@ -0,0 +1,56 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = u'Łukasz Grąbczewski 2011'
__version__ = '2.0'
import re, os
from calibre import walk
from calibre.utils.zipfile import ZipFile
from calibre.ptempfile import PersistentTemporaryFile
from calibre.web.feeds.news import BasicNewsRecipe
class dwutygodnik(BasicNewsRecipe):
__author__ = u'Łukasz Grąbczewski'
title = 'Dwutygodnik'
language = 'pl'
publisher = 'Narodowy Instytut Audiowizualny'
publication_type = 'magazine'
description = u'Strona Kultury: literatura, teatr, film, sztuka, muzyka, felietony, rozmowy'
conversion_options = {
'authors' : 'Dwutygodnik.com'
,'publisher' : publisher
,'language' : language
,'comments' : description
,'no_default_epub_cover' : True
,'preserve_cover_aspect_ratio': True
}
def build_index(self):
browser = self.get_browser()
browser.open('http://www.dwutygodnik.com/')
# find the link
epublink = browser.find_link(text_regex=re.compile('Wersja ePub'))
# download ebook
self.report_progress(0,_('Downloading ePUB'))
response = browser.follow_link(epublink)
book_file = PersistentTemporaryFile(suffix='.epub')
book_file.write(response.read())
book_file.close()
# convert
self.report_progress(0.2,_('Converting to OEB'))
oeb = self.output_dir + '/INPUT/'
if not os.path.exists(oeb):
os.makedirs(oeb)
with ZipFile(book_file.name) as f:
f.extractall(path=oeb)
for f in walk(oeb):
if f.endswith('.opf'):
return f

View File

@ -3,7 +3,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
class Dzieje(BasicNewsRecipe): class Dzieje(BasicNewsRecipe):
title = u'dzieje.pl' title = u'dzieje.pl'
__author__ = 'fenuks' __author__ = 'fenuks'
description = 'Dzieje - history of Poland' description = 'Dzieje.pl - najlepszy portal informacyjno-edukacyjny dotyczący historii Polski XX wieku. Archiwalne fotografie, filmy, katalog postaci, quizy i konkursy.'
cover_url = 'http://www.dzieje.pl/sites/default/files/dzieje_logo.png' cover_url = 'http://www.dzieje.pl/sites/default/files/dzieje_logo.png'
category = 'history' category = 'history'
language = 'pl' language = 'pl'

View File

@ -0,0 +1,34 @@
from calibre.web.feeds.news import BasicNewsRecipe
class DziennikBaltycki(BasicNewsRecipe):
title = u'Dziennik Ba\u0142tycki'
__author__ = 'fenuks'
description = u'Gazeta Regionalna Dziennik Bałtycki. Najnowsze Wiadomości Trójmiasto i Wiadomości Pomorskie. Czytaj!'
category = 'newspaper'
language = 'pl'
encoding = 'iso-8859-2'
masthead_url = 'http://s.polskatimes.pl/g/logo_naglowek/dziennikbaltycki.png?24'
oldest_article = 7
max_articles_per_feed = 100
remove_empty_feeds= True
no_stylesheets = True
use_embedded_content = False
ignore_duplicate_articles = {'title', 'url'}
#preprocess_regexps = [(re.compile(ur'<b>Czytaj także:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur',<b>Czytaj też:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>Zobacz także:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<center><h4><a.*?</a></h4></center>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ TEŻ:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ WIĘCEJ:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ TAKŻE:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>\* CZYTAJ KONIECZNIE:.*', re.DOTALL), lambda match: '</body>'), (re.compile(ur'<b>Nasze serwisy:</b>.*', re.DOTALL), lambda match: '</body>') ]
remove_tags_after= dict(attrs={'src':'http://nm.dz.com.pl/dz.png'})
remove_tags=[dict(id='mat-podobne'), dict(name='a', attrs={'class':'czytajDalej'}), dict(attrs={'src':'http://nm.dz.com.pl/dz.png'})]
feeds = [(u'Wiadomo\u015bci', u'http://www.dziennikbaltycki.pl/rss/dziennikbaltycki_wiadomosci.xml?201302'), (u'Sport', u'http://dziennikbaltycki.feedsportal.com/c/32980/f/533756/index.rss?201302'), (u'Rejsy', u'http://www.dziennikbaltycki.pl/rss/dziennikbaltycki_rejsy.xml?201302'), (u'Biznes na Pomorzu', u'http://www.dziennikbaltycki.pl/rss/dziennikbaltycki_biznesnapomorzu.xml?201302'), (u'GOM', u'http://www.dziennikbaltycki.pl/rss/dziennikbaltycki_gom.xml?201302'), (u'Opinie', u'http://www.dziennikbaltycki.pl/rss/dziennikbaltycki_opinie.xml?201302'), (u'Pitawal Pomorski', u'http://www.dziennikbaltycki.pl/rss/dziennikbaltycki_pitawalpomorski.xml?201302')]
def print_version(self, url):
return url.replace('artykul', 'drukuj')
def skip_ad_pages(self, soup):
if 'Advertisement' in soup.title:
nexturl=soup.find('a')['href']
return self.index_to_soup(nexturl, raw=True)
def get_cover_url(self):
soup = self.index_to_soup('http://www.prasa24.pl/gazeta/dziennik-baltycki/')
self.cover_url=soup.find(id='pojemnik').img['src']
return getattr(self, 'cover_url', self.cover_url)

View File

@ -0,0 +1,35 @@
from calibre.web.feeds.news import BasicNewsRecipe
class DziennikLodzki(BasicNewsRecipe):
title = u'Dziennik \u0141\xf3dzki'
__author__ = 'fenuks'
description = u'Gazeta Regionalna Dziennik Łódzki. Najnowsze Wiadomości Łódź. Czytaj Wiadomości Łódzkie!'
category = 'newspaper'
language = 'pl'
encoding = 'iso-8859-2'
masthead_url = 'http://s.polskatimes.pl/g/logo_naglowek/dzienniklodzki.png?24'
oldest_article = 7
max_articles_per_feed = 100
remove_empty_feeds = True
no_stylesheets = True
use_embedded_content = False
ignore_duplicate_articles = {'title', 'url'}
#preprocess_regexps = [(re.compile(ur'<b>Czytaj także:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur',<b>Czytaj też:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>Zobacz także:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<center><h4><a.*?</a></h4></center>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ TEŻ:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ WIĘCEJ:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ TAKŻE:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>\* CZYTAJ KONIECZNIE:.*', re.DOTALL), lambda match: '</body>'), (re.compile(ur'<b>Nasze serwisy:</b>.*', re.DOTALL), lambda match: '</body>') ]
remove_tags_after= dict(attrs={'src':'http://nm.dz.com.pl/dz.png'})
remove_tags=[dict(id='mat-podobne'), dict(name='a', attrs={'class':'czytajDalej'}), dict(attrs={'src':'http://nm.dz.com.pl/dz.png'})]
feeds = [(u'Na sygnale', u'http://www.dzienniklodzki.pl/rss/dzienniklodzki_nasygnale.xml?201302'), (u'\u0141\xf3d\u017a', u'http://www.dzienniklodzki.pl/rss/dzienniklodzki_lodz.xml?201302'), (u'Opinie', u'http://www.dzienniklodzki.pl/rss/dzienniklodzki_opinie.xml?201302'), (u'Pieni\u0105dze', u'http://dzienniklodzki.feedsportal.com/c/32980/f/533763/index.rss?201302'), (u'Kultura', u'http://dzienniklodzki.feedsportal.com/c/32980/f/533762/index.rss?201302'), (u'Sport', u'http://dzienniklodzki.feedsportal.com/c/32980/f/533761/index.rss?201302'), (u'Akcje', u'http://www.dzienniklodzki.pl/rss/dzienniklodzki_akcje.xml?201302'), (u'M\xf3j Reporter', u'http://www.dzienniklodzki.pl/rss/dzienniklodzki_mojreporter.xml?201302'), (u'Studni\xf3wki', u'http://www.dzienniklodzki.pl/rss/dzienniklodzki_studniowki.xml?201302'), (u'Kraj', u'http://www.dzienniklodzki.pl/rss/dzienniklodzki_kraj.xml?201302'), (u'Zdrowie', u'http://www.dzienniklodzki.pl/rss/dzienniklodzki_zdrowie.xml?201302')]
def print_version(self, url):
return url.replace('artykul', 'drukuj')
def skip_ad_pages(self, soup):
if 'Advertisement' in soup.title:
nexturl=soup.find('a')['href']
return self.index_to_soup(nexturl, raw=True)
def get_cover_url(self):
soup = self.index_to_soup('http://www.prasa24.pl/gazeta/dziennik-lodzki/')
self.cover_url=soup.find(id='pojemnik').img['src']
return getattr(self, 'cover_url', self.cover_url)

View File

@ -0,0 +1,78 @@
import re
from calibre.web.feeds.news import BasicNewsRecipe
class DziennikWschodni(BasicNewsRecipe):
title = u'Dziennik Wschodni'
__author__ = 'fenuks'
description = u'Dziennik Wschodni - portal regionalny województwa lubelskiego.'
category = 'newspaper'
language = 'pl'
encoding = 'iso-8859-2'
extra_css = 'ul {list-style: none; padding:0; margin:0;}'
INDEX = 'http://www.dziennikwschodni.pl'
masthead_url = INDEX + '/images/top_logo.png'
oldest_article = 7
max_articles_per_feed = 100
remove_empty_feeds = True
no_stylesheets = True
ignore_duplicate_articles = {'title', 'url'}
preprocess_regexps = [(re.compile(ur'Czytaj:.*?</a>', re.DOTALL), lambda match: ''), (re.compile(ur'Przeczytaj także:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''),
(re.compile(ur'Przeczytaj również:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(ur'Zobacz też:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: '')]
keep_only_tags = [dict(id=['article', 'cover', 'photostory'])]
remove_tags = [dict(id=['articleTags', 'articleMeta', 'boxReadIt', 'articleGalleries', 'articleConnections',
'ForumArticleComments', 'articleRecommend', 'jedynkiLinks', 'articleGalleryConnections',
'photostoryConnections', 'articleEpaper', 'articlePoll', 'articleAlarm', 'articleByline']),
dict(attrs={'class':'articleFunctions'})]
feeds = [(u'Wszystkie', u'http://www.dziennikwschodni.pl/rss.xml'),
(u'Lublin', u'http://www.dziennikwschodni.pl/lublin.xml'),
(u'Zamość', u'http://www.dziennikwschodni.pl/zamosc.xml'),
(u'Biała Podlaska', u'http://www.dziennikwschodni.pl/biala_podlaska.xml'),
(u'Chełm', u'http://www.dziennikwschodni.pl/chelm.xml'),
(u'Kraśnik', u'http://www.dziennikwschodni.pl/krasnik.xml'),
(u'Puławy', u'http://www.dziennikwschodni.pl/pulawy.xml'),
(u'Świdnik', u'http://www.dziennikwschodni.pl/swidnik.xml'),
(u'Łęczna', u'http://www.dziennikwschodni.pl/leczna.xml'),
(u'Lubartów', u'http://www.dziennikwschodni.pl/lubartow.xml'),
(u'Sport', u'http://www.dziennikwschodni.pl/sport.xml'),
(u'Praca', u'http://www.dziennikwschodni.pl/praca.xml'),
(u'Dom', u'http://www.dziennikwschodni.pl/dom.xml'),
(u'Moto', u'http://www.dziennikwschodni.pl/moto.xml'),
(u'Zdrowie', u'http://www.dziennikwschodni.pl/zdrowie.xml'),
]
def get_cover_url(self):
soup = self.index_to_soup(self.INDEX + '/apps/pbcs.dll/section?Category=JEDYNKI')
nexturl = self.INDEX + soup.find(id='covers').find('a')['href']
soup = self.index_to_soup(nexturl)
self.cover_url = self.INDEX + soup.find(id='cover').find(name='img')['src']
return getattr(self, 'cover_url', self.cover_url)
def append_page(self, soup, appendtag):
tag = soup.find('span', attrs={'class':'photoNavigationPages'})
if tag:
number = int(tag.string.rpartition('/')[-1].replace('&nbsp;', ''))
baseurl = self.INDEX + soup.find(attrs={'class':'photoNavigationNext'})['href'][:-1]
for r in appendtag.findAll(attrs={'class':'photoNavigation'}):
r.extract()
for nr in range(2, number+1):
soup2 = self.index_to_soup(baseurl + str(nr))
pagetext = soup2.find(id='photoContainer')
if pagetext:
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
pagetext = soup2.find(attrs={'class':'photoMeta'})
if pagetext:
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
pagetext = soup2.find(attrs={'class':'photoStoryText'})
if pagetext:
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
def preprocess_html(self, soup):
self.append_page(soup, soup.body)
return soup

View File

@ -0,0 +1,34 @@
from calibre.web.feeds.news import BasicNewsRecipe
class DziennikZachodni(BasicNewsRecipe):
title = u'Dziennik Zachodni'
__author__ = 'fenuks'
description = u'Gazeta Regionalna Dziennik Zachodni. Najnowsze Wiadomości Śląskie. Wiadomości Śląsk. Czytaj!'
category = 'newspaper'
language = 'pl'
encoding = 'iso-8859-2'
masthead_url = 'http://s.polskatimes.pl/g/logo_naglowek/dziennikzachodni.png?24'
oldest_article = 7
max_articles_per_feed = 100
remove_empty_feeds= True
no_stylesheets = True
use_embedded_content = False
ignore_duplicate_articles = {'title', 'url'}
#preprocess_regexps = [(re.compile(ur'<b>Czytaj także:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur',<b>Czytaj też:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>Zobacz także:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<center><h4><a.*?</a></h4></center>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ TEŻ:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ WIĘCEJ:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ TAKŻE:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>\* CZYTAJ KONIECZNIE:.*', re.DOTALL), lambda match: '</body>'), (re.compile(ur'<b>Nasze serwisy:</b>.*', re.DOTALL), lambda match: '</body>') ]
remove_tags_after= dict(attrs={'src':'http://nm.dz.com.pl/dz.png'})
remove_tags=[dict(id='mat-podobne'), dict(name='a', attrs={'class':'czytajDalej'}), dict(attrs={'src':'http://nm.dz.com.pl/dz.png'}), dict(attrs={'href':'http://www.dziennikzachodni.pl/piano'})]
feeds = [(u'Wszystkie', u'http://dziennikzachodni.feedsportal.com/c/32980/f/533764/index.rss?201302'), (u'Wiadomo\u015bci', u'http://dziennikzachodni.feedsportal.com/c/32980/f/533765/index.rss?201302'), (u'Regiony', u'http://www.dziennikzachodni.pl/rss/dziennikzachodni_regiony.xml?201302'), (u'Opinie', u'http://www.dziennikzachodni.pl/rss/dziennikzachodni_regiony.xml?201302'), (u'Blogi', u'http://www.dziennikzachodni.pl/rss/dziennikzachodni_blogi.xml?201302'), (u'Serwisy', u'http://www.dziennikzachodni.pl/rss/dziennikzachodni_serwisy.xml?201302'), (u'Sport', u'http://dziennikzachodni.feedsportal.com/c/32980/f/533766/index.rss?201302'), (u'M\xf3j Reporter', u'http://www.dziennikzachodni.pl/rss/dziennikzachodni_mojreporter.xml?201302'), (u'Na narty', u'http://www.dziennikzachodni.pl/rss/dziennikzachodni_nanarty.xml?201302'), (u'Drogi', u'http://www.dziennikzachodni.pl/rss/dziennikzachodni_drogi.xml?201302'), (u'Pieni\u0105dze', u'http://dziennikzachodni.feedsportal.com/c/32980/f/533768/index.rss?201302')]
def print_version(self, url):
return url.replace('artykul', 'drukuj')
def skip_ad_pages(self, soup):
if 'Advertisement' in soup.title:
nexturl=soup.find('a')['href']
return self.index_to_soup(nexturl, raw=True)
def get_cover_url(self):
soup = self.index_to_soup('http://www.prasa24.pl/gazeta/dziennik-zachodni/')
self.cover_url=soup.find(id='pojemnik').img['src']
return getattr(self, 'cover_url', self.cover_url)

74
recipes/echo_dnia.recipe Normal file
View File

@ -0,0 +1,74 @@
import re
from calibre.web.feeds.news import BasicNewsRecipe
class EchoDnia(BasicNewsRecipe):
title = u'Echo Dnia'
__author__ = 'fenuks'
description = u'Echo Dnia - portal regionalny świętokrzyskiego radomskiego i podkarpackiego. Najnowsze wiadomości z Twojego regionu, galerie, video, mp3.'
category = 'newspaper'
language = 'pl'
encoding = 'iso-8859-2'
extra_css = 'ul {list-style: none; padding:0; margin:0;}'
INDEX = 'http://www.echodnia.eu'
masthead_url = INDEX + '/images/top_logo.png'
oldest_article = 7
max_articles_per_feed = 100
remove_empty_feeds = True
no_stylesheets = True
ignore_duplicate_articles = {'title', 'url'}
preprocess_regexps = [(re.compile(ur'Czytaj:.*?</a>', re.DOTALL), lambda match: ''), (re.compile(ur'Przeczytaj także:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''),
(re.compile(ur'Przeczytaj również:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(ur'Zobacz też:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: '')]
keep_only_tags = [dict(id=['article', 'cover', 'photostory'])]
remove_tags = [dict(id=['articleTags', 'articleMeta', 'boxReadIt', 'articleGalleries', 'articleConnections',
'ForumArticleComments', 'articleRecommend', 'jedynkiLinks', 'articleGalleryConnections',
'photostoryConnections', 'articleEpaper', 'articlePoll', 'articleAlarm', 'articleByline']),
dict(attrs={'class':'articleFunctions'})]
feeds = [(u'Wszystkie', u'http://www.echodnia.eu/rss.xml'),
(u'Świętokrzyskie', u'http://www.echodnia.eu/swietokrzyskie.xml'),
(u'Radomskie', u'http://www.echodnia.eu/radomskie.xml'),
(u'Podkarpackie', u'http://www.echodnia.eu/podkarpackie.xml'),
(u'Sport \u015bwi\u0119tokrzyski', u'http://www.echodnia.eu/sport_swi.xml'),
(u'Sport radomski', u'http://www.echodnia.eu/sport_rad.xml'),
(u'Sport podkarpacki', u'http://www.echodnia.eu/sport_pod.xml'),
(u'Pi\u0142ka no\u017cna', u'http://www.echodnia.eu/pilka.xml'),
(u'Praca', u'http://www.echodnia.eu/praca.xml'),
(u'Dom', u'http://www.echodnia.eu/dom.xml'),
(u'Auto', u'http://www.echodnia.eu/auto.xml'),
(u'Zdrowie', u'http://www.echodnia.eu/zdrowie.xml')]
def get_cover_url(self):
soup = self.index_to_soup(self.INDEX + '/apps/pbcs.dll/section?Category=JEDYNKI')
nexturl = self.INDEX + soup.find(id='covers').find('a')['href']
soup = self.index_to_soup(nexturl)
self.cover_url = self.INDEX + soup.find(id='cover').find(name='img')['src']
return getattr(self, 'cover_url', self.cover_url)
def append_page(self, soup, appendtag):
tag = soup.find('span', attrs={'class':'photoNavigationPages'})
if tag:
number = int(tag.string.rpartition('/')[-1].replace('&nbsp;', ''))
baseurl = self.INDEX + soup.find(attrs={'class':'photoNavigationNext'})['href'][:-1]
for r in appendtag.findAll(attrs={'class':'photoNavigation'}):
r.extract()
for nr in range(2, number+1):
soup2 = self.index_to_soup(baseurl + str(nr))
pagetext = soup2.find(id='photoContainer')
if pagetext:
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
pagetext = soup2.find(attrs={'class':'photoMeta'})
if pagetext:
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
pagetext = soup2.find(attrs={'class':'photoStoryText'})
if pagetext:
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
def preprocess_html(self, soup):
self.append_page(soup, soup.body)
return soup

View File

@ -1,8 +1,6 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__author__ = 'Mori'
__version__ = 'v. 0.1'
''' '''
blog.eclicto.pl blog.eclicto.pl
''' '''
@ -11,39 +9,39 @@ from calibre.web.feeds.news import BasicNewsRecipe
import re import re
class BlogeClictoRecipe(BasicNewsRecipe): class BlogeClictoRecipe(BasicNewsRecipe):
__author__ = 'Mori' __author__ = 'Mori, Tomasz Długosz'
language = 'pl' language = 'pl'
title = u'Blog eClicto' title = u'Blog eClicto'
publisher = u'Blog eClicto' publisher = u'Blog eClicto'
description = u'Blog o e-papierze i e-bookach' description = u'Blog o e-papierze i e-bookach'
max_articles_per_feed = 100 max_articles_per_feed = 100
cover_url = 'http://blog.eclicto.pl/wordpress/wp-content/themes/blog_eclicto/g/logo.gif' cover_url = 'http://blog.eclicto.pl/wordpress/wp-content/themes/blog_eclicto/g/logo.gif'
no_stylesheets = True no_stylesheets = True
remove_javascript = True remove_javascript = True
encoding = 'utf-8' encoding = 'utf-8'
extra_css = ''' extra_css = '''
img{float: left; padding-right: 10px; padding-bottom: 5px;} img{float: left; padding-right: 10px; padding-bottom: 5px;}
''' '''
feeds = [ feeds = [
(u'Blog eClicto', u'http://blog.eclicto.pl/feed/') (u'Blog eClicto', u'http://blog.eclicto.pl/feed/')
] ]
remove_tags = [ remove_tags = [
dict(name = 'span', attrs = {'id' : 'tags'}) dict(name = 'div', attrs = {'class' : 'social_bookmark'}),
] ]
remove_tags_after = [ keep_only_tags = [
dict(name = 'div', attrs = {'class' : 'post'}) dict(name = 'div', attrs = {'class' : 'post'})
] ]
preprocess_regexps = [ preprocess_regexps = [
(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[ [
(r'\s*</', lambda match: '</'), (r'\s*</', lambda match: '</'),
] ]
] ]

View File

@ -4,6 +4,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
class eioba(BasicNewsRecipe): class eioba(BasicNewsRecipe):
title = u'eioba' title = u'eioba'
__author__ = 'fenuks' __author__ = 'fenuks'
description = u'eioba.pl - daj się przeczytać!'
cover_url = 'http://www.eioba.org/lay/logo_pl_v3.png' cover_url = 'http://www.eioba.org/lay/logo_pl_v3.png'
language = 'pl' language = 'pl'
oldest_article = 7 oldest_article = 7

View File

@ -5,7 +5,7 @@ class Elektroda(BasicNewsRecipe):
title = u'Elektroda' title = u'Elektroda'
oldest_article = 8 oldest_article = 8
__author__ = 'fenuks' __author__ = 'fenuks'
description = 'Elektroda.pl' description = 'Międzynarodowy portal elektroniczny udostępniający bogate zasoby z dziedziny elektroniki oraz forum dyskusyjne.'
cover_url = 'http://demotywatory.elektroda.pl/Thunderpic/logo.gif' cover_url = 'http://demotywatory.elektroda.pl/Thunderpic/logo.gif'
category = 'electronics' category = 'electronics'
language = 'pl' language = 'pl'

View File

@ -12,6 +12,7 @@ class eMuzyka(BasicNewsRecipe):
no_stylesheets = True no_stylesheets = True
oldest_article = 7 oldest_article = 7
max_articles_per_feed = 100 max_articles_per_feed = 100
remove_attributes = ['style']
keep_only_tags=[dict(name='div', attrs={'id':'news_container'}), dict(name='h3'), dict(name='div', attrs={'class':'review_text'})] keep_only_tags=[dict(name='div', attrs={'id':'news_container'}), dict(name='h3'), dict(name='div', attrs={'class':'review_text'})]
remove_tags=[dict(name='span', attrs={'id':'date'})] remove_tags=[dict(name='span', attrs={'id':'date'})]
feeds = [(u'Aktualno\u015bci', u'http://www.emuzyka.pl/rss.php?f=1'), (u'Recenzje', u'http://www.emuzyka.pl/rss.php?f=2')] feeds = [(u'Aktualno\u015bci', u'http://www.emuzyka.pl/rss.php?f=1'), (u'Recenzje', u'http://www.emuzyka.pl/rss.php?f=2')]

View File

@ -22,14 +22,14 @@ class f1ultra(BasicNewsRecipe):
remove_tags.append(dict(name = 'hr', attrs = {'size' : '2'})) remove_tags.append(dict(name = 'hr', attrs = {'size' : '2'}))
preprocess_regexps = [(re.compile(r'align="left"'), lambda match: ''), preprocess_regexps = [(re.compile(r'align="left"'), lambda match: ''),
(re.compile(r'align="right"'), lambda match: ''), (re.compile(r'align="right"'), lambda match: ''),
(re.compile(r'width=\"*\"'), lambda match: ''), (re.compile(r'width=\"*\"'), lambda match: ''),
(re.compile(r'\<table .*?\>'), lambda match: '')] (re.compile(r'\<table .*?\>'), lambda match: '')]
extra_css = '''.contentheading { font-size: 1.4em; font-weight: bold; } extra_css = '''.contentheading { font-size: 1.4em; font-weight: bold; }
img { display: block; clear: both;} img { display: block; clear: both;}
''' '''
remove_attributes = ['width','height','position','float','padding-left','padding-right','padding','text-align'] remove_attributes = ['width','height','position','float','padding-left','padding-right','padding','text-align']
feeds = [(u'F1 Ultra', u'http://www.f1ultra.pl/index.php?option=com_rd_rss&id=1&Itemid=245')] feeds = [(u'F1 Ultra', u'http://www.f1ultra.pl/index.php?option=com_rd_rss&id=1&Itemid=245')]

View File

@ -4,21 +4,21 @@ from calibre.ebooks.BeautifulSoup import BeautifulSoup
class FilmWebPl(BasicNewsRecipe): class FilmWebPl(BasicNewsRecipe):
title = u'FilmWeb' title = u'FilmWeb'
__author__ = 'fenuks' __author__ = 'fenuks'
description = 'FilmWeb - biggest polish movie site' description = 'Filmweb.pl - Filmy takie jak Ty Filmweb to największy i najczęściej odwiedzany polski serwis filmowy. Największa baza filmów, seriali i aktorów, repertuar kin i tv, ...'
cover_url = 'http://userlogos.org/files/logos/crudus/filmweb.png' cover_url = 'http://gfx.filmweb.pl/n/logo-filmweb-bevel.jpg'
category = 'movies' category = 'movies'
language = 'pl' language = 'pl'
index='http://www.filmweb.pl' index = 'http://www.filmweb.pl'
oldest_article = 8 oldest_article = 8
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets= True no_stylesheets = True
remove_empty_feeds=True remove_empty_feeds = True
ignore_duplicate_articles = {'title', 'url'} ignore_duplicate_articles = {'title', 'url'}
preprocess_regexps = [(re.compile(u'\(kliknij\,\ aby powiększyć\)', re.IGNORECASE), lambda m: ''), ]#(re.compile(ur' | ', re.IGNORECASE), lambda m: '')] preprocess_regexps = [(re.compile(u'\(kliknij\,\ aby powiększyć\)', re.IGNORECASE), lambda m: ''), ]#(re.compile(ur' | ', re.IGNORECASE), lambda m: '')]
extra_css = '.hdrBig {font-size:22px;} ul {list-style-type:none; padding: 0; margin: 0;}' extra_css = '.hdrBig {font-size:22px;} ul {list-style-type:none; padding: 0; margin: 0;}'
remove_tags= [dict(name='div', attrs={'class':['recommendOthers']}), dict(name='ul', attrs={'class':'fontSizeSet'}), dict(attrs={'class':'userSurname anno'})] remove_tags = [dict(name='div', attrs={'class':['recommendOthers']}), dict(name='ul', attrs={'class':'fontSizeSet'}), dict(attrs={'class':'userSurname anno'})]
remove_attributes = ['style',] remove_attributes = ['style',]
keep_only_tags= [dict(name='h1', attrs={'class':['hdrBig', 'hdrEntity']}), dict(name='div', attrs={'class':['newsInfo', 'newsInfoSmall', 'reviewContent description']})] keep_only_tags = [dict(name='h1', attrs={'class':['hdrBig', 'hdrEntity']}), dict(name='div', attrs={'class':['newsInfo', 'newsInfoSmall', 'reviewContent description']})]
feeds = [(u'News / Filmy w produkcji', 'http://www.filmweb.pl/feed/news/category/filminproduction'), feeds = [(u'News / Filmy w produkcji', 'http://www.filmweb.pl/feed/news/category/filminproduction'),
(u'News / Festiwale, nagrody i przeglądy', u'http://www.filmweb.pl/feed/news/category/festival'), (u'News / Festiwale, nagrody i przeglądy', u'http://www.filmweb.pl/feed/news/category/festival'),
(u'News / Seriale', u'http://www.filmweb.pl/feed/news/category/serials'), (u'News / Seriale', u'http://www.filmweb.pl/feed/news/category/serials'),

View File

@ -13,7 +13,7 @@ class FocusRecipe(BasicNewsRecipe):
title = u'Focus' title = u'Focus'
publisher = u'Gruner + Jahr Polska' publisher = u'Gruner + Jahr Polska'
category = u'News' category = u'News'
description = u'Newspaper' description = u'Focus.pl - pierwszy w Polsce portal społecznościowy dla miłośników nauki. Tematyka: nauka, historia, cywilizacja, technika, przyroda, sport, gadżety'
category = 'magazine' category = 'magazine'
cover_url = '' cover_url = ''
remove_empty_feeds = True remove_empty_feeds = True

View File

@ -3,6 +3,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
class Fotoblogia_pl(BasicNewsRecipe): class Fotoblogia_pl(BasicNewsRecipe):
title = u'Fotoblogia.pl' title = u'Fotoblogia.pl'
__author__ = 'fenuks' __author__ = 'fenuks'
description = u'Jeden z największych polskich blogów o fotografii.'
category = 'photography' category = 'photography'
language = 'pl' language = 'pl'
masthead_url = 'http://img.interia.pl/komputery/nimg/u/0/fotoblogia21.jpg' masthead_url = 'http://img.interia.pl/komputery/nimg/u/0/fotoblogia21.jpg'
@ -11,6 +12,6 @@ class Fotoblogia_pl(BasicNewsRecipe):
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
keep_only_tags=[dict(name='div', attrs={'class':'post-view post-standard'})] keep_only_tags=[dict(name='div', attrs={'class':['post-view post-standard', 'photo-container']})]
remove_tags=[dict(attrs={'class':['external fotoblogia', 'categories', 'tags']})] remove_tags=[dict(attrs={'class':['external fotoblogia', 'categories', 'tags']})]
feeds = [(u'Wszystko', u'http://fotoblogia.pl/feed/rss2')] feeds = [(u'Wszystko', u'http://fotoblogia.pl/feed/rss2')]

View File

@ -18,6 +18,7 @@ class FrazPC(BasicNewsRecipe):
max_articles_per_feed = 100 max_articles_per_feed = 100
use_embedded_content = False use_embedded_content = False
no_stylesheets = True no_stylesheets = True
remove_empty_feeds = True
cover_url='http://www.frazpc.pl/images/logo.png' cover_url='http://www.frazpc.pl/images/logo.png'
feeds = [ feeds = [
(u'Aktualno\u015bci', u'http://www.frazpc.pl/feed/aktualnosci'), (u'Aktualno\u015bci', u'http://www.frazpc.pl/feed/aktualnosci'),

View File

@ -0,0 +1,34 @@
from calibre.web.feeds.news import BasicNewsRecipe
class GazetaKrakowska(BasicNewsRecipe):
title = u'Gazeta Krakowska'
__author__ = 'fenuks'
description = u'Gazeta Regionalna Gazeta Krakowska. Najnowsze Wiadomości Kraków. Informacje Kraków. Czytaj!'
category = 'newspaper'
language = 'pl'
encoding = 'iso-8859-2'
masthead_url = 'http://s.polskatimes.pl/g/logo_naglowek/gazetakrakowska.png?24'
oldest_article = 7
max_articles_per_feed = 100
remove_empty_feeds = True
no_stylesheets = True
use_embedded_content = False
ignore_duplicate_articles = {'title', 'url'}
#preprocess_regexps = [(re.compile(ur'<b>Czytaj także:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur',<b>Czytaj też:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>Zobacz także:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<center><h4><a.*?</a></h4></center>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ TEŻ:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ WIĘCEJ:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ TAKŻE:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>\* CZYTAJ KONIECZNIE:.*', re.DOTALL), lambda match: '</body>'), (re.compile(ur'<b>Nasze serwisy:</b>.*', re.DOTALL), lambda match: '</body>') ]
remove_tags_after= dict(attrs={'src':'http://nm.dz.com.pl/dz.png'})
remove_tags=[dict(id='mat-podobne'), dict(name='a', attrs={'class':'czytajDalej'}), dict(attrs={'src':'http://nm.dz.com.pl/dz.png'})]
feeds = [(u'Fakty24', u'http://gazetakrakowska.feedsportal.com/c/32980/f/533770/index.rss?201302'), (u'Krak\xf3w', u'http://www.gazetakrakowska.pl/rss/gazetakrakowska_krakow.xml?201302'), (u'Tarn\xf3w', u'http://www.gazetakrakowska.pl/rss/gazetakrakowska_tarnow.xml?201302'), (u'Nowy S\u0105cz', u'http://www.gazetakrakowska.pl/rss/gazetakrakowska_nsacz.xml?201302'), (u'Ma\u0142. Zach.', u'http://www.gazetakrakowska.pl/rss/gazetakrakowska_malzach.xml?201302'), (u'Podhale', u'http://www.gazetakrakowska.pl/rss/gazetakrakowska_podhale.xml?201302'), (u'Sport', u'http://gazetakrakowska.feedsportal.com/c/32980/f/533771/index.rss?201302'), (u'Kultura', u'http://gazetakrakowska.feedsportal.com/c/32980/f/533772/index.rss?201302'), (u'Opinie', u'http://www.gazetakrakowska.pl/rss/gazetakrakowska_opinie.xml?201302'), (u'Magazyn', u'http://www.gazetakrakowska.pl/rss/gazetakrakowska_magazyn.xml?201302')]
def print_version(self, url):
return url.replace('artykul', 'drukuj')
def skip_ad_pages(self, soup):
if 'Advertisement' in soup.title:
nexturl=soup.find('a')['href']
return self.index_to_soup(nexturl, raw=True)
def get_cover_url(self):
soup = self.index_to_soup('http://www.prasa24.pl/gazeta/gazeta-krakowska/')
self.cover_url=soup.find(id='pojemnik').img['src']
return getattr(self, 'cover_url', self.cover_url)

View File

@ -0,0 +1,64 @@
import re
from calibre.web.feeds.news import BasicNewsRecipe
class GazetaLubuska(BasicNewsRecipe):
title = u'Gazeta Lubuska'
__author__ = 'fenuks'
description = u'Gazeta Lubuska - portal regionalny województwa lubuskiego.'
category = 'newspaper'
language = 'pl'
encoding = 'iso-8859-2'
extra_css = 'ul {list-style: none; padding:0; margin:0;}'
INDEX = 'http://www.gazetalubuska.pl'
masthead_url = INDEX + '/images/top_logo.png'
oldest_article = 7
max_articles_per_feed = 100
remove_empty_feeds = True
no_stylesheets = True
ignore_duplicate_articles = {'title', 'url'}
preprocess_regexps = [(re.compile(ur'Czytaj:.*?</a>', re.DOTALL), lambda match: ''), (re.compile(ur'Przeczytaj także:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''),
(re.compile(ur'Przeczytaj również:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(ur'Zobacz też:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: '')]
keep_only_tags = [dict(id=['article', 'cover', 'photostory'])]
remove_tags = [dict(id=['articleTags', 'articleMeta', 'boxReadIt', 'articleGalleries', 'articleConnections',
'ForumArticleComments', 'articleRecommend', 'jedynkiLinks', 'articleGalleryConnections',
'photostoryConnections', 'articleEpaper', 'articlePoll', 'articleAlarm', 'articleByline']),
dict(attrs={'class':'articleFunctions'})]
feeds = [(u'Wszystkie', u'http://www.gazetalubuska.pl/rss.xml'), (u'Dreznenko', u'http://www.gazetalubuska.pl/drezdenko.xml'), (u'G\u0142og\xf3w', u'http://www.gazetalubuska.pl/glogow.xml'), (u'Gorz\xf3w Wielkopolski', u'http://www.gazetalubuska.pl/gorzow-wielkopolski.xml'), (u'Gubin', u'http://www.gazetalubuska.pl/gubin.xml'), (u'Kostrzyn', u'http://www.gazetalubuska.pl/kostrzyn.xml'), (u'Krosno Odrza\u0144skie', u'http://www.gazetalubuska.pl/krosno-odrzanskie.xml'), (u'Lubsko', u'http://www.gazetalubuska.pl/lubsko.xml'), (u'Mi\u0119dzych\xf3d', u'http://www.gazetalubuska.pl/miedzychod.xml'), (u'Mi\u0119dzyrzecz', u'http://www.gazetalubuska.pl/miedzyrzecz.xml'), (u'Nowa S\xf3l', u'http://www.gazetalubuska.pl/nowa-sol.xml'), (u'S\u0142ubice', u'http://www.gazetalubuska.pl/slubice.xml'), (u'Strzelce Kraje\u0144skie', u'http://www.gazetalubuska.pl/strzelce-krajenskie.xml'), (u'Sulech\xf3w', u'http://www.gazetalubuska.pl/sulechow.xml'), (u'Sul\u0119cin', u'http://www.gazetalubuska.pl/sulecin.xml'), (u'\u015awi\u0119bodzin', u'http://www.gazetalubuska.pl/swiebodzin.xml'), (u'Wolsztyn', u'http://www.gazetalubuska.pl/wolsztyn.xml'), (u'Wschowa', u'http://www.gazetalubuska.pl/wschowa.xml'), (u'Zielona G\xf3ra', u'http://www.gazetalubuska.pl/zielona-gora.xml'), (u'\u017baga\u0144', u'http://www.gazetalubuska.pl/zagan.xml'), (u'\u017bary', u'http://www.gazetalubuska.pl/zary.xml'), (u'Sport', u'http://www.gazetalubuska.pl/sport.xml'), (u'Auto', u'http://www.gazetalubuska.pl/auto.xml'), (u'Dom', u'http://www.gazetalubuska.pl/dom.xml'), (u'Praca', u'http://www.gazetalubuska.pl/praca.xml'), (u'Zdrowie', u'http://www.gazetalubuska.pl/zdrowie.xml')]
def get_cover_url(self):
soup = self.index_to_soup(self.INDEX + '/apps/pbcs.dll/section?Category=JEDYNKI')
nexturl = self.INDEX + soup.find(id='covers').find('a')['href']
soup = self.index_to_soup(nexturl)
self.cover_url = self.INDEX + soup.find(id='cover').find(name='img')['src']
return getattr(self, 'cover_url', self.cover_url)
def append_page(self, soup, appendtag):
tag = soup.find('span', attrs={'class':'photoNavigationPages'})
if tag:
number = int(tag.string.rpartition('/')[-1].replace('&nbsp;', ''))
baseurl = self.INDEX + soup.find(attrs={'class':'photoNavigationNext'})['href'][:-1]
for r in appendtag.findAll(attrs={'class':'photoNavigation'}):
r.extract()
for nr in range(2, number+1):
soup2 = self.index_to_soup(baseurl + str(nr))
pagetext = soup2.find(id='photoContainer')
if pagetext:
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
pagetext = soup2.find(attrs={'class':'photoMeta'})
if pagetext:
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
pagetext = soup2.find(attrs={'class':'photoStoryText'})
if pagetext:
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
def preprocess_html(self, soup):
self.append_page(soup, soup.body)
return soup

View File

@ -1,102 +1,91 @@
#!/usr/bin/env python
# # Przed uzyciem przeczytaj komentarz w sekcji "feeds"
__license__ = 'GPL v3'
__copyright__ = u'2010, Richard z forum.eksiazki.org'
'''pomorska.pl'''
import re import re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class GazetaPomorska(BasicNewsRecipe): class GazetaPomorska(BasicNewsRecipe):
title = u'Gazeta Pomorska' title = u'Gazeta Pomorska'
publisher = u'Gazeta Pomorska' __author__ = 'Richard z forum.eksiazki.org, fenuks'
description = u'Kujawy i Pomorze - wiadomo\u015bci' description = u'Gazeta Pomorska - portal regionalny'
category = 'newspaper'
language = 'pl' language = 'pl'
__author__ = u'Richard z forum.eksiazki.org' encoding = 'iso-8859-2'
# # (dziekuje t3d z forum.eksiazki.org za testy) extra_css = 'ul {list-style: none; padding:0; margin:0;}'
oldest_article = 2 INDEX = 'http://www.pomorska.pl'
max_articles_per_feed = 20 masthead_url = INDEX + '/images/top_logo.png'
oldest_article = 7
max_articles_per_feed = 100
remove_empty_feeds = True
no_stylesheets = True no_stylesheets = True
remove_javascript = True ignore_duplicate_articles = {'title', 'url'}
preprocess_regexps = [
(re.compile(r'<a href="http://maps.google[^>]*>[^<]*</a>\.*', re.DOTALL|re.IGNORECASE), lambda m: ''),
(re.compile(r'[<Bb >]*Poznaj opinie[^<]*[</Bb >]*[^<]*<a href[^>]*>[^<]*</a>\.*', re.DOTALL|re.IGNORECASE), lambda m: ''),
(re.compile(r'[<Bb >]*Przeczytaj[^<]*[</Bb >]*[^<]*<a href[^>]*>[^<]*</a>\.*', re.DOTALL|re.IGNORECASE), lambda m: ''),
(re.compile(r'[<Bb >]*Wi.cej informacji[^<]*[</Bb >]*[^<]*<a href[^>]*>[^<]*</a>\.*', re.DOTALL|re.IGNORECASE), lambda m: ''),
(re.compile(r'<a href[^>]*>[<Bb >]*Wideo[^<]*[</Bb >]*[^<]*</a>\.*', re.DOTALL|re.IGNORECASE), lambda m: ''),
(re.compile(r'<a href[^>]*>[<Bb >]*KLIKNIJ TUTAJ[^<]*[</Bb >]*[^<]*</a>\.*', re.DOTALL|re.IGNORECASE), lambda m: '')
]
feeds = [ preprocess_regexps = [(re.compile(ur'Czytaj:.*?</a>', re.DOTALL), lambda match: ''), (re.compile(ur'Przeczytaj także:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''),
# # Tutaj jest wymieniona lista kategorii jakie mozemy otrzymywac z Gazety (re.compile(ur'Przeczytaj również:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(ur'Zobacz też:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: '')]
# # Pomorskiej, po jednej kategorii w wierszu. Jesli na poczatku danego wiersza
# # znajduje sie jeden znak "#", oznacza to ze kategoria jest zakomentowana
# # i nie bedziemy jej otrzymywac. Jesli chcemy ja otrzymywac nalezy usunac
# # znak # z jej wiersza.
# # Jesli subskrybujemy wiecej niz jedna kategorie, na koncu wiersza z kazda
# # kategoria musi sie znajdowac niezakomentowany przecinek, z wyjatkiem
# # ostatniego wiersza - ma byc bez przecinka na koncu.
# # Rekomendowane opcje wyboru kategorii:
# # 1. PomorskaRSS - wiadomosci kazdego typu, lub
# # 2. Region + wybrane miasta, lub
# # 3. Wiadomosci tematyczne.
# # Lista kategorii:
# # PomorskaRSS - wiadomosci kazdego typu, zakomentuj znakiem "#" keep_only_tags = [dict(id=['article', 'cover', 'photostory'])]
# # przed odkomentowaniem wiadomosci wybranego typu: remove_tags = [dict(id=['articleTags', 'articleMeta', 'boxReadIt', 'articleGalleries', 'articleConnections',
(u'PomorskaRSS', u'http://www.pomorska.pl/rss.xml') 'ForumArticleComments', 'articleRecommend', 'jedynkiLinks', 'articleGalleryConnections',
'photostoryConnections', 'articleEpaper', 'articlePoll', 'articleAlarm', 'articleByline']),
dict(attrs={'class':'articleFunctions'})]
# # wiadomosci z regionu nie przypisane do okreslonego miasta: feeds = [(u'Wszystkie', u'http://www.pomorska.pl/rss.xml'),
# (u'Region', u'http://www.pomorska.pl/region.xml'), (u'Region', u'http://www.pomorska.pl/region.xml'),
(u'Bydgoszcz', u'http://www.pomorska.pl/bydgoszcz.xml'),
(u'Nakło', u'http://www.pomorska.pl/naklo.xml'),
(u'Koronowo', u'http://www.pomorska.pl/koronowo.xml'),
(u'Solec Kujawski', u'http://www.pomorska.pl/soleckujawski.xml'),
(u'Grudziądz', u'http://www.pomorska.pl/grudziadz.xml'),
(u'Inowrocław', u'http://www.pomorska.pl/inowroclaw.xml'),
(u'Toruń', u'http://www.pomorska.pl/torun.xml'),
(u'Włocławek', u'http://www.pomorska.pl/wloclawek.xml'),
(u'Aleksandrów Kujawski', u'http://www.pomorska.pl/aleksandrow.xml'),
(u'Brodnica', u'http://www.pomorska.pl/brodnica.xml'),
(u'Chełmno', u'http://www.pomorska.pl/chelmno.xml'),
(u'Chojnice', u'http://www.pomorska.pl/chojnice.xml'),
(u'Ciechocinek', u'http://www.pomorska.pl/ciechocinek.xml'),
(u'Golub-Dobrzyń', u'http://www.pomorska.pl/golubdobrzyn.xml'),
(u'Mogilno', u'http://www.pomorska.pl/mogilno.xml'),
(u'Radziejów', u'http://www.pomorska.pl/radziejow.xml'),
(u'Rypin', u'http://www.pomorska.pl/rypin.xml'),
(u'Sępólno', u'http://www.pomorska.pl/sepolno.xml'),
(u'Świecie', u'http://www.pomorska.pl/swiecie.xml'),
(u'Tuchola', u'http://www.pomorska.pl/tuchola.xml'),
(u'Żnin', u'http://www.pomorska.pl/znin.xml'),
(u'Sport', u'http://www.pomorska.pl/sport.xml'),
(u'Zdrowie', u'http://www.pomorska.pl/zdrowie.xml'),
(u'Auto', u'http://www.pomorska.pl/moto.xml'),
(u'Dom', u'http://www.pomorska.pl/dom.xml'),
#(u'Reporta\u017c', u'http://www.pomorska.pl/reportaz.xml'),
(u'Gospodarka', u'http://www.pomorska.pl/gospodarka.xml')]
# # wiadomosci przypisane do miast: def get_cover_url(self):
# (u'Bydgoszcz', u'http://www.pomorska.pl/bydgoszcz.xml'), soup = self.index_to_soup(self.INDEX + '/apps/pbcs.dll/section?Category=JEDYNKI')
# (u'Nak\u0142o', u'http://www.pomorska.pl/naklo.xml'), nexturl = self.INDEX + soup.find(id='covers').find('a')['href']
# (u'Koronowo', u'http://www.pomorska.pl/koronowo.xml'), soup = self.index_to_soup(nexturl)
# (u'Solec Kujawski', u'http://www.pomorska.pl/soleckujawski.xml'), self.cover_url = self.INDEX + soup.find(id='cover').find(name='img')['src']
# (u'Grudzi\u0105dz', u'http://www.pomorska.pl/grudziadz.xml'), return getattr(self, 'cover_url', self.cover_url)
# (u'Inowroc\u0142aw', u'http://www.pomorska.pl/inowroclaw.xml'),
# (u'Toru\u0144', u'http://www.pomorska.pl/torun.xml'),
# (u'W\u0142oc\u0142awek', u'http://www.pomorska.pl/wloclawek.xml'),
# (u'Aleksandr\u00f3w Kujawski', u'http://www.pomorska.pl/aleksandrow.xml'),
# (u'Brodnica', u'http://www.pomorska.pl/brodnica.xml'),
# (u'Che\u0142mno', u'http://www.pomorska.pl/chelmno.xml'),
# (u'Chojnice', u'http://www.pomorska.pl/chojnice.xml'),
# (u'Ciechocinek', u'http://www.pomorska.pl/ciechocinek.xml'),
# (u'Golub Dobrzy\u0144', u'http://www.pomorska.pl/golubdobrzyn.xml'),
# (u'Mogilno', u'http://www.pomorska.pl/mogilno.xml'),
# (u'Radziej\u00f3w', u'http://www.pomorska.pl/radziejow.xml'),
# (u'Rypin', u'http://www.pomorska.pl/rypin.xml'),
# (u'S\u0119p\u00f3lno', u'http://www.pomorska.pl/sepolno.xml'),
# (u'\u015awiecie', u'http://www.pomorska.pl/swiecie.xml'),
# (u'Tuchola', u'http://www.pomorska.pl/tuchola.xml'),
# (u'\u017bnin', u'http://www.pomorska.pl/znin.xml')
# # wiadomosci tematyczne (redundancja z region/miasta): def append_page(self, soup, appendtag):
# (u'Sport', u'http://www.pomorska.pl/sport.xml'), tag = soup.find('span', attrs={'class':'photoNavigationPages'})
# (u'Zdrowie', u'http://www.pomorska.pl/zdrowie.xml'), if tag:
# (u'Auto', u'http://www.pomorska.pl/moto.xml'), number = int(tag.string.rpartition('/')[-1].replace('&nbsp;', ''))
# (u'Dom', u'http://www.pomorska.pl/dom.xml'), baseurl = self.INDEX + soup.find(attrs={'class':'photoNavigationNext'})['href'][:-1]
# (u'Reporta\u017c', u'http://www.pomorska.pl/reportaz.xml'),
# (u'Gospodarka', u'http://www.pomorska.pl/gospodarka.xml')
]
keep_only_tags = [dict(name='div', attrs={'id':'article'})] for r in appendtag.findAll(attrs={'class':'photoNavigation'}):
r.extract()
for nr in range(2, number+1):
soup2 = self.index_to_soup(baseurl + str(nr))
pagetext = soup2.find(id='photoContainer')
if pagetext:
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
pagetext = soup2.find(attrs={'class':'photoMeta'})
if pagetext:
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
pagetext = soup2.find(attrs={'class':'photoStoryText'})
if pagetext:
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
remove_tags = [ def preprocess_html(self, soup):
dict(name='p', attrs={'id':'articleTags'}), self.append_page(soup, soup.body)
dict(name='div', attrs={'id':'articleEpaper'}), return soup
dict(name='div', attrs={'id':'articleConnections'}),
dict(name='div', attrs={'class':'articleFacts'}),
dict(name='div', attrs={'id':'articleExternalLink'}),
dict(name='div', attrs={'id':'articleMultimedia'}),
dict(name='div', attrs={'id':'articleGalleries'}),
dict(name='div', attrs={'id':'articleAlarm'}),
dict(name='div', attrs={'id':'adholder_srodek1'}),
dict(name='div', attrs={'id':'articleVideo'}),
dict(name='a', attrs={'name':'fb_share'})]
extra_css = '''h1 { font-size: 1.4em; }
h2 { font-size: 1.0em; }'''

View File

@ -0,0 +1,34 @@
from calibre.web.feeds.news import BasicNewsRecipe
class GazetaWroclawska(BasicNewsRecipe):
title = u'Gazeta Wroc\u0142awska'
__author__ = 'fenuks'
description = u'Gazeta Regionalna Gazeta Wrocławska. Najnowsze Wiadomości Wrocław, Informacje Wrocław. Czytaj!'
category = 'newspaper'
language = 'pl'
encoding = 'iso-8859-2'
masthead_url = 'http://s.polskatimes.pl/g/logo_naglowek/gazetawroclawska.png?24'
oldest_article = 7
max_articles_per_feed = 100
remove_empty_feeds = True
no_stylesheets = True
use_embedded_content = False
ignore_duplicate_articles = {'title', 'url'}
#preprocess_regexps = [(re.compile(ur'<b>Czytaj także:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur',<b>Czytaj też:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>Zobacz także:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<center><h4><a.*?</a></h4></center>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ TEŻ:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ WIĘCEJ:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ TAKŻE:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>\* CZYTAJ KONIECZNIE:.*', re.DOTALL), lambda match: '</body>'), (re.compile(ur'<b>Nasze serwisy:</b>.*', re.DOTALL), lambda match: '</body>') ]
remove_tags_after= dict(attrs={'src':'http://nm.dz.com.pl/dz.png'})
remove_tags=[dict(id='mat-podobne'), dict(name='a', attrs={'class':'czytajDalej'}), dict(attrs={'src':'http://nm.dz.com.pl/dz.png'})]
feeds = [(u'Fakty24', u'http://gazetawroclawska.feedsportal.com/c/32980/f/533775/index.rss?201302'), (u'Region', u'http://www.gazetawroclawska.pl/rss/gazetawroclawska_region.xml?201302'), (u'Kultura', u'http://gazetawroclawska.feedsportal.com/c/32980/f/533777/index.rss?201302'), (u'Sport', u'http://gazetawroclawska.feedsportal.com/c/32980/f/533776/index.rss?201302'), (u'Z archiwum', u'http://www.gazetawroclawska.pl/rss/gazetawroclawska_zarchiwum.xml?201302'), (u'M\xf3j reporter', u'http://www.gazetawroclawska.pl/rss/gazetawroclawska_mojreporter.xml?201302'), (u'Historia', u'http://www.gazetawroclawska.pl/rss/gazetawroclawska_historia.xml?201302'), (u'Listy do redakcji', u'http://www.gazetawroclawska.pl/rss/gazetawroclawska_listydoredakcji.xml?201302'), (u'Na drogach', u'http://www.gazetawroclawska.pl/rss/gazetawroclawska_nadrogach.xml?201302')]
def print_version(self, url):
return url.replace('artykul', 'drukuj')
def skip_ad_pages(self, soup):
if 'Advertisement' in soup.title:
nexturl=soup.find('a')['href']
return self.index_to_soup(nexturl, raw=True)
def get_cover_url(self):
soup = self.index_to_soup('http://www.prasa24.pl/gazeta/gazeta-wroclawska/')
self.cover_url=soup.find(id='pojemnik').img['src']
return getattr(self, 'cover_url', self.cover_url)

View File

@ -0,0 +1,63 @@
import re
from calibre.web.feeds.news import BasicNewsRecipe
class GazetaWspolczesna(BasicNewsRecipe):
title = u'Gazeta Wsp\xf3\u0142czesna'
__author__ = 'fenuks'
description = u'Gazeta Współczesna - portal regionalny.'
category = 'newspaper'
language = 'pl'
encoding = 'iso-8859-2'
extra_css = 'ul {list-style: none; padding:0; margin:0;}'
INDEX = 'http://www.wspolczesna.pl'
masthead_url = INDEX + '/images/top_logo.png'
oldest_article = 7
max_articles_per_feed = 100
remove_empty_feeds = True
no_stylesheets = True
ignore_duplicate_articles = {'title', 'url'}
preprocess_regexps = [(re.compile(ur'Czytaj:.*?</a>', re.DOTALL), lambda match: ''), (re.compile(ur'Przeczytaj także:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''),
(re.compile(ur'Przeczytaj również:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(ur'Zobacz też:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: '')]
keep_only_tags = [dict(id=['article', 'cover', 'photostory'])]
remove_tags = [dict(id=['articleTags', 'articleMeta', 'boxReadIt', 'articleGalleries', 'articleConnections',
'ForumArticleComments', 'articleRecommend', 'jedynkiLinks', 'articleGalleryConnections',
'photostoryConnections', 'articleEpaper', 'articlePoll', 'articleAlarm', 'articleByline']),
dict(attrs={'class':'articleFunctions'})]
feeds = [(u'Wszystkie', u'http://www.wspolczesna.pl/rss.xml'), (u'August\xf3w', u'http://www.wspolczesna.pl/augustow.xml'), (u'Bia\u0142ystok', u'http://www.wspolczesna.pl/bialystok.xml'), (u'Bielsk Podlaski', u'http://www.wspolczesna.pl/bielsk.xml'), (u'E\u0142k', u'http://www.wspolczesna.pl/elk.xml'), (u'Grajewo', u'http://www.wspolczesna.pl/grajewo.xml'), (u'Go\u0142dap', u'http://www.wspolczesna.pl/goldap.xml'), (u'Hajn\xf3wka', u'http://www.wspolczesna.pl/hajnowka.xml'), (u'Kolno', u'http://www.wspolczesna.pl/kolno.xml'), (u'\u0141om\u017ca', u'http://www.wspolczesna.pl/lomza.xml'), (u'Mo\u0144ki', u'http://www.wspolczesna.pl/monki.xml'), (u'Olecko', u'http://www.wspolczesna.pl/olecko.xml'), (u'Ostro\u0142\u0119ka', u'http://www.wspolczesna.pl/ostroleka.xml'), (u'Powiat Bia\u0142ostocki', u'http://www.wspolczesna.pl/powiat.xml'), (u'Sejny', u'http://www.wspolczesna.pl/sejny.xml'), (u'Siemiatycze', u'http://www.wspolczesna.pl/siemiatycze.xml'), (u'Sok\xf3\u0142ka', u'http://www.wspolczesna.pl/sokolka.xml'), (u'Suwa\u0142ki', u'http://www.wspolczesna.pl/suwalki.xml'), (u'Wysokie Mazowieckie', u'http://www.wspolczesna.pl/wysokie.xml'), (u'Zambr\xf3w', u'http://www.wspolczesna.pl/zambrow.xml'), (u'Sport', u'http://www.wspolczesna.pl/sport.xml'), (u'Praca', u'http://www.wspolczesna.pl/praca.xml'), (u'Dom', u'http://www.wspolczesna.pl/dom.xml'), (u'Auto', u'http://www.wspolczesna.pl/auto.xml'), (u'Zdrowie', u'http://www.wspolczesna.pl/zdrowie.xml')]
def get_cover_url(self):
soup = self.index_to_soup(self.INDEX + '/apps/pbcs.dll/section?Category=JEDYNKI')
nexturl = self.INDEX + soup.find(id='covers').find('a')['href']
soup = self.index_to_soup(nexturl)
self.cover_url = self.INDEX + soup.find(id='cover').find(name='img')['src']
return getattr(self, 'cover_url', self.cover_url)
def append_page(self, soup, appendtag):
tag = soup.find('span', attrs={'class':'photoNavigationPages'})
if tag:
number = int(tag.string.rpartition('/')[-1].replace('&nbsp;', ''))
baseurl = self.INDEX + soup.find(attrs={'class':'photoNavigationNext'})['href'][:-1]
for r in appendtag.findAll(attrs={'class':'photoNavigation'}):
r.extract()
for nr in range(2, number+1):
soup2 = self.index_to_soup(baseurl + str(nr))
pagetext = soup2.find(id='photoContainer')
if pagetext:
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
pagetext = soup2.find(attrs={'class':'photoMeta'})
if pagetext:
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
pagetext = soup2.find(attrs={'class':'photoStoryText'})
if pagetext:
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
def preprocess_html(self, soup):
self.append_page(soup, soup.body)
return soup

View File

@ -6,7 +6,7 @@ class Gazeta_Wyborcza(BasicNewsRecipe):
title = u'Gazeta.pl' title = u'Gazeta.pl'
__author__ = 'fenuks, Artur Stachecki' __author__ = 'fenuks, Artur Stachecki'
language = 'pl' language = 'pl'
description = 'news from gazeta.pl' description = 'Wiadomości z Polski i ze świata. Serwisy tematyczne i lokalne w 20 miastach.'
category = 'newspaper' category = 'newspaper'
publication_type = 'newspaper' publication_type = 'newspaper'
masthead_url = 'http://bi.gazeta.pl/im/5/10285/z10285445AA.jpg' masthead_url = 'http://bi.gazeta.pl/im/5/10285/z10285445AA.jpg'

83
recipes/gcn.recipe Normal file
View File

@ -0,0 +1,83 @@
import re
from calibre.web.feeds.news import BasicNewsRecipe
class GCN(BasicNewsRecipe):
title = u'Gazeta Codziennej Nowiny'
__author__ = 'fenuks'
description = u'nowiny24.pl - portal regionalny województwa podkarpackiego.'
category = 'newspaper'
language = 'pl'
encoding = 'iso-8859-2'
extra_css = 'ul {list-style: none; padding:0; margin:0;}'
INDEX = 'http://www.nowiny24.pl'
masthead_url = INDEX + '/images/top_logo.png'
oldest_article = 7
max_articles_per_feed = 100
remove_empty_feeds = True
no_stylesheets = True
ignore_duplicate_articles = {'title', 'url'}
preprocess_regexps = [(re.compile(ur'Czytaj:.*?</a>', re.DOTALL), lambda match: ''), (re.compile(ur'Przeczytaj także:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''),
(re.compile(ur'Przeczytaj również:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(ur'Zobacz też:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: '')]
keep_only_tags = [dict(id=['article', 'cover', 'photostory'])]
remove_tags = [dict(id=['articleTags', 'articleMeta', 'boxReadIt', 'articleGalleries', 'articleConnections',
'ForumArticleComments', 'articleRecommend', 'jedynkiLinks', 'articleGalleryConnections',
'photostoryConnections', 'articleEpaper', 'articlePoll', 'articleAlarm', 'articleByline']),
dict(attrs={'class':'articleFunctions'})]
feeds = [(u'Wszystkie', u'http://www.nowiny24.pl/rss.xml'),
(u'Podkarpacie', u'http://www.nowiny24.pl/podkarpacie.xml'),
(u'Bieszczady', u'http://www.nowiny24.pl/bieszczady.xml'),
(u'Rzeszów', u'http://www.nowiny24.pl/rzeszow.xml'),
(u'Przemyśl', u'http://www.nowiny24.pl/przemysl.xml'),
(u'Leżajsk', u'http://www.nowiny24.pl/lezajsk.xml'),
(u'Łańcut', u'http://www.nowiny24.pl/lancut.xml'),
(u'Dębica', u'http://www.nowiny24.pl/debica.xml'),
(u'Jarosław', u'http://www.nowiny24.pl/jaroslaw.xml'),
(u'Krosno', u'http://www.nowiny24.pl/krosno.xml'),
(u'Mielec', u'http://www.nowiny24.pl/mielec.xml'),
(u'Nisko', u'http://www.nowiny24.pl/nisko.xml'),
(u'Sanok', u'http://www.nowiny24.pl/sanok.xml'),
(u'Stalowa Wola', u'http://www.nowiny24.pl/stalowawola.xml'),
(u'Tarnobrzeg', u'http://www.nowiny24.pl/tarnobrzeg.xml'),
(u'Sport', u'http://www.nowiny24.pl/sport.xml'),
(u'Dom', u'http://www.nowiny24.pl/dom.xml'),
(u'Auto', u'http://www.nowiny24.pl/auto.xml'),
(u'Praca', u'http://www.nowiny24.pl/praca.xml'),
(u'Zdrowie', u'http://www.nowiny24.pl/zdrowie.xml'),
(u'Wywiady', u'http://www.nowiny24.pl/wywiady.xml')]
def get_cover_url(self):
soup = self.index_to_soup(self.INDEX + '/apps/pbcs.dll/section?Category=JEDYNKI')
nexturl = self.INDEX + soup.find(id='covers').find('a')['href']
soup = self.index_to_soup(nexturl)
self.cover_url = self.INDEX + soup.find(id='cover').find(name='img')['src']
return getattr(self, 'cover_url', self.cover_url)
def append_page(self, soup, appendtag):
tag = soup.find('span', attrs={'class':'photoNavigationPages'})
if tag:
number = int(tag.string.rpartition('/')[-1].replace('&nbsp;', ''))
baseurl = self.INDEX + soup.find(attrs={'class':'photoNavigationNext'})['href'][:-1]
for r in appendtag.findAll(attrs={'class':'photoNavigation'}):
r.extract()
for nr in range(2, number+1):
soup2 = self.index_to_soup(baseurl + str(nr))
pagetext = soup2.find(id='photoContainer')
if pagetext:
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
pagetext = soup2.find(attrs={'class':'photoMeta'})
if pagetext:
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
pagetext = soup2.find(attrs={'class':'photoStoryText'})
if pagetext:
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
def preprocess_html(self, soup):
self.append_page(soup, soup.body)
return soup

View File

@ -0,0 +1,34 @@
from calibre.web.feeds.news import BasicNewsRecipe
class GlosWielkopolski(BasicNewsRecipe):
title = u'G\u0142os Wielkopolski'
__author__ = 'fenuks'
description = u'Gazeta Regionalna Głos Wielkopolski. Najnowsze Wiadomości Poznań. Czytaj Informacje Poznań!'
category = 'newspaper'
language = 'pl'
encoding = 'iso-8859-2'
masthead_url = 'http://s.polskatimes.pl/g/logo_naglowek/gloswielkopolski.png?24'
oldest_article = 7
max_articles_per_feed = 100
remove_empty_feeds= True
no_stylesheets = True
use_embedded_content = False
ignore_duplicate_articles = {'title', 'url'}
#preprocess_regexps = [(re.compile(ur'<b>Czytaj także:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur',<b>Czytaj też:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>Zobacz także:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<center><h4><a.*?</a></h4></center>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ TEŻ:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ WIĘCEJ:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ TAKŻE:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>\* CZYTAJ KONIECZNIE:.*', re.DOTALL), lambda match: '</body>'), (re.compile(ur'<b>Nasze serwisy:</b>.*', re.DOTALL), lambda match: '</body>') ]
remove_tags_after= dict(attrs={'src':'http://nm.dz.com.pl/dz.png'})
remove_tags=[dict(id='mat-podobne'), dict(name='a', attrs={'class':'czytajDalej'}), dict(attrs={'src':'http://nm.dz.com.pl/dz.png'})]
feeds = [(u'Wszystkie', u'http://gloswielkopolski.feedsportal.com/c/32980/f/533779/index.rss?201302'), (u'Wiadomo\u015bci', u'http://gloswielkopolski.feedsportal.com/c/32980/f/533780/index.rss?201302'), (u'Sport', u'http://gloswielkopolski.feedsportal.com/c/32980/f/533781/index.rss?201302'), (u'Kultura', u'http://gloswielkopolski.feedsportal.com/c/32980/f/533782/index.rss?201302'), (u'Porady', u'http://www.gloswielkopolski.pl/rss/gloswielkopolski_porady.xml?201302'), (u'Blogi', u'http://www.gloswielkopolski.pl/rss/gloswielkopolski_blogi.xml?201302'), (u'Nasze akcje', u'http://www.gloswielkopolski.pl/rss/gloswielkopolski_naszeakcje.xml?201302'), (u'Opinie', u'http://www.gloswielkopolski.pl/rss/gloswielkopolski_opinie.xml?201302'), (u'Magazyn', u'http://www.gloswielkopolski.pl/rss/gloswielkopolski_magazyn.xml?201302')]
def print_version(self, url):
return url.replace('artykul', 'drukuj')
def skip_ad_pages(self, soup):
if 'Advertisement' in soup.title:
nexturl=soup.find('a')['href']
return self.index_to_soup(nexturl, raw=True)
def get_cover_url(self):
soup = self.index_to_soup('http://www.prasa24.pl/gazeta/glos-wielkopolski/')
self.cover_url=soup.find(id='pojemnik').img['src']
return getattr(self, 'cover_url', self.cover_url)

View File

@ -11,15 +11,14 @@ class Gram_pl(BasicNewsRecipe):
max_articles_per_feed = 100 max_articles_per_feed = 100
ignore_duplicate_articles = {'title', 'url'} ignore_duplicate_articles = {'title', 'url'}
no_stylesheets= True no_stylesheets= True
remove_empty_feeds = True
#extra_css = 'h2 {font-style: italic; font-size:20px;} .picbox div {float: left;}' #extra_css = 'h2 {font-style: italic; font-size:20px;} .picbox div {float: left;}'
cover_url=u'http://www.gram.pl/www/01/img/grampl_zima.png' cover_url=u'http://www.gram.pl/www/01/img/grampl_zima.png'
keep_only_tags= [dict(id='articleModule')] keep_only_tags= [dict(id='articleModule')]
remove_tags = [dict(attrs={'class':['breadCrump', 'dymek', 'articleFooter']})] remove_tags = [dict(attrs={'class':['breadCrump', 'dymek', 'articleFooter', 'twitter-share-button']})]
feeds = [(u'Informacje', u'http://www.gram.pl/feed_news.asp'), feeds = [(u'Informacje', u'http://www.gram.pl/feed_news.asp'),
(u'Publikacje', u'http://www.gram.pl/feed_news.asp?type=articles'), (u'Publikacje', u'http://www.gram.pl/feed_news.asp?type=articles')
(u'Kolektyw- Indie Games', u'http://indie.gram.pl/feed/'), ]
#(u'Kolektyw- Moto Games', u'http://www.motogames.gram.pl/news.rss')
]
def parse_feeds (self): def parse_feeds (self):
feeds = BasicNewsRecipe.parse_feeds(self) feeds = BasicNewsRecipe.parse_feeds(self)

View File

@ -1,20 +1,23 @@
import time
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
class GryOnlinePl(BasicNewsRecipe): class GryOnlinePl(BasicNewsRecipe):
title = u'Gry-Online.pl' title = u'Gry-Online.pl'
__author__ = 'fenuks' __author__ = 'fenuks'
description = 'Gry-Online.pl - computer games' description = u'Wiadomości o grach, recenzje, zapowiedzi. Encyklopedia Gier zawiera opisy gier na PC, konsole Xbox360, PS3 i inne platformy.'
category = 'games' category = 'games'
language = 'pl' language = 'pl'
oldest_article = 13 oldest_article = 13
INDEX= 'http://www.gry-online.pl/' INDEX = 'http://www.gry-online.pl/'
masthead_url='http://www.gry-online.pl/im/gry-online-logo.png' masthead_url = 'http://www.gry-online.pl/im/gry-online-logo.png'
cover_url='http://www.gry-online.pl/im/gry-online-logo.png' cover_url = 'http://www.gry-online.pl/im/gry-online-logo.png'
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets= True no_stylesheets = True
keep_only_tags=[dict(name='div', attrs={'class':['gc660', 'gc660 S013']})] keep_only_tags = [dict(name='div', attrs={'class':['gc660', 'gc660 S013', 'news_endpage_tit', 'news_container', 'news']})]
remove_tags=[dict({'class':['nav-social', 'add-info', 'smlb', 'lista lista3 lista-gry', 'S013po', 'S013-npb', 'zm_gfx_cnt_bottom', 'ocen-txt', 'wiecej-txt', 'wiecej-txt2']})] remove_tags = [dict({'class':['nav-social', 'add-info', 'smlb', 'lista lista3 lista-gry', 'S013po', 'S013-npb', 'zm_gfx_cnt_bottom', 'ocen-txt', 'wiecej-txt', 'wiecej-txt2']})]
feeds = [(u'Newsy', 'http://www.gry-online.pl/rss/news.xml'), ('Teksty', u'http://www.gry-online.pl/rss/teksty.xml')] feeds = [
(u'Newsy', 'http://www.gry-online.pl/rss/news.xml'),
('Teksty', u'http://www.gry-online.pl/rss/teksty.xml')]
def append_page(self, soup, appendtag): def append_page(self, soup, appendtag):
@ -24,7 +27,14 @@ class GryOnlinePl(BasicNewsRecipe):
url_part = soup.find('link', attrs={'rel':'canonical'})['href'] url_part = soup.find('link', attrs={'rel':'canonical'})['href']
url_part = url_part[25:].rpartition('?')[0] url_part = url_part[25:].rpartition('?')[0]
for nexturl in nexturls[1:-1]: for nexturl in nexturls[1:-1]:
soup2 = self.index_to_soup('http://www.gry-online.pl/' + url_part + nexturl['href']) finalurl = 'http://www.gry-online.pl/' + url_part + nexturl['href']
for i in range(10):
try:
soup2 = self.index_to_soup(finalurl)
break
except:
print 'retrying in 0.5s'
time.sleep(0.5)
pagetext = soup2.find(attrs={'class':'gc660'}) pagetext = soup2.find(attrs={'class':'gc660'})
for r in pagetext.findAll(name='header'): for r in pagetext.findAll(name='header'):
r.extract() r.extract()
@ -34,7 +44,42 @@ class GryOnlinePl(BasicNewsRecipe):
appendtag.insert(pos, pagetext) appendtag.insert(pos, pagetext)
for r in appendtag.findAll(attrs={'class':['n5p', 'add-info', 'twitter-share-button', 'lista lista3 lista-gry']}): for r in appendtag.findAll(attrs={'class':['n5p', 'add-info', 'twitter-share-button', 'lista lista3 lista-gry']}):
r.extract() r.extract()
else:
tag = appendtag.find('div', attrs={'class':'S018stronyr'})
if tag:
nexturl = tag.a
url_part = soup.find('link', attrs={'rel':'canonical'})['href']
url_part = url_part[25:].rpartition('?')[0]
while tag:
end = tag.find(attrs={'class':'right left-dead'})
if end:
break
else:
nexturl = tag.a
finalurl = 'http://www.gry-online.pl/' + url_part + nexturl['href']
for i in range(10):
try:
soup2 = self.index_to_soup(finalurl)
break
except:
print 'retrying in 0.5s'
time.sleep(0.5)
tag = soup2.find('div', attrs={'class':'S018stronyr'})
pagetext = soup2.find(attrs={'class':'gc660'})
for r in pagetext.findAll(name='header'):
r.extract()
for r in pagetext.findAll(attrs={'itemprop':'description'}):
r.extract()
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
for r in appendtag.findAll(attrs={'class':['n5p', 'add-info', 'twitter-share-button', 'lista lista3 lista-gry', 'S018strony']}):
r.extract()
def image_url_processor(self, baseurl, url):
if url.startswith('..'):
return url[2:]
else:
return url
def preprocess_html(self, soup): def preprocess_html(self, soup):
self.append_page(soup, soup.body) self.append_page(soup, soup.body)

View File

@ -1,5 +1,5 @@
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008-2012, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2008-2013, Darko Miletic <darko.miletic at gmail.com>'
''' '''
harpers.org - paid subscription/ printed issue articles harpers.org - paid subscription/ printed issue articles
This recipe only get's article's published in text format This recipe only get's article's published in text format
@ -72,7 +72,8 @@ class Harpers_full(BasicNewsRecipe):
#go to the current issue #go to the current issue
soup1 = self.index_to_soup(currentIssue_url) soup1 = self.index_to_soup(currentIssue_url)
date = re.split('\s\|\s',self.tag_to_string(soup1.head.title.string))[0] currentIssue_title = self.tag_to_string(soup1.head.title.string)
date = re.split('\s\|\s',currentIssue_title)[0]
self.timefmt = u' [%s]'%date self.timefmt = u' [%s]'%date
#get cover #get cover
@ -84,27 +85,23 @@ class Harpers_full(BasicNewsRecipe):
count = 0 count = 0
for item in soup1.findAll('div', attrs={'class':'articleData'}): for item in soup1.findAll('div', attrs={'class':'articleData'}):
text_links = item.findAll('h2') text_links = item.findAll('h2')
for text_link in text_links: if text_links:
if count == 0: for text_link in text_links:
count = 1 if count == 0:
else: count = 1
url = text_link.a['href'] else:
title = text_link.a.contents[0] url = text_link.a['href']
date = strftime(' %B %Y') title = self.tag_to_string(text_link.a)
articles.append({ date = strftime(' %B %Y')
'title' :title articles.append({
,'date' :date 'title' :title
,'url' :url ,'date' :date
,'description':'' ,'url' :url
}) ,'description':''
return [(soup1.head.title.string, articles)] })
return [(currentIssue_title, articles)]
def print_version(self, url): def print_version(self, url):
return url + '?single=1' return url + '?single=1'
def cleanup(self):
soup = self.index_to_soup('http://harpers.org/')
signouturl=self.tag_to_string(soup.find('li', attrs={'class':'subLogOut'}).findNext('li').a['href'])
self.log(signouturl)
self.browser.open(signouturl)

View File

@ -8,7 +8,6 @@ hatalska.com
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
import re
class hatalska(BasicNewsRecipe): class hatalska(BasicNewsRecipe):
title = u'Hatalska' title = u'Hatalska'

View File

@ -41,13 +41,16 @@ class TheHindu(BasicNewsRecipe):
if current_section and x.get('class', '') == 'tpaper': if current_section and x.get('class', '') == 'tpaper':
a = x.find('a', href=True) a = x.find('a', href=True)
if a is not None: if a is not None:
title = self.tag_to_string(a)
self.log('\tFound article:', title)
current_articles.append({'url':a['href']+'?css=print', current_articles.append({'url':a['href']+'?css=print',
'title':self.tag_to_string(a), 'date': '', 'title':title, 'date': '',
'description':''}) 'description':''})
if x.name == 'h3': if x.name == 'h3':
if current_section and current_articles: if current_section and current_articles:
feeds.append((current_section, current_articles)) feeds.append((current_section, current_articles))
current_section = self.tag_to_string(x) current_section = self.tag_to_string(x)
self.log('Found section:', current_section)
current_articles = [] current_articles = []
return feeds return feeds

Binary file not shown.

Before

Width:  |  Height:  |  Size: 389 B

After

Width:  |  Height:  |  Size: 887 B

Binary file not shown.

Before

Width:  |  Height:  |  Size: 391 B

After

Width:  |  Height:  |  Size: 772 B

BIN
recipes/icons/biweekly.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 603 B

Binary file not shown.

Before

Width:  |  Height:  |  Size: 837 B

After

Width:  |  Height:  |  Size: 364 B

Binary file not shown.

Before

Width:  |  Height:  |  Size: 24 KiB

After

Width:  |  Height:  |  Size: 1.3 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 603 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 865 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 461 B

Binary file not shown.

Before

Width:  |  Height:  |  Size: 481 B

After

Width:  |  Height:  |  Size: 1.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 414 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 431 B

BIN
recipes/icons/echo_dnia.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 760 B

Binary file not shown.

Before

Width:  |  Height:  |  Size: 3.6 KiB

After

Width:  |  Height:  |  Size: 946 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 762 B

Binary file not shown.

Before

Width:  |  Height:  |  Size: 3.4 KiB

After

Width:  |  Height:  |  Size: 2.2 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 991 B

After

Width:  |  Height:  |  Size: 737 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 398 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 470 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 921 B

BIN
recipes/icons/gcn.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 554 B

BIN
recipes/icons/gildia_pl.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.5 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 446 B

Binary file not shown.

Before

Width:  |  Height:  |  Size: 357 B

After

Width:  |  Height:  |  Size: 936 B

Binary file not shown.

Before

Width:  |  Height:  |  Size: 4.0 KiB

After

Width:  |  Height:  |  Size: 1.4 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 483 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 354 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.1 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 2.0 KiB

After

Width:  |  Height:  |  Size: 610 B

Binary file not shown.

Before

Width:  |  Height:  |  Size: 2.1 KiB

After

Width:  |  Height:  |  Size: 966 B

Binary file not shown.

Before

Width:  |  Height:  |  Size: 894 B

After

Width:  |  Height:  |  Size: 299 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.7 KiB

BIN
recipes/icons/nowy_obywatel.png Executable file

Binary file not shown.

After

Width:  |  Height:  |  Size: 480 B

BIN
recipes/icons/nto.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 416 B

Binary file not shown.

Before

Width:  |  Height:  |  Size: 881 B

After

Width:  |  Height:  |  Size: 926 B

Binary file not shown.

Before

Width:  |  Height:  |  Size: 2.8 KiB

After

Width:  |  Height:  |  Size: 1.0 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 605 B

After

Width:  |  Height:  |  Size: 1.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 834 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 537 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 856 B

View File

@ -35,5 +35,5 @@ class InteriaFakty(BasicNewsRecipe):
dict(name='span', attrs={'class':'keywords'})] dict(name='span', attrs={'class':'keywords'})]
extra_css = ''' extra_css = '''
h2 { font-size: 1.2em; } h2 { font-size: 1.2em; }
''' '''

55
recipes/jazzpress.recipe Normal file
View File

@ -0,0 +1,55 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = u'Łukasz Grąbczewski 2011-2013'
__version__ = '2.0'
import re, os
from calibre import walk
from calibre.utils.zipfile import ZipFile
from calibre.ptempfile import PersistentTemporaryFile
from calibre.web.feeds.news import BasicNewsRecipe
class jazzpress(BasicNewsRecipe):
__author__ = u'Łukasz Grąbczewski'
title = 'JazzPRESS'
language = 'pl'
publisher = 'Fundacja Popularyzacji Muzyki Jazzowej EuroJAZZ'
publication_type = 'magazine'
description = u'Internetowa gazeta poświęcona muzyce improwizowanej'
conversion_options = {
'authors' : 'Fundacja Popularyzacji Muzyki Jazzowej EuroJAZZ'
,'publisher' : publisher
,'language' : language
,'preserve_cover_aspect_ratio': True
,'remove_first_image': True
}
def build_index(self):
browser = self.get_browser()
browser.open('http://radiojazz.fm/')
# find the link
epublink = browser.find_link(url_regex=re.compile('e_jazzpress\d\d\d\d\_epub'))
# download ebook
self.report_progress(0,_('Downloading ePUB'))
response = browser.follow_link(epublink)
book_file = PersistentTemporaryFile(suffix='.epub')
book_file.write(response.read())
book_file.close()
# convert
self.report_progress(0.2,_('Converting to OEB'))
oeb = self.output_dir + '/INPUT/'
if not os.path.exists(oeb):
os.makedirs(oeb)
with ZipFile(book_file.name) as f:
f.extractall(path=oeb)
for f in walk(oeb):
if f.endswith('.opf'):
return f # convert

View File

@ -7,6 +7,10 @@ class AdvancedUserRecipe1295262156(BasicNewsRecipe):
oldest_article = 7 oldest_article = 7
language = 'de' language = 'de'
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True
auto_cleanup = True
encoding='iso-8859-1'
feeds = [(u'kath.net', u'http://www.kath.net/2005/xml/index.xml')] feeds = [(u'kath.net', u'http://www.kath.net/2005/xml/index.xml')]

View File

@ -7,7 +7,7 @@ class Konflikty(BasicNewsRecipe):
__author__ = 'fenuks' __author__ = 'fenuks'
cover_url = 'http://www.konflikty.pl/images/tapety_logo.jpg' cover_url = 'http://www.konflikty.pl/images/tapety_logo.jpg'
language = 'pl' language = 'pl'
description ='military news' description = u'Zbiór ciekawych artykułów historycznych, militarnych oraz recenzji książek, gier i filmów. Najświeższe informacje o lotnictwie, wojskach lądowych i polityce.'
category='military, history' category='military, history'
oldest_article = 7 oldest_article = 7
max_articles_per_feed = 100 max_articles_per_feed = 100

View File

@ -6,74 +6,75 @@ import re
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
class KopalniaWiedzy(BasicNewsRecipe): class KopalniaWiedzy(BasicNewsRecipe):
title = u'Kopalnia Wiedzy' title = u'Kopalnia Wiedzy'
publisher = u'Kopalnia Wiedzy' publisher = u'Kopalnia Wiedzy'
description = u'Ciekawostki ze świata nauki i techniki' description = u'Ciekawostki ze świata nauki i techniki'
encoding = 'utf-8' encoding = 'utf-8'
__author__ = 'Attis & Tomasz Długosz' __author__ = 'Attis & Tomasz Długosz'
language = 'pl' language = 'pl'
oldest_article = 7 oldest_article = 7
max_articles_per_feed = 100 max_articles_per_feed = 100
INDEX = u'http://kopalniawiedzy.pl/' INDEX = u'http://kopalniawiedzy.pl/'
remove_javascript = True remove_javascript = True
no_stylesheets = True remove_empty_feeds = True
no_stylesheets = True
remove_tags = [{'name':'p', 'attrs': {'class': 'keywords'}}, {'name':'div', 'attrs': {'class':'sexy-bookmarks sexy-bookmarks-bg-caring'}}, {'name':'div', 'attrs': {'class':'article-time-and-cat'}}, {'name':'p', 'attrs': {'class':'tags'}}] remove_tags = [{'name':'p', 'attrs': {'class': 'keywords'}}, {'name':'div', 'attrs': {'class':'sexy-bookmarks sexy-bookmarks-bg-caring'}}, {'name':'div', 'attrs': {'class':'article-time-and-cat'}}, {'name':'p', 'attrs': {'class':'tags'}}]
remove_tags_after = dict(attrs={'class':'ad-square'}) remove_tags_after = dict(attrs={'class':'ad-square'})
keep_only_tags = [dict(name="div", attrs={'class':'article-text text-small'})] keep_only_tags = [dict(name="div", attrs={'class':'article-text text-small'})]
extra_css = '.topimage {margin-top: 30px}' extra_css = '.topimage {margin-top: 30px}'
preprocess_regexps = [ preprocess_regexps = [
(re.compile(u'<a .* rel="lightboxText" .*><img (.*)></a>'), (re.compile(u'<a .* rel="lightboxText" .*><img (.*)></a>'),
lambda match: '<img class="topimage" ' + match.group(1) + '>' ), lambda match: '<img class="topimage" ' + match.group(1) + '>' ),
(re.compile(u'<br /><br />'), (re.compile(u'<br /><br />'),
lambda match: '<br\/>') lambda match: '<br\/>')
]
feeds = [
(u'Biologia', u'http://kopalniawiedzy.pl/wiadomosci_biologia.rss'),
(u'Medycyna', u'http://kopalniawiedzy.pl/wiadomosci_medycyna.rss'),
(u'Psychologia', u'http://kopalniawiedzy.pl/wiadomosci_psychologia.rss'),
(u'Technologie', u'http://kopalniawiedzy.pl/wiadomosci_technologie.rss'),
(u'Ciekawostki', u'http://kopalniawiedzy.pl/wiadomosci_ciekawostki.rss'),
(u'Artykuły', u'http://kopalniawiedzy.pl/artykuly.rss')
] ]
def is_link_wanted(self, url, tag): feeds = [
return tag['class'] == 'next' (u'Biologia', u'http://kopalniawiedzy.pl/wiadomosci_biologia.rss'),
(u'Medycyna', u'http://kopalniawiedzy.pl/wiadomosci_medycyna.rss'),
(u'Psychologia', u'http://kopalniawiedzy.pl/wiadomosci_psychologia.rss'),
(u'Technologie', u'http://kopalniawiedzy.pl/wiadomosci_technologie.rss'),
(u'Ciekawostki', u'http://kopalniawiedzy.pl/wiadomosci_ciekawostki.rss'),
(u'Artykuły', u'http://kopalniawiedzy.pl/artykuly.rss')
]
def remove_beyond(self, tag, next): def is_link_wanted(self, url, tag):
while tag is not None and getattr(tag, 'name', None) != 'body': return tag['class'] == 'next'
after = getattr(tag, next)
while after is not None:
ns = getattr(tag, next)
after.extract()
after = ns
tag = tag.parent
def append_page(self, soup, appendtag, position): def remove_beyond(self, tag, next):
pager = soup.find('a',attrs={'class':'next'}) while tag is not None and getattr(tag, 'name', None) != 'body':
if pager: after = getattr(tag, next)
nexturl = self.INDEX + pager['href'] while after is not None:
soup2 = self.index_to_soup(nexturl) ns = getattr(tag, next)
texttag = soup2.find('div', attrs={'id':'articleContent'}) after.extract()
after = ns
tag = tag.parent
tag = texttag.find(attrs={'class':'pages'}) def append_page(self, soup, appendtag, position):
self.remove_beyond(tag, 'nextSibling') pager = soup.find('a',attrs={'class':'next'})
if pager:
nexturl = self.INDEX + pager['href']
soup2 = self.index_to_soup(nexturl)
texttag = soup2.find('div', attrs={'id':'articleContent'})
newpos = len(texttag.contents) tag = texttag.find(attrs={'class':'pages'})
self.append_page(soup2,texttag,newpos) self.remove_beyond(tag, 'nextSibling')
appendtag.insert(position,texttag) newpos = len(texttag.contents)
self.append_page(soup2,texttag,newpos)
appendtag.insert(position,texttag)
def preprocess_html(self, soup): def preprocess_html(self, soup):
self.append_page(soup, soup.body, 3) self.append_page(soup, soup.body, 3)
for item in soup.findAll('div',attrs={'class':'pages'}): for item in soup.findAll('div',attrs={'class':'pages'}):
item.extract() item.extract()
for item in soup.findAll('p', attrs={'class':'wykop'}): for item in soup.findAll('p', attrs={'class':'wykop'}):
item.extract() item.extract()
return soup return soup

View File

@ -24,17 +24,16 @@ class KorespondentPL(BasicNewsRecipe):
extra_css = '.naglowek {font-size: small}\n .tytul {font-size: x-large; padding-bottom: 10px; padding-top: 30px} \n .external {font-size: small}' extra_css = '.naglowek {font-size: small}\n .tytul {font-size: x-large; padding-bottom: 10px; padding-top: 30px} \n .external {font-size: small}'
preprocess_regexps = [ preprocess_regexps = [
(re.compile(u'<a href="index\.php.*>(.*)</a>'), (re.compile(u'<a href="index\.php.*>(.*)</a>'),
lambda match: match.group(1) ), lambda match: match.group(1) ),
(re.compile(u'<i>'), (re.compile(u'<i>'),
lambda match:'<i class="external">' ), lambda match:'<i class="external">' ),
(re.compile(u'<p></p>Więcej'), (re.compile(u'<p></p>Więcej'),
lambda match:'Więcej' ), lambda match:'Więcej' ),
(re.compile(u'target="_blank"'), (re.compile(u'target="_blank"'),
lambda match:'target="_blank" class="external"' ), lambda match:'target="_blank" class="external"' ),
(re.compile(u'<p align="center">\nPoczytaj inne teksty w <a href="http://www.korespondent.pl">Serwisie wolnorynkowym Korespondent.pl</a>.*</body>', re.DOTALL|re.IGNORECASE), (re.compile(u'<p align="center">\nPoczytaj inne teksty w <a href="http://www.korespondent.pl">Serwisie wolnorynkowym Korespondent.pl</a>.*</body>', re.DOTALL|re.IGNORECASE),
lambda match: '</div></body>'), lambda match: '</div></body>'),
] ]
feeds = [(u'Serwis informacyjny', u'http://korespondent.pl/rss.xml')] feeds = [(u'Serwis informacyjny', u'http://korespondent.pl/rss.xml')]

View File

@ -7,7 +7,7 @@ class Kosmonauta(BasicNewsRecipe):
description = u'polskojęzyczny portal w całości dedykowany misjom kosmicznym i badaniom kosmosu.' description = u'polskojęzyczny portal w całości dedykowany misjom kosmicznym i badaniom kosmosu.'
category = 'astronomy' category = 'astronomy'
language = 'pl' language = 'pl'
cover_url='http://bi.gazeta.pl/im/4/10393/z10393414X,Kosmonauta-net.jpg' cover_url = 'http://bi.gazeta.pl/im/4/10393/z10393414X,Kosmonauta-net.jpg'
no_stylesheets = True no_stylesheets = True
INDEX = 'http://www.kosmonauta.net' INDEX = 'http://www.kosmonauta.net'
oldest_article = 7 oldest_article = 7
@ -24,6 +24,5 @@ class Kosmonauta(BasicNewsRecipe):
href = a['href'] href = a['href']
if not href.startswith('http'): if not href.startswith('http'):
a['href'] = self.INDEX + href a['href'] = self.INDEX + href
print '%%%%%%%%%%%%%%%%%%%%%%%%%', a['href']
return soup return soup

View File

@ -3,7 +3,7 @@ from calibre.ebooks.BeautifulSoup import BeautifulSoup as bs
class KurierGalicyjski(BasicNewsRecipe): class KurierGalicyjski(BasicNewsRecipe):
title = u'Kurier Galicyjski' title = u'Kurier Galicyjski'
__author__ = 'fenuks' __author__ = 'fenuks'
#description = u'' description = u'Kurier Galicyjski - największa gazeta dla Polaków na Ukrainie. Bieżące wydarzenia z życia polskiej mniejszości, historia, kultura, polityka, reportaże.'
category = 'news' category = 'news'
language = 'pl' language = 'pl'
cover_url = 'http://www.duszki.pl/Kurier_galicyjski_bis2_small.gif' cover_url = 'http://www.duszki.pl/Kurier_galicyjski_bis2_small.gif'

View File

@ -0,0 +1,34 @@
from calibre.web.feeds.news import BasicNewsRecipe
class KurierLubelski(BasicNewsRecipe):
title = u'Kurier Lubelski'
__author__ = 'fenuks'
description = u'Gazeta Regionalna Kurier Lubelski. Najnowsze Wiadomości Lublin. Czytaj Informacje Lublin!'
category = 'newspaper'
language = 'pl'
encoding = 'iso-8859-2'
masthead_url = 'http://s.polskatimes.pl/g/logo_naglowek/kurierlubelski.png?24'
oldest_article = 7
max_articles_per_feed = 100
remove_empty_feeds = True
no_stylesheets = True
use_embedded_content = False
ignore_duplicate_articles = {'title', 'url'}
#preprocess_regexps = [(re.compile(ur'<b>Czytaj także:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur',<b>Czytaj też:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>Zobacz także:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<center><h4><a.*?</a></h4></center>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ TEŻ:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ WIĘCEJ:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ TAKŻE:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>\* CZYTAJ KONIECZNIE:.*', re.DOTALL), lambda match: '</body>'), (re.compile(ur'<b>Nasze serwisy:</b>.*', re.DOTALL), lambda match: '</body>') ]
remove_tags_after= dict(attrs={'src':'http://nm.dz.com.pl/dz.png'})
remove_tags=[dict(id='mat-podobne'), dict(name='a', attrs={'class':'czytajDalej'}), dict(attrs={'src':'http://nm.dz.com.pl/dz.png'})]
feeds = [(u'Wiadomo\u015bci', u'http://kurierlubelski.feedsportal.com/c/32980/f/533785/index.rss?201302'), (u'Region', u'http://www.kurierlubelski.pl/rss/kurierlubelski_region.xml?201302'), (u'Sport', u'http://kurierlubelski.feedsportal.com/c/32980/f/533786/index.rss?201302'), (u'Kultura', u'http://kurierlubelski.feedsportal.com/c/32980/f/533787/index.rss?201302'), (u'Rozmaito\u015bci', u'http://www.kurierlubelski.pl/rss/kurierlubelski_rozmaitosci.xml?201302'), (u'Dom', u'http://www.kurierlubelski.pl/rss/kurierlubelski_dom.xml?201302'), (u'Serwisy', u'http://www.kurierlubelski.pl/rss/kurierlubelski_serwisy.xml?201302'), (u'Motofakty', u'http://www.kurierlubelski.pl/rss/kurierlubelski_motofakty.xml?201302'), (u'M\xf3j Reporter', u'http://www.kurierlubelski.pl/rss/kurierlubelski_mojreporter.xml?201302'), (u'Praca', u'http://www.kurierlubelski.pl/rss/kurierlubelski_praca.xml?201302')]
def print_version(self, url):
return url.replace('artykul', 'drukuj')
def skip_ad_pages(self, soup):
if 'Advertisement' in soup.title:
nexturl=soup.find('a')['href']
return self.index_to_soup(nexturl, raw=True)
def get_cover_url(self):
soup = self.index_to_soup('http://www.prasa24.pl/gazeta/kurier-lubelski/')
self.cover_url=soup.find(id='pojemnik').img['src']
return getattr(self, 'cover_url', self.cover_url)

View File

@ -0,0 +1,78 @@
import re
from calibre.web.feeds.news import BasicNewsRecipe
class KurierPoranny(BasicNewsRecipe):
title = u'Kurier Poranny'
__author__ = 'fenuks'
description = u'Kurier Poranny | poranny.pl - portal miejski Białegostoku,informacje,wydarzenia'
category = 'newspaper'
language = 'pl'
encoding = 'iso-8859-2'
extra_css = 'ul {list-style: none; padding:0; margin:0;}'
INDEX = 'http://www.poranny.pl'
masthead_url = INDEX + '/images/top_logo.png'
oldest_article = 7
max_articles_per_feed = 100
remove_empty_feeds = True
no_stylesheets = True
ignore_duplicate_articles = {'title', 'url'}
preprocess_regexps = [(re.compile(ur'Czytaj:.*?</a>', re.DOTALL), lambda match: ''), (re.compile(ur'Przeczytaj także:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''),
(re.compile(ur'Przeczytaj również:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(ur'Zobacz też:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: '')]
keep_only_tags = [dict(id=['article', 'cover', 'photostory'])]
remove_tags = [dict(id=['articleTags', 'articleMeta', 'boxReadIt', 'articleGalleries', 'articleConnections',
'ForumArticleComments', 'articleRecommend', 'jedynkiLinks', 'articleGalleryConnections',
'photostoryConnections', 'articleEpaper', 'articlePoll', 'articleAlarm', 'articleByline']),
dict(attrs={'class':'articleFunctions'})]
feeds = [(u'Wszystkie', u'http://www.poranny.pl/rss.xml'),
(u'Białystok', u'http://www.poranny.pl/bialystok.xml'),
(u'Bielsk Podlaski', u'http://www.poranny.pl/bielskpodlaski.xml'),
(u'Czarna Białostocka', u'http://www.poranny.pl/czarnabialostocka.xml'),
(u'Hajnówka', u'http://www.poranny.pl/hajnowka.xml'),
(u'Łapy', u'http://www.poranny.pl/lapy.xml'),
(u'Sokółka', u'http://www.poranny.pl/sokolka.xml'),
(u'Supraśl', u'http://www.poranny.pl/suprasl.xml'),
(u'Wasilków', u'http://www.poranny.pl/wasilkow.xml'),
(u'Sport', u'http://www.poranny.pl/sport.xml'),
(u'Praca', u'http://www.poranny.pl/praca.xml'),
(u'Kultura', u'http://www.poranny.pl/kultura.xml'),
(u'Dom', u'http://www.poranny.pl/dom.xml'),
(u'Auto', u'http://www.poranny.pl/auto.xml'),
(u'Polityka', u'http://www.poranny.pl/polityka.xml')]
def get_cover_url(self):
soup = self.index_to_soup(self.INDEX + '/apps/pbcs.dll/section?Category=JEDYNKI')
nexturl = self.INDEX + soup.find(id='covers').find('a')['href']
soup = self.index_to_soup(nexturl)
self.cover_url = self.INDEX + soup.find(id='cover').find(name='img')['src']
return getattr(self, 'cover_url', self.cover_url)
def append_page(self, soup, appendtag):
tag = soup.find('span', attrs={'class':'photoNavigationPages'})
if tag:
number = int(tag.string.rpartition('/')[-1].replace('&nbsp;', ''))
baseurl = self.INDEX + soup.find(attrs={'class':'photoNavigationNext'})['href'][:-1]
for r in appendtag.findAll(attrs={'class':'photoNavigation'}):
r.extract()
for nr in range(2, number+1):
soup2 = self.index_to_soup(baseurl + str(nr))
pagetext = soup2.find(id='photoContainer')
if pagetext:
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
pagetext = soup2.find(attrs={'class':'photoMeta'})
if pagetext:
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
pagetext = soup2.find(attrs={'class':'photoStoryText'})
if pagetext:
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
def preprocess_html(self, soup):
self.append_page(soup, soup.body)
return soup

View File

@ -0,0 +1,27 @@
from calibre.web.feeds.news import BasicNewsRecipe
class KurierSzczecinski(BasicNewsRecipe):
title = u'Kurier Szczeci\u0144ski'
__author__ = 'fenuks'
description = u'24Kurier jest portalem Kuriera Szczecińskiego. Zawiera aktualności ze Szczecina oraz wiadomości regionalne z województwa zachodniopomorskiego.'
category = 'newspaper'
#publication_type = ''
language = 'pl'
#encoding = ''
#extra_css = ''
cover_url = 'http://www.24kurier.pl/Administracja/Img/24kurier_logo-copy-po-zapis'
#masthead_url = ''
use_embedded_content = False
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = True
remove_empty_feeds = True
remove_javascript = True
remove_attributes = ['style', 'font']
ignore_duplicate_articles = {'title', 'url'}
keep_only_tags = [dict(attrs={'class':'section'})]
remove_tags = [dict(attrs={'class':['Ikonki', 'rek', 'artComments']})]
remove_tags_after = dict(attrs={'class':'artComments'})
#remove_tags_before = dict()
feeds = [(u'Aktualno\u015bci', u'http://www.24kurier.pl/cmspages/articles_rss.aspx'), (u'Kraj', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=kraj'), (u'\u015awiat', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=swiat'), (u'Sport', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=sport'), (u'Kultura', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=kultura'), (u'Gospodarka', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=gospodarka'), (u'Nauka', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=nauka'), (u'Region', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=region'), (u'Szczecin', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=szczecin'), (u'Bia\u0142ogard', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=bialogard'), (u'Choszczno', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=choszczno'), (u'Drawsko', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=drawsko'), (u'Goleni\xf3w', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=goleniow'), (u'Gryfice', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=gryfice'), (u'Gryfino', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=gryfino'), (u'Kamie\u0144 Pomorski', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=kamien'), (u'Ko\u0142obrzeg', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=kolobrzeg'), (u'Koszalin', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=koszalin'), (u'\u0141obez', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=lobez'), (u'My\u015blib\xf3rz', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=mysliborz'), (u'Police', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=police'), (u'Pyrzyce', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=pyrzyce'), (u'S\u0142awno', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=slawno'), (u'Stargard', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=stargard'), (u'Szczecinek', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=szczecinek'), (u'\u015awidwin', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=swidwin'), (u'\u015awinouj\u015bcie', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=swinoujscie'), (u'Wa\u0142cz', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=walcz')]

View File

@ -19,7 +19,7 @@ class Kyungyhang(BasicNewsRecipe):
keep_only_tags = [ keep_only_tags = [
dict(name='div', attrs ={'class':['article_title_wrap']}), dict(name='div', attrs ={'class':['article_title_wrap']}),
dict(name='div', attrs ={'class':['article_txt']}) dict(name='span', attrs ={'class':['article_txt']})
] ]
remove_tags_after = dict(id={'sub_bottom'}) remove_tags_after = dict(id={'sub_bottom'})

27
recipes/lamebook.recipe Normal file
View File

@ -0,0 +1,27 @@
from calibre.web.feeds.news import BasicNewsRecipe
class LamebookRecipe(BasicNewsRecipe):
title = 'Lamebook'
__author__ = 'atordo'
description = 'Funny Facebook Statuses, Fails, LOLs and More - The Original'
cover_url = 'http://www.lamebook.com/wp-content/themes/lamebook/images/h1-new2.png'
oldest_article = 7
max_articles_per_feed = 50
auto_cleanup = False
no_stylesheets = True
remove_javascript = True
language = 'en'
use_embedded_content = False
publication_type = 'blog'
keep_only_tags = [
dict(name='div', attrs={'class':'entry'})
,dict(name='ol', attrs={'class':'commentlist'})
]
remove_tags = [
dict(name='div', attrs={'style':['clear: left; float: left; margin: 0 15px 0 0;'
,'float: left; margin: 0 15px 0 0;']})
]
feeds = [('Lamebook', 'http://feeds.feedburner.com/Lamebook')]

Some files were not shown because too many files have changed in this diff Show More