diff --git a/resources/recipes/esenja.recipe b/resources/recipes/esenja.recipe
new file mode 100644
index 0000000000..b8b94ad66e
--- /dev/null
+++ b/resources/recipes/esenja.recipe
@@ -0,0 +1,87 @@
+#!/usr/bin/env python
+
+__license__ = 'GPL v3'
+__copyright__ = '2010, matek09, matek09@gmail.com'
+
+from calibre.web.feeds.news import BasicNewsRecipe
+import re
+
+class Esensja(BasicNewsRecipe):
+
+ title = u'Esensja'
+ __author__ = 'matek09'
+ description = 'Monthly magazine'
+ encoding = 'utf-8'
+ no_stylesheets = True
+ language = 'pl'
+ remove_javascript = True
+ HREF = '0'
+
+ #keep_only_tags =[]
+ #keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'article'})
+ remove_tags_before = dict(dict(name = 'div', attrs = {'class' : 't-title'}))
+ remove_tags_after = dict(dict(name = 'img', attrs = {'src' : '../../../2000/01/img/tab_bot.gif'}))
+
+ remove_tags =[]
+ remove_tags.append(dict(name = 'img', attrs = {'src' : '../../../2000/01/img/tab_top.gif'}))
+ remove_tags.append(dict(name = 'img', attrs = {'src' : '../../../2000/01/img/tab_bot.gif'}))
+ remove_tags.append(dict(name = 'div', attrs = {'class' : 't-title2 nextpage'}))
+
+ extra_css = '''
+ .t-title {font-size: x-large; font-weight: bold; text-align: left}
+ .t-author {font-size: x-small; text-align: left}
+ .t-title2 {font-size: x-small; font-style: italic; text-align: left}
+ .text {font-size: small; text-align: left}
+ .annot-ref {font-style: italic; text-align: left}
+ '''
+
+ preprocess_regexps = [(re.compile(r'alt="[^"]*"'),
+ lambda match: '')]
+
+ def parse_index(self):
+ soup = self.index_to_soup('http://www.esensja.pl/magazyn/')
+ a = soup.find('a', attrs={'href' : re.compile('.*/index.html')})
+ year = a['href'].split('/')[0]
+ month = a['href'].split('/')[1]
+ self.HREF = 'http://www.esensja.pl/magazyn/' + year + '/' + month + '/iso/'
+ soup = self.index_to_soup(self.HREF + '01.html')
+ self.cover_url = 'http://www.esensja.pl/magazyn/' + year + '/' + month + '/img/ilustr/cover_b.jpg'
+ feeds = []
+ intro = soup.find('div', attrs={'class' : 'n-title'})
+ introduction = {'title' : self.tag_to_string(intro.a),
+ 'url' : self.HREF + intro.a['href'],
+ 'date' : '',
+ 'description' : ''}
+ chapter = 'Wprowadzenie'
+ subchapter = ''
+ articles = []
+ articles.append(introduction)
+ for tag in intro.findAllNext(attrs={'class': ['chapter', 'subchapter', 'n-title']}):
+ if tag.name in 'td':
+ if len(articles) > 0:
+ section = chapter
+ if len(subchapter) > 0:
+ section += ' - ' + subchapter
+ feeds.append((section, articles))
+ articles = []
+ if tag['class'] == 'chapter':
+ chapter = self.tag_to_string(tag).capitalize()
+ subchapter = ''
+ else:
+ subchapter = self.tag_to_string(tag)
+ subchapter = self.tag_to_string(tag)
+ continue
+ articles.append({'title' : self.tag_to_string(tag.a), 'url' : self.HREF + tag.a['href'], 'date' : '', 'description' : ''})
+
+ a = self.index_to_soup(self.HREF + tag.a['href'])
+ i = 1
+ while True:
+ div = a.find('div', attrs={'class' : 't-title2 nextpage'})
+ if div is not None:
+ a = self.index_to_soup(self.HREF + div.a['href'])
+ articles.append({'title' : self.tag_to_string(tag.a) + ' c. d. ' + str(i), 'url' : self.HREF + div.a['href'], 'date' : '', 'description' : ''})
+ i = i + 1
+ else:
+ break
+
+ return feeds
diff --git a/resources/recipes/histmag.recipe b/resources/recipes/histmag.recipe
new file mode 100644
index 0000000000..38956e7995
--- /dev/null
+++ b/resources/recipes/histmag.recipe
@@ -0,0 +1,59 @@
+#!/usr/bin/env python
+
+__license__ = 'GPL v3'
+__copyright__ = '2010, matek09, matek09@gmail.com'
+
+from calibre.web.feeds.news import BasicNewsRecipe
+import re
+
+class Histmag(BasicNewsRecipe):
+
+ title = u'Histmag'
+ __author__ = 'matek09'
+ description = u"Artykuly historyczne i publicystyczne"
+ encoding = 'utf-8'
+ no_stylesheets = True
+ language = 'pl'
+ remove_javascript = True
+ #max_articles_per_feed = 1
+ remove_tags_before = dict(dict(name = 'div', attrs = {'id' : 'article'}))
+ remove_tags_after = dict(dict(name = 'h2', attrs = {'class' : 'komentarze'}))
+ #keep_only_tags =[]
+ #keep_only_tags.append(dict(name = 'h2'))
+ #keep_only_tags.append(dict(name = 'p'))
+
+ remove_tags =[]
+ remove_tags.append(dict(name = 'p', attrs = {'class' : 'podpis'}))
+ remove_tags.append(dict(name = 'h2', attrs = {'class' : 'komentarze'}))
+ remove_tags.append(dict(name = 'img', attrs = {'src' : 'style/buttons/wesprzyjnas-1.jpg'}))
+
+ preprocess_regexps = [(re.compile(r''), lambda match: '
'),
+ (re.compile(r''), lambda match: '
')]
+ extra_css = '''
+ .left {font-size: x-small}
+ .right {font-size: x-small}
+ '''
+
+ def find_articles(self, soup):
+ articles = []
+ for div in soup.findAll('div', attrs={'class' : 'text'}):
+ articles.append({
+ 'title' : self.tag_to_string(div.h3.a),
+ 'url' : 'http://www.histmag.org/' + div.h3.a['href'],
+ 'date' : self.tag_to_string(div.next('p')).split('|')[0],
+ 'description' : self.tag_to_string(div.next('p', podpis=False)),
+ })
+ return articles
+
+ def parse_index(self):
+ soup = self.index_to_soup('http://histmag.org/?arc=4&dx=0')
+ feeds = []
+ feeds.append((u"Artykuly historyczne", self.find_articles(soup)))
+ soup = self.index_to_soup('http://histmag.org/?arc=5&dx=0')
+ feeds.append((u"Artykuly publicystyczne", self.find_articles(soup)))
+ soup = self.index_to_soup('http://histmag.org/?arc=1&dx=0')
+ feeds.append((u"Wydarzenia", self.find_articles(soup)))
+
+ return feeds
+
+
diff --git a/resources/recipes/newsweek_polska.recipe b/resources/recipes/newsweek_polska.recipe
index 31dd8ccddd..4227a88026 100644
--- a/resources/recipes/newsweek_polska.recipe
+++ b/resources/recipes/newsweek_polska.recipe
@@ -1,19 +1,22 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
-__copyright__ = '2010, Mateusz Kielar, matek09@gmail.com'
+__copyright__ = '2010, matek09, matek09@gmail.com'
from calibre.web.feeds.news import BasicNewsRecipe
class Newsweek(BasicNewsRecipe):
- EDITION = 0
+ FIND_LAST_FULL_ISSUE = True
+ EDITION = '0'
+ EXCLUDE_LOCKED = True
+ LOCKED_ICO = 'http://www.newsweek.pl/bins/media/static/newsweek/img/ico_locked.gif'
title = u'Newsweek Polska'
- __author__ = 'Mateusz Kielar'
+ __author__ = 'matek09'
description = 'Weekly magazine'
encoding = 'utf-8'
no_stylesheets = True
- language = 'en'
+ language = 'pl'
remove_javascript = True
keep_only_tags =[]
@@ -33,34 +36,54 @@ class Newsweek(BasicNewsRecipe):
def print_version(self, url):
return url.replace("http://www.newsweek.pl/artykuly/wydanie/" + str(self.EDITION), "http://www.newsweek.pl/artykuly") + '/print'
+ def is_locked(self, a):
+ if a.findNext('img')['src'] == 'http://www.newsweek.pl/bins/media/static/newsweek/img/ico_locked.gif':
+ return True
+ else:
+ return False
+
+ def is_full(self, issue_soup):
+ if len(issue_soup.findAll('img', attrs={'src' : 'http://www.newsweek.pl/bins/media/static/newsweek/img/ico_locked.gif'})) > 1:
+ return False
+ else:
+ return True
+
def find_last_full_issue(self):
- page = self.index_to_soup('http://www.newsweek.pl/Frames/IssueCover.aspx')
- issue = 'http://www.newsweek.pl/Frames/' + page.find(lambda tag: tag.name == 'span' and not tag.attrs).a['href']
- page = self.index_to_soup(issue)
- issue = 'http://www.newsweek.pl/Frames/' + page.find(lambda tag: tag.name == 'span' and not tag.attrs).a['href']
- page = self.index_to_soup(issue)
- self.EDITION = page.find('a', attrs={'target' : '_parent'})['href'].replace('/wydania/','')
+ frame_url = 'http://www.newsweek.pl/Frames/IssueCover.aspx'
+ while True:
+ frame_soup = self.index_to_soup(frame_url)
+ self.EDITION = frame_soup.find('a', attrs={'target' : '_parent'})['href'].replace('/wydania/','')
+ issue_soup = self.index_to_soup('http://www.newsweek.pl/wydania/' + self.EDITION)
+ if self.is_full(issue_soup):
+ break
+ frame_url = 'http://www.newsweek.pl/Frames/' + frame_soup.find(lambda tag: tag.name == 'span' and not tag.attrs).a['href']
+
+
def parse_index(self):
- self.find_last_full_issue()
- soup = self.index_to_soup('http://www.newsweek.pl/wydania/' + str(self.EDITION))
+ if self.FIND_LAST_FULL_ISSUE:
+ self.find_last_full_issue()
+ soup = self.index_to_soup('http://www.newsweek.pl/wydania/' + self.EDITION)
img = soup.find('img', id="ctl00_C1_PaperIsssueView_IssueImage", src=True)
self.cover_url = img['src']
feeds = []
parent = soup.find(id='content-left-big')
for txt in parent.findAll(attrs={'class':'txt_normal_red strong'}):
- section = self.tag_to_string(txt).capitalize()
articles = list(self.find_articles(txt))
- feeds.append((section, articles))
+ if len(articles) > 0:
+ section = self.tag_to_string(txt).capitalize()
+ feeds.append((section, articles))
return feeds
def find_articles(self, txt):
for a in txt.findAllNext( attrs={'class':['strong','hr']}):
if a.name in "div":
break
+ if (not self.FIND_LAST_FULL_ISSUE) & self.EXCLUDE_LOCKED & self.is_locked(a):
+ continue
yield {
'title' : self.tag_to_string(a),
- 'url' : 'http://www.newsweek.pl'+a['href'],
+ 'url' : 'http://www.newsweek.pl' + a['href'],
'date' : '',
'description' : ''
}
diff --git a/resources/recipes/polityka.recipe b/resources/recipes/polityka.recipe
index ab31e148aa..16ccae6085 100644
--- a/resources/recipes/polityka.recipe
+++ b/resources/recipes/polityka.recipe
@@ -1,18 +1,18 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
-__copyright__ = '2010, Mateusz Kielar, matek09@gmail.com'
+__copyright__ = '2010, matek09, matek09@gmail.com'
from calibre.web.feeds.news import BasicNewsRecipe
class Polityka(BasicNewsRecipe):
title = u'Polityka'
- __author__ = 'Mateusz Kielar'
+ __author__ = 'matek09'
description = 'Weekly magazine. Last archive issue'
encoding = 'utf-8'
no_stylesheets = True
- language = 'en'
+ language = 'pl'
remove_javascript = True
remove_tags_before = dict(dict(name = 'h2', attrs = {'class' : 'box_nag'}))
@@ -48,7 +48,6 @@ class Polityka(BasicNewsRecipe):
for div in box.findAll('div', attrs={'class': 'list_tresc'}):
article_page = self.index_to_soup('http://archiwum.polityka.pl' + div.a['href'],)
section = self.tag_to_string(article_page.find('h2', attrs = {'class' : 'box_nag'})).split('/')[0].lstrip().rstrip()
- print section
if not articles.has_key(section):
articles[section] = []
articles[section].append( {
diff --git a/resources/recipes/wprost.recipe b/resources/recipes/wprost.recipe
new file mode 100644
index 0000000000..b317571981
--- /dev/null
+++ b/resources/recipes/wprost.recipe
@@ -0,0 +1,91 @@
+#!/usr/bin/env python
+
+__license__ = 'GPL v3'
+__copyright__ = '2010, matek09, matek09@gmail.com'
+
+from calibre.web.feeds.news import BasicNewsRecipe
+import re
+
+class Wprost(BasicNewsRecipe):
+ EDITION = 0
+ FIND_LAST_FULL_ISSUE = True
+ EXCLUDE_LOCKED = True
+ ICO_BLOCKED = 'http://www.wprost.pl/G/icons/ico_blocked.gif'
+
+ title = u'Wprost'
+ __author__ = 'matek09'
+ description = 'Weekly magazine'
+ encoding = 'ISO-8859-2'
+ no_stylesheets = True
+ language = 'pl'
+ remove_javascript = True
+
+ remove_tags_before = dict(dict(name = 'div', attrs = {'id' : 'print-layer'}))
+ remove_tags_after = dict(dict(name = 'div', attrs = {'id' : 'print-layer'}))
+
+ '''keep_only_tags =[]
+ keep_only_tags.append(dict(name = 'table', attrs = {'id' : 'title-table'}))
+ keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'div-header'}))
+ keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'div-content'}))
+ keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'def element-autor'}))'''
+
+ preprocess_regexps = [(re.compile(r'style="display: none;"'), lambda match: ''),
+ (re.compile(r'display: block;'), lambda match: '')]
+
+
+ remove_tags =[]
+ remove_tags.append(dict(name = 'div', attrs = {'class' : 'def element-date'}))
+ remove_tags.append(dict(name = 'div', attrs = {'class' : 'def silver'}))
+ remove_tags.append(dict(name = 'div', attrs = {'id' : 'content-main-column-right'}))
+
+
+ extra_css = '''
+ .div-header {font-size: x-small; font-weight: bold}
+ '''
+#h2 {font-size: x-large; font-weight: bold}
+ def is_blocked(self, a):
+ if a.findNextSibling('img') is None:
+ return False
+ else:
+ return True
+
+
+
+ def find_last_issue(self):
+ soup = self.index_to_soup('http://www.wprost.pl/archiwum/')
+ a = 0
+ if self.FIND_LAST_FULL_ISSUE:
+ ico_blocked = soup.findAll('img', attrs={'src' : self.ICO_BLOCKED})
+ a = ico_blocked[-1].findNext('a', attrs={'title' : re.compile('Zobacz spis tre.ci')})
+ else:
+ a = soup.find('a', attrs={'title' : re.compile('Zobacz spis tre.ci')})
+ self.EDITION = a['href'].replace('/tygodnik/?I=', '')
+ self.cover_url = a.img['src']
+
+
+
+ def parse_index(self):
+ self.find_last_issue()
+ soup = self.index_to_soup('http://www.wprost.pl/tygodnik/?I=' + self.EDITION)
+ feeds = []
+ for main_block in soup.findAll(attrs={'class':'main-block-s3 s3-head head-red3'}):
+ articles = list(self.find_articles(main_block))
+ if len(articles) > 0:
+ section = self.tag_to_string(main_block)
+ feeds.append((section, articles))
+ return feeds
+
+ def find_articles(self, main_block):
+ for a in main_block.findAllNext( attrs={'style':['','padding-top: 15px;']}):
+ if a.name in "td":
+ break
+ if self.EXCLUDE_LOCKED & self.is_blocked(a):
+ continue
+ yield {
+ 'title' : self.tag_to_string(a),
+ 'url' : 'http://www.wprost.pl' + a['href'],
+ 'date' : '',
+ 'description' : ''
+ }
+
+