diff --git a/recipes/adventure_zone_pl.recipe b/recipes/adventure_zone_pl.recipe
index 50a980dc92..43342a9b28 100644
--- a/recipes/adventure_zone_pl.recipe
+++ b/recipes/adventure_zone_pl.recipe
@@ -6,42 +6,20 @@ class Adventure_zone(BasicNewsRecipe):
description = u'Czytaj więcej o przygodzie - codzienne nowinki. Szukaj u nas solucji i poradników, czytaj recenzje i zapowiedzi. Także galeria, pliki oraz forum dla wszystkich fanów gier przygodowych.'
category = 'games'
language = 'pl'
+ BASEURL = 'http://www.adventure-zone.info/fusion/'
no_stylesheets = True
+ extra_css = '.image {float: left; margin-right: 5px;}'
oldest_article = 20
max_articles_per_feed = 100
cover_url = 'http://www.adventure-zone.info/inne/logoaz_2012.png'
- index = 'http://www.adventure-zone.info/fusion/'
+ remove_attributes = ['style']
use_embedded_content = False
- preprocess_regexps = [(re.compile(r"
Komentarze | ", re.IGNORECASE), lambda m: ''),
- (re.compile(r'?table.*?>'), lambda match: ''),
- (re.compile(r'?tbody.*?>'), lambda match: '')]
- remove_tags_before = dict(name='td', attrs={'class':'main-bg'})
- remove_tags = [dict(name='img', attrs={'alt':'Drukuj'})]
- remove_tags_after = dict(id='comments')
- extra_css = '.main-bg{text-align: left;} td.capmain{ font-size: 22px; } img.news-category {float: left; margin-right: 5px;}'
- feeds = [(u'Nowinki', u'http://www.adventure-zone.info/fusion/feeds/news.php')]
-
- '''def get_cover_url(self):
- soup = self.index_to_soup('http://www.adventure-zone.info/fusion/news.php')
- cover=soup.find(id='box_OstatninumerAZ')
- self.cover_url='http://www.adventure-zone.info/fusion/'+ cover.center.a.img['src']
- return getattr(self, 'cover_url', self.cover_url)'''
-
- def populate_article_metadata(self, article, soup, first):
- result = re.search('(.+) - Adventure Zone', soup.title.string)
- if result:
- result = result.group(1)
- else:
- result = soup.body.find('strong')
- if result:
- result = result.string
- if result:
- result = result.replace('&', '&')
- result = result.replace(''', '’')
- article.title = result
+ keep_only_tags = [dict(attrs={'class':'content'})]
+ remove_tags = [dict(attrs={'class':'footer'})]
+ feeds = [(u'Nowinki', u'http://www.adventure-zone.info/fusion/rss/index.php')]
def skip_ad_pages(self, soup):
- skip_tag = soup.body.find(name='td', attrs={'class':'main-bg'})
+ skip_tag = soup.body.find(attrs={'class':'content'})
skip_tag = skip_tag.findAll(name='a')
title = soup.title.string.lower()
if (('zapowied' in title) or ('recenzj' in title) or ('solucj' in title) or ('poradnik' in title)):
@@ -49,20 +27,10 @@ class Adventure_zone(BasicNewsRecipe):
if r.strong and r.strong.string:
word=r.strong.string.lower()
if (('zapowied' in word) or ('recenzj' in word) or ('solucj' in word) or ('poradnik' in word)):
- return self.index_to_soup('http://www.adventure-zone.info/fusion/print.php?type=A&item'+r['href'][r['href'].find('article_id')+7:], raw=True)
+ return self.index_to_soup(self.BASEURL+r['href'], raw=True)
def preprocess_html(self, soup):
- footer=soup.find(attrs={'class':'news-footer middle-border'})
- r = soup.find(name='td', attrs={'class':'capmain'})
- if r:
- r.name='h1'
- for item in soup.findAll(name=['tr', 'td']):
- item.name='div'
- if footer and len(footer('a'))>=2:
- footer('a')[1].extract()
- for item in soup.findAll(style=True):
- del item['style']
- for a in soup('a'):
- if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
- a['href']=self.index + a['href']
- return soup
+ for link in soup.findAll('a', href=True):
+ if not link['href'].startswith('http'):
+ link['href'] = self.BASEURL + link['href']
+ return soup
\ No newline at end of file
diff --git a/recipes/astroflesz.recipe b/recipes/astroflesz.recipe
index 676aedfd3a..902f99c2c8 100644
--- a/recipes/astroflesz.recipe
+++ b/recipes/astroflesz.recipe
@@ -13,6 +13,7 @@ class Astroflesz(BasicNewsRecipe):
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
+ remove_empty_feeds = True
remove_attributes = ['style']
keep_only_tags = [dict(id="k2Container")]
remove_tags_after = dict(name='div', attrs={'class':'itemLinks'})
diff --git a/recipes/gosc_niedzielny.recipe b/recipes/gosc_niedzielny.recipe
index 11beb076f5..65e6e1704c 100644
--- a/recipes/gosc_niedzielny.recipe
+++ b/recipes/gosc_niedzielny.recipe
@@ -6,12 +6,10 @@ __copyright__ = '2011, Piotr Kontek, piotr.kontek@gmail.com \
2013, Tomasz Długosz, tomek3d@gmail.com'
from calibre.web.feeds.news import BasicNewsRecipe
-from calibre.ptempfile import PersistentTemporaryFile
-from datetime import date
import re
+from lxml import html
class GN(BasicNewsRecipe):
- EDITION = 0
__author__ = 'Piotr Kontek, Tomasz Długosz'
title = u'Gość Niedzielny'
@@ -20,83 +18,23 @@ class GN(BasicNewsRecipe):
no_stylesheets = True
language = 'pl'
remove_javascript = True
- temp_files = []
- articles_are_obfuscated = True
+ def find_last_issue(self):
+ raw = self.index_to_soup('http://gosc.pl/wyszukaj/wydania/3.Gosc-Niedzielny/', raw=True)
+ doc = html.fromstring(raw)
+ page = doc.xpath('//div[@class="c"]//div[@class="search-result"]/div[1]/div[2]/h1//a/@href')
- def get_obfuscated_article(self, url):
- br = self.get_browser()
- br.open(url)
- source = br.response().read()
- page = self.index_to_soup(source)
-
- main_section = page.find('div',attrs={'class':'txt doc_prnt_prv'})
-
- title = main_section.find('h2')
- info = main_section.find('div', attrs={'class' : 'cf doc_info'})
- authors = info.find(attrs={'class':'l'})
- article = str(main_section.find('p', attrs={'class' : 'doc_lead'}))
- first = True
- for p in main_section.findAll('p', attrs={'class':None}, recursive=False):
- if first and p.find('img') != None:
- article += ''
- article += str(p.find('img')).replace('src="/files/','src="http://www.gosc.pl/files/')
- article += ''
- for s in p.findAll('span'):
- article += self.tag_to_string(s)
- article += '
'
- else:
- article += str(p).replace('src="/files/','src="http://www.gosc.pl/files/')
- first = False
- limiter = main_section.find('p', attrs={'class' : 'limiter'})
- if limiter:
- article += str(limiter)
-
- html = unicode(title)
- #sometimes authors are not filled in:
- if authors:
- html += unicode(authors) + unicode(article)
- else:
- html += unicode(article)
-
- self.temp_files.append(PersistentTemporaryFile('_temparse.html'))
- self.temp_files[-1].write(html)
- self.temp_files[-1].close()
- return self.temp_files[-1].name
-
- def find_last_issue(self, year):
- soup = self.index_to_soup('http://gosc.pl/wyszukaj/wydania/3.Gosc-Niedzielny/rok/' + str(year))
-
- #szukam zdjęcia i linka do poprzedniego pełnego numeru
- first = True
- for d in soup.findAll('div', attrs={'class':'l release_preview_l'}):
- img = d.find('img')
- if img != None:
- a = img.parent
- self.EDITION = a['href']
- #this was preventing kindles from moving old issues to 'Back Issues' category:
- #self.title = img['alt']
- self.cover_url = 'http://www.gosc.pl' + img['src']
- if year != date.today().year or not first:
- break
- first = False
+ return page[1]
def parse_index(self):
- year = date.today().year
- self.find_last_issue(year)
- ##jeśli to pierwszy numer w roku trzeba pobrać poprzedni rok
- if self.EDITION == 0:
- self.find_last_issue(year-1)
- soup = self.index_to_soup('http://www.gosc.pl' + self.EDITION)
+ soup = self.index_to_soup('http://gosc.pl' + self.find_last_issue())
feeds = []
#wstepniak
a = soup.find('div',attrs={'class':'release-wp-b'}).find('a')
articles = [
{'title' : self.tag_to_string(a),
- 'url' : 'http://www.gosc.pl' + a['href'].replace('/doc/','/doc_pr/'),
- 'date' : '',
- 'description' : ''}
- ]
+ 'url' : 'http://www.gosc.pl' + a['href'].replace('/doc/','/doc_pr/')
+ }]
feeds.append((u'Wstępniak',articles))
#kategorie
for addr in soup.findAll('a',attrs={'href':re.compile('kategoria')}):
@@ -113,16 +51,46 @@ class GN(BasicNewsRecipe):
art = a.find('a')
yield {
'title' : self.tag_to_string(art),
- 'url' : 'http://www.gosc.pl' + art['href'].replace('/doc/','/doc_pr/'),
- 'date' : '',
- 'description' : ''
+ 'url' : 'http://www.gosc.pl' + art['href']
}
for a in main_block.findAll('div', attrs={'class':'sr-document'}):
art = a.find('a')
yield {
'title' : self.tag_to_string(art),
- 'url' : 'http://www.gosc.pl' + art['href'].replace('/doc/','/doc_pr/'),
- 'date' : '',
- 'description' : ''
+ 'url' : 'http://www.gosc.pl' + art['href']
}
+ def append_page(self, soup, appendtag):
+ chpage= appendtag.find(attrs={'class':'pgr_nrs'})
+ if chpage:
+ for page in chpage.findAll('a'):
+ soup2 = self.index_to_soup('http://gosc.pl' + page['href'])
+ pagetext = soup2.find(attrs={'class':'intextAd'})
+ pos = len(appendtag.contents)
+ appendtag.insert(pos, pagetext)
+
+ def preprocess_html(self, soup):
+ self.append_page(soup, soup.body)
+ '''
+ for image_div in soup.findAll(attrs={'class':'doc_image'}):
+ link =
+ if 'm.jpg' in image['src']:
+ image['src'] = image['src'].replace('m.jpg', '.jpg')
+ '''
+ return soup
+
+ keep_only_tags = [
+ dict(name='div', attrs={'class':'cf txt'})
+ ]
+
+ remove_tags = [
+ dict(name='p', attrs={'class':['r tr', 'l l-2', 'wykop']}),
+ dict(name='div', attrs={'class':['doc_actions', 'pgr', 'fr1_cl']}),
+ dict(name='div', attrs={'id':'vote'})
+ ]
+
+ extra_css = '''
+ h1 {font-size:150%}
+ div#doc_image {font-style:italic; font-size:70%}
+ p.limiter {font-size:150%; font-weight: bold}
+ '''
diff --git a/recipes/histmag.recipe b/recipes/histmag.recipe
index 0009580e49..9e6ca111a7 100644
--- a/recipes/histmag.recipe
+++ b/recipes/histmag.recipe
@@ -13,11 +13,12 @@ class Histmag(BasicNewsRecipe):
__author__ = 'matek09'
description = u"Artykuly historyczne i publicystyczne"
encoding = 'utf-8'
+ extra_css = '''.center img {display: block;}'''
#preprocess_regexps = [(re.compile(r''), lambda match: '
'),(re.compile(r''), lambda match: '
')]
no_stylesheets = True
language = 'pl'
remove_javascript = True
keep_only_tags=[dict(id='article')]
- remove_tags=[dict(name = 'p', attrs = {'class' : 'article-tags'})]
+ remove_tags=[dict(name = 'p', attrs = {'class' : 'article-tags'}), dict(attrs={'class':'twitter-share-button'})]
feeds = [(u'Wszystkie', u'http://histmag.org/rss/wszystkie.xml'), (u'Wydarzenia', u'http://histmag.org/rss/wydarzenia.xml'), (u'Recenzje', u'http://histmag.org/rss/recenzje.xml'), (u'Artykuły historyczne', u'http://histmag.org/rss/historia.xml'), (u'Publicystyka', u'http://histmag.org/rss/publicystyka.xml')]
diff --git a/recipes/icons/geopolityka.png b/recipes/icons/geopolityka.png
new file mode 100644
index 0000000000..7bef643edb
Binary files /dev/null and b/recipes/icons/geopolityka.png differ
diff --git a/recipes/icons/gs24_pl.png b/recipes/icons/gs24_pl.png
new file mode 100644
index 0000000000..fc2a4950cb
Binary files /dev/null and b/recipes/icons/gs24_pl.png differ
diff --git a/recipes/icons/homopedia_pl.png b/recipes/icons/homopedia_pl.png
new file mode 100644
index 0000000000..2de09f2730
Binary files /dev/null and b/recipes/icons/homopedia_pl.png differ
diff --git a/recipes/icons/pc_lab.png b/recipes/icons/pc_lab.png
new file mode 100644
index 0000000000..03971fc57a
Binary files /dev/null and b/recipes/icons/pc_lab.png differ
diff --git a/recipes/icons/polityka.png b/recipes/icons/polityka.png
new file mode 100644
index 0000000000..482408dc7b
Binary files /dev/null and b/recipes/icons/polityka.png differ
diff --git a/recipes/icons/rynek_zdrowia.png b/recipes/icons/rynek_zdrowia.png
new file mode 100644
index 0000000000..76fcf3fb98
Binary files /dev/null and b/recipes/icons/rynek_zdrowia.png differ
diff --git a/recipes/osnews_pl.recipe b/recipes/osnews_pl.recipe
index 455f005a7e..7251f31827 100644
--- a/recipes/osnews_pl.recipe
+++ b/recipes/osnews_pl.recipe
@@ -20,7 +20,7 @@ class OSNewsRecipe(BasicNewsRecipe):
remove_javascript = True
encoding = 'utf-8'
use_embedded_content = False;
-
+ remove_empty_feeds = True
oldest_article = 7
max_articles_per_feed = 100
cover_url='http://osnews.pl/wp-content/themes/osnews/img/logo.png'
@@ -31,22 +31,18 @@ class OSNewsRecipe(BasicNewsRecipe):
'''
feeds = [
- (u'OSNews.pl', u'http://feeds.feedburner.com/OSnewspl')
+ (u'Niusy', u'http://feeds.feedburner.com/OSnewspl'),
+ (u'Wylęgarnia', u'http://feeds.feedburner.com/osnewspl_nowe')
]
keep_only_tags = [
- dict(name = 'a', attrs = {'class' : 'news-heading'}),
- dict(name = 'div', attrs = {'class' : 'newsinformations'}),
- dict(name = 'div', attrs = {'id' : 'news-content'})
+ dict(name = 'div', attrs = {'id' : 'content'})
]
remove_tags = [
- dict(name = 'div', attrs = {'class' : 'sociable'}),
- dict(name = 'div', attrs = {'class' : 'post_prev'}),
- dict(name = 'div', attrs = {'class' : 'post_next'}),
- dict(name = 'div', attrs = {'class' : 'clr'}),
- dict(name = 'div', attrs = {'class' : 'tw_button'}),
- dict(name = 'div', attrs = {'style' : 'width:56px;height:60px;float:left;margin-right:10px'})
+ dict(name = 'div', attrs = {'class' : ['newstags', 'tw_button', 'post_prev']}),
+ dict(name = 'div', attrs = {'id' : 'newspage_upinfo'}),
]
- preprocess_regexps = [(re.compile(u'Komentarze: \(?[0-9]+\)? ?Komentarze: \(?[0-9]+\)? ?'), lambda match: '')]