diff --git a/recipes/adventure_zone_pl.recipe b/recipes/adventure_zone_pl.recipe
index 2224937f3c..b02460695e 100644
--- a/recipes/adventure_zone_pl.recipe
+++ b/recipes/adventure_zone_pl.recipe
@@ -11,7 +11,7 @@ class Adventure_zone(BasicNewsRecipe):
max_articles_per_feed = 100
cover_url = 'http://www.adventure-zone.info/inne/logoaz_2012.png'
index='http://www.adventure-zone.info/fusion/'
- use_embedded_content=False
+ use_embedded_content = False
preprocess_regexps = [(re.compile(r"
Komentarze | ", re.IGNORECASE), lambda m: ''),
(re.compile(r'?table.*?>'), lambda match: ''),
(re.compile(r'?tbody.*?>'), lambda match: '')]
@@ -21,7 +21,7 @@ class Adventure_zone(BasicNewsRecipe):
extra_css = '.main-bg{text-align: left;} td.capmain{ font-size: 22px; }'
feeds = [(u'Nowinki', u'http://www.adventure-zone.info/fusion/feeds/news.php')]
- def parse_feeds (self):
+ '''def parse_feeds (self):
feeds = BasicNewsRecipe.parse_feeds(self)
soup=self.index_to_soup(u'http://www.adventure-zone.info/fusion/feeds/news.php')
tag=soup.find(name='channel')
@@ -34,7 +34,7 @@ class Adventure_zone(BasicNewsRecipe):
for feed in feeds:
for article in feed.articles[:]:
article.title=titles[feed.articles.index(article)]
- return feeds
+ return feeds'''
'''def get_cover_url(self):
@@ -42,16 +42,25 @@ class Adventure_zone(BasicNewsRecipe):
cover=soup.find(id='box_OstatninumerAZ')
self.cover_url='http://www.adventure-zone.info/fusion/'+ cover.center.a.img['src']
return getattr(self, 'cover_url', self.cover_url)'''
-
+ def populate_article_metadata(self, article, soup, first):
+ result = re.search('(.+) - Adventure Zone', soup.title.string)
+ if result:
+ article.title = result.group(1)
+ else:
+ result = soup.body.find('strong')
+ if result:
+ article.title = result.string
def skip_ad_pages(self, soup):
skip_tag = soup.body.find(name='td', attrs={'class':'main-bg'})
skip_tag = skip_tag.findAll(name='a')
- for r in skip_tag:
- if r.strong:
- word=r.strong.string.lower()
- if word and (('zapowied' in word) or ('recenzj' in word) or ('solucj' in word) or ('poradnik' in word)):
- return self.index_to_soup('http://www.adventure-zone.info/fusion/print.php?type=A&item'+r['href'][r['href'].find('article_id')+7:], raw=True)
+ title = soup.title.string.lower()
+ if (('zapowied' in title) or ('recenzj' in title) or ('solucj' in title) or ('poradnik' in title)):
+ for r in skip_tag:
+ if r.strong and r.strong.string:
+ word=r.strong.string.lower()
+ if (('zapowied' in word) or ('recenzj' in word) or ('solucj' in word) or ('poradnik' in word)):
+ return self.index_to_soup('http://www.adventure-zone.info/fusion/print.php?type=A&item'+r['href'][r['href'].find('article_id')+7:], raw=True)
def preprocess_html(self, soup):
footer=soup.find(attrs={'class':'news-footer middle-border'})
diff --git a/recipes/bash_org_pl.recipe b/recipes/bash_org_pl.recipe
index 4ed59614e7..a04f267ca3 100644
--- a/recipes/bash_org_pl.recipe
+++ b/recipes/bash_org_pl.recipe
@@ -35,8 +35,8 @@ class Bash_org_pl(BasicNewsRecipe):
soup=self.index_to_soup(u'http://bash.org.pl/random/')
#date=soup.find('div', attrs={'class':'right'}).string
url=soup.find('a', attrs={'class':'qid click'})
- title=url.string
- url='http://bash.org.pl' +url['href']
+ title=''
+ url='http://bash.org.pl/random/'
articles.append({'title' : title,
'url' : url,
'date' : '',
@@ -44,6 +44,8 @@ class Bash_org_pl(BasicNewsRecipe):
})
return articles
+ def populate_article_metadata(self, article, soup, first):
+ article.title = soup.find(attrs={'class':'qid click'}).string
def parse_index(self):
feeds = []
diff --git a/recipes/ekologia_pl.recipe b/recipes/ekologia_pl.recipe
index 2b0933b58d..21d3b607d2 100644
--- a/recipes/ekologia_pl.recipe
+++ b/recipes/ekologia_pl.recipe
@@ -15,7 +15,8 @@ class EkologiaPl(BasicNewsRecipe):
no_stylesheets = True
remove_empty_feeds = True
use_embedded_content = False
- remove_tags = [dict(attrs={'class':['ekoLogo', 'powrocArt', 'butonDrukuj']})]
+ remove_attrs = ['style']
+ remove_tags = [dict(attrs={'class':['ekoLogo', 'powrocArt', 'butonDrukuj', 'widget-social-buttons']})]
feeds = [(u'Wiadomo\u015bci', u'http://www.ekologia.pl/rss/20,53,0'), (u'\u015arodowisko', u'http://www.ekologia.pl/rss/20,56,0'), (u'Styl \u017cycia', u'http://www.ekologia.pl/rss/20,55,0')]
diff --git a/recipes/informacje_usa.recipe b/recipes/informacje_usa.recipe
index ac31134103..692dcdc07e 100644
--- a/recipes/informacje_usa.recipe
+++ b/recipes/informacje_usa.recipe
@@ -1,5 +1,4 @@
from calibre.web.feeds.news import BasicNewsRecipe
-import re
class Informacje_USA(BasicNewsRecipe):
title = u'Informacje USA'
oldest_article = 7
@@ -8,11 +7,10 @@ class Informacje_USA(BasicNewsRecipe):
description = u'portal wiadomości amerykańskich'
category = 'news'
language = 'pl'
- masthead_url= 'http://www.informacjeusa.com/wp-content/add_images/top_logo_5_2010.jpg'
- cover_url='http://www.informacjeusa.com/wp-content/add_images/top_logo_5_2010.jpg'
+ cover_url='http://www.informacjeusa.com/wp-content/uploads/2013/01/V3BANNER420-90new.jpg'
no_stylesheets = True
- preprocess_regexps = [(re.compile(ur'Zobacz:.*?
', re.DOTALL), lambda match: ''), (re.compile(ur'Podobne", re.IGNORECASE), lambda m: '')]
oldest_article = 7
max_articles_per_feed = 100
- #keep_only_tags=[dict(id='container')]
- feeds = [(u'Artyku\u0142y', u'http://www.mt.com.pl/feed')]
+ remove_empty_feeds = True
+ use_embedded_content = False
+ keep_only_tags = [dict(id='content')]
+ remove_tags = [dict(attrs={'class':'st-related-posts'})]
+ remove_tags_after = dict(attrs={'class':'entry-content clearfix'})
+ feeds = [(u'Wszystko', u'http://www.mt.com.pl/feed'),
+ (u'MT NEWS 24/7', u'http://www.mt.com.pl/kategoria/mt-newsy-24-7/feed'),
+ (u'Info zoom', u'http://www.mt.com.pl/kategoria/info-zoom/feed'),
+ (u'm.technik', u'http://www.mt.com.pl/kategoria/m-technik/feed'),
+ (u'Szkoła', u'http://www.mt.com.pl/kategoria/szkola-2/feed'),
+ (u'Na Warsztacie', u'http://www.mt.com.pl/kategoria/na-warsztacie/feed'),
+ (u'Z pasji do...', u'http://www.mt.com.pl/kategoria/z-pasji-do/feed'),
+ (u'MT testuje', u'http://www.mt.com.pl/kategoria/mt-testuje/feed')]
diff --git a/recipes/pc_lab.recipe b/recipes/pc_lab.recipe
index c4b33b8416..7a6038bd65 100644
--- a/recipes/pc_lab.recipe
+++ b/recipes/pc_lab.recipe
@@ -1,5 +1,4 @@
#!/usr/bin/env python
-
from calibre.web.feeds.recipes import BasicNewsRecipe
class PCLab(BasicNewsRecipe):
@@ -8,12 +7,13 @@ class PCLab(BasicNewsRecipe):
__author__ = 'ravcio - rlelusz[at]gmail.com'
description = u"Articles from PC Lab website"
language = 'pl'
- oldest_article = 30.0
+ oldest_article = 30
max_articles_per_feed = 100
recursions = 0
encoding = 'iso-8859-2'
no_stylesheets = True
remove_javascript = True
+ remove_empty_feeds = True
use_embedded_content = False
keep_only_tags = [
@@ -21,50 +21,54 @@ class PCLab(BasicNewsRecipe):
]
remove_tags = [
- dict(name='div', attrs={'class':['chapters']})
- ,dict(name='div', attrs={'id':['script_bxad_slot_display_list_bxad_slot']})
+ dict(name='div', attrs={'class':['toc first', 'toc', 'tags', 'recommendedarticles', 'name', 'zumi', 'chapters']})
]
- remove_tags_after = [
- dict(name='div', attrs={'class':['navigation']})
- ]
-
#links to RSS feeds
- feeds = [ ('PCLab', u'http://pclab.pl/xml/artykuly.xml') ]
+ feeds = [
+ (u'Aktualności', 'http://pclab.pl/xml/aktualnosci.xml'),
+ (u'Artykuły', u'http://pclab.pl/xml/artykuly.xml'),
+ (u'Poradniki', 'http://pclab.pl/xml/poradniki.xml')
+ ]
#load second and subsequent page content
# in: soup - full page with 'next' button
# out: appendtag - tag to which new page is to be added
def append_page(self, soup, appendtag):
# find the 'Next' button
- pager = soup.find('div', attrs={'class':'next'})
-
+ pager = soup.find('div', attrs={'class':'navigation'})
if pager:
+ a = pager.find('a')
+ if 'news' in a['href']:
+ pager = None
+ else:
+ pager = pager.find('div', attrs={'class':'next'})
+
+ while pager:
#search for 'a' element with link to next page (exit if not found)
a = pager.find('a')
- if a:
- nexturl = a['href']
+ nexturl = a['href']
+ soup2 = self.index_to_soup('http://pclab.pl' + nexturl)
+ pager = soup2.find('div', attrs={'class':'next'})
+ pagetext = soup2.find('div', attrs={'class':'substance'})
+ pagetext = pagetext.find('div', attrs={'class':'data'})
- soup2 = self.index_to_soup('http://pclab.pl/' + nexturl)
-
- pagetext_substance = soup2.find('div', attrs={'class':'substance'})
- pagetext = pagetext_substance.find('div', attrs={'class':'data'})
- pagetext.extract()
-
- pos = len(appendtag.contents)
- appendtag.insert(pos, pagetext)
- pos = len(appendtag.contents)
-
- self.append_page(soup2, appendtag)
+ pos = len(appendtag.contents)
+ appendtag.insert(pos, pagetext)
+ pos = len(appendtag.contents)
+ pager = soup.find('div', attrs={'class':'navigation'})
+ if pager:
+ pager.extract()
def preprocess_html(self, soup):
-
# soup.body contains no title and no navigator, they are in soup
self.append_page(soup, soup.body)
-
+ for link in soup.findAll('a'):
+ href = link.get('href', None)
+ if href and href.startswith('/'):
+ link['href'] = 'http://pclab.pl' + href
# finally remove some tags
- tags = soup.findAll('div',attrs={'class':['tags', 'index', 'script_bxad_slot_display_list_bxad_slot', 'index first', 'zumi', 'navigation']})
- [tag.extract() for tag in tags]
+ #for r in soup.findAll('div', attrs={'class':['tags', 'index', 'script_bxad_slot_display_list_bxad_slot', 'index first', 'zumi', 'navigation']})
return soup
diff --git a/recipes/spiders_web_pl.recipe b/recipes/spiders_web_pl.recipe
index 678ee5c640..b593d6b837 100644
--- a/recipes/spiders_web_pl.recipe
+++ b/recipes/spiders_web_pl.recipe
@@ -5,11 +5,14 @@ class SpidersWeb(BasicNewsRecipe):
oldest_article = 7
__author__ = 'fenuks'
description = u''
- cover_url = 'http://www.spidersweb.pl/wp-content/themes/spiderweb/img/Logo.jpg'
+ cover_url = 'http://www.spidersweb.pl/wp-content/themes/new_sw/images/spidersweb.png'
category = 'IT, WEB'
language = 'pl'
no_stylesheers=True
+ remove_javascript = True
+ use_embedded_content = False
max_articles_per_feed = 100
- keep_only_tags=[dict(id='Post')]
- remove_tags=[dict(name='div', attrs={'class':['Comments', 'Shows', 'Post-Tags']}), dict(id='Author-Column')]
+ keep_only_tags=[dict(id='start')]
+ remove_tags_after = dict(attrs={'class':'padding20'})
+ remove_tags=[dict(name='div', attrs={'class':['padding border-bottom', 'padding20', 'padding border-top']})]
feeds = [(u'Wpisy', u'http://www.spidersweb.pl/feed')]
diff --git a/recipes/wprost.recipe b/recipes/wprost.recipe
index 2adac1e113..90dde251ca 100644
--- a/recipes/wprost.recipe
+++ b/recipes/wprost.recipe
@@ -10,89 +10,89 @@ from calibre.web.feeds.news import BasicNewsRecipe
import re
class Wprost(BasicNewsRecipe):
- EDITION = 0
- FIND_LAST_FULL_ISSUE = True
- EXCLUDE_LOCKED = True
- ICO_BLOCKED = 'http://www.wprost.pl/G/layout2/ico_blocked.png'
+ EDITION = 0
+ FIND_LAST_FULL_ISSUE = True
+ EXCLUDE_LOCKED = True
+ ICO_BLOCKED = 'http://www.wprost.pl/G/layout2/ico_blocked.png'
+ title = u'Wprost'
+ __author__ = 'matek09'
+ description = 'Weekly magazine'
+ encoding = 'ISO-8859-2'
+ no_stylesheets = True
+ language = 'pl'
+ remove_javascript = True
+ recursions = 0
+ remove_tags_before = dict(dict(name = 'div', attrs = {'id' : 'print-layer'}))
+ remove_tags_after = dict(dict(name = 'div', attrs = {'id' : 'print-layer'}))
+ '''
+ keep_only_tags =[]
+ keep_only_tags.append(dict(name = 'table', attrs = {'id' : 'title-table'}))
+ keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'div-header'}))
+ keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'div-content'}))
+ keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'def element-autor'}))
+ '''
- title = u'Wprost'
- __author__ = 'matek09'
- description = 'Weekly magazine'
- encoding = 'ISO-8859-2'
- no_stylesheets = True
- language = 'pl'
- remove_javascript = True
- recursions = 0
-
- remove_tags_before = dict(dict(name = 'div', attrs = {'id' : 'print-layer'}))
- remove_tags_after = dict(dict(name = 'div', attrs = {'id' : 'print-layer'}))
-
- '''keep_only_tags =[]
- keep_only_tags.append(dict(name = 'table', attrs = {'id' : 'title-table'}))
- keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'div-header'}))
- keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'div-content'}))
- keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'def element-autor'}))'''
-
- preprocess_regexps = [(re.compile(r'style="display: none;"'), lambda match: ''),
+ preprocess_regexps = [(re.compile(r'style="display: none;"'), lambda match: ''),
(re.compile(r'display: block;'), lambda match: ''),
(re.compile(r'\
\ | \<\/table\>'), lambda match: ''),
(re.compile(r'\'), lambda match: ''),
(re.compile(r'\'), lambda match: ''),
(re.compile(r'\'), lambda match: ''),
- (re.compile(r'\ |