Update various Polish recipes

This commit is contained in:
Kovid Goyal 2013-02-16 21:14:46 +05:30
parent 9556e47b51
commit 60b4f4fd01
8 changed files with 153 additions and 124 deletions

View File

@ -11,7 +11,7 @@ class Adventure_zone(BasicNewsRecipe):
max_articles_per_feed = 100 max_articles_per_feed = 100
cover_url = 'http://www.adventure-zone.info/inne/logoaz_2012.png' cover_url = 'http://www.adventure-zone.info/inne/logoaz_2012.png'
index='http://www.adventure-zone.info/fusion/' index='http://www.adventure-zone.info/fusion/'
use_embedded_content=False use_embedded_content = False
preprocess_regexps = [(re.compile(r"<td class='capmain'>Komentarze</td>", re.IGNORECASE), lambda m: ''), preprocess_regexps = [(re.compile(r"<td class='capmain'>Komentarze</td>", re.IGNORECASE), lambda m: ''),
(re.compile(r'</?table.*?>'), lambda match: ''), (re.compile(r'</?table.*?>'), lambda match: ''),
(re.compile(r'</?tbody.*?>'), lambda match: '')] (re.compile(r'</?tbody.*?>'), lambda match: '')]
@ -21,7 +21,7 @@ class Adventure_zone(BasicNewsRecipe):
extra_css = '.main-bg{text-align: left;} td.capmain{ font-size: 22px; }' extra_css = '.main-bg{text-align: left;} td.capmain{ font-size: 22px; }'
feeds = [(u'Nowinki', u'http://www.adventure-zone.info/fusion/feeds/news.php')] feeds = [(u'Nowinki', u'http://www.adventure-zone.info/fusion/feeds/news.php')]
def parse_feeds (self): '''def parse_feeds (self):
feeds = BasicNewsRecipe.parse_feeds(self) feeds = BasicNewsRecipe.parse_feeds(self)
soup=self.index_to_soup(u'http://www.adventure-zone.info/fusion/feeds/news.php') soup=self.index_to_soup(u'http://www.adventure-zone.info/fusion/feeds/news.php')
tag=soup.find(name='channel') tag=soup.find(name='channel')
@ -34,7 +34,7 @@ class Adventure_zone(BasicNewsRecipe):
for feed in feeds: for feed in feeds:
for article in feed.articles[:]: for article in feed.articles[:]:
article.title=titles[feed.articles.index(article)] article.title=titles[feed.articles.index(article)]
return feeds return feeds'''
'''def get_cover_url(self): '''def get_cover_url(self):
@ -42,16 +42,25 @@ class Adventure_zone(BasicNewsRecipe):
cover=soup.find(id='box_OstatninumerAZ') cover=soup.find(id='box_OstatninumerAZ')
self.cover_url='http://www.adventure-zone.info/fusion/'+ cover.center.a.img['src'] self.cover_url='http://www.adventure-zone.info/fusion/'+ cover.center.a.img['src']
return getattr(self, 'cover_url', self.cover_url)''' return getattr(self, 'cover_url', self.cover_url)'''
def populate_article_metadata(self, article, soup, first):
result = re.search('(.+) - Adventure Zone', soup.title.string)
if result:
article.title = result.group(1)
else:
result = soup.body.find('strong')
if result:
article.title = result.string
def skip_ad_pages(self, soup): def skip_ad_pages(self, soup):
skip_tag = soup.body.find(name='td', attrs={'class':'main-bg'}) skip_tag = soup.body.find(name='td', attrs={'class':'main-bg'})
skip_tag = skip_tag.findAll(name='a') skip_tag = skip_tag.findAll(name='a')
for r in skip_tag: title = soup.title.string.lower()
if r.strong: if (('zapowied' in title) or ('recenzj' in title) or ('solucj' in title) or ('poradnik' in title)):
word=r.strong.string.lower() for r in skip_tag:
if word and (('zapowied' in word) or ('recenzj' in word) or ('solucj' in word) or ('poradnik' in word)): if r.strong and r.strong.string:
return self.index_to_soup('http://www.adventure-zone.info/fusion/print.php?type=A&item'+r['href'][r['href'].find('article_id')+7:], raw=True) word=r.strong.string.lower()
if (('zapowied' in word) or ('recenzj' in word) or ('solucj' in word) or ('poradnik' in word)):
return self.index_to_soup('http://www.adventure-zone.info/fusion/print.php?type=A&item'+r['href'][r['href'].find('article_id')+7:], raw=True)
def preprocess_html(self, soup): def preprocess_html(self, soup):
footer=soup.find(attrs={'class':'news-footer middle-border'}) footer=soup.find(attrs={'class':'news-footer middle-border'})

View File

@ -35,8 +35,8 @@ class Bash_org_pl(BasicNewsRecipe):
soup=self.index_to_soup(u'http://bash.org.pl/random/') soup=self.index_to_soup(u'http://bash.org.pl/random/')
#date=soup.find('div', attrs={'class':'right'}).string #date=soup.find('div', attrs={'class':'right'}).string
url=soup.find('a', attrs={'class':'qid click'}) url=soup.find('a', attrs={'class':'qid click'})
title=url.string title=''
url='http://bash.org.pl' +url['href'] url='http://bash.org.pl/random/'
articles.append({'title' : title, articles.append({'title' : title,
'url' : url, 'url' : url,
'date' : '', 'date' : '',
@ -44,6 +44,8 @@ class Bash_org_pl(BasicNewsRecipe):
}) })
return articles return articles
def populate_article_metadata(self, article, soup, first):
article.title = soup.find(attrs={'class':'qid click'}).string
def parse_index(self): def parse_index(self):
feeds = [] feeds = []

View File

@ -15,7 +15,8 @@ class EkologiaPl(BasicNewsRecipe):
no_stylesheets = True no_stylesheets = True
remove_empty_feeds = True remove_empty_feeds = True
use_embedded_content = False use_embedded_content = False
remove_tags = [dict(attrs={'class':['ekoLogo', 'powrocArt', 'butonDrukuj']})] remove_attrs = ['style']
remove_tags = [dict(attrs={'class':['ekoLogo', 'powrocArt', 'butonDrukuj', 'widget-social-buttons']})]
feeds = [(u'Wiadomo\u015bci', u'http://www.ekologia.pl/rss/20,53,0'), (u'\u015arodowisko', u'http://www.ekologia.pl/rss/20,56,0'), (u'Styl \u017cycia', u'http://www.ekologia.pl/rss/20,55,0')] feeds = [(u'Wiadomo\u015bci', u'http://www.ekologia.pl/rss/20,53,0'), (u'\u015arodowisko', u'http://www.ekologia.pl/rss/20,56,0'), (u'Styl \u017cycia', u'http://www.ekologia.pl/rss/20,55,0')]

View File

@ -1,5 +1,4 @@
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
import re
class Informacje_USA(BasicNewsRecipe): class Informacje_USA(BasicNewsRecipe):
title = u'Informacje USA' title = u'Informacje USA'
oldest_article = 7 oldest_article = 7
@ -8,11 +7,10 @@ class Informacje_USA(BasicNewsRecipe):
description = u'portal wiadomości amerykańskich' description = u'portal wiadomości amerykańskich'
category = 'news' category = 'news'
language = 'pl' language = 'pl'
masthead_url= 'http://www.informacjeusa.com/wp-content/add_images/top_logo_5_2010.jpg' cover_url='http://www.informacjeusa.com/wp-content/uploads/2013/01/V3BANNER420-90new.jpg'
cover_url='http://www.informacjeusa.com/wp-content/add_images/top_logo_5_2010.jpg'
no_stylesheets = True no_stylesheets = True
preprocess_regexps = [(re.compile(ur'<p>Zobacz:.*?</p>', re.DOTALL), lambda match: ''), (re.compile(ur'<p><a href=".*?Zobacz także:.*?</a></p>', re.DOTALL), lambda match: ''), (re.compile(ur'<p><p>Zobacz też:.*?</a></p>', re.DOTALL), lambda match: '')] use_embedded_content = False
keep_only_tags=[dict(name='div', attrs={'class':'box box-single'})] keep_only_tags=[dict(id='post-area')]
remove_tags_after= dict(attrs={'class':'tags'}) remove_tags_after= dict(id='content-area')
remove_tags= [dict(attrs={'class':['postmetadata', 'tags', 'banner']}), dict(name='a', attrs={'title':['Drukuj', u'Wyślij']})] remove_tags= [dict(attrs={'class':['breadcrumb']}), dict(id=['social-box', 'social-box-vert'])]
feeds = [(u'Informacje', u'http://www.informacjeusa.com/feed/')] feeds = [(u'Informacje', u'http://www.informacjeusa.com/feed/')]

View File

@ -1,5 +1,5 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
import re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class Mlody_technik(BasicNewsRecipe): class Mlody_technik(BasicNewsRecipe):
title = u'Młody technik' title = u'Młody technik'
@ -9,7 +9,19 @@ class Mlody_technik(BasicNewsRecipe):
language = 'pl' language = 'pl'
cover_url='http://science-everywhere.pl/wp-content/uploads/2011/10/mt12.jpg' cover_url='http://science-everywhere.pl/wp-content/uploads/2011/10/mt12.jpg'
no_stylesheets = True no_stylesheets = True
preprocess_regexps = [(re.compile(r"<h4>Podobne</h4>", re.IGNORECASE), lambda m: '')]
oldest_article = 7 oldest_article = 7
max_articles_per_feed = 100 max_articles_per_feed = 100
#keep_only_tags=[dict(id='container')] remove_empty_feeds = True
feeds = [(u'Artyku\u0142y', u'http://www.mt.com.pl/feed')] use_embedded_content = False
keep_only_tags = [dict(id='content')]
remove_tags = [dict(attrs={'class':'st-related-posts'})]
remove_tags_after = dict(attrs={'class':'entry-content clearfix'})
feeds = [(u'Wszystko', u'http://www.mt.com.pl/feed'),
(u'MT NEWS 24/7', u'http://www.mt.com.pl/kategoria/mt-newsy-24-7/feed'),
(u'Info zoom', u'http://www.mt.com.pl/kategoria/info-zoom/feed'),
(u'm.technik', u'http://www.mt.com.pl/kategoria/m-technik/feed'),
(u'Szkoła', u'http://www.mt.com.pl/kategoria/szkola-2/feed'),
(u'Na Warsztacie', u'http://www.mt.com.pl/kategoria/na-warsztacie/feed'),
(u'Z pasji do...', u'http://www.mt.com.pl/kategoria/z-pasji-do/feed'),
(u'MT testuje', u'http://www.mt.com.pl/kategoria/mt-testuje/feed')]

View File

@ -1,5 +1,4 @@
#!/usr/bin/env python #!/usr/bin/env python
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
class PCLab(BasicNewsRecipe): class PCLab(BasicNewsRecipe):
@ -8,12 +7,13 @@ class PCLab(BasicNewsRecipe):
__author__ = 'ravcio - rlelusz[at]gmail.com' __author__ = 'ravcio - rlelusz[at]gmail.com'
description = u"Articles from PC Lab website" description = u"Articles from PC Lab website"
language = 'pl' language = 'pl'
oldest_article = 30.0 oldest_article = 30
max_articles_per_feed = 100 max_articles_per_feed = 100
recursions = 0 recursions = 0
encoding = 'iso-8859-2' encoding = 'iso-8859-2'
no_stylesheets = True no_stylesheets = True
remove_javascript = True remove_javascript = True
remove_empty_feeds = True
use_embedded_content = False use_embedded_content = False
keep_only_tags = [ keep_only_tags = [
@ -21,50 +21,54 @@ class PCLab(BasicNewsRecipe):
] ]
remove_tags = [ remove_tags = [
dict(name='div', attrs={'class':['chapters']}) dict(name='div', attrs={'class':['toc first', 'toc', 'tags', 'recommendedarticles', 'name', 'zumi', 'chapters']})
,dict(name='div', attrs={'id':['script_bxad_slot_display_list_bxad_slot']})
] ]
remove_tags_after = [
dict(name='div', attrs={'class':['navigation']})
]
#links to RSS feeds #links to RSS feeds
feeds = [ ('PCLab', u'http://pclab.pl/xml/artykuly.xml') ] feeds = [
(u'Aktualności', 'http://pclab.pl/xml/aktualnosci.xml'),
(u'Artykuły', u'http://pclab.pl/xml/artykuly.xml'),
(u'Poradniki', 'http://pclab.pl/xml/poradniki.xml')
]
#load second and subsequent page content #load second and subsequent page content
# in: soup - full page with 'next' button # in: soup - full page with 'next' button
# out: appendtag - tag to which new page is to be added # out: appendtag - tag to which new page is to be added
def append_page(self, soup, appendtag): def append_page(self, soup, appendtag):
# find the 'Next' button # find the 'Next' button
pager = soup.find('div', attrs={'class':'next'}) pager = soup.find('div', attrs={'class':'navigation'})
if pager: if pager:
a = pager.find('a')
if 'news' in a['href']:
pager = None
else:
pager = pager.find('div', attrs={'class':'next'})
while pager:
#search for 'a' element with link to next page (exit if not found) #search for 'a' element with link to next page (exit if not found)
a = pager.find('a') a = pager.find('a')
if a: nexturl = a['href']
nexturl = a['href'] soup2 = self.index_to_soup('http://pclab.pl' + nexturl)
pager = soup2.find('div', attrs={'class':'next'})
pagetext = soup2.find('div', attrs={'class':'substance'})
pagetext = pagetext.find('div', attrs={'class':'data'})
soup2 = self.index_to_soup('http://pclab.pl/' + nexturl) pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
pagetext_substance = soup2.find('div', attrs={'class':'substance'}) pos = len(appendtag.contents)
pagetext = pagetext_substance.find('div', attrs={'class':'data'})
pagetext.extract()
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
pos = len(appendtag.contents)
self.append_page(soup2, appendtag)
pager = soup.find('div', attrs={'class':'navigation'})
if pager:
pager.extract()
def preprocess_html(self, soup): def preprocess_html(self, soup):
# soup.body contains no title and no navigator, they are in soup # soup.body contains no title and no navigator, they are in soup
self.append_page(soup, soup.body) self.append_page(soup, soup.body)
for link in soup.findAll('a'):
href = link.get('href', None)
if href and href.startswith('/'):
link['href'] = 'http://pclab.pl' + href
# finally remove some tags # finally remove some tags
tags = soup.findAll('div',attrs={'class':['tags', 'index', 'script_bxad_slot_display_list_bxad_slot', 'index first', 'zumi', 'navigation']}) #for r in soup.findAll('div', attrs={'class':['tags', 'index', 'script_bxad_slot_display_list_bxad_slot', 'index first', 'zumi', 'navigation']})
[tag.extract() for tag in tags]
return soup return soup

View File

@ -5,11 +5,14 @@ class SpidersWeb(BasicNewsRecipe):
oldest_article = 7 oldest_article = 7
__author__ = 'fenuks' __author__ = 'fenuks'
description = u'' description = u''
cover_url = 'http://www.spidersweb.pl/wp-content/themes/spiderweb/img/Logo.jpg' cover_url = 'http://www.spidersweb.pl/wp-content/themes/new_sw/images/spidersweb.png'
category = 'IT, WEB' category = 'IT, WEB'
language = 'pl' language = 'pl'
no_stylesheers=True no_stylesheers=True
remove_javascript = True
use_embedded_content = False
max_articles_per_feed = 100 max_articles_per_feed = 100
keep_only_tags=[dict(id='Post')] keep_only_tags=[dict(id='start')]
remove_tags=[dict(name='div', attrs={'class':['Comments', 'Shows', 'Post-Tags']}), dict(id='Author-Column')] remove_tags_after = dict(attrs={'class':'padding20'})
remove_tags=[dict(name='div', attrs={'class':['padding border-bottom', 'padding20', 'padding border-top']})]
feeds = [(u'Wpisy', u'http://www.spidersweb.pl/feed')] feeds = [(u'Wpisy', u'http://www.spidersweb.pl/feed')]

View File

@ -10,89 +10,89 @@ from calibre.web.feeds.news import BasicNewsRecipe
import re import re
class Wprost(BasicNewsRecipe): class Wprost(BasicNewsRecipe):
EDITION = 0 EDITION = 0
FIND_LAST_FULL_ISSUE = True FIND_LAST_FULL_ISSUE = True
EXCLUDE_LOCKED = True EXCLUDE_LOCKED = True
ICO_BLOCKED = 'http://www.wprost.pl/G/layout2/ico_blocked.png' ICO_BLOCKED = 'http://www.wprost.pl/G/layout2/ico_blocked.png'
title = u'Wprost'
__author__ = 'matek09'
description = 'Weekly magazine'
encoding = 'ISO-8859-2'
no_stylesheets = True
language = 'pl'
remove_javascript = True
recursions = 0
remove_tags_before = dict(dict(name = 'div', attrs = {'id' : 'print-layer'}))
remove_tags_after = dict(dict(name = 'div', attrs = {'id' : 'print-layer'}))
'''
keep_only_tags =[]
keep_only_tags.append(dict(name = 'table', attrs = {'id' : 'title-table'}))
keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'div-header'}))
keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'div-content'}))
keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'def element-autor'}))
'''
title = u'Wprost' preprocess_regexps = [(re.compile(r'style="display: none;"'), lambda match: ''),
__author__ = 'matek09'
description = 'Weekly magazine'
encoding = 'ISO-8859-2'
no_stylesheets = True
language = 'pl'
remove_javascript = True
recursions = 0
remove_tags_before = dict(dict(name = 'div', attrs = {'id' : 'print-layer'}))
remove_tags_after = dict(dict(name = 'div', attrs = {'id' : 'print-layer'}))
'''keep_only_tags =[]
keep_only_tags.append(dict(name = 'table', attrs = {'id' : 'title-table'}))
keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'div-header'}))
keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'div-content'}))
keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'def element-autor'}))'''
preprocess_regexps = [(re.compile(r'style="display: none;"'), lambda match: ''),
(re.compile(r'display: block;'), lambda match: ''), (re.compile(r'display: block;'), lambda match: ''),
(re.compile(r'\<td\>\<tr\>\<\/table\>'), lambda match: ''), (re.compile(r'\<td\>\<tr\>\<\/table\>'), lambda match: ''),
(re.compile(r'\<table .*?\>'), lambda match: ''), (re.compile(r'\<table .*?\>'), lambda match: ''),
(re.compile(r'\<tr>'), lambda match: ''), (re.compile(r'\<tr>'), lambda match: ''),
(re.compile(r'\<td .*?\>'), lambda match: ''), (re.compile(r'\<td .*?\>'), lambda match: ''),
(re.compile(r'\<div id="footer"\>.*?\</footer\>'), lambda match: '')] (re.compile(r'\<div id="footer"\>.*?\</footer\>'), lambda match: '')]
remove_tags =[] remove_tags =[]
remove_tags.append(dict(name = 'div', attrs = {'class' : 'def element-date'})) remove_tags.append(dict(name = 'div', attrs = {'class' : 'def element-date'}))
remove_tags.append(dict(name = 'div', attrs = {'class' : 'def silver'})) remove_tags.append(dict(name = 'div', attrs = {'class' : 'def silver'}))
remove_tags.append(dict(name = 'div', attrs = {'id' : 'content-main-column-right'})) remove_tags.append(dict(name = 'div', attrs = {'id' : 'content-main-column-right'}))
extra_css = ''' extra_css = '''.div-header {font-size: x-small; font-weight: bold}'''
.div-header {font-size: x-small; font-weight: bold} #h2 {font-size: x-large; font-weight: bold}
''' def is_blocked(self, a):
#h2 {font-size: x-large; font-weight: bold} if a.findNextSibling('img') is None:
def is_blocked(self, a): return False
if a.findNextSibling('img') is None: else:
return False return True
else:
return True
def find_last_issue(self):
soup = self.index_to_soup('http://www.wprost.pl/archiwum/')
a = 0
if self.FIND_LAST_FULL_ISSUE:
ico_blocked = soup.findAll('img', attrs={'src' : self.ICO_BLOCKED})
a = ico_blocked[-1].findNext('a', attrs={'title' : re.compile(r'Spis *', re.IGNORECASE | re.DOTALL)})
else:
a = soup.find('a', attrs={'title' : re.compile(r'Spis *', re.IGNORECASE | re.DOTALL)})
self.EDITION = a['href'].replace('/tygodnik/?I=', '')
self.EDITION_SHORT = a['href'].replace('/tygodnik/?I=15', '')
self.cover_url = a.img['src']
def find_last_issue(self): def parse_index(self):
soup = self.index_to_soup('http://www.wprost.pl/archiwum/') self.find_last_issue()
a = 0 soup = self.index_to_soup('http://www.wprost.pl/tygodnik/?I=' + self.EDITION)
if self.FIND_LAST_FULL_ISSUE: feeds = []
ico_blocked = soup.findAll('img', attrs={'src' : self.ICO_BLOCKED}) headers = soup.findAll(attrs={'class':'block-header block-header-left mtop20 mbottom20'})
a = ico_blocked[-1].findNext('a', attrs={'title' : re.compile(r'Spis *', re.IGNORECASE | re.DOTALL)}) articles_list = soup.findAll(attrs={'class':'standard-box'})
else: for i in range(len(headers)):
a = soup.find('a', attrs={'title' : re.compile(r'Spis *', re.IGNORECASE | re.DOTALL)}) articles = self.find_articles(articles_list[i])
self.EDITION = a['href'].replace('/tygodnik/?I=', '') if len(articles) > 0:
self.EDITION_SHORT = a['href'].replace('/tygodnik/?I=15', '') section = headers[i].find('a').string
self.cover_url = a.img['src'] feeds.append((section, articles))
return feeds
def parse_index(self): def find_articles(self, main_block):
self.find_last_issue() articles = []
soup = self.index_to_soup('http://www.wprost.pl/tygodnik/?I=' + self.EDITION) for a in main_block.findAll('a'):
feeds = [] if a.name in "td":
for main_block in soup.findAll(attrs={'id': 'content-main-column-element-content'}): break
articles = list(self.find_articles(main_block)) if self.EXCLUDE_LOCKED and self.is_blocked(a):
if len(articles) > 0: continue
section = self.tag_to_string(main_block.find('h3')) articles.append({
feeds.append((section, articles)) 'title' : self.tag_to_string(a),
return feeds 'url' : 'http://www.wprost.pl' + a['href'],
'date' : '',
def find_articles(self, main_block): 'description' : ''
for a in main_block.findAll('a'): })
if a.name in "td": return articles
break
if self.EXCLUDE_LOCKED & self.is_blocked(a):
continue
yield {
'title' : self.tag_to_string(a),
'url' : 'http://www.wprost.pl' + a['href'],
'date' : '',
'description' : ''
}