From 60b4f4fd0148370dabd113419a134ea887bc59fc Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 16 Feb 2013 21:14:46 +0530 Subject: [PATCH] Update various Polish recipes --- recipes/adventure_zone_pl.recipe | 27 ++++-- recipes/bash_org_pl.recipe | 6 +- recipes/ekologia_pl.recipe | 3 +- recipes/informacje_usa.recipe | 12 ++- recipes/mlody_technik_pl.recipe | 18 +++- recipes/pc_lab.recipe | 60 +++++++------ recipes/spiders_web_pl.recipe | 9 +- recipes/wprost.recipe | 142 +++++++++++++++---------------- 8 files changed, 153 insertions(+), 124 deletions(-) diff --git a/recipes/adventure_zone_pl.recipe b/recipes/adventure_zone_pl.recipe index 2224937f3c..b02460695e 100644 --- a/recipes/adventure_zone_pl.recipe +++ b/recipes/adventure_zone_pl.recipe @@ -11,7 +11,7 @@ class Adventure_zone(BasicNewsRecipe): max_articles_per_feed = 100 cover_url = 'http://www.adventure-zone.info/inne/logoaz_2012.png' index='http://www.adventure-zone.info/fusion/' - use_embedded_content=False + use_embedded_content = False preprocess_regexps = [(re.compile(r"Komentarze", re.IGNORECASE), lambda m: ''), (re.compile(r''), lambda match: ''), (re.compile(r''), lambda match: '')] @@ -21,7 +21,7 @@ class Adventure_zone(BasicNewsRecipe): extra_css = '.main-bg{text-align: left;} td.capmain{ font-size: 22px; }' feeds = [(u'Nowinki', u'http://www.adventure-zone.info/fusion/feeds/news.php')] - def parse_feeds (self): + '''def parse_feeds (self): feeds = BasicNewsRecipe.parse_feeds(self) soup=self.index_to_soup(u'http://www.adventure-zone.info/fusion/feeds/news.php') tag=soup.find(name='channel') @@ -34,7 +34,7 @@ class Adventure_zone(BasicNewsRecipe): for feed in feeds: for article in feed.articles[:]: article.title=titles[feed.articles.index(article)] - return feeds + return feeds''' '''def get_cover_url(self): @@ -42,16 +42,25 @@ class Adventure_zone(BasicNewsRecipe): cover=soup.find(id='box_OstatninumerAZ') self.cover_url='http://www.adventure-zone.info/fusion/'+ cover.center.a.img['src'] return getattr(self, 'cover_url', self.cover_url)''' - + def populate_article_metadata(self, article, soup, first): + result = re.search('(.+) - Adventure Zone', soup.title.string) + if result: + article.title = result.group(1) + else: + result = soup.body.find('strong') + if result: + article.title = result.string def skip_ad_pages(self, soup): skip_tag = soup.body.find(name='td', attrs={'class':'main-bg'}) skip_tag = skip_tag.findAll(name='a') - for r in skip_tag: - if r.strong: - word=r.strong.string.lower() - if word and (('zapowied' in word) or ('recenzj' in word) or ('solucj' in word) or ('poradnik' in word)): - return self.index_to_soup('http://www.adventure-zone.info/fusion/print.php?type=A&item'+r['href'][r['href'].find('article_id')+7:], raw=True) + title = soup.title.string.lower() + if (('zapowied' in title) or ('recenzj' in title) or ('solucj' in title) or ('poradnik' in title)): + for r in skip_tag: + if r.strong and r.strong.string: + word=r.strong.string.lower() + if (('zapowied' in word) or ('recenzj' in word) or ('solucj' in word) or ('poradnik' in word)): + return self.index_to_soup('http://www.adventure-zone.info/fusion/print.php?type=A&item'+r['href'][r['href'].find('article_id')+7:], raw=True) def preprocess_html(self, soup): footer=soup.find(attrs={'class':'news-footer middle-border'}) diff --git a/recipes/bash_org_pl.recipe b/recipes/bash_org_pl.recipe index 4ed59614e7..a04f267ca3 100644 --- a/recipes/bash_org_pl.recipe +++ b/recipes/bash_org_pl.recipe @@ -35,8 +35,8 @@ class Bash_org_pl(BasicNewsRecipe): soup=self.index_to_soup(u'http://bash.org.pl/random/') #date=soup.find('div', attrs={'class':'right'}).string url=soup.find('a', attrs={'class':'qid click'}) - title=url.string - url='http://bash.org.pl' +url['href'] + title='' + url='http://bash.org.pl/random/' articles.append({'title' : title, 'url' : url, 'date' : '', @@ -44,6 +44,8 @@ class Bash_org_pl(BasicNewsRecipe): }) return articles + def populate_article_metadata(self, article, soup, first): + article.title = soup.find(attrs={'class':'qid click'}).string def parse_index(self): feeds = [] diff --git a/recipes/ekologia_pl.recipe b/recipes/ekologia_pl.recipe index 2b0933b58d..21d3b607d2 100644 --- a/recipes/ekologia_pl.recipe +++ b/recipes/ekologia_pl.recipe @@ -15,7 +15,8 @@ class EkologiaPl(BasicNewsRecipe): no_stylesheets = True remove_empty_feeds = True use_embedded_content = False - remove_tags = [dict(attrs={'class':['ekoLogo', 'powrocArt', 'butonDrukuj']})] + remove_attrs = ['style'] + remove_tags = [dict(attrs={'class':['ekoLogo', 'powrocArt', 'butonDrukuj', 'widget-social-buttons']})] feeds = [(u'Wiadomo\u015bci', u'http://www.ekologia.pl/rss/20,53,0'), (u'\u015arodowisko', u'http://www.ekologia.pl/rss/20,56,0'), (u'Styl \u017cycia', u'http://www.ekologia.pl/rss/20,55,0')] diff --git a/recipes/informacje_usa.recipe b/recipes/informacje_usa.recipe index ac31134103..692dcdc07e 100644 --- a/recipes/informacje_usa.recipe +++ b/recipes/informacje_usa.recipe @@ -1,5 +1,4 @@ from calibre.web.feeds.news import BasicNewsRecipe -import re class Informacje_USA(BasicNewsRecipe): title = u'Informacje USA' oldest_article = 7 @@ -8,11 +7,10 @@ class Informacje_USA(BasicNewsRecipe): description = u'portal wiadomości amerykańskich' category = 'news' language = 'pl' - masthead_url= 'http://www.informacjeusa.com/wp-content/add_images/top_logo_5_2010.jpg' - cover_url='http://www.informacjeusa.com/wp-content/add_images/top_logo_5_2010.jpg' + cover_url='http://www.informacjeusa.com/wp-content/uploads/2013/01/V3BANNER420-90new.jpg' no_stylesheets = True - preprocess_regexps = [(re.compile(ur'

Zobacz:.*?

', re.DOTALL), lambda match: ''), (re.compile(ur'

Podobne", re.IGNORECASE), lambda m: '')] oldest_article = 7 max_articles_per_feed = 100 - #keep_only_tags=[dict(id='container')] - feeds = [(u'Artyku\u0142y', u'http://www.mt.com.pl/feed')] + remove_empty_feeds = True + use_embedded_content = False + keep_only_tags = [dict(id='content')] + remove_tags = [dict(attrs={'class':'st-related-posts'})] + remove_tags_after = dict(attrs={'class':'entry-content clearfix'}) + feeds = [(u'Wszystko', u'http://www.mt.com.pl/feed'), + (u'MT NEWS 24/7', u'http://www.mt.com.pl/kategoria/mt-newsy-24-7/feed'), + (u'Info zoom', u'http://www.mt.com.pl/kategoria/info-zoom/feed'), + (u'm.technik', u'http://www.mt.com.pl/kategoria/m-technik/feed'), + (u'Szkoła', u'http://www.mt.com.pl/kategoria/szkola-2/feed'), + (u'Na Warsztacie', u'http://www.mt.com.pl/kategoria/na-warsztacie/feed'), + (u'Z pasji do...', u'http://www.mt.com.pl/kategoria/z-pasji-do/feed'), + (u'MT testuje', u'http://www.mt.com.pl/kategoria/mt-testuje/feed')] diff --git a/recipes/pc_lab.recipe b/recipes/pc_lab.recipe index c4b33b8416..7a6038bd65 100644 --- a/recipes/pc_lab.recipe +++ b/recipes/pc_lab.recipe @@ -1,5 +1,4 @@ #!/usr/bin/env python - from calibre.web.feeds.recipes import BasicNewsRecipe class PCLab(BasicNewsRecipe): @@ -8,12 +7,13 @@ class PCLab(BasicNewsRecipe): __author__ = 'ravcio - rlelusz[at]gmail.com' description = u"Articles from PC Lab website" language = 'pl' - oldest_article = 30.0 + oldest_article = 30 max_articles_per_feed = 100 recursions = 0 encoding = 'iso-8859-2' no_stylesheets = True remove_javascript = True + remove_empty_feeds = True use_embedded_content = False keep_only_tags = [ @@ -21,50 +21,54 @@ class PCLab(BasicNewsRecipe): ] remove_tags = [ - dict(name='div', attrs={'class':['chapters']}) - ,dict(name='div', attrs={'id':['script_bxad_slot_display_list_bxad_slot']}) + dict(name='div', attrs={'class':['toc first', 'toc', 'tags', 'recommendedarticles', 'name', 'zumi', 'chapters']}) ] - remove_tags_after = [ - dict(name='div', attrs={'class':['navigation']}) - ] - #links to RSS feeds - feeds = [ ('PCLab', u'http://pclab.pl/xml/artykuly.xml') ] + feeds = [ + (u'Aktualności', 'http://pclab.pl/xml/aktualnosci.xml'), + (u'Artykuły', u'http://pclab.pl/xml/artykuly.xml'), + (u'Poradniki', 'http://pclab.pl/xml/poradniki.xml') + ] #load second and subsequent page content # in: soup - full page with 'next' button # out: appendtag - tag to which new page is to be added def append_page(self, soup, appendtag): # find the 'Next' button - pager = soup.find('div', attrs={'class':'next'}) - + pager = soup.find('div', attrs={'class':'navigation'}) if pager: + a = pager.find('a') + if 'news' in a['href']: + pager = None + else: + pager = pager.find('div', attrs={'class':'next'}) + + while pager: #search for 'a' element with link to next page (exit if not found) a = pager.find('a') - if a: - nexturl = a['href'] + nexturl = a['href'] + soup2 = self.index_to_soup('http://pclab.pl' + nexturl) + pager = soup2.find('div', attrs={'class':'next'}) + pagetext = soup2.find('div', attrs={'class':'substance'}) + pagetext = pagetext.find('div', attrs={'class':'data'}) - soup2 = self.index_to_soup('http://pclab.pl/' + nexturl) - - pagetext_substance = soup2.find('div', attrs={'class':'substance'}) - pagetext = pagetext_substance.find('div', attrs={'class':'data'}) - pagetext.extract() - - pos = len(appendtag.contents) - appendtag.insert(pos, pagetext) - pos = len(appendtag.contents) - - self.append_page(soup2, appendtag) + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + pos = len(appendtag.contents) + pager = soup.find('div', attrs={'class':'navigation'}) + if pager: + pager.extract() def preprocess_html(self, soup): - # soup.body contains no title and no navigator, they are in soup self.append_page(soup, soup.body) - + for link in soup.findAll('a'): + href = link.get('href', None) + if href and href.startswith('/'): + link['href'] = 'http://pclab.pl' + href # finally remove some tags - tags = soup.findAll('div',attrs={'class':['tags', 'index', 'script_bxad_slot_display_list_bxad_slot', 'index first', 'zumi', 'navigation']}) - [tag.extract() for tag in tags] + #for r in soup.findAll('div', attrs={'class':['tags', 'index', 'script_bxad_slot_display_list_bxad_slot', 'index first', 'zumi', 'navigation']}) return soup diff --git a/recipes/spiders_web_pl.recipe b/recipes/spiders_web_pl.recipe index 678ee5c640..b593d6b837 100644 --- a/recipes/spiders_web_pl.recipe +++ b/recipes/spiders_web_pl.recipe @@ -5,11 +5,14 @@ class SpidersWeb(BasicNewsRecipe): oldest_article = 7 __author__ = 'fenuks' description = u'' - cover_url = 'http://www.spidersweb.pl/wp-content/themes/spiderweb/img/Logo.jpg' + cover_url = 'http://www.spidersweb.pl/wp-content/themes/new_sw/images/spidersweb.png' category = 'IT, WEB' language = 'pl' no_stylesheers=True + remove_javascript = True + use_embedded_content = False max_articles_per_feed = 100 - keep_only_tags=[dict(id='Post')] - remove_tags=[dict(name='div', attrs={'class':['Comments', 'Shows', 'Post-Tags']}), dict(id='Author-Column')] + keep_only_tags=[dict(id='start')] + remove_tags_after = dict(attrs={'class':'padding20'}) + remove_tags=[dict(name='div', attrs={'class':['padding border-bottom', 'padding20', 'padding border-top']})] feeds = [(u'Wpisy', u'http://www.spidersweb.pl/feed')] diff --git a/recipes/wprost.recipe b/recipes/wprost.recipe index 2adac1e113..90dde251ca 100644 --- a/recipes/wprost.recipe +++ b/recipes/wprost.recipe @@ -10,89 +10,89 @@ from calibre.web.feeds.news import BasicNewsRecipe import re class Wprost(BasicNewsRecipe): - EDITION = 0 - FIND_LAST_FULL_ISSUE = True - EXCLUDE_LOCKED = True - ICO_BLOCKED = 'http://www.wprost.pl/G/layout2/ico_blocked.png' + EDITION = 0 + FIND_LAST_FULL_ISSUE = True + EXCLUDE_LOCKED = True + ICO_BLOCKED = 'http://www.wprost.pl/G/layout2/ico_blocked.png' + title = u'Wprost' + __author__ = 'matek09' + description = 'Weekly magazine' + encoding = 'ISO-8859-2' + no_stylesheets = True + language = 'pl' + remove_javascript = True + recursions = 0 + remove_tags_before = dict(dict(name = 'div', attrs = {'id' : 'print-layer'})) + remove_tags_after = dict(dict(name = 'div', attrs = {'id' : 'print-layer'})) + ''' + keep_only_tags =[] + keep_only_tags.append(dict(name = 'table', attrs = {'id' : 'title-table'})) + keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'div-header'})) + keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'div-content'})) + keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'def element-autor'})) + ''' - title = u'Wprost' - __author__ = 'matek09' - description = 'Weekly magazine' - encoding = 'ISO-8859-2' - no_stylesheets = True - language = 'pl' - remove_javascript = True - recursions = 0 - - remove_tags_before = dict(dict(name = 'div', attrs = {'id' : 'print-layer'})) - remove_tags_after = dict(dict(name = 'div', attrs = {'id' : 'print-layer'})) - - '''keep_only_tags =[] - keep_only_tags.append(dict(name = 'table', attrs = {'id' : 'title-table'})) - keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'div-header'})) - keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'div-content'})) - keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'def element-autor'}))''' - - preprocess_regexps = [(re.compile(r'style="display: none;"'), lambda match: ''), + preprocess_regexps = [(re.compile(r'style="display: none;"'), lambda match: ''), (re.compile(r'display: block;'), lambda match: ''), (re.compile(r'\\\<\/table\>'), lambda match: ''), (re.compile(r'\'), lambda match: ''), (re.compile(r'\'), lambda match: ''), (re.compile(r'\
'), lambda match: ''), - (re.compile(r'\