...

2026-01-07 20:50:20 -05:00 · 2011-12-17 21:32:28 +05:30 · 2011-12-17 21:32:28 +05:30 · e8e4246860
commit e8e4246860
parent a85c0e5ffb
6 changed files with 81 additions and 54 deletions
--- a/recipes/adventure_zone_pl.recipe
+++ b/recipes/adventure_zone_pl.recipe
@ -1,19 +1,38 @@
 from calibre.web.feeds.news import BasicNewsRecipe
-
+import re
 class Adventure_zone(BasicNewsRecipe):
    title          = u'Adventure Zone'
    __author__        = 'fenuks'
    description   = 'Adventure zone - adventure games from A to Z'
    category       = 'games'
    language       = 'pl'
-    oldest_article = 15
-    max_articles_per_feed = 100
    no_stylesheets = True
+    oldest_article = 20
+    max_articles_per_feed = 100
+    use_embedded_content=False
+    preprocess_regexps     = [(re.compile(r"<td class='capmain'>Komentarze</td>", re.IGNORECASE), lambda m: '')]
    remove_tags_before= dict(name='td', attrs={'class':'main-bg'})
-    remove_tags_after= dict(name='td', attrs={'class':'main-body middle-border'})
+    remove_tags= [dict(name='img', attrs={'alt':'Drukuj'})]
+    remove_tags_after= dict(id='comments')
    extra_css              = '.main-bg{text-align: left;}  td.capmain{ font-size: 22px; }'
    feeds          = [(u'Nowinki', u'http://www.adventure-zone.info/fusion/feeds/news.php')]

+    def parse_feeds (self): 
+      feeds = BasicNewsRecipe.parse_feeds(self) 
+      soup=self.index_to_soup(u'http://www.adventure-zone.info/fusion/feeds/news.php')
+      tag=soup.find(name='channel')
+      titles=[]
+      for r in tag.findAll(name='image'):
+          r.extract()
+      art=tag.findAll(name='item')
+      for i in art:
+            titles.append(i.title.string)
+      for feed in feeds:
+        for article in feed.articles[:]:
+            article.title=titles[feed.articles.index(article)]
+      return feeds
+
+
    def get_cover_url(self):
        soup = self.index_to_soup('http://www.adventure-zone.info/fusion/news.php')
        cover=soup.find(id='box_OstatninumerAZ')
@ -22,17 +41,10 @@ class Adventure_zone(BasicNewsRecipe):


    def skip_ad_pages(self, soup):
-        skip_tag = soup.body.findAll(name='a')
-        if skip_tag is not None:
-            for r in skip_tag:
-                 if 'articles.php?' in r['href']:
-                     if r.strong is not None:
-                         word=r.strong.string
-                         if ('zapowied' or 'recenzj') in word:
-                             return self.index_to_soup('http://www.adventure-zone.info/fusion/print.php?type=A&item_id'+r['href'][r['href'].find('_id')+3:], raw=True)
-        else:
-            None
-
-    def print_version(self, url):
-        return url.replace('news.php?readmore', 'print.php?type=N&item_id')
-
+        skip_tag = soup.body.find(name='td', attrs={'class':'main-bg'})
+        skip_tag = skip_tag.findAll(name='a')
+        for r in skip_tag:
+           if r.strong:
+                 word=r.strong.string
+                 if word and (('zapowied' in word) or ('recenzj' in word)  or ('solucj' in word)):
+                   return self.index_to_soup('http://www.adventure-zone.info/fusion/print.php?type=A&item'+r['href'][r['href'].find('article_id')+7:], raw=True)
--- a/recipes/astro_news_pl.recipe
+++ b/recipes/astro_news_pl.recipe
@ -1,5 +1,4 @@
 from calibre.web.feeds.news import BasicNewsRecipe
-
 class AstroNEWS(BasicNewsRecipe):
    title          = u'AstroNEWS'
    __author__        = 'fenuks'
@ -8,11 +7,16 @@ class AstroNEWS(BasicNewsRecipe):
    language       = 'pl'
    oldest_article = 8
    max_articles_per_feed = 100
-    auto_cleanup = True
+    #extra_css= 'table {text-align: left;}'
+    no_stylesheets=True
    cover_url='http://news.astronet.pl/img/logo_news.jpg'
-   # no_stylesheets= True
+    remove_tags=[dict(name='hr')]
    feeds          = [(u'Wiadomości', u'http://news.astronet.pl/rss.cgi')]

    def print_version(self, url):
        return url.replace('astronet.pl/', 'astronet.pl/print.cgi?')

+    def preprocess_html(self, soup):
+        for item in soup.findAll(align=True):
+            del item['align']
+        return soup
--- a/recipes/focus_pl.recipe
+++ b/recipes/focus_pl.recipe
@ -12,8 +12,9 @@ class Focus_pl(BasicNewsRecipe):
    cover_url=''
    remove_empty_feeds= True
    no_stylesheets=True
-    remove_tags_before=dict(name='div', attrs={'class':'h2 h2f'})
-    remove_tags_after=dict(name='div', attrs={'class':'clear'})
+    #remove_tags_before=dict(name='div', attrs={'class':'h2 h2f'})
+    #remove_tags_after=dict(name='div', attrs={'class':'clear'})
+    keep_only_tags=[dict(name='div', attrs={'class':['h2 h2f', 'news-left', 'news-right']})]
    feeds          = [(u'Wszystkie kategorie', u'http://focus.pl.feedsportal.com/c/32992/f/532692/index.rss'),
 	(u'Nauka', u'http://focus.pl.feedsportal.com/c/32992/f/532693/index.rss'),
 	(u'Historia', u'http://focus.pl.feedsportal.com/c/32992/f/532694/index.rss'),
@ -23,35 +24,33 @@ class Focus_pl(BasicNewsRecipe):
 	(u'Przyroda', u'http://focus.pl.feedsportal.com/c/32992/f/532698/index.rss'),
 	(u'Technologie', u'http://focus.pl.feedsportal.com/c/32992/f/532699/index.rss'),
 	(u'Warto wiedzieć', u'http://focus.pl.feedsportal.com/c/32992/f/532700/index.rss'),
-
-
-
-]
+           ]

    def skip_ad_pages(self, soup):
-          tag=soup.find(name='a')
-          if tag:
-            new_soup=self.index_to_soup(tag['href']+ 'do-druku/1/', raw=True)
-            return new_soup
+          if 'Advertisement' in soup.title:
+              tag=soup.find(name='a')
+              if tag:
+                 new_soup=self.index_to_soup(tag['href']+ 'do-druku/1/', raw=True)
+                 return new_soup

    def append_page(self, appendtag):
-        tag=appendtag.find(name='div', attrs={'class':'arrows'})
-        if tag:
-            nexturl='http://www.focus.pl/'+tag.a['href']
-            for rem in appendtag.findAll(name='div', attrs={'class':'klik-nav'}):
-                rem.extract()
-            while nexturl:
-                 soup2=self.index_to_soup(nexturl)
-                 nexturl=None
-                 pagetext=soup2.find(name='div', attrs={'class':'txt'})
-                 tag=pagetext.find(name='div', attrs={'class':'arrows'})
-                 for r in tag.findAll(name='a'):
-                     if u'Następne' in r.string:
-                         nexturl='http://www.focus.pl/'+r['href']
-                 for rem in pagetext.findAll(name='div', attrs={'class':'klik-nav'}):
-                     rem.extract()
-                 pos = len(appendtag.contents)
-                 appendtag.insert(pos, pagetext)
+            tag=appendtag.find(name='div', attrs={'class':'arrows'})
+            if tag:
+                nexturl='http://www.focus.pl/'+tag.a['href']
+                for rem in appendtag.findAll(name='div', attrs={'class':'klik-nav'}):
+                    rem.extract()
+                while nexturl:
+                     soup2=self.index_to_soup(nexturl)
+                     nexturl=None
+                     pagetext=soup2.find(name='div', attrs={'class':'txt'})
+                     tag=pagetext.find(name='div', attrs={'class':'arrows'})
+                     for r in tag.findAll(name='a'):
+                         if u'Następne' in r.string:
+                             nexturl='http://www.focus.pl/'+r['href']
+                     for rem in pagetext.findAll(name='div', attrs={'class':'klik-nav'}):
+                         rem.extract()
+                     pos = len(appendtag.contents)
+                     appendtag.insert(pos, pagetext)

    def get_cover_url(self):
        soup=self.index_to_soup('http://www.focus.pl/magazyn/')
--- a/recipes/naczytniki.recipe
+++ b/recipes/naczytniki.recipe
@ -7,6 +7,7 @@ class naczytniki(BasicNewsRecipe):
    language       = 'pl'
    description ='everything about e-readers'
    category='readers'
+    no_stylesheets=True
    oldest_article = 7
    max_articles_per_feed = 100
    remove_tags_after= dict(name='div', attrs={'class':'sociable'})
--- a/recipes/nowa_fantastyka.recipe
+++ b/recipes/nowa_fantastyka.recipe
@ -1,20 +1,21 @@
 # -*- coding: utf-8 -*-
 from calibre.web.feeds.news import BasicNewsRecipe
-
 class Nowa_Fantastyka(BasicNewsRecipe):
    title          = u'Nowa Fantastyka'
    oldest_article = 7
    __author__        = 'fenuks'
    language       = 'pl'
+    encoding='latin2'
    description ='site for fantasy readers'
    category='fantasy'
    max_articles_per_feed = 100
    INDEX='http://www.fantastyka.pl/'
+    no_stylesheets=True
+    needs_subscription = 'optional'
    remove_tags_before=dict(attrs={'class':'belka1-tlo-md'})
    #remove_tags_after=dict(name='span', attrs={'class':'naglowek-oceny'})
    remove_tags_after=dict(name='td', attrs={'class':'belka1-bot'})
-    remove_tags=[dict(attrs={'class':'avatar2'})]
-    feeds          = []
+    remove_tags=[dict(attrs={'class':'avatar2'}), dict(name='span', attrs={'class':'alert-oceny'}), dict(name='img', attrs={'src':['obrazki/sledz1.png', 'obrazki/print.gif', 'obrazki/mlnf.gif']}), dict(name='b', text='Dodaj komentarz'),dict(name='a', attrs={'href':'http://www.fantastyka.pl/10,1727.html'})]

    def find_articles(self, url):
        articles = []
@ -45,3 +46,13 @@ class Nowa_Fantastyka(BasicNewsRecipe):
        cover=soup.find(name='img', attrs={'class':'okladka'})
        self.cover_url=self.INDEX+ cover['src']
        return getattr(self, 'cover_url', self.cover_url)
+
+    def get_browser(self):
+        br = BasicNewsRecipe.get_browser()
+        if self.username is not None and self.password is not None:
+            br.open('http://www.fantastyka.pl/')
+            br.select_form(nr=0)
+            br['login']   = self.username
+            br['pass'] = self.password
+            br.submit()
+        return br
--- a/recipes/spiders_web_pl.recipe
+++ b/recipes/spiders_web_pl.recipe
@ -8,8 +8,8 @@ class SpidersWeb(BasicNewsRecipe):
    cover_url      = 'http://www.spidersweb.pl/wp-content/themes/spiderweb/img/Logo.jpg'
    category       = 'IT, WEB'
    language       = 'pl'
+    no_stylesheers=True
    max_articles_per_feed = 100
-    remove_tags_before=dict(name="h1", attrs={'class':'Title'})
-    remove_tags_after=dict(name="div", attrs={'class':'Text'})
-    remove_tags=[dict(name='div', attrs={'class':['Tags', 'CommentCount FloatL', 'Show FloatL']})]
+    keep_only_tags=[dict(id='Post')]
+    remove_tags=[dict(name='div', attrs={'class':['Comments', 'Shows', 'Post-Tags']})]
    feeds          = [(u'Wpisy', u'http://www.spidersweb.pl/feed')]