Update Focus (PL)

2025-12-18 02:55:11 -05:00 · 2013-10-04 15:43:09 +05:30 · 2013-10-04 15:43:09 +05:30 · 9a9daa7908
commit 9a9daa7908
parent 002886b0ba
1 changed files with 41 additions and 75 deletions
--- a/recipes/focus_pl.recipe
+++ b/recipes/focus_pl.recipe
@ -1,85 +1,51 @@
-#!/usr/bin/env python
+from calibre.web.feeds.recipes import BasicNewsRecipe
 __license__ = 'GPL v3'
 import re
 from calibre.web.feeds.news import BasicNewsRecipe
-class FocusRecipe(BasicNewsRecipe):
+class NYTimes(BasicNewsRecipe):
-    __author__ = u'Artur Stachecki <artur.stachecki@gmail.com>'
+    title = 'Focus'
    __author__ = 'Krittika Goyal'
    language = 'pl'
-    version = 1
+    description = 'Polish scientific monthly magazine'
-
+    timefmt = ' [%d %b, %Y]'
-    title = u'Focus'
+    needs_subscription = False
    publisher = u'Gruner + Jahr Polska'
    category = u'News'
    description = u'Focus.pl - pierwszy w Polsce portal społecznościowy dla miłośników nauki. Tematyka: nauka, historia, cywilizacja, technika, przyroda, sport, gadżety'
    category = 'magazine'
    cover_url = ''
    remove_empty_feeds = True
    no_stylesheets = True
    oldest_article = 7
    max_articles_per_feed = 100000
    recursions = 0
    no_stylesheets = True
-    remove_javascript = True
+    keep_only_tags = dict(name='article', attrs={'class': 'content'})
-    encoding = 'utf-8'
+    remove_tags_after = dict(name='div', attrs={'class': 'inner_article'})
-    # Seems to work best, but YMMV
+    remove_tags = [
-    simultaneous_downloads = 5
+        dict(name='div', attrs={'class': ['social_btns']}),
    r = re.compile('.*(?P<url>http:\/\/(www.focus.pl)|(rss.feedsportal.com\/c)\/.*\.html?).*')
    keep_only_tags = []
    keep_only_tags.append(dict(name='div', attrs={'id': 'cll'}))
    remove_tags = []
    remove_tags.append(dict(name='div', attrs={'class': 'ulm noprint'}))
    remove_tags.append(dict(name='div', attrs={'class': 'txb'}))
    remove_tags.append(dict(name='div', attrs={'class': 'h2'}))
    remove_tags.append(dict(name='ul', attrs={'class': 'txu'}))
    remove_tags.append(dict(name='div', attrs={'class': 'ulc'}))
    extra_css = '''
                    body {font-family: verdana, arial, helvetica, geneva, sans-serif ;}
                    h1{text-align: left;}
                    h2{font-size: medium; font-weight: bold;}
                    p.lead {font-weight: bold; text-align: left;}
                    .authordate {font-size: small; color: #696969;}
                    .fot{font-size: x-small; color: #666666;}
                    '''
    feeds = [
        ('Nauka', 'http://www.focus.pl/nauka/rss/'),
        ('Historia', 'http://www.focus.pl/historia/rss/'),
        ('Cywilizacja', 'http://www.focus.pl/cywilizacja/rss/'),
        ('Sport', 'http://www.focus.pl/sport/rss/'),
        ('Technika', 'http://www.focus.pl/technika/rss/'),
        ('Przyroda', 'http://www.focus.pl/przyroda/rss/'),
        ('Technologie', 'http://www.focus.pl/gadzety/rss/')
    ]
-    def skip_ad_pages(self, soup):
+    # TO GET ARTICLE TOC
-        if ('advertisement' in soup.find('title').string.lower()):
+    def nejm_get_index(self):
-            href = soup.find('a').get('href')
+            return self.index_to_soup('http://www.focus.pl/')
            return self.index_to_soup(href, raw=True)
        else:
            return None
-    def get_cover_url(self):
+    # To parse artice toc
-        soup = self.index_to_soup('http://www.focus.pl/magazyn/')
+    def parse_index(self):
-        tag = soup.find(name='div', attrs={'class': 'clr fl'})
+            soup = self.nejm_get_index()
        if tag:
            self.cover_url = 'http://www.focus.pl/' + tag.a['href']
            return getattr(self, 'cover_url', self.cover_url)
-    def print_version(self, url):
+            toc = soup.find('div', id='wrapper')
-        if url.count('focus.pl.feedsportal.com'):
+
-            u = url.find('focus0Bpl')
+            articles = []
-            u = 'http://www.focus.pl/' + url[u + 11:]
+            feeds = []
-            u = u.replace('0C', '/')
+            section_title = 'Focus Articles'
-            u = u.replace('A', '')
+            for x in toc.findAll(True):
-            u = u.replace('0E', '-')
+                if x.name == 'h1':
-            u = u.replace('/nc/1//story01.htm', '/do-druku/1')
+                    # Article found
-        else:
+                    a = x.find('a')
-            u = url.replace('/nc/1', '/do-druku/1')
+                    if a is None:
-        return u
+                        continue
                    title = self.tag_to_string(a)
                    url = a.get('href', False)
                    if not url or not title:
                        continue
                    # if url.startswith('story'):
                    url = 'http://www.focus.pl' + url
                    self.log('\t\tFound article:', title)
                    self.log('\t\t\t', url)
                    articles.append({'title': title, 'url': url,
                                     'description': '', 'date': ''})
            feeds.append((section_title, articles))
            return feeds