Autosport by Mr Stefan and Various new Polish recipes by Artur Stachecki

2025-07-09 03:04:10 -04:00 · 2012-11-10 17:36:47 +05:30 · 2012-11-10 17:36:47 +05:30 · 6b69085a39
commit 6b69085a39
parent b4a49e5cdd 84f18e6db9
10 changed files with 241 additions and 2 deletions
--- a/.bzrignore
+++ b/.bzrignore
@ -39,3 +39,45 @@ recipes/.git
 recipes/.gitignore
 recipes/README
 recipes/katalog_egazeciarz.recipe
+recipes/tv_axnscifi.recipe
+recipes/tv_comedycentral.recipe
+recipes/tv_discoveryscience.recipe
+recipes/tv_foxlife.recipe
+recipes/tv_fox.recipe
+recipes/tv_hbo.recipe
+recipes/tv_kinopolska.recipe
+recipes/tv_nationalgeographic.recipe
+recipes/tv_polsat2.recipe
+recipes/tv_polsat.recipe
+recipes/tv_tv4.recipe
+recipes/tv_tvn7.recipe
+recipes/tv_tvn.recipe
+recipes/tv_tvp1.recipe
+recipes/tv_tvp2.recipe
+recipes/tv_tvphd.recipe
+recipes/tv_tvphistoria.recipe
+recipes/tv_tvpkultura.recipe
+recipes/tv_tvppolonia.recipe
+recipes/tv_tvpuls.recipe
+recipes/tv_viasathistory.recipe
+recipes/icons/tv_axnscifi.png
+recipes/icons/tv_comedycentral.png
+recipes/icons/tv_discoveryscience.png
+recipes/icons/tv_foxlife.png
+recipes/icons/tv_fox.png
+recipes/icons/tv_hbo.png
+recipes/icons/tv_kinopolska.png
+recipes/icons/tv_nationalgeographic.png
+recipes/icons/tv_polsat2.png
+recipes/icons/tv_polsat.png
+recipes/icons/tv_tv4.png
+recipes/icons/tv_tvn7.png
+recipes/icons/tv_tvn.png
+recipes/icons/tv_tvp1.png
+recipes/icons/tv_tvp2.png
+recipes/icons/tv_tvphd.png
+recipes/icons/tv_tvphistoria.png
+recipes/icons/tv_tvpkultura.png
+recipes/icons/tv_tvppolonia.png
+recipes/icons/tv_tvpuls.png
+recipes/icons/tv_viasathistory.png
--- a/recipes/autosport.recipe
+++ b/recipes/autosport.recipe
@ -0,0 +1,30 @@
+#!/usr/bin/env python
+
+__license__ = 'GPL v3'
+__author__ = 'MrStefan <mrstefaan@gmail.com>'
+
+'''
+www.autosport.com
+'''
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class autosport(BasicNewsRecipe):
+    title = u'Autosport'
+    __author__ = 'MrStefan <mrstefaan@gmail.com>'
+    language = 'en_GB'
+    description =u'Daily Formula 1 and motorsport news from the leading weekly motor racing magazine. The authority on Formula 1, F1, MotoGP, GP2, Champ Car, Le Mans...'
+    masthead_url='http://cdn.images.autosport.com/asdotcom.gif'
+    remove_empty_feeds= True
+    oldest_article = 1
+    max_articles_per_feed = 100
+    remove_javascript=True
+    no_stylesheets=True
+
+    keep_only_tags =[]
+    keep_only_tags.append(dict(name = 'h1', attrs = {'class' : 'news_headline'}))
+    keep_only_tags.append(dict(name = 'td', attrs = {'class' : 'news_article_author'}))
+    keep_only_tags.append(dict(name = 'td', attrs = {'class' : 'news_article_date'}))
+    keep_only_tags.append(dict(name = 'p'))
+
+    feeds = [(u'ALL NEWS', u'http://www.autosport.com/rss/allnews.xml')]
--- a/recipes/blognexto.recipe
+++ b/recipes/blognexto.recipe
@ -0,0 +1,28 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class blognexto(BasicNewsRecipe):
+    title = 'BLOG.NEXTO.pl'
+    __author__ = 'MrStefan <mrstefaan@gmail.com>'
+    language = 'pl'
+    description ='o e-publikacjach prawie wszystko'
+    masthead_url='http://blog.nexto.pl/wp-content/uploads/2012/04/logo-blog-nexto.pl_.jpg'
+    remove_empty_feeds= True
+    oldest_article = 7
+    max_articles_per_feed = 100
+    remove_javascript=True
+    no_stylesheets=True
+
+
+    keep_only_tags =[]
+    keep_only_tags.append(dict(name = 'div', attrs = {'id' : 'content'}))
+
+    remove_tags =[]
+    remove_tags.append(dict(name = 'div', attrs = {'class' : 'comment-cloud'}))
+    remove_tags.append(dict(name = 'p', attrs = {'class' : 'post-date1'}))
+    remove_tags.append(dict(name = 'div', attrs = {'class' : 'fb-like'}))
+    remove_tags.append(dict(name = 'div', attrs = {'class' : 'tags'}))
+    remove_tags.append(dict(name = 'div', attrs = {'class' : 'postnavi'}))
+    remove_tags.append(dict(name = 'div', attrs = {'class' : 'commments-box'}))
+    remove_tags.append(dict(name = 'div', attrs = {'id' : 'respond'}))
+
+    feeds          = [('Artykuly', 'http://feeds.feedburner.com/blognexto')]
--- a/recipes/brewiarz.recipe
+++ b/recipes/brewiarz.recipe
@ -0,0 +1,140 @@
+#!/usr/bin/env python
+
+__license__ = 'GPL v3'
+
+from calibre.web.feeds.news import BasicNewsRecipe
+import datetime, re
+
+
+class brewiarz(BasicNewsRecipe):
+    title = u'Brewiarz'
+    __author__ = 'Artur Stachecki <artur.stachecki@gmail.com>'
+    language = 'pl'
+    description = u'Serwis poświęcony Liturgii Godzin (brewiarzowi) - formie codziennej modlitwy Kościoła katolickiego.'
+    masthead_url = 'http://brewiarz.pl/images/logo2.gif'
+    max_articles_per_feed = 100
+    remove_javascript = True
+    no_stylesheets = True
+    publication_type = 'newspaper'
+    next_days = 1
+
+    def parse_index(self):
+        dec2rom_dict = {"01": "i", "02": "ii", "03": "iii", "04": "iv",
+                        "05": "v", "06": "vi", "07": "vii", "08": "viii",
+                        "09": "ix", "10": "x", "11": "xi", "12": "xii"}
+
+        weekday_dict = {"Sunday": "Niedziela", "Monday": "Poniedziałek", "Tuesday": "Wtorek",
+                        "Wednesday": "Środa", "Thursday": "Czwartek", "Friday": "Piątek", "Saturday": "Sobota"}
+
+        now = datetime.datetime.now()
+
+        feeds = []
+        for i in range(0, self.next_days):
+            url_date = now + datetime.timedelta(days=i)
+            url_date_month = url_date.strftime("%m")
+            url_date_month_roman = dec2rom_dict[url_date_month]
+            url_date_day = url_date.strftime("%d")
+            url_date_year = url_date.strftime("%Y")[2:]
+            url_date_weekday = url_date.strftime("%A")
+            url_date_weekday_pl = weekday_dict[url_date_weekday]
+
+            url = "http://brewiarz.pl/" + url_date_month_roman + "_" + url_date_year + "/" + url_date_day + url_date_month + "/index.php3"
+            articles = self.parse_pages(url)
+            if articles:
+                title = url_date_weekday_pl + " " + url_date_day + "." + url_date_month + "." + url_date_year
+                feeds.append((title, articles))
+            else:
+                sectors = self.get_sectors(url)
+                for subpage in sectors:
+                    title = url_date_weekday_pl + " " + url_date_day + "." + url_date_month + "." + url_date_year + " - " + subpage.string
+                    url = "http://brewiarz.pl/" + url_date_month_roman + "_" + url_date_year + "/" + url_date_day + url_date_month + "/" + subpage['href']
+                    print(url)
+                    articles = self.parse_pages(url)
+                    if articles:
+                        feeds.append((title, articles))
+        return feeds
+
+    def get_sectors(self, url):
+        sectors = []
+        soup = self.index_to_soup(url)
+        sectors_table = soup.find(name='table', attrs={'width': '490'})
+        sector_links = sectors_table.findAll(name='a')
+        for sector_links_modified in sector_links:
+            link_parent_text = sector_links_modified.findParent(name='div').text
+            if link_parent_text:
+                sector_links_modified.text = link_parent_text.text
+            sectors.append(sector_links_modified)
+        return sectors
+
+    def parse_pages(self, url):
+        current_articles = []
+        soup = self.index_to_soup(url)
+        www = soup.find(attrs={'class': 'www'})
+        if www:
+            box_title = www.find(text='Teksty LG')
+            article_box_parent = box_title.findParent('ul')
+            article_box_sibling = article_box_parent.findNextSibling('ul')
+            for li in article_box_sibling.findAll('li'):
+                link = li.find(name='a')
+                ol = link.findNextSibling(name='ol')
+                if ol:
+                    sublinks = ol.findAll(name='a')
+                    for sublink in sublinks:
+                        link_title = self.tag_to_string(link) + " - " + self.tag_to_string(sublink)
+                        link_url_print = re.sub('php3', 'php3?kr=_druk&wr=lg&', sublink['href'])
+                        link_url = url[:-10] + link_url_print
+                        current_articles.append({'title': link_title,
+                                                 'url': link_url, 'description': '', 'date': ''})
+                else:
+                    if link.findParent(name = 'ol'):
+                        continue
+                    else:
+                        link_title = self.tag_to_string(link)
+                        link_url_print = re.sub('php3', 'php3?kr=_druk&wr=lg&', link['href'])
+                        link_url = url[:-10] + link_url_print
+                        current_articles.append({'title': link_title,
+                                                 'url': link_url, 'description': '', 'date': ''})
+            return current_articles
+        else:
+            return None
+
+    def preprocess_html(self, soup):
+        footer = soup.find(name='a', attrs={'href': 'http://brewiarz.pl'})
+        footer_parent = footer.findParent('div')
+        footer_parent.extract()
+
+        header = soup.find(text='http://brewiarz.pl')
+        header_parent = header.findParent('div')
+        header_parent.extract()
+
+        subheader = soup.find(text='Kolor szat:').findParent('div')
+        subheader.extract()
+
+        color = soup.find('b')
+        color.extract()
+
+        cleaned = self.strip_tags(soup)
+
+        div = cleaned.findAll(name='div')
+        div[1].extract()
+        div[2].extract()
+        div[3].extract()
+
+        return cleaned
+
+    def strip_tags(self, soup_dirty):
+        VALID_TAGS = ['p', 'div', 'br', 'b', 'a', 'title', 'head', 'html', 'body']
+
+        for tag in soup_dirty.findAll(True):
+            if tag.name not in VALID_TAGS:
+                for i, x in enumerate(tag.parent.contents):
+                    if x == tag:
+                        break
+                else:
+                    print "Can't find", tag, "in", tag.parent
+                    continue
+                for r in reversed(tag.contents):
+                    tag.parent.insert(i, r)
+                tag.extract()
+
+        return soup_dirty
--- a/recipes/dobreprogamy.recipe
+++ b/recipes/dobreprogamy.recipe
@ -6,7 +6,6 @@ class Dobreprogramy_pl(BasicNewsRecipe):
    __author__  = 'fenuks'
    __licence__ ='GPL v3'
    category       = 'IT'
-    language       = 'pl'
    masthead_url='http://static.dpcdn.pl/css/Black/Images/header_logo_napis_fullVersion.png'
    cover_url = 'http://userlogos.org/files/logos/Karmody/dobreprogramy_01.png'
    description = u'Aktualności i blogi z dobreprogramy.pl'
--- a/recipes/icons/autosport.png
+++ b/recipes/icons/autosport.png
--- a/recipes/icons/blognexto.png
+++ b/recipes/icons/blognexto.png
--- a/recipes/icons/brewiarz.png
+++ b/recipes/icons/brewiarz.png
--- a/recipes/icons/naszdziennik.png
+++ b/recipes/icons/naszdziennik.png
--- a/recipes/icons/wprost.png
+++ b/recipes/icons/wprost.png