From a487b6ca00b213872442ad155dd146d3c5c64d1d Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sat, 21 Jan 2012 17:10:06 +0530
Subject: [PATCH] Gazeta Wyborcza by ravcio. Fixes #919546 (new recipe (GW Duzy
 Format - lang: polish ))

---
 recipes/wyborcza_duzy_format.recipe | 144 ++++++++++++++++++++++++++++
 1 file changed, 144 insertions(+)
 create mode 100644 recipes/wyborcza_duzy_format.recipe

diff --git a/recipes/wyborcza_duzy_format.recipe b/recipes/wyborcza_duzy_format.recipe
new file mode 100644
index 0000000000..30b0cfe418
--- /dev/null
+++ b/recipes/wyborcza_duzy_format.recipe
@@ -0,0 +1,144 @@
+#!/usr/bin/env  python
+
+from calibre.web.feeds.recipes import BasicNewsRecipe
+
+class GazetaWyborczaDuzyForma(BasicNewsRecipe):
+    cover_url             = 'http://bi.gazeta.pl/im/8/5415/m5415058.gif'
+    title                 = u"Gazeta Wyborcza Duzy Format"
+    __author__            = 'ravcio - rlelusz[at]gmail.com'
+    description           = u"Articles from Gazeta's website"
+    language              = 'pl'
+    max_articles_per_feed = 50  #you can increade it event up to maybe 600, should still work
+    recursions            = 0
+    encoding              = 'iso-8859-2'
+    no_stylesheets        = True
+    remove_javascript     = True
+    use_embedded_content  = False
+
+
+    keep_only_tags    = [
+            dict(name='div', attrs={'id':['k1']})
+                ]
+
+    remove_tags = [
+            dict(name='div', attrs={'class':['zdjM', 'rel_video', 'zdjP', 'rel_box', 'index mod_zi_dolStrony']})
+            ,dict(name='div', attrs={'id':['source', 'banP4', 'article_toolbar', 'rel', 'inContext_disabled']})
+            ,dict(name='ul', attrs={'id':['articleToolbar']})
+            ,dict(name='img', attrs={'class':['brand']})
+            ,dict(name='h5', attrs={'class':['author']})
+            ,dict(name='h6', attrs={'class':['date']})
+            ,dict(name='p', attrs={'class':['txt_upl']})
+                ]
+
+    remove_tags_after = [
+            dict(name='div', attrs={'id':['Str']})                #nawigator numerow linii
+                ]
+
+    def load_article_links(self, url, count):
+        print '--- load_article_links', url, count
+
+		#page with link to articles
+        soup = self.index_to_soup(url)
+
+		#table with articles
+        list = soup.find('div', attrs={'class':'GWdalt'})
+
+		#single articles (link, title, ...)
+        links = list.findAll('div', attrs={'class':['GWdaltE']})
+
+        if len(links) < count:
+            #load links to more articles...
+
+			#remove new link
+            pages_nav = list.find('div', attrs={'class':'pages'})
+            next = pages_nav.find('a', attrs={'class':'next'})
+            if next:
+                print 'next=', next['href']
+                url = 'http://wyborcza.pl' + next['href']
+                #e.g. url = 'http://wyborcza.pl/0,75480.html?str=2'
+
+                older_links = self.load_article_links(url, count - len(links))
+                links.extend(older_links)
+
+        return links
+
+
+    #produce list of articles to download
+    def parse_index(self):
+        print '--- parse_index'
+
+        max_articles = 8000
+        links = self.load_article_links('http://wyborcza.pl/0,75480.html', max_articles)
+
+        ans = []
+        key = None
+        articles = {}
+
+        key = 'Uncategorized'
+        articles[key] = []
+
+        for div_art in links:
+            div_date = div_art.find('div', attrs={'class':'kL'})
+            div = div_art.find('div', attrs={'class':'kR'})
+
+            a = div.find('a', href=True)
+
+            url = a['href']
+            title = a.string
+            description = ''
+            pubdate = div_date.string.rstrip().lstrip()
+            summary = div.find('span', attrs={'class':'lead'})
+
+            desc = summary.find('a', href=True)
+            if desc:
+                desc.extract()
+
+            description = self.tag_to_string(summary, use_alt=False)
+            description = description.rstrip().lstrip()
+
+            feed = key if key is not None else 'Duzy Format'
+
+            if not articles.has_key(feed):
+                articles[feed] = []
+
+            if description != '':  # skip just pictures atricle
+                articles[feed].append(
+                                   dict(title=title, url=url, date=pubdate,
+                                        description=description,
+                                        content=''))
+
+        ans = [(key, articles[key])]
+        return ans
+
+    def append_page(self, soup, appendtag, position):
+        pager = soup.find('div',attrs={'id':'Str'})
+        if pager:
+			#seek for 'a' element with nast value (if not found exit)
+            list = pager.findAll('a')
+
+            for elem in list:
+                if 'nast' in elem.string:
+                    nexturl = elem['href']
+
+                    soup2 = self.index_to_soup('http://warszawa.gazeta.pl' + nexturl)
+
+                    texttag = soup2.find('div', attrs={'id':'artykul'})
+
+                    newpos = len(texttag.contents)
+                    self.append_page(soup2,texttag,newpos)
+                    texttag.extract()
+                    appendtag.insert(position,texttag)
+
+    def preprocess_html(self, soup):
+        self.append_page(soup, soup.body, 3)
+
+        # finally remove some tags
+        pager = soup.find('div',attrs={'id':'Str'})
+        if pager:
+           pager.extract()
+
+        pager = soup.find('div',attrs={'class':'tylko_int'})
+        if pager:
+           pager.extract()
+
+        return soup