remove DF as it is paywalled

2025-08-11 09:13:57 -04:00 · 2013-04-04 21:13:33 +02:00 · 2013-04-04 21:13:33 +02:00 · 59e81ea077
commit 59e81ea077
parent 673f8aa9ad
1 changed files with 0 additions and 144 deletions
--- a/recipes/wyborcza_duzy_format.recipe
+++ b/recipes/wyborcza_duzy_format.recipe
@ -1,144 +0,0 @@
-#!/usr/bin/env  python
-
-from calibre.web.feeds.recipes import BasicNewsRecipe
-
-class GazetaWyborczaDuzyForma(BasicNewsRecipe):
-    cover_url             = 'http://bi.gazeta.pl/im/8/5415/m5415058.gif'
-    title                 = u"Gazeta Wyborcza Duzy Format"
-    __author__            = 'ravcio - rlelusz[at]gmail.com'
-    description           = u"Articles from Gazeta's website"
-    language              = 'pl'
-    max_articles_per_feed = 50  #you can increade it event up to maybe 600, should still work
-    recursions            = 0
-    encoding              = 'iso-8859-2'
-    no_stylesheets        = True
-    remove_javascript     = True
-    use_embedded_content  = False
-
-
-    keep_only_tags    = [
-            dict(name='div', attrs={'id':['k1']})
-                ]
-
-    remove_tags = [
-            dict(name='div', attrs={'class':['zdjM', 'rel_video', 'zdjP', 'rel_box', 'index mod_zi_dolStrony']})
-            ,dict(name='div', attrs={'id':['source', 'banP4', 'article_toolbar', 'rel', 'inContext_disabled']})
-            ,dict(name='ul', attrs={'id':['articleToolbar']})
-            ,dict(name='img', attrs={'class':['brand']})
-            ,dict(name='h5', attrs={'class':['author']})
-            ,dict(name='h6', attrs={'class':['date']})
-            ,dict(name='p', attrs={'class':['txt_upl']})
-                ]
-
-    remove_tags_after = [
-            dict(name='div', attrs={'id':['Str']})                #nawigator numerow linii
-                ]
-
-    def load_article_links(self, url, count):
-        print '--- load_article_links', url, count
-
-		#page with link to articles
-        soup = self.index_to_soup(url)
-
-		#table with articles
-        list = soup.find('div', attrs={'class':'GWdalt'})
-
-		#single articles (link, title, ...)
-        links = list.findAll('div', attrs={'class':['GWdaltE']})
-
-        if len(links) < count:
-            #load links to more articles...
-
-			#remove new link
-            pages_nav = list.find('div', attrs={'class':'pages'})
-            next = pages_nav.find('a', attrs={'class':'next'})
-            if next:
-                print 'next=', next['href']
-                url = 'http://wyborcza.pl' + next['href']
-                #e.g. url = 'http://wyborcza.pl/0,75480.html?str=2'
-
-                older_links = self.load_article_links(url, count - len(links))
-                links.extend(older_links)
-
-        return links
-
-
-    #produce list of articles to download
-    def parse_index(self):
-        print '--- parse_index'
-
-        max_articles = 8000
-        links = self.load_article_links('http://wyborcza.pl/0,75480.html', max_articles)
-
-        ans = []
-        key = None
-        articles = {}
-
-        key = 'Uncategorized'
-        articles[key] = []
-
-        for div_art in links:
-            div_date = div_art.find('div', attrs={'class':'kL'})
-            div = div_art.find('div', attrs={'class':'kR'})
-
-            a = div.find('a', href=True)
-
-            url = a['href']
-            title = a.string
-            description = ''
-            pubdate = div_date.string.rstrip().lstrip()
-            summary = div.find('span', attrs={'class':'lead'})
-
-            desc = summary.find('a', href=True)
-            if desc:
-                desc.extract()
-
-            description = self.tag_to_string(summary, use_alt=False)
-            description = description.rstrip().lstrip()
-
-            feed = key if key is not None else 'Duzy Format'
-
-            if not articles.has_key(feed):
-                articles[feed] = []
-
-            if description != '':  # skip just pictures atricle
-                articles[feed].append(
-                                   dict(title=title, url=url, date=pubdate,
-                                        description=description,
-                                        content=''))
-
-        ans = [(key, articles[key])]
-        return ans
-
-    def append_page(self, soup, appendtag, position):
-        pager = soup.find('div',attrs={'id':'Str'})
-        if pager:
-			#seek for 'a' element with nast value (if not found exit)
-            list = pager.findAll('a')
-
-            for elem in list:
-                if 'nast' in elem.string:
-                    nexturl = elem['href']
-
-                    soup2 = self.index_to_soup('http://warszawa.gazeta.pl' + nexturl)
-
-                    texttag = soup2.find('div', attrs={'id':'artykul'})
-
-                    newpos = len(texttag.contents)
-                    self.append_page(soup2,texttag,newpos)
-                    texttag.extract()
-                    appendtag.insert(position,texttag)
-
-    def preprocess_html(self, soup):
-        self.append_page(soup, soup.body, 3)
-
-        # finally remove some tags
-        pager = soup.find('div',attrs={'id':'Str'})
-        if pager:
-           pager.extract()
-
-        pager = soup.find('div',attrs={'class':'tylko_int'})
-        if pager:
-           pager.extract()
-
-        return soup