Gazeta Wyborcza by ravcio. Fixes #919546 (new recipe (GW Duzy Format - lang: polish ))

2025-12-18 11:05:05 -05:00 · 2012-01-21 17:10:06 +05:30 · 2012-01-21 17:10:06 +05:30 · a487b6ca00
commit a487b6ca00
parent 0d807994b7
1 changed files with 144 additions and 0 deletions
--- a/recipes/wyborcza_duzy_format.recipe
+++ b/recipes/wyborcza_duzy_format.recipe
@ -0,0 +1,144 @@
 #!/usr/bin/env  python
 from calibre.web.feeds.recipes import BasicNewsRecipe
 class GazetaWyborczaDuzyForma(BasicNewsRecipe):
    cover_url             = 'http://bi.gazeta.pl/im/8/5415/m5415058.gif'
    title                 = u"Gazeta Wyborcza Duzy Format"
    __author__            = 'ravcio - rlelusz[at]gmail.com'
    description           = u"Articles from Gazeta's website"
    language              = 'pl'
    max_articles_per_feed = 50  #you can increade it event up to maybe 600, should still work
    recursions            = 0
    encoding              = 'iso-8859-2'
    no_stylesheets        = True
    remove_javascript     = True
    use_embedded_content  = False
    keep_only_tags    = [
            dict(name='div', attrs={'id':['k1']})
                ]
    remove_tags = [
            dict(name='div', attrs={'class':['zdjM', 'rel_video', 'zdjP', 'rel_box', 'index mod_zi_dolStrony']})
            ,dict(name='div', attrs={'id':['source', 'banP4', 'article_toolbar', 'rel', 'inContext_disabled']})
            ,dict(name='ul', attrs={'id':['articleToolbar']})
            ,dict(name='img', attrs={'class':['brand']})
            ,dict(name='h5', attrs={'class':['author']})
            ,dict(name='h6', attrs={'class':['date']})
            ,dict(name='p', attrs={'class':['txt_upl']})
                ]
    remove_tags_after = [
            dict(name='div', attrs={'id':['Str']})                #nawigator numerow linii
                ]
    def load_article_links(self, url, count):
        print '--- load_article_links', url, count
 		#page with link to articles
        soup = self.index_to_soup(url)
 		#table with articles
        list = soup.find('div', attrs={'class':'GWdalt'})
 		#single articles (link, title, ...)
        links = list.findAll('div', attrs={'class':['GWdaltE']})
        if len(links) < count:
            #load links to more articles...
 			#remove new link
            pages_nav = list.find('div', attrs={'class':'pages'})
            next = pages_nav.find('a', attrs={'class':'next'})
            if next:
                print 'next=', next['href']
                url = 'http://wyborcza.pl' + next['href']
                #e.g. url = 'http://wyborcza.pl/0,75480.html?str=2'
                older_links = self.load_article_links(url, count - len(links))
                links.extend(older_links)
        return links
    #produce list of articles to download
    def parse_index(self):
        print '--- parse_index'
        max_articles = 8000
        links = self.load_article_links('http://wyborcza.pl/0,75480.html', max_articles)
        ans = []
        key = None
        articles = {}
        key = 'Uncategorized'
        articles[key] = []
        for div_art in links:
            div_date = div_art.find('div', attrs={'class':'kL'})
            div = div_art.find('div', attrs={'class':'kR'})
            a = div.find('a', href=True)
            url = a['href']
            title = a.string
            description = ''
            pubdate = div_date.string.rstrip().lstrip()
            summary = div.find('span', attrs={'class':'lead'})
            desc = summary.find('a', href=True)
            if desc:
                desc.extract()
            description = self.tag_to_string(summary, use_alt=False)
            description = description.rstrip().lstrip()
            feed = key if key is not None else 'Duzy Format'
            if not articles.has_key(feed):
                articles[feed] = []
            if description != '':  # skip just pictures atricle
                articles[feed].append(
                                   dict(title=title, url=url, date=pubdate,
                                        description=description,
                                        content=''))
        ans = [(key, articles[key])]
        return ans
    def append_page(self, soup, appendtag, position):
        pager = soup.find('div',attrs={'id':'Str'})
        if pager:
 			#seek for 'a' element with nast value (if not found exit)
            list = pager.findAll('a')
            for elem in list:
                if 'nast' in elem.string:
                    nexturl = elem['href']
                    soup2 = self.index_to_soup('http://warszawa.gazeta.pl' + nexturl)
                    texttag = soup2.find('div', attrs={'id':'artykul'})
                    newpos = len(texttag.contents)
                    self.append_page(soup2,texttag,newpos)
                    texttag.extract()
                    appendtag.insert(position,texttag)
    def preprocess_html(self, soup):
        self.append_page(soup, soup.body, 3)
        # finally remove some tags
        pager = soup.find('div',attrs={'id':'Str'})
        if pager:
           pager.extract()
        pager = soup.find('div',attrs={'class':'tylko_int'})
        if pager:
           pager.extract()
        return soup