From a487b6ca00b213872442ad155dd146d3c5c64d1d Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 21 Jan 2012 17:10:06 +0530 Subject: [PATCH] Gazeta Wyborcza by ravcio. Fixes #919546 (new recipe (GW Duzy Format - lang: polish )) --- recipes/wyborcza_duzy_format.recipe | 144 ++++++++++++++++++++++++++++ 1 file changed, 144 insertions(+) create mode 100644 recipes/wyborcza_duzy_format.recipe diff --git a/recipes/wyborcza_duzy_format.recipe b/recipes/wyborcza_duzy_format.recipe new file mode 100644 index 0000000000..30b0cfe418 --- /dev/null +++ b/recipes/wyborcza_duzy_format.recipe @@ -0,0 +1,144 @@ +#!/usr/bin/env python + +from calibre.web.feeds.recipes import BasicNewsRecipe + +class GazetaWyborczaDuzyForma(BasicNewsRecipe): + cover_url = 'http://bi.gazeta.pl/im/8/5415/m5415058.gif' + title = u"Gazeta Wyborcza Duzy Format" + __author__ = 'ravcio - rlelusz[at]gmail.com' + description = u"Articles from Gazeta's website" + language = 'pl' + max_articles_per_feed = 50 #you can increade it event up to maybe 600, should still work + recursions = 0 + encoding = 'iso-8859-2' + no_stylesheets = True + remove_javascript = True + use_embedded_content = False + + + keep_only_tags = [ + dict(name='div', attrs={'id':['k1']}) + ] + + remove_tags = [ + dict(name='div', attrs={'class':['zdjM', 'rel_video', 'zdjP', 'rel_box', 'index mod_zi_dolStrony']}) + ,dict(name='div', attrs={'id':['source', 'banP4', 'article_toolbar', 'rel', 'inContext_disabled']}) + ,dict(name='ul', attrs={'id':['articleToolbar']}) + ,dict(name='img', attrs={'class':['brand']}) + ,dict(name='h5', attrs={'class':['author']}) + ,dict(name='h6', attrs={'class':['date']}) + ,dict(name='p', attrs={'class':['txt_upl']}) + ] + + remove_tags_after = [ + dict(name='div', attrs={'id':['Str']}) #nawigator numerow linii + ] + + def load_article_links(self, url, count): + print '--- load_article_links', url, count + + #page with link to articles + soup = self.index_to_soup(url) + + #table with articles + list = soup.find('div', attrs={'class':'GWdalt'}) + + #single articles (link, title, ...) + links = list.findAll('div', attrs={'class':['GWdaltE']}) + + if len(links) < count: + #load links to more articles... + + #remove new link + pages_nav = list.find('div', attrs={'class':'pages'}) + next = pages_nav.find('a', attrs={'class':'next'}) + if next: + print 'next=', next['href'] + url = 'http://wyborcza.pl' + next['href'] + #e.g. url = 'http://wyborcza.pl/0,75480.html?str=2' + + older_links = self.load_article_links(url, count - len(links)) + links.extend(older_links) + + return links + + + #produce list of articles to download + def parse_index(self): + print '--- parse_index' + + max_articles = 8000 + links = self.load_article_links('http://wyborcza.pl/0,75480.html', max_articles) + + ans = [] + key = None + articles = {} + + key = 'Uncategorized' + articles[key] = [] + + for div_art in links: + div_date = div_art.find('div', attrs={'class':'kL'}) + div = div_art.find('div', attrs={'class':'kR'}) + + a = div.find('a', href=True) + + url = a['href'] + title = a.string + description = '' + pubdate = div_date.string.rstrip().lstrip() + summary = div.find('span', attrs={'class':'lead'}) + + desc = summary.find('a', href=True) + if desc: + desc.extract() + + description = self.tag_to_string(summary, use_alt=False) + description = description.rstrip().lstrip() + + feed = key if key is not None else 'Duzy Format' + + if not articles.has_key(feed): + articles[feed] = [] + + if description != '': # skip just pictures atricle + articles[feed].append( + dict(title=title, url=url, date=pubdate, + description=description, + content='')) + + ans = [(key, articles[key])] + return ans + + def append_page(self, soup, appendtag, position): + pager = soup.find('div',attrs={'id':'Str'}) + if pager: + #seek for 'a' element with nast value (if not found exit) + list = pager.findAll('a') + + for elem in list: + if 'nast' in elem.string: + nexturl = elem['href'] + + soup2 = self.index_to_soup('http://warszawa.gazeta.pl' + nexturl) + + texttag = soup2.find('div', attrs={'id':'artykul'}) + + newpos = len(texttag.contents) + self.append_page(soup2,texttag,newpos) + texttag.extract() + appendtag.insert(position,texttag) + + def preprocess_html(self, soup): + self.append_page(soup, soup.body, 3) + + # finally remove some tags + pager = soup.find('div',attrs={'id':'Str'}) + if pager: + pager.extract() + + pager = soup.find('div',attrs={'class':'tylko_int'}) + if pager: + pager.extract() + + return soup