From 59e81ea0777761ec2a81d3f74f9a7915d12cf9bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tomasz=20D=C5=82ugosz?= Date: Thu, 4 Apr 2013 21:13:33 +0200 Subject: [PATCH] remove DF as it is paywalled --- recipes/wyborcza_duzy_format.recipe | 144 ---------------------------- 1 file changed, 144 deletions(-) delete mode 100644 recipes/wyborcza_duzy_format.recipe diff --git a/recipes/wyborcza_duzy_format.recipe b/recipes/wyborcza_duzy_format.recipe deleted file mode 100644 index 30b0cfe418..0000000000 --- a/recipes/wyborcza_duzy_format.recipe +++ /dev/null @@ -1,144 +0,0 @@ -#!/usr/bin/env python - -from calibre.web.feeds.recipes import BasicNewsRecipe - -class GazetaWyborczaDuzyForma(BasicNewsRecipe): - cover_url = 'http://bi.gazeta.pl/im/8/5415/m5415058.gif' - title = u"Gazeta Wyborcza Duzy Format" - __author__ = 'ravcio - rlelusz[at]gmail.com' - description = u"Articles from Gazeta's website" - language = 'pl' - max_articles_per_feed = 50 #you can increade it event up to maybe 600, should still work - recursions = 0 - encoding = 'iso-8859-2' - no_stylesheets = True - remove_javascript = True - use_embedded_content = False - - - keep_only_tags = [ - dict(name='div', attrs={'id':['k1']}) - ] - - remove_tags = [ - dict(name='div', attrs={'class':['zdjM', 'rel_video', 'zdjP', 'rel_box', 'index mod_zi_dolStrony']}) - ,dict(name='div', attrs={'id':['source', 'banP4', 'article_toolbar', 'rel', 'inContext_disabled']}) - ,dict(name='ul', attrs={'id':['articleToolbar']}) - ,dict(name='img', attrs={'class':['brand']}) - ,dict(name='h5', attrs={'class':['author']}) - ,dict(name='h6', attrs={'class':['date']}) - ,dict(name='p', attrs={'class':['txt_upl']}) - ] - - remove_tags_after = [ - dict(name='div', attrs={'id':['Str']}) #nawigator numerow linii - ] - - def load_article_links(self, url, count): - print '--- load_article_links', url, count - - #page with link to articles - soup = self.index_to_soup(url) - - #table with articles - list = soup.find('div', attrs={'class':'GWdalt'}) - - #single articles (link, title, ...) - links = list.findAll('div', attrs={'class':['GWdaltE']}) - - if len(links) < count: - #load links to more articles... - - #remove new link - pages_nav = list.find('div', attrs={'class':'pages'}) - next = pages_nav.find('a', attrs={'class':'next'}) - if next: - print 'next=', next['href'] - url = 'http://wyborcza.pl' + next['href'] - #e.g. url = 'http://wyborcza.pl/0,75480.html?str=2' - - older_links = self.load_article_links(url, count - len(links)) - links.extend(older_links) - - return links - - - #produce list of articles to download - def parse_index(self): - print '--- parse_index' - - max_articles = 8000 - links = self.load_article_links('http://wyborcza.pl/0,75480.html', max_articles) - - ans = [] - key = None - articles = {} - - key = 'Uncategorized' - articles[key] = [] - - for div_art in links: - div_date = div_art.find('div', attrs={'class':'kL'}) - div = div_art.find('div', attrs={'class':'kR'}) - - a = div.find('a', href=True) - - url = a['href'] - title = a.string - description = '' - pubdate = div_date.string.rstrip().lstrip() - summary = div.find('span', attrs={'class':'lead'}) - - desc = summary.find('a', href=True) - if desc: - desc.extract() - - description = self.tag_to_string(summary, use_alt=False) - description = description.rstrip().lstrip() - - feed = key if key is not None else 'Duzy Format' - - if not articles.has_key(feed): - articles[feed] = [] - - if description != '': # skip just pictures atricle - articles[feed].append( - dict(title=title, url=url, date=pubdate, - description=description, - content='')) - - ans = [(key, articles[key])] - return ans - - def append_page(self, soup, appendtag, position): - pager = soup.find('div',attrs={'id':'Str'}) - if pager: - #seek for 'a' element with nast value (if not found exit) - list = pager.findAll('a') - - for elem in list: - if 'nast' in elem.string: - nexturl = elem['href'] - - soup2 = self.index_to_soup('http://warszawa.gazeta.pl' + nexturl) - - texttag = soup2.find('div', attrs={'id':'artykul'}) - - newpos = len(texttag.contents) - self.append_page(soup2,texttag,newpos) - texttag.extract() - appendtag.insert(position,texttag) - - def preprocess_html(self, soup): - self.append_page(soup, soup.body, 3) - - # finally remove some tags - pager = soup.find('div',attrs={'id':'Str'}) - if pager: - pager.extract() - - pager = soup.find('div',attrs={'class':'tylko_int'}) - if pager: - pager.extract() - - return soup