mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Gazeta Wyborcza by ravcio. Fixes #919546 (new recipe (GW Duzy Format - lang: polish ))
This commit is contained in:
parent
0d807994b7
commit
a487b6ca00
144
recipes/wyborcza_duzy_format.recipe
Normal file
144
recipes/wyborcza_duzy_format.recipe
Normal file
@ -0,0 +1,144 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
|
||||||
|
class GazetaWyborczaDuzyForma(BasicNewsRecipe):
|
||||||
|
cover_url = 'http://bi.gazeta.pl/im/8/5415/m5415058.gif'
|
||||||
|
title = u"Gazeta Wyborcza Duzy Format"
|
||||||
|
__author__ = 'ravcio - rlelusz[at]gmail.com'
|
||||||
|
description = u"Articles from Gazeta's website"
|
||||||
|
language = 'pl'
|
||||||
|
max_articles_per_feed = 50 #you can increade it event up to maybe 600, should still work
|
||||||
|
recursions = 0
|
||||||
|
encoding = 'iso-8859-2'
|
||||||
|
no_stylesheets = True
|
||||||
|
remove_javascript = True
|
||||||
|
use_embedded_content = False
|
||||||
|
|
||||||
|
|
||||||
|
keep_only_tags = [
|
||||||
|
dict(name='div', attrs={'id':['k1']})
|
||||||
|
]
|
||||||
|
|
||||||
|
remove_tags = [
|
||||||
|
dict(name='div', attrs={'class':['zdjM', 'rel_video', 'zdjP', 'rel_box', 'index mod_zi_dolStrony']})
|
||||||
|
,dict(name='div', attrs={'id':['source', 'banP4', 'article_toolbar', 'rel', 'inContext_disabled']})
|
||||||
|
,dict(name='ul', attrs={'id':['articleToolbar']})
|
||||||
|
,dict(name='img', attrs={'class':['brand']})
|
||||||
|
,dict(name='h5', attrs={'class':['author']})
|
||||||
|
,dict(name='h6', attrs={'class':['date']})
|
||||||
|
,dict(name='p', attrs={'class':['txt_upl']})
|
||||||
|
]
|
||||||
|
|
||||||
|
remove_tags_after = [
|
||||||
|
dict(name='div', attrs={'id':['Str']}) #nawigator numerow linii
|
||||||
|
]
|
||||||
|
|
||||||
|
def load_article_links(self, url, count):
|
||||||
|
print '--- load_article_links', url, count
|
||||||
|
|
||||||
|
#page with link to articles
|
||||||
|
soup = self.index_to_soup(url)
|
||||||
|
|
||||||
|
#table with articles
|
||||||
|
list = soup.find('div', attrs={'class':'GWdalt'})
|
||||||
|
|
||||||
|
#single articles (link, title, ...)
|
||||||
|
links = list.findAll('div', attrs={'class':['GWdaltE']})
|
||||||
|
|
||||||
|
if len(links) < count:
|
||||||
|
#load links to more articles...
|
||||||
|
|
||||||
|
#remove new link
|
||||||
|
pages_nav = list.find('div', attrs={'class':'pages'})
|
||||||
|
next = pages_nav.find('a', attrs={'class':'next'})
|
||||||
|
if next:
|
||||||
|
print 'next=', next['href']
|
||||||
|
url = 'http://wyborcza.pl' + next['href']
|
||||||
|
#e.g. url = 'http://wyborcza.pl/0,75480.html?str=2'
|
||||||
|
|
||||||
|
older_links = self.load_article_links(url, count - len(links))
|
||||||
|
links.extend(older_links)
|
||||||
|
|
||||||
|
return links
|
||||||
|
|
||||||
|
|
||||||
|
#produce list of articles to download
|
||||||
|
def parse_index(self):
|
||||||
|
print '--- parse_index'
|
||||||
|
|
||||||
|
max_articles = 8000
|
||||||
|
links = self.load_article_links('http://wyborcza.pl/0,75480.html', max_articles)
|
||||||
|
|
||||||
|
ans = []
|
||||||
|
key = None
|
||||||
|
articles = {}
|
||||||
|
|
||||||
|
key = 'Uncategorized'
|
||||||
|
articles[key] = []
|
||||||
|
|
||||||
|
for div_art in links:
|
||||||
|
div_date = div_art.find('div', attrs={'class':'kL'})
|
||||||
|
div = div_art.find('div', attrs={'class':'kR'})
|
||||||
|
|
||||||
|
a = div.find('a', href=True)
|
||||||
|
|
||||||
|
url = a['href']
|
||||||
|
title = a.string
|
||||||
|
description = ''
|
||||||
|
pubdate = div_date.string.rstrip().lstrip()
|
||||||
|
summary = div.find('span', attrs={'class':'lead'})
|
||||||
|
|
||||||
|
desc = summary.find('a', href=True)
|
||||||
|
if desc:
|
||||||
|
desc.extract()
|
||||||
|
|
||||||
|
description = self.tag_to_string(summary, use_alt=False)
|
||||||
|
description = description.rstrip().lstrip()
|
||||||
|
|
||||||
|
feed = key if key is not None else 'Duzy Format'
|
||||||
|
|
||||||
|
if not articles.has_key(feed):
|
||||||
|
articles[feed] = []
|
||||||
|
|
||||||
|
if description != '': # skip just pictures atricle
|
||||||
|
articles[feed].append(
|
||||||
|
dict(title=title, url=url, date=pubdate,
|
||||||
|
description=description,
|
||||||
|
content=''))
|
||||||
|
|
||||||
|
ans = [(key, articles[key])]
|
||||||
|
return ans
|
||||||
|
|
||||||
|
def append_page(self, soup, appendtag, position):
|
||||||
|
pager = soup.find('div',attrs={'id':'Str'})
|
||||||
|
if pager:
|
||||||
|
#seek for 'a' element with nast value (if not found exit)
|
||||||
|
list = pager.findAll('a')
|
||||||
|
|
||||||
|
for elem in list:
|
||||||
|
if 'nast' in elem.string:
|
||||||
|
nexturl = elem['href']
|
||||||
|
|
||||||
|
soup2 = self.index_to_soup('http://warszawa.gazeta.pl' + nexturl)
|
||||||
|
|
||||||
|
texttag = soup2.find('div', attrs={'id':'artykul'})
|
||||||
|
|
||||||
|
newpos = len(texttag.contents)
|
||||||
|
self.append_page(soup2,texttag,newpos)
|
||||||
|
texttag.extract()
|
||||||
|
appendtag.insert(position,texttag)
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
self.append_page(soup, soup.body, 3)
|
||||||
|
|
||||||
|
# finally remove some tags
|
||||||
|
pager = soup.find('div',attrs={'id':'Str'})
|
||||||
|
if pager:
|
||||||
|
pager.extract()
|
||||||
|
|
||||||
|
pager = soup.find('div',attrs={'class':'tylko_int'})
|
||||||
|
if pager:
|
||||||
|
pager.extract()
|
||||||
|
|
||||||
|
return soup
|
Loading…
x
Reference in New Issue
Block a user