mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 10:14:46 -04:00
remove DF as it is paywalled
This commit is contained in:
parent
673f8aa9ad
commit
59e81ea077
@ -1,144 +0,0 @@
|
|||||||
#!/usr/bin/env python
|
|
||||||
|
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
|
||||||
|
|
||||||
class GazetaWyborczaDuzyForma(BasicNewsRecipe):
|
|
||||||
cover_url = 'http://bi.gazeta.pl/im/8/5415/m5415058.gif'
|
|
||||||
title = u"Gazeta Wyborcza Duzy Format"
|
|
||||||
__author__ = 'ravcio - rlelusz[at]gmail.com'
|
|
||||||
description = u"Articles from Gazeta's website"
|
|
||||||
language = 'pl'
|
|
||||||
max_articles_per_feed = 50 #you can increade it event up to maybe 600, should still work
|
|
||||||
recursions = 0
|
|
||||||
encoding = 'iso-8859-2'
|
|
||||||
no_stylesheets = True
|
|
||||||
remove_javascript = True
|
|
||||||
use_embedded_content = False
|
|
||||||
|
|
||||||
|
|
||||||
keep_only_tags = [
|
|
||||||
dict(name='div', attrs={'id':['k1']})
|
|
||||||
]
|
|
||||||
|
|
||||||
remove_tags = [
|
|
||||||
dict(name='div', attrs={'class':['zdjM', 'rel_video', 'zdjP', 'rel_box', 'index mod_zi_dolStrony']})
|
|
||||||
,dict(name='div', attrs={'id':['source', 'banP4', 'article_toolbar', 'rel', 'inContext_disabled']})
|
|
||||||
,dict(name='ul', attrs={'id':['articleToolbar']})
|
|
||||||
,dict(name='img', attrs={'class':['brand']})
|
|
||||||
,dict(name='h5', attrs={'class':['author']})
|
|
||||||
,dict(name='h6', attrs={'class':['date']})
|
|
||||||
,dict(name='p', attrs={'class':['txt_upl']})
|
|
||||||
]
|
|
||||||
|
|
||||||
remove_tags_after = [
|
|
||||||
dict(name='div', attrs={'id':['Str']}) #nawigator numerow linii
|
|
||||||
]
|
|
||||||
|
|
||||||
def load_article_links(self, url, count):
|
|
||||||
print '--- load_article_links', url, count
|
|
||||||
|
|
||||||
#page with link to articles
|
|
||||||
soup = self.index_to_soup(url)
|
|
||||||
|
|
||||||
#table with articles
|
|
||||||
list = soup.find('div', attrs={'class':'GWdalt'})
|
|
||||||
|
|
||||||
#single articles (link, title, ...)
|
|
||||||
links = list.findAll('div', attrs={'class':['GWdaltE']})
|
|
||||||
|
|
||||||
if len(links) < count:
|
|
||||||
#load links to more articles...
|
|
||||||
|
|
||||||
#remove new link
|
|
||||||
pages_nav = list.find('div', attrs={'class':'pages'})
|
|
||||||
next = pages_nav.find('a', attrs={'class':'next'})
|
|
||||||
if next:
|
|
||||||
print 'next=', next['href']
|
|
||||||
url = 'http://wyborcza.pl' + next['href']
|
|
||||||
#e.g. url = 'http://wyborcza.pl/0,75480.html?str=2'
|
|
||||||
|
|
||||||
older_links = self.load_article_links(url, count - len(links))
|
|
||||||
links.extend(older_links)
|
|
||||||
|
|
||||||
return links
|
|
||||||
|
|
||||||
|
|
||||||
#produce list of articles to download
|
|
||||||
def parse_index(self):
|
|
||||||
print '--- parse_index'
|
|
||||||
|
|
||||||
max_articles = 8000
|
|
||||||
links = self.load_article_links('http://wyborcza.pl/0,75480.html', max_articles)
|
|
||||||
|
|
||||||
ans = []
|
|
||||||
key = None
|
|
||||||
articles = {}
|
|
||||||
|
|
||||||
key = 'Uncategorized'
|
|
||||||
articles[key] = []
|
|
||||||
|
|
||||||
for div_art in links:
|
|
||||||
div_date = div_art.find('div', attrs={'class':'kL'})
|
|
||||||
div = div_art.find('div', attrs={'class':'kR'})
|
|
||||||
|
|
||||||
a = div.find('a', href=True)
|
|
||||||
|
|
||||||
url = a['href']
|
|
||||||
title = a.string
|
|
||||||
description = ''
|
|
||||||
pubdate = div_date.string.rstrip().lstrip()
|
|
||||||
summary = div.find('span', attrs={'class':'lead'})
|
|
||||||
|
|
||||||
desc = summary.find('a', href=True)
|
|
||||||
if desc:
|
|
||||||
desc.extract()
|
|
||||||
|
|
||||||
description = self.tag_to_string(summary, use_alt=False)
|
|
||||||
description = description.rstrip().lstrip()
|
|
||||||
|
|
||||||
feed = key if key is not None else 'Duzy Format'
|
|
||||||
|
|
||||||
if not articles.has_key(feed):
|
|
||||||
articles[feed] = []
|
|
||||||
|
|
||||||
if description != '': # skip just pictures atricle
|
|
||||||
articles[feed].append(
|
|
||||||
dict(title=title, url=url, date=pubdate,
|
|
||||||
description=description,
|
|
||||||
content=''))
|
|
||||||
|
|
||||||
ans = [(key, articles[key])]
|
|
||||||
return ans
|
|
||||||
|
|
||||||
def append_page(self, soup, appendtag, position):
|
|
||||||
pager = soup.find('div',attrs={'id':'Str'})
|
|
||||||
if pager:
|
|
||||||
#seek for 'a' element with nast value (if not found exit)
|
|
||||||
list = pager.findAll('a')
|
|
||||||
|
|
||||||
for elem in list:
|
|
||||||
if 'nast' in elem.string:
|
|
||||||
nexturl = elem['href']
|
|
||||||
|
|
||||||
soup2 = self.index_to_soup('http://warszawa.gazeta.pl' + nexturl)
|
|
||||||
|
|
||||||
texttag = soup2.find('div', attrs={'id':'artykul'})
|
|
||||||
|
|
||||||
newpos = len(texttag.contents)
|
|
||||||
self.append_page(soup2,texttag,newpos)
|
|
||||||
texttag.extract()
|
|
||||||
appendtag.insert(position,texttag)
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
|
||||||
self.append_page(soup, soup.body, 3)
|
|
||||||
|
|
||||||
# finally remove some tags
|
|
||||||
pager = soup.find('div',attrs={'id':'Str'})
|
|
||||||
if pager:
|
|
||||||
pager.extract()
|
|
||||||
|
|
||||||
pager = soup.find('div',attrs={'class':'tylko_int'})
|
|
||||||
if pager:
|
|
||||||
pager.extract()
|
|
||||||
|
|
||||||
return soup
|
|
Loading…
x
Reference in New Issue
Block a user