calibre/recipes/benchmark_pl.recipe
2011-09-21 10:50:11 -06:00

71 lines
3.5 KiB
Plaintext

from calibre.web.feeds.news import BasicNewsRecipe
import re
class Benchmark_pl(BasicNewsRecipe):
title = u'Benchmark.pl'
__author__ = 'fenuks'
description = u'benchmark.pl -IT site'
cover_url = 'http://www.ieaddons.pl/benchmark/logo_benchmark_new.gif'
category = 'IT'
language = 'pl'
oldest_article = 8
max_articles_per_feed = 100
no_stylesheets=True
preprocess_regexps = [(re.compile(ur'\bWięcej o .*</body>', re.DOTALL|re.IGNORECASE), lambda match: '</body>')]
keep_only_tags=[dict(name='div', attrs={'class':['m_zwykly', 'gallery']})]
remove_tags_after=dict(name='div', attrs={'class':'body'})
remove_tags=[dict(name='div', attrs={'class':['kategoria', 'socialize', 'thumb', 'panelOcenaObserwowane', 'categoryNextToSocializeGallery']})]
INDEX= 'http://www.benchmark.pl'
feeds = [(u'Aktualności', u'http://www.benchmark.pl/rss/aktualnosci-pliki.xml'),
(u'Testy i recenzje', u'http://www.benchmark.pl/rss/testy-recenzje-minirecenzje.xml')]
def append_page(self, soup, appendtag):
nexturl = soup.find('span', attrs={'class':'next'})
while nexturl is not None:
nexturl= self.INDEX + nexturl.parent['href']
soup2 = self.index_to_soup(nexturl)
nexturl=soup2.find('span', attrs={'class':'next'})
pagetext = soup2.find(name='div', attrs={'class':'body'})
appendtag.find('div', attrs={'class':'k_ster'}).extract()
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
if appendtag.find('div', attrs={'class':'k_ster'}) is not None:
appendtag.find('div', attrs={'class':'k_ster'}).extract()
def image_article(self, soup, appendtag):
nexturl=soup.find('div', attrs={'class':'preview'})
if nexturl is not None:
nexturl=nexturl.find('a', attrs={'class':'move_next'})
image=appendtag.find('div', attrs={'class':'preview'}).div['style'][16:]
image=self.INDEX + image[:image.find("')")]
appendtag.find(attrs={'class':'preview'}).name='img'
appendtag.find(attrs={'class':'preview'})['src']=image
appendtag.find('a', attrs={'class':'move_next'}).extract()
while nexturl is not None:
nexturl= self.INDEX + nexturl['href']
soup2 = self.index_to_soup(nexturl)
nexturl=soup2.find('a', attrs={'class':'move_next'})
image=soup2.find('div', attrs={'class':'preview'}).div['style'][16:]
image=self.INDEX + image[:image.find("')")]
soup2.find(attrs={'class':'preview'}).name='img'
soup2.find(attrs={'class':'preview'})['src']=image
pagetext=soup2.find('div', attrs={'class':'gallery'})
pagetext.find('div', attrs={'class':'title'}).extract()
pagetext.find('div', attrs={'class':'thumb'}).extract()
pagetext.find('div', attrs={'class':'panelOcenaObserwowane'}).extract()
if nexturl is not None:
pagetext.find('a', attrs={'class':'move_next'}).extract()
pagetext.find('a', attrs={'class':'move_back'}).extract()
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
def preprocess_html(self, soup):
if soup.find('div', attrs={'class':'preview'}) is not None:
self.image_article(soup, soup.body)
else:
self.append_page(soup, soup.body)
return soup