diff --git a/recipes/benchmark_pl.recipe b/recipes/benchmark_pl.recipe index 00eea1be68..9544abdfcf 100644 --- a/recipes/benchmark_pl.recipe +++ b/recipes/benchmark_pl.recipe @@ -1,6 +1,6 @@ from calibre.web.feeds.news import BasicNewsRecipe import re -class Benchmark_pl(BasicNewsRecipe): +class BenchmarkPl(BasicNewsRecipe): title = u'Benchmark.pl' __author__ = 'fenuks' description = u'benchmark.pl -IT site' @@ -14,7 +14,7 @@ class Benchmark_pl(BasicNewsRecipe): preprocess_regexps = [(re.compile(ur'

 Zobacz poprzednie Opinie dnia:.*', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(ur'Więcej o .*?', re.DOTALL|re.IGNORECASE), lambda match: '')] keep_only_tags=[dict(name='div', attrs={'class':['m_zwykly', 'gallery']})] remove_tags_after=dict(name='div', attrs={'class':'body'}) - remove_tags=[dict(name='div', attrs={'class':['kategoria', 'socialize', 'thumb', 'panelOcenaObserwowane', 'categoryNextToSocializeGallery']}), dict(name='table', attrs={'background':'http://www.benchmark.pl/uploads/backend_img/a/fotki_newsy/opinie_dnia/bg.png'}), dict(name='table', attrs={'width':'210', 'cellspacing':'1', 'cellpadding':'4', 'border':'0', 'align':'right'})] + remove_tags=[dict(name='div', attrs={'class':['kategoria', 'socialize', 'thumb', 'panelOcenaObserwowane', 'categoryNextToSocializeGallery', 'breadcrumb']}), dict(name='table', attrs={'background':'http://www.benchmark.pl/uploads/backend_img/a/fotki_newsy/opinie_dnia/bg.png'}), dict(name='table', attrs={'width':'210', 'cellspacing':'1', 'cellpadding':'4', 'border':'0', 'align':'right'})] INDEX= 'http://www.benchmark.pl' feeds = [(u'Aktualności', u'http://www.benchmark.pl/rss/aktualnosci-pliki.xml'), (u'Testy i recenzje', u'http://www.benchmark.pl/rss/testy-recenzje-minirecenzje.xml')] diff --git a/recipes/film_web.recipe b/recipes/film_web.recipe index 2a6e00d501..ba34c9ff63 100644 --- a/recipes/film_web.recipe +++ b/recipes/film_web.recipe @@ -1,6 +1,7 @@ from calibre.web.feeds.news import BasicNewsRecipe - -class Filmweb_pl(BasicNewsRecipe): +import re +from calibre.ebooks.BeautifulSoup import BeautifulSoup +class FilmWebPl(BasicNewsRecipe): title = u'FilmWeb' __author__ = 'fenuks' description = 'FilmWeb - biggest polish movie site' @@ -12,8 +13,9 @@ class Filmweb_pl(BasicNewsRecipe): max_articles_per_feed = 100 no_stylesheets= True remove_empty_feeds=True + preprocess_regexps = [(re.compile(u'\(kliknij\,\ aby powiększyć\)', re.IGNORECASE), lambda m: ''), ]#(re.compile(ur' | ', re.IGNORECASE), lambda m: '')] extra_css = '.hdrBig {font-size:22px;} ul {list-style-type:none; padding: 0; margin: 0;}' - remove_tags= [dict(name='div', attrs={'class':['recommendOthers']}), dict(name='ul', attrs={'class':'fontSizeSet'})] + remove_tags= [dict(name='div', attrs={'class':['recommendOthers']}), dict(name='ul', attrs={'class':'fontSizeSet'}), dict(attrs={'class':'userSurname anno'})] keep_only_tags= [dict(name='h1', attrs={'class':['hdrBig', 'hdrEntity']}), dict(name='div', attrs={'class':['newsInfo', 'newsInfoSmall', 'reviewContent description']})] feeds = [(u'Wszystkie newsy', u'http://www.filmweb.pl/feed/news/latest'), (u'News / Filmy w produkcji', 'http://www.filmweb.pl/feed/news/category/filminproduction'), @@ -31,18 +33,22 @@ class Filmweb_pl(BasicNewsRecipe): (u'News / Kino polskie', u'http://www.filmweb.pl/feed/news/category/polish.cinema'), (u'News / Telewizja', u'http://www.filmweb.pl/feed/news/category/tv'), (u'Recenzje redakcji', u'http://www.filmweb.pl/feed/reviews/latest'), - (u'Recenzje użytkowników', u'http://www.filmweb.pl/feed/user-reviews/latest')] + (u'Recenzje użytkowników', u'http://www.filmweb.pl/feed/user-reviews/latest') + ] - def skip_ad_pages(self, soup): + def skip_ad_pages(self, soup): skip_tag = soup.find('a', attrs={'class':'welcomeScreenButton'}) if skip_tag is not None: - self.log.warn('skip_tag') - self.log.warn(skip_tag) return self.index_to_soup(skip_tag['href'], raw=True) - + def preprocess_html(self, soup): for a in soup('a'): if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: a['href']=self.index + a['href'] - return soup \ No newline at end of file + for i in soup.findAll('a', attrs={'class':'fn'}): + i.insert(len(i), BeautifulSoup('
')) + for i in soup.findAll('sup'): + if not i.string or i.string.startswith('(kliknij'): + i.extract() + return soup diff --git a/recipes/gry_online_pl.recipe b/recipes/gry_online_pl.recipe index e188e4988c..fce9674081 100644 --- a/recipes/gry_online_pl.recipe +++ b/recipes/gry_online_pl.recipe @@ -1,6 +1,6 @@ from calibre.web.feeds.recipes import BasicNewsRecipe -class Gry_online_pl(BasicNewsRecipe): +class GryOnlinePl(BasicNewsRecipe): title = u'Gry-Online.pl' __author__ = 'fenuks' description = 'Gry-Online.pl - computer games' @@ -21,17 +21,18 @@ class Gry_online_pl(BasicNewsRecipe): tag = appendtag.find('div', attrs={'class':'n5p'}) if tag: nexturls=tag.findAll('a') - for nexturl in nexturls[1:]: - try: - soup2 = self.index_to_soup('http://www.gry-online.pl/S020.asp'+ nexturl['href']) - except: - soup2 = self.index_to_soup('http://www.gry-online.pl/S022.asp'+ nexturl['href']) + url_part = soup.find('link', attrs={'rel':'canonical'})['href'] + url_part = url_part[25:].rpartition('?')[0] + for nexturl in nexturls[1:-1]: + soup2 = self.index_to_soup('http://www.gry-online.pl/' + url_part + nexturl['href']) pagetext = soup2.find(attrs={'class':'gc660'}) for r in pagetext.findAll(name='header'): r.extract() + for r in pagetext.findAll(attrs={'itemprop':'description'}): + r.extract() pos = len(appendtag.contents) appendtag.insert(pos, pagetext) - for r in appendtag.findAll(attrs={'class':['n5p', 'add-info', 'twitter-share-button']}): + for r in appendtag.findAll(attrs={'class':['n5p', 'add-info', 'twitter-share-button', 'lista lista3 lista-gry']}): r.extract() diff --git a/recipes/natemat_pl.recipe b/recipes/natemat_pl.recipe index faa1b341a0..d6db93dad7 100644 --- a/recipes/natemat_pl.recipe +++ b/recipes/natemat_pl.recipe @@ -1,3 +1,4 @@ +import re from calibre.web.feeds.news import BasicNewsRecipe class NaTemat(BasicNewsRecipe): @@ -8,8 +9,9 @@ class NaTemat(BasicNewsRecipe): description = u'informacje, komentarze, opinie' category = 'news' language = 'pl' + preprocess_regexps = [(re.compile(ur'Czytaj też\:.*?', re.IGNORECASE), lambda m: ''), (re.compile(ur'Zobacz też\:.*?', re.IGNORECASE), lambda m: ''), (re.compile(ur'Czytaj więcej\:.*?', re.IGNORECASE), lambda m: ''), (re.compile(ur'Czytaj również\:.*?', re.IGNORECASE), lambda m: '')] cover_url= 'http://blog.plona.pl/wp-content/uploads/2012/05/natemat.png' no_stylesheets = True keep_only_tags= [dict(id='main')] - remove_tags= [dict(attrs={'class':['button', 'block-inside style_default', 'article-related']})] + remove_tags= [dict(attrs={'class':['button', 'block-inside style_default', 'article-related', 'user-header', 'links']}), dict(name='img', attrs={'class':'indent'})] feeds = [(u'Artyku\u0142y', u'http://natemat.pl/rss/wszystkie')] diff --git a/recipes/wnp.recipe b/recipes/wnp.recipe index ee87112437..ec0d012733 100644 --- a/recipes/wnp.recipe +++ b/recipes/wnp.recipe @@ -1,7 +1,7 @@ from calibre.web.feeds.news import BasicNewsRecipe import re -class AdvancedUserRecipe1312886443(BasicNewsRecipe): +class WNP(BasicNewsRecipe): title = u'WNP' cover_url= 'http://k.wnp.pl/images/wnpLogo.gif' __author__ = 'fenuks' @@ -12,7 +12,7 @@ class AdvancedUserRecipe1312886443(BasicNewsRecipe): oldest_article = 8 max_articles_per_feed = 100 no_stylesheets= True - remove_tags=[dict(attrs={'class':'printF'})] + remove_tags=[dict(attrs={'class':['printF', 'border3B2 clearfix', 'articleMenu clearfix']})] feeds = [(u'Wiadomości gospodarcze', u'http://www.wnp.pl/rss/serwis_rss.xml'), (u'Serwis Energetyka - Gaz', u'http://www.wnp.pl/rss/serwis_rss_1.xml'), (u'Serwis Nafta - Chemia', u'http://www.wnp.pl/rss/serwis_rss_2.xml'),