From 94f2927f66e417b4a8a9c0fe4d0cecbfe62df6d4 Mon Sep 17 00:00:00 2001 From: fenuks Date: Mon, 14 Aug 2017 18:42:56 +0200 Subject: [PATCH] Some work on recipes (mostly Polish) --- recipes/adventure_zone_pl.recipe | 25 ++++--- recipes/eclicto.recipe | 48 -------------- recipes/film_org_pl.recipe | 72 +++++--------------- recipes/film_web.recipe | 95 +++++++++++---------------- recipes/gildia_pl.recipe | 16 +++-- recipes/parisreview.recipe | 27 ++++++++ recipes/publicdomainreview_org.recipe | 27 ++++++++ 7 files changed, 135 insertions(+), 175 deletions(-) delete mode 100644 recipes/eclicto.recipe create mode 100644 recipes/parisreview.recipe create mode 100644 recipes/publicdomainreview_org.recipe diff --git a/recipes/adventure_zone_pl.recipe b/recipes/adventure_zone_pl.recipe index 3f0c71fdcb..40be1482fc 100644 --- a/recipes/adventure_zone_pl.recipe +++ b/recipes/adventure_zone_pl.recipe @@ -1,4 +1,5 @@ from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup as bs class Adventure_zone(BasicNewsRecipe): @@ -19,16 +20,24 @@ class Adventure_zone(BasicNewsRecipe): remove_tags = [dict(attrs={'class': 'footer'})] feeds = [(u'Nowinki', u'http://www.adventure-zone.info/fusion/rss/index.php')] + _trigger_words = ('zapowied', 'recenzj', 'solucj', 'poradnik') + + @staticmethod + def _is_linked_text(title): + return 'zapowied' in title or 'recenz' in title or 'solucj' in title or 'poradnik' in title + def skip_ad_pages(self, soup): - skip_tag = soup.body.find(attrs={'class': 'content'}) - skip_tag = skip_tag.findAll(name='a') - title = soup.title.string.lower() - if (('zapowied' in title) or ('recenzj' in title) or ('solucj' in title) or ('poradnik' in title)): + skip_tag = soup.body.find(attrs={'class':'subject'}) + skip_tag = skip_tag.findAll(name='a', href=True) + title = soup.title.renderContents().lower() + if self._is_linked_text(title): for r in skip_tag: - if r.strong and r.strong.string: - word = r.strong.string.lower() - if (('zapowied' in word) or ('recenzj' in word) or ('solucj' in word) or ('poradnik' in word)): - return self.index_to_soup(self.BASEURL + r['href'], raw=True) + word = r.renderContents() + if not word: + continue + word = word.lower() + if self._is_linked_text(word): + return self.index_to_soup(self.BASEURL+r['href'], raw=True) def preprocess_html(self, soup): for link in soup.findAll('a', href=True): diff --git a/recipes/eclicto.recipe b/recipes/eclicto.recipe deleted file mode 100644 index 230b1d77fb..0000000000 --- a/recipes/eclicto.recipe +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/env python2 - -__license__ = 'GPL v3' -''' -blog.eclicto.pl -''' - -from calibre.web.feeds.news import BasicNewsRecipe -import re - - -class BlogeClictoRecipe(BasicNewsRecipe): - __author__ = 'Mori, Tomasz Długosz' - language = 'pl' - - title = u'Blog eClicto' - publisher = u'Blog eClicto' - description = u'Blog o e-papierze i e-bookach' - - max_articles_per_feed = 100 - cover_url = 'http://blog.eclicto.pl/wordpress/wp-content/themes/blog_eclicto/g/logo.gif' - - no_stylesheets = True - remove_javascript = True - encoding = 'utf-8' - - extra_css = ''' - img{float: left; padding-right: 10px; padding-bottom: 5px;} - ''' - - feeds = [ - (u'Blog eClicto', u'http://blog.eclicto.pl/feed/') - ] - - remove_tags = [ - dict(name='div', attrs={'class': 'social_bookmark'}), - ] - - keep_only_tags = [ - dict(name='div', attrs={'class': 'post'}) - ] - - preprocess_regexps = [ - (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in - [ - (r'\s*Przeczytaj także:.*', re.IGNORECASE | re.DOTALL), lambda m: ''), (re.compile(ur'', re.IGNORECASE | re.DOTALL), lambda m: ''), # noqa - (re.compile(ur'
Artykuł
', re.IGNORECASE), lambda m: ''), - (re.compile(ur'
Ludzie filmu
', re.IGNORECASE), lambda m: ''), - (re.compile(ur'(
\s*?){2,}', re.IGNORECASE | re.DOTALL), lambda m: '')] - keep_only_tags = [dict(name=['h11', 'h16', 'h17']), - dict(attrs={'class': 'editor'})] - remove_tags_after = dict(id='comments') - remove_tags = [dict(name=['link', 'meta', 'style']), dict(name='img', attrs={'alt': ['Ludzie filmu', u'Artykuł']}), dict(id='comments'), dict( - attrs={'style': 'border: 0pt none ; margin: 0pt; padding: 0pt;'}), dict(name='p', attrs={'class': 'rating'}), dict(attrs={'layout': 'button_count'})] + use_embedded_content = True + + remove_attributes = ['style', 'width', 'height'] + remove_tags = [dict(attrs={'class': 'shortcode-box right'})] + feeds = [ - (u'Recenzje', u'http://film.org.pl/r/recenzje/feed/'), - (u'Artyku\u0142', u'http://film.org.pl/a/artykul/feed/'), - (u'Analiza', u'http://film.org.pl/a/analiza/feed/'), - (u'Ranking', u'http://film.org.pl/a/ranking/feed/'), - (u'Blog', u'http://film.org.pl/kmf/blog/feed/'), - (u'Ludzie', u'http://film.org.pl/a/ludzie/feed/'), - (u'Seriale', u'http://film.org.pl/a/seriale/feed/'), - (u'Oceanarium', u'http://film.org.pl/a/ocenarium/feed/'), - (u'VHS', u'http://film.org.pl/a/vhs-a/feed/')] - - def append_page(self, soup, appendtag): - tag = soup.find('div', attrs={'class': 'pagelink'}) - if tag: - for nexturl in tag.findAll('a'): - url = nexturl['href'] - soup2 = self.index_to_soup(url) - pagetext = soup2.find(attrs={'class': 'editor'}) - comments = pagetext.findAll( - text=lambda text: isinstance(text, Comment)) - for comment in comments: - comment.extract() - pos = len(appendtag.contents) - appendtag.insert(pos, pagetext) - for r in appendtag.findAll(attrs={'class': 'pagelink'}): - r.extract() - for r in appendtag.findAll(attrs={'id': 'comments'}): - r.extract() - for r in appendtag.findAll(attrs={'style': 'border: 0pt none ; margin: 0pt; padding: 0pt;'}): - r.extract() - for r in appendtag.findAll(attrs={'layout': 'button_count'}): - r.extract() - - def preprocess_html(self, soup): - for c in soup.findAll('h11'): - c.name = 'h1' - self.append_page(soup, soup.body) - for r in soup.findAll('br'): - r.extract() - return soup + (u'Recenzje', u'http://film.org.pl/r/recenzje/feed/'), + (u'Artyku\u0142', u'http://film.org.pl/a/artykul/feed/'), + (u'Analiza', u'http://film.org.pl/a/analiza/feed/'), + (u'Ranking', u'http://film.org.pl/a/ranking/feed/'), + (u'Blog', u'http://film.org.pl/kmf/blog/feed/'), + (u'Ludzie', u'http://film.org.pl/a/ludzie/feed/'), + (u'Seriale', u'http://film.org.pl/a/seriale/feed/'), + (u'Oceanarium', u'http://film.org.pl/a/ocenarium/feed/'), + (u'VHS', u'http://film.org.pl/a/vhs-a/feed/'), ] diff --git a/recipes/film_web.recipe b/recipes/film_web.recipe index cad9d077a0..b7dee1c688 100644 --- a/recipes/film_web.recipe +++ b/recipes/film_web.recipe @@ -1,15 +1,14 @@ -import re from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup +import re class FilmWebPl(BasicNewsRecipe): - title = u'FilmWeb' - __author__ = 'fenuks' - description = 'Filmweb.pl - Filmy takie jak Ty Filmweb to największy i najczęściej odwiedzany polski serwis filmowy. Największa baza filmów, seriali i aktorów, repertuar kin i tv, ...' # noqa - cover_url = 'http://gfx.filmweb.pl/n/logo-filmweb-bevel.jpg' - category = 'movies' - language = 'pl' + title = 'FilmWeb' + __author__ = 'fenuks' + description = u'Filmweb.pl - Filmy takie jak Ty Filmweb to największy i najczęściej odwiedzany polski serwis filmowy.' + cover_url = 'http://1.fwcdn.pl/an/867323/63321_1.11.jpg' + category = 'movies' + language = 'pl' index = 'http://www.filmweb.pl' oldest_article = 8 max_articles_per_feed = 100 @@ -17,57 +16,37 @@ class FilmWebPl(BasicNewsRecipe): remove_empty_feeds = True ignore_duplicate_articles = {'title', 'url'} remove_javascript = True - preprocess_regexps = [(re.compile(u'\(kliknij\,\ aby powiększyć\)', re.IGNORECASE), lambda m: ''), (re.compile( - ur'(
\s*?
\s*?)+', re.IGNORECASE), lambda m: '
')] # (re.compile(ur' | ', re.IGNORECASE), lambda m: '')] - extra_css = '.hdrBig {font-size:22px;} ul {list-style-type:none; padding: 0; margin: 0;}' - remove_attributes = ['style', ] - keep_only_tags = [dict(attrs={'class': ['hdr hdr-super', 'newsContent']})] - feeds = [(u'News / Filmy w produkcji', 'http://www.filmweb.pl/feed/news/category/filminproduction'), - (u'News / Festiwale, nagrody i przeglądy', - u'http://www.filmweb.pl/feed/news/category/festival'), - (u'News / Seriale', u'http://www.filmweb.pl/feed/news/category/serials'), - (u'News / Box office', u'http://www.filmweb.pl/feed/news/category/boxoffice'), - (u'News / Multimedia', - u'http://www.filmweb.pl/feed/news/category/multimedia'), - (u'News / Dystrybucja dvd / blu-ray', - u'http://www.filmweb.pl/feed/news/category/video'), - (u'News / Dystrybucja kinowa', - u'http://www.filmweb.pl/feed/news/category/cinema'), - (u'News / off', u'http://www.filmweb.pl/feed/news/category/off'), - (u'News / Gry wideo', u'http://www.filmweb.pl/feed/news/category/game'), - (u'News / Organizacje branżowe', - u'http://www.filmweb.pl/feed/news/category/organizations'), - (u'News / Internet', u'http://www.filmweb.pl/feed/news/category/internet'), - (u'News / Różne', u'http://www.filmweb.pl/feed/news/category/other'), - (u'News / Kino polskie', - u'http://www.filmweb.pl/feed/news/category/polish.cinema'), - (u'News / Telewizja', u'http://www.filmweb.pl/feed/news/category/tv'), - (u'Recenzje redakcji', u'http://www.filmweb.pl/feed/reviews/latest'), - (u'Recenzje użytkowników', - u'http://www.filmweb.pl/feed/user-reviews/latest') - ] - - def skip_ad_pages(self, soup): - skip_tag = soup.find('a', attrs={'class': 'welcomeScreenButton'}) - if skip_tag is not None: - return self.index_to_soup(skip_tag['href'], raw=True) - - def postprocess_html(self, soup, first_fetch): - for r in soup.findAll(attrs={'class': 'singlephoto'}): - r['style'] = 'float:left; margin-right: 10px;' - return soup + use_embedded_content = False + extra_css = ('.hdrBig {font-size:22px;} ul {list-style-type:none;} ' + 'ul.inline > li {display: inline;} ' + 'ul.sep-line > li + li::before {content: " | "} ' + 'ul.inline {padding:0px;} .vertical-align {display: inline-block;}') + preprocess_regexps = [(re.compile(r'', re.DOTALL), lambda match: ''), # fix malformed HTML with 2 body tags... + (re.compile(u'(?:)?\(kliknij\,\ aby powiększyć\)(?:)?', re.IGNORECASE), lambda m: ''), + (re.compile(ur'(
\s*?
\s*?)+', re.IGNORECASE), lambda m: '
') + ] + remove_tags = [dict(attrs={'class':['infoParent', 'likeBar', + 'droptions-box pull-right', 'photoDesc', 'imageLicense', 'play big', 'shadow embed__icon--svg']})] + remove_attributes = ['style',] + keep_only_tags = [dict(attrs={'class': ['newsHdr hdrWithAuthor ', 'reviewHdr', 'newsContent newsPage', 'newsContent']})] + # remove_tags_before = dict(attrs={'class': 'hdr hdr-mega'}) + # remove_tags_after = dict(attrs={'class': 'newsContent'}) + feeds = [(u'Filmy', u'http://www.filmweb.pl/feed/news/category/film'), + (u'Seriale', u'http://www.filmweb.pl/feed/news/category/serial'), + (u'Box office', u'http://www.filmweb.pl/feed/news/category/boxoffice'), + (u'Telewizja', u'http://www.filmweb.pl/feed/news/category/tv'), + (u'Festiwale, nagrody i przeglądy', u'http://www.filmweb.pl/feed/news/category/festival'), + (u'Multimedia', u'http://www.filmweb.pl/feed/news/category/multimedia'), + (u'Dystrybucja dvd/blu-ray', u'http://www.filmweb.pl/feed/news/category/dvd'), + (u'Gry wideo', u'http://www.filmweb.pl/feed/news/category/game'), + (u'Różne', u'http://www.filmweb.pl/feed/news/category/other'), + (u'Recenzje redakcji', u'http://www.filmweb.pl/feed/reviews/latest'), + (u'Recenzje użytkowników', u'http://www.filmweb.pl/feed/user-reviews/latest') + ] def preprocess_html(self, soup): for a in soup('a'): - if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: # noqa - a['href'] = self.index + a['href'] # noqa - for i in soup.findAll('a', attrs={'class': 'fn'}): - i.insert(len(i), BeautifulSoup('
')) - for i in soup.findAll('sup'): - if not i.string or i.string.startswith('(kliknij'): - i.extract() - for r in soup.findAll(id=re.compile('photo-\d+')): - r.extract() - for r in soup.findAll(style=re.compile('float: ?left')): - r['class'] = 'singlephoto' + if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: + a['href'] = self.index + a['href'] + return soup diff --git a/recipes/gildia_pl.recipe b/recipes/gildia_pl.recipe index f70767ad9e..2a9c446645 100644 --- a/recipes/gildia_pl.recipe +++ b/recipes/gildia_pl.recipe @@ -8,18 +8,19 @@ class Gildia(BasicNewsRecipe): description = u'Fantastyczny Portal Kulturalny - newsy, recenzje, galerie, wywiady. Literatura, film, gry komputerowe i planszowe, komiks, RPG, sklep. Nie lekceważ potęgi wyobraźni!' # noqa cover_url = 'http://www.film.gildia.pl/_n_/portal/redakcja/logo/logo-gildia.pl-500.jpg' category = 'culture' - cover_url = 'http://gildia.pl/images/logo-main.png' + cover_url = 'http://portal.gildia.pl/images/logo-main.png' language = 'pl' oldest_article = 8 max_articles_per_feed = 100 remove_empty_feeds = True no_stylesheets = True + use_embedded_content = False ignore_duplicate_articles = {'title', 'url'} preprocess_regexps = [(re.compile(ur''), lambda match: '')] ignore_duplicate_articles = {'title', 'url'} remove_tags = [dict(name='div', attrs={'class': [ 'backlink', 'im_img', 'addthis_toolbox addthis_default_style', 'banner-bottom']})] - keep_only_tags = [dict(name='div', attrs={'class': 'widetext'})] + keep_only_tags = [dict(name='div', attrs={'class': 'widetext'}), dict(name='article', attrs={'id': re.compile(r'post-\d+')})] feeds = [(u'Gry', u'http://www.gry.gildia.pl/rss'), (u'Literatura', u'http://www.literatura.gildia.pl/rss'), (u'Film', u'http://www.film.gildia.pl/rss'), @@ -40,10 +41,14 @@ class Gildia(BasicNewsRecipe): def skip_ad_pages(self, soup): content = soup.find('div', attrs={'class': 'news'}) + if content is None: + return + words = ('recenzj', 'zapowied', 'fragmen', 'relacj', 'wywiad', 'nominacj') + document_title = soup.title.renderContents().lower() for word in words: - if word in soup.title.string.lower(): + if word in document_title: for link in content.findAll(name='a'): if word in link['href'] or (link.string and word in link.string): return self.index_to_soup(link['href'], raw=True) @@ -52,13 +57,14 @@ class Gildia(BasicNewsRecipe): return self.index_to_soup(tag['href'], raw=True) def preprocess_html(self, soup): + title = soup.title.renderContents().lower() for a in soup('a'): if a.has_key('href') and not a['href'].startswith('http'): # noqa if '/gry/' in a['href']: a['href'] = 'http://www.gry.gildia.pl' + a['href'] - elif u'książk' in soup.title.string.lower() or u'komiks' in soup.title.string.lower(): + elif u'książk' in title or u'komiks' in title: a['href'] = 'http://www.literatura.gildia.pl' + a['href'] - elif u'komiks' in soup.title.string.lower(): + elif u'komiks' in title: a['href'] = 'http://www.literatura.gildia.pl' + a['href'] else: a['href'] = 'http://www.gildia.pl' + a['href'] diff --git a/recipes/parisreview.recipe b/recipes/parisreview.recipe new file mode 100644 index 0000000000..8a370c4bbb --- /dev/null +++ b/recipes/parisreview.recipe @@ -0,0 +1,27 @@ +from calibre.web.feeds.news import BasicNewsRecipe +import re + + +class ParisReview(BasicNewsRecipe): + title = 'The Paris Review Blog' + __author__ = 'fenuks' + description = u'The Paris Review is a literary magazine featuring original writing, art, and in-depth interviews with famous writers.' + # cover_url = '' + category = 'culture' + language = 'en' + encoding = 'utf-8' + oldest_article = 8 + max_articles_per_feed = 100 + no_stylesheets = True + remove_empty_feeds = True + ignore_duplicate_articles = {'title', 'url'} + remove_javascript = True + use_embedded_content = True + # extra_css = '' + # preprocess_regexps = [] + # remove_attributes = ['style',] + # keep_only_tags = [] + remove_tags = [] + # remove_tags_before = dict() + remove_tags_after = dict() + feeds = [('Posts', 'http://feeds.feedburner.com/TheParisReviewBlog')] diff --git a/recipes/publicdomainreview_org.recipe b/recipes/publicdomainreview_org.recipe new file mode 100644 index 0000000000..316e69ab66 --- /dev/null +++ b/recipes/publicdomainreview_org.recipe @@ -0,0 +1,27 @@ +from calibre.web.feeds.news import BasicNewsRecipe +import re + + +class PublicDomainReview(BasicNewsRecipe): + title = 'The Public Domain Review' + __author__ = 'fenuks' + description = u'Online journal dedicated to showcasing the most interesting and unusual out-of-copyright works available on the web' + cover_url = 'http://publicdomainreview.org/wp-content/themes/pdr/assets/img/pdr-logo.gif' + category = 'culture' + language = 'en' + encoding = 'utf-8' + oldest_article = 14 + max_articles_per_feed = 100 + no_stylesheets = True + remove_empty_feeds = True + ignore_duplicate_articles = {'title', 'url'} + remove_javascript = True + use_embedded_content = False + # extra_css = '' + # preprocess_regexps = [] + # remove_attributes = ['style',] + keep_only_tags = [dict(name='article', attrs={'class': re.compile(r'post-\d+')})] + remove_tags = [dict(attrs={'class': 'synved-social-container synved-social-container-share'})] + # remove_tags_before = dict() + remove_tags_after = dict(name='div', attrs={'class': 'entry-content'}) + feeds = [('Posts', 'http://publicdomainreview.org/feed/')]