mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 10:14:46 -04:00
Some work on recipes (mostly Polish)
This commit is contained in:
parent
cf824e5670
commit
94f2927f66
@ -1,4 +1,5 @@
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup as bs
|
||||||
|
|
||||||
|
|
||||||
class Adventure_zone(BasicNewsRecipe):
|
class Adventure_zone(BasicNewsRecipe):
|
||||||
@ -19,16 +20,24 @@ class Adventure_zone(BasicNewsRecipe):
|
|||||||
remove_tags = [dict(attrs={'class': 'footer'})]
|
remove_tags = [dict(attrs={'class': 'footer'})]
|
||||||
feeds = [(u'Nowinki', u'http://www.adventure-zone.info/fusion/rss/index.php')]
|
feeds = [(u'Nowinki', u'http://www.adventure-zone.info/fusion/rss/index.php')]
|
||||||
|
|
||||||
|
_trigger_words = ('zapowied', 'recenzj', 'solucj', 'poradnik')
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _is_linked_text(title):
|
||||||
|
return 'zapowied' in title or 'recenz' in title or 'solucj' in title or 'poradnik' in title
|
||||||
|
|
||||||
def skip_ad_pages(self, soup):
|
def skip_ad_pages(self, soup):
|
||||||
skip_tag = soup.body.find(attrs={'class': 'content'})
|
skip_tag = soup.body.find(attrs={'class':'subject'})
|
||||||
skip_tag = skip_tag.findAll(name='a')
|
skip_tag = skip_tag.findAll(name='a', href=True)
|
||||||
title = soup.title.string.lower()
|
title = soup.title.renderContents().lower()
|
||||||
if (('zapowied' in title) or ('recenzj' in title) or ('solucj' in title) or ('poradnik' in title)):
|
if self._is_linked_text(title):
|
||||||
for r in skip_tag:
|
for r in skip_tag:
|
||||||
if r.strong and r.strong.string:
|
word = r.renderContents()
|
||||||
word = r.strong.string.lower()
|
if not word:
|
||||||
if (('zapowied' in word) or ('recenzj' in word) or ('solucj' in word) or ('poradnik' in word)):
|
continue
|
||||||
return self.index_to_soup(self.BASEURL + r['href'], raw=True)
|
word = word.lower()
|
||||||
|
if self._is_linked_text(word):
|
||||||
|
return self.index_to_soup(self.BASEURL+r['href'], raw=True)
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
for link in soup.findAll('a', href=True):
|
for link in soup.findAll('a', href=True):
|
||||||
|
@ -1,48 +0,0 @@
|
|||||||
#!/usr/bin/env python2
|
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
|
||||||
'''
|
|
||||||
blog.eclicto.pl
|
|
||||||
'''
|
|
||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
|
||||||
import re
|
|
||||||
|
|
||||||
|
|
||||||
class BlogeClictoRecipe(BasicNewsRecipe):
|
|
||||||
__author__ = 'Mori, Tomasz Długosz'
|
|
||||||
language = 'pl'
|
|
||||||
|
|
||||||
title = u'Blog eClicto'
|
|
||||||
publisher = u'Blog eClicto'
|
|
||||||
description = u'Blog o e-papierze i e-bookach'
|
|
||||||
|
|
||||||
max_articles_per_feed = 100
|
|
||||||
cover_url = 'http://blog.eclicto.pl/wordpress/wp-content/themes/blog_eclicto/g/logo.gif'
|
|
||||||
|
|
||||||
no_stylesheets = True
|
|
||||||
remove_javascript = True
|
|
||||||
encoding = 'utf-8'
|
|
||||||
|
|
||||||
extra_css = '''
|
|
||||||
img{float: left; padding-right: 10px; padding-bottom: 5px;}
|
|
||||||
'''
|
|
||||||
|
|
||||||
feeds = [
|
|
||||||
(u'Blog eClicto', u'http://blog.eclicto.pl/feed/')
|
|
||||||
]
|
|
||||||
|
|
||||||
remove_tags = [
|
|
||||||
dict(name='div', attrs={'class': 'social_bookmark'}),
|
|
||||||
]
|
|
||||||
|
|
||||||
keep_only_tags = [
|
|
||||||
dict(name='div', attrs={'class': 'post'})
|
|
||||||
]
|
|
||||||
|
|
||||||
preprocess_regexps = [
|
|
||||||
(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
|
|
||||||
[
|
|
||||||
(r'\s*</', lambda match: '</'),
|
|
||||||
]
|
|
||||||
]
|
|
@ -1,72 +1,32 @@
|
|||||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
from calibre.ebooks.BeautifulSoup import Comment
|
|
||||||
import re
|
|
||||||
|
|
||||||
|
|
||||||
class FilmOrgPl(BasicNewsRecipe):
|
class FilmOrgPl(BasicNewsRecipe):
|
||||||
title = u'Film.org.pl'
|
title = u'Film.org.pl'
|
||||||
__author__ = 'fenuks'
|
__author__ = 'fenuks'
|
||||||
description = u"Recenzje, analizy, artykuły, rankingi - wszystko o filmie dla miłośników kina. Opisy efektów specjalnych, wersji reżyserskich, remake'ów, sequeli. No i forum filmowe. Jedne z największych w Polsce." # noqa
|
description = u"Recenzje, analizy, artykuły, rankingi - wszystko o filmie dla miłośników kina. Opisy efektów specjalnych, wersji reżyserskich, remake'ów, sequeli. No i forum filmowe. Jedne z największych w Polsce."
|
||||||
category = 'film'
|
category = 'film'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
extra_css = '.alignright {float:right; margin-left:5px;} .alignleft {float:left; margin-right:5px;} .recenzja-title {font-size: 150%; margin-top: 5px; margin-bottom: 5px;}' # noqa
|
cover_url = 'http://film.org.pl/wp-content/uploads/2015/02/film.org.pl_film.org_.pl_kmfviolet4.png'
|
||||||
cover_url = 'http://film.org.pl/wp-content/themes/KMF/images/logo_kmf10.png'
|
|
||||||
ignore_duplicate_articles = {'title', 'url'}
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
oldest_article = 7
|
oldest_article = 7
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
use_embedded_content = False
|
use_embedded_content = True
|
||||||
remove_attributes = ['style']
|
|
||||||
preprocess_regexps = [
|
remove_attributes = ['style', 'width', 'height']
|
||||||
(re.compile(ur'<h3>Przeczytaj także:</h3>.*', re.IGNORECASE | re.DOTALL), lambda m: '</body>'), (re.compile(ur'</?center>', re.IGNORECASE | re.DOTALL), lambda m: ''), # noqa
|
remove_tags = [dict(attrs={'class': 'shortcode-box right'})]
|
||||||
(re.compile(ur'<div>Artykuł</div>', re.IGNORECASE), lambda m: ''),
|
|
||||||
(re.compile(ur'<div>Ludzie filmu</div>', re.IGNORECASE), lambda m: ''),
|
|
||||||
(re.compile(ur'(<br ?/?>\s*?){2,}', re.IGNORECASE | re.DOTALL), lambda m: '')]
|
|
||||||
keep_only_tags = [dict(name=['h11', 'h16', 'h17']),
|
|
||||||
dict(attrs={'class': 'editor'})]
|
|
||||||
remove_tags_after = dict(id='comments')
|
|
||||||
remove_tags = [dict(name=['link', 'meta', 'style']), dict(name='img', attrs={'alt': ['Ludzie filmu', u'Artykuł']}), dict(id='comments'), dict(
|
|
||||||
attrs={'style': 'border: 0pt none ; margin: 0pt; padding: 0pt;'}), dict(name='p', attrs={'class': 'rating'}), dict(attrs={'layout': 'button_count'})]
|
|
||||||
feeds = [
|
feeds = [
|
||||||
(u'Recenzje', u'http://film.org.pl/r/recenzje/feed/'),
|
(u'Recenzje', u'http://film.org.pl/r/recenzje/feed/'),
|
||||||
(u'Artyku\u0142', u'http://film.org.pl/a/artykul/feed/'),
|
(u'Artyku\u0142', u'http://film.org.pl/a/artykul/feed/'),
|
||||||
(u'Analiza', u'http://film.org.pl/a/analiza/feed/'),
|
(u'Analiza', u'http://film.org.pl/a/analiza/feed/'),
|
||||||
(u'Ranking', u'http://film.org.pl/a/ranking/feed/'),
|
(u'Ranking', u'http://film.org.pl/a/ranking/feed/'),
|
||||||
(u'Blog', u'http://film.org.pl/kmf/blog/feed/'),
|
(u'Blog', u'http://film.org.pl/kmf/blog/feed/'),
|
||||||
(u'Ludzie', u'http://film.org.pl/a/ludzie/feed/'),
|
(u'Ludzie', u'http://film.org.pl/a/ludzie/feed/'),
|
||||||
(u'Seriale', u'http://film.org.pl/a/seriale/feed/'),
|
(u'Seriale', u'http://film.org.pl/a/seriale/feed/'),
|
||||||
(u'Oceanarium', u'http://film.org.pl/a/ocenarium/feed/'),
|
(u'Oceanarium', u'http://film.org.pl/a/ocenarium/feed/'),
|
||||||
(u'VHS', u'http://film.org.pl/a/vhs-a/feed/')]
|
(u'VHS', u'http://film.org.pl/a/vhs-a/feed/'), ]
|
||||||
|
|
||||||
def append_page(self, soup, appendtag):
|
|
||||||
tag = soup.find('div', attrs={'class': 'pagelink'})
|
|
||||||
if tag:
|
|
||||||
for nexturl in tag.findAll('a'):
|
|
||||||
url = nexturl['href']
|
|
||||||
soup2 = self.index_to_soup(url)
|
|
||||||
pagetext = soup2.find(attrs={'class': 'editor'})
|
|
||||||
comments = pagetext.findAll(
|
|
||||||
text=lambda text: isinstance(text, Comment))
|
|
||||||
for comment in comments:
|
|
||||||
comment.extract()
|
|
||||||
pos = len(appendtag.contents)
|
|
||||||
appendtag.insert(pos, pagetext)
|
|
||||||
for r in appendtag.findAll(attrs={'class': 'pagelink'}):
|
|
||||||
r.extract()
|
|
||||||
for r in appendtag.findAll(attrs={'id': 'comments'}):
|
|
||||||
r.extract()
|
|
||||||
for r in appendtag.findAll(attrs={'style': 'border: 0pt none ; margin: 0pt; padding: 0pt;'}):
|
|
||||||
r.extract()
|
|
||||||
for r in appendtag.findAll(attrs={'layout': 'button_count'}):
|
|
||||||
r.extract()
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
|
||||||
for c in soup.findAll('h11'):
|
|
||||||
c.name = 'h1'
|
|
||||||
self.append_page(soup, soup.body)
|
|
||||||
for r in soup.findAll('br'):
|
|
||||||
r.extract()
|
|
||||||
return soup
|
|
||||||
|
@ -1,15 +1,14 @@
|
|||||||
import re
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
import re
|
||||||
|
|
||||||
|
|
||||||
class FilmWebPl(BasicNewsRecipe):
|
class FilmWebPl(BasicNewsRecipe):
|
||||||
title = u'FilmWeb'
|
title = 'FilmWeb'
|
||||||
__author__ = 'fenuks'
|
__author__ = 'fenuks'
|
||||||
description = 'Filmweb.pl - Filmy takie jak Ty Filmweb to największy i najczęściej odwiedzany polski serwis filmowy. Największa baza filmów, seriali i aktorów, repertuar kin i tv, ...' # noqa
|
description = u'Filmweb.pl - Filmy takie jak Ty Filmweb to największy i najczęściej odwiedzany polski serwis filmowy.'
|
||||||
cover_url = 'http://gfx.filmweb.pl/n/logo-filmweb-bevel.jpg'
|
cover_url = 'http://1.fwcdn.pl/an/867323/63321_1.11.jpg'
|
||||||
category = 'movies'
|
category = 'movies'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
index = 'http://www.filmweb.pl'
|
index = 'http://www.filmweb.pl'
|
||||||
oldest_article = 8
|
oldest_article = 8
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
@ -17,57 +16,37 @@ class FilmWebPl(BasicNewsRecipe):
|
|||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
ignore_duplicate_articles = {'title', 'url'}
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
preprocess_regexps = [(re.compile(u'\(kliknij\,\ aby powiększyć\)', re.IGNORECASE), lambda m: ''), (re.compile(
|
use_embedded_content = False
|
||||||
ur'(<br ?/?>\s*?<br ?/?>\s*?)+', re.IGNORECASE), lambda m: '<br />')] # (re.compile(ur' | ', re.IGNORECASE), lambda m: '')]
|
extra_css = ('.hdrBig {font-size:22px;} ul {list-style-type:none;} '
|
||||||
extra_css = '.hdrBig {font-size:22px;} ul {list-style-type:none; padding: 0; margin: 0;}'
|
'ul.inline > li {display: inline;} '
|
||||||
remove_attributes = ['style', ]
|
'ul.sep-line > li + li::before {content: " | "} '
|
||||||
keep_only_tags = [dict(attrs={'class': ['hdr hdr-super', 'newsContent']})]
|
'ul.inline {padding:0px;} .vertical-align {display: inline-block;}')
|
||||||
feeds = [(u'News / Filmy w produkcji', 'http://www.filmweb.pl/feed/news/category/filminproduction'),
|
preprocess_regexps = [(re.compile(r'<body.+?</head>', re.DOTALL), lambda match: ''), # fix malformed HTML with 2 body tags...
|
||||||
(u'News / Festiwale, nagrody i przeglądy',
|
(re.compile(u'(?:<sup>)?\(kliknij\,\ aby powiększyć\)(?:</sup>)?', re.IGNORECASE), lambda m: ''),
|
||||||
u'http://www.filmweb.pl/feed/news/category/festival'),
|
(re.compile(ur'(<br ?/?>\s*?<br ?/?>\s*?)+', re.IGNORECASE), lambda m: '<br />')
|
||||||
(u'News / Seriale', u'http://www.filmweb.pl/feed/news/category/serials'),
|
]
|
||||||
(u'News / Box office', u'http://www.filmweb.pl/feed/news/category/boxoffice'),
|
remove_tags = [dict(attrs={'class':['infoParent', 'likeBar',
|
||||||
(u'News / Multimedia',
|
'droptions-box pull-right', 'photoDesc', 'imageLicense', 'play big', 'shadow embed__icon--svg']})]
|
||||||
u'http://www.filmweb.pl/feed/news/category/multimedia'),
|
remove_attributes = ['style',]
|
||||||
(u'News / Dystrybucja dvd / blu-ray',
|
keep_only_tags = [dict(attrs={'class': ['newsHdr hdrWithAuthor ', 'reviewHdr', 'newsContent newsPage', 'newsContent']})]
|
||||||
u'http://www.filmweb.pl/feed/news/category/video'),
|
# remove_tags_before = dict(attrs={'class': 'hdr hdr-mega'})
|
||||||
(u'News / Dystrybucja kinowa',
|
# remove_tags_after = dict(attrs={'class': 'newsContent'})
|
||||||
u'http://www.filmweb.pl/feed/news/category/cinema'),
|
feeds = [(u'Filmy', u'http://www.filmweb.pl/feed/news/category/film'),
|
||||||
(u'News / off', u'http://www.filmweb.pl/feed/news/category/off'),
|
(u'Seriale', u'http://www.filmweb.pl/feed/news/category/serial'),
|
||||||
(u'News / Gry wideo', u'http://www.filmweb.pl/feed/news/category/game'),
|
(u'Box office', u'http://www.filmweb.pl/feed/news/category/boxoffice'),
|
||||||
(u'News / Organizacje branżowe',
|
(u'Telewizja', u'http://www.filmweb.pl/feed/news/category/tv'),
|
||||||
u'http://www.filmweb.pl/feed/news/category/organizations'),
|
(u'Festiwale, nagrody i przeglądy', u'http://www.filmweb.pl/feed/news/category/festival'),
|
||||||
(u'News / Internet', u'http://www.filmweb.pl/feed/news/category/internet'),
|
(u'Multimedia', u'http://www.filmweb.pl/feed/news/category/multimedia'),
|
||||||
(u'News / Różne', u'http://www.filmweb.pl/feed/news/category/other'),
|
(u'Dystrybucja dvd/blu-ray', u'http://www.filmweb.pl/feed/news/category/dvd'),
|
||||||
(u'News / Kino polskie',
|
(u'Gry wideo', u'http://www.filmweb.pl/feed/news/category/game'),
|
||||||
u'http://www.filmweb.pl/feed/news/category/polish.cinema'),
|
(u'Różne', u'http://www.filmweb.pl/feed/news/category/other'),
|
||||||
(u'News / Telewizja', u'http://www.filmweb.pl/feed/news/category/tv'),
|
(u'Recenzje redakcji', u'http://www.filmweb.pl/feed/reviews/latest'),
|
||||||
(u'Recenzje redakcji', u'http://www.filmweb.pl/feed/reviews/latest'),
|
(u'Recenzje użytkowników', u'http://www.filmweb.pl/feed/user-reviews/latest')
|
||||||
(u'Recenzje użytkowników',
|
]
|
||||||
u'http://www.filmweb.pl/feed/user-reviews/latest')
|
|
||||||
]
|
|
||||||
|
|
||||||
def skip_ad_pages(self, soup):
|
|
||||||
skip_tag = soup.find('a', attrs={'class': 'welcomeScreenButton'})
|
|
||||||
if skip_tag is not None:
|
|
||||||
return self.index_to_soup(skip_tag['href'], raw=True)
|
|
||||||
|
|
||||||
def postprocess_html(self, soup, first_fetch):
|
|
||||||
for r in soup.findAll(attrs={'class': 'singlephoto'}):
|
|
||||||
r['style'] = 'float:left; margin-right: 10px;'
|
|
||||||
return soup
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
for a in soup('a'):
|
for a in soup('a'):
|
||||||
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: # noqa
|
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
|
||||||
a['href'] = self.index + a['href'] # noqa
|
a['href'] = self.index + a['href']
|
||||||
for i in soup.findAll('a', attrs={'class': 'fn'}):
|
|
||||||
i.insert(len(i), BeautifulSoup('<br />'))
|
|
||||||
for i in soup.findAll('sup'):
|
|
||||||
if not i.string or i.string.startswith('(kliknij'):
|
|
||||||
i.extract()
|
|
||||||
for r in soup.findAll(id=re.compile('photo-\d+')):
|
|
||||||
r.extract()
|
|
||||||
for r in soup.findAll(style=re.compile('float: ?left')):
|
|
||||||
r['class'] = 'singlephoto'
|
|
||||||
return soup
|
return soup
|
||||||
|
@ -8,18 +8,19 @@ class Gildia(BasicNewsRecipe):
|
|||||||
description = u'Fantastyczny Portal Kulturalny - newsy, recenzje, galerie, wywiady. Literatura, film, gry komputerowe i planszowe, komiks, RPG, sklep. Nie lekceważ potęgi wyobraźni!' # noqa
|
description = u'Fantastyczny Portal Kulturalny - newsy, recenzje, galerie, wywiady. Literatura, film, gry komputerowe i planszowe, komiks, RPG, sklep. Nie lekceważ potęgi wyobraźni!' # noqa
|
||||||
cover_url = 'http://www.film.gildia.pl/_n_/portal/redakcja/logo/logo-gildia.pl-500.jpg'
|
cover_url = 'http://www.film.gildia.pl/_n_/portal/redakcja/logo/logo-gildia.pl-500.jpg'
|
||||||
category = 'culture'
|
category = 'culture'
|
||||||
cover_url = 'http://gildia.pl/images/logo-main.png'
|
cover_url = 'http://portal.gildia.pl/images/logo-main.png'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
oldest_article = 8
|
oldest_article = 8
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
|
use_embedded_content = False
|
||||||
ignore_duplicate_articles = {'title', 'url'}
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
preprocess_regexps = [(re.compile(ur'</?sup>'), lambda match: '')]
|
preprocess_regexps = [(re.compile(ur'</?sup>'), lambda match: '')]
|
||||||
ignore_duplicate_articles = {'title', 'url'}
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
remove_tags = [dict(name='div', attrs={'class': [
|
remove_tags = [dict(name='div', attrs={'class': [
|
||||||
'backlink', 'im_img', 'addthis_toolbox addthis_default_style', 'banner-bottom']})]
|
'backlink', 'im_img', 'addthis_toolbox addthis_default_style', 'banner-bottom']})]
|
||||||
keep_only_tags = [dict(name='div', attrs={'class': 'widetext'})]
|
keep_only_tags = [dict(name='div', attrs={'class': 'widetext'}), dict(name='article', attrs={'id': re.compile(r'post-\d+')})]
|
||||||
feeds = [(u'Gry', u'http://www.gry.gildia.pl/rss'),
|
feeds = [(u'Gry', u'http://www.gry.gildia.pl/rss'),
|
||||||
(u'Literatura', u'http://www.literatura.gildia.pl/rss'),
|
(u'Literatura', u'http://www.literatura.gildia.pl/rss'),
|
||||||
(u'Film', u'http://www.film.gildia.pl/rss'),
|
(u'Film', u'http://www.film.gildia.pl/rss'),
|
||||||
@ -40,10 +41,14 @@ class Gildia(BasicNewsRecipe):
|
|||||||
|
|
||||||
def skip_ad_pages(self, soup):
|
def skip_ad_pages(self, soup):
|
||||||
content = soup.find('div', attrs={'class': 'news'})
|
content = soup.find('div', attrs={'class': 'news'})
|
||||||
|
if content is None:
|
||||||
|
return
|
||||||
|
|
||||||
words = ('recenzj', 'zapowied', 'fragmen',
|
words = ('recenzj', 'zapowied', 'fragmen',
|
||||||
'relacj', 'wywiad', 'nominacj')
|
'relacj', 'wywiad', 'nominacj')
|
||||||
|
document_title = soup.title.renderContents().lower()
|
||||||
for word in words:
|
for word in words:
|
||||||
if word in soup.title.string.lower():
|
if word in document_title:
|
||||||
for link in content.findAll(name='a'):
|
for link in content.findAll(name='a'):
|
||||||
if word in link['href'] or (link.string and word in link.string):
|
if word in link['href'] or (link.string and word in link.string):
|
||||||
return self.index_to_soup(link['href'], raw=True)
|
return self.index_to_soup(link['href'], raw=True)
|
||||||
@ -52,13 +57,14 @@ class Gildia(BasicNewsRecipe):
|
|||||||
return self.index_to_soup(tag['href'], raw=True)
|
return self.index_to_soup(tag['href'], raw=True)
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
|
title = soup.title.renderContents().lower()
|
||||||
for a in soup('a'):
|
for a in soup('a'):
|
||||||
if a.has_key('href') and not a['href'].startswith('http'): # noqa
|
if a.has_key('href') and not a['href'].startswith('http'): # noqa
|
||||||
if '/gry/' in a['href']:
|
if '/gry/' in a['href']:
|
||||||
a['href'] = 'http://www.gry.gildia.pl' + a['href']
|
a['href'] = 'http://www.gry.gildia.pl' + a['href']
|
||||||
elif u'książk' in soup.title.string.lower() or u'komiks' in soup.title.string.lower():
|
elif u'książk' in title or u'komiks' in title:
|
||||||
a['href'] = 'http://www.literatura.gildia.pl' + a['href']
|
a['href'] = 'http://www.literatura.gildia.pl' + a['href']
|
||||||
elif u'komiks' in soup.title.string.lower():
|
elif u'komiks' in title:
|
||||||
a['href'] = 'http://www.literatura.gildia.pl' + a['href']
|
a['href'] = 'http://www.literatura.gildia.pl' + a['href']
|
||||||
else:
|
else:
|
||||||
a['href'] = 'http://www.gildia.pl' + a['href']
|
a['href'] = 'http://www.gildia.pl' + a['href']
|
||||||
|
27
recipes/parisreview.recipe
Normal file
27
recipes/parisreview.recipe
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
class ParisReview(BasicNewsRecipe):
|
||||||
|
title = 'The Paris Review Blog'
|
||||||
|
__author__ = 'fenuks'
|
||||||
|
description = u'The Paris Review is a literary magazine featuring original writing, art, and in-depth interviews with famous writers.'
|
||||||
|
# cover_url = ''
|
||||||
|
category = 'culture'
|
||||||
|
language = 'en'
|
||||||
|
encoding = 'utf-8'
|
||||||
|
oldest_article = 8
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
no_stylesheets = True
|
||||||
|
remove_empty_feeds = True
|
||||||
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
|
remove_javascript = True
|
||||||
|
use_embedded_content = True
|
||||||
|
# extra_css = ''
|
||||||
|
# preprocess_regexps = []
|
||||||
|
# remove_attributes = ['style',]
|
||||||
|
# keep_only_tags = []
|
||||||
|
remove_tags = []
|
||||||
|
# remove_tags_before = dict()
|
||||||
|
remove_tags_after = dict()
|
||||||
|
feeds = [('Posts', 'http://feeds.feedburner.com/TheParisReviewBlog')]
|
27
recipes/publicdomainreview_org.recipe
Normal file
27
recipes/publicdomainreview_org.recipe
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
class PublicDomainReview(BasicNewsRecipe):
|
||||||
|
title = 'The Public Domain Review'
|
||||||
|
__author__ = 'fenuks'
|
||||||
|
description = u'Online journal dedicated to showcasing the most interesting and unusual out-of-copyright works available on the web'
|
||||||
|
cover_url = 'http://publicdomainreview.org/wp-content/themes/pdr/assets/img/pdr-logo.gif'
|
||||||
|
category = 'culture'
|
||||||
|
language = 'en'
|
||||||
|
encoding = 'utf-8'
|
||||||
|
oldest_article = 14
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
no_stylesheets = True
|
||||||
|
remove_empty_feeds = True
|
||||||
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
|
remove_javascript = True
|
||||||
|
use_embedded_content = False
|
||||||
|
# extra_css = ''
|
||||||
|
# preprocess_regexps = []
|
||||||
|
# remove_attributes = ['style',]
|
||||||
|
keep_only_tags = [dict(name='article', attrs={'class': re.compile(r'post-\d+')})]
|
||||||
|
remove_tags = [dict(attrs={'class': 'synved-social-container synved-social-container-share'})]
|
||||||
|
# remove_tags_before = dict()
|
||||||
|
remove_tags_after = dict(name='div', attrs={'class': 'entry-content'})
|
||||||
|
feeds = [('Posts', 'http://publicdomainreview.org/feed/')]
|
Loading…
x
Reference in New Issue
Block a user