mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
updates from kalibrator project
This commit is contained in:
parent
5df48e1b4d
commit
3e84887382
@ -6,42 +6,20 @@ class Adventure_zone(BasicNewsRecipe):
|
|||||||
description = u'Czytaj więcej o przygodzie - codzienne nowinki. Szukaj u nas solucji i poradników, czytaj recenzje i zapowiedzi. Także galeria, pliki oraz forum dla wszystkich fanów gier przygodowych.'
|
description = u'Czytaj więcej o przygodzie - codzienne nowinki. Szukaj u nas solucji i poradników, czytaj recenzje i zapowiedzi. Także galeria, pliki oraz forum dla wszystkich fanów gier przygodowych.'
|
||||||
category = 'games'
|
category = 'games'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
|
BASEURL = 'http://www.adventure-zone.info/fusion/'
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
|
extra_css = '.image {float: left; margin-right: 5px;}'
|
||||||
oldest_article = 20
|
oldest_article = 20
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
cover_url = 'http://www.adventure-zone.info/inne/logoaz_2012.png'
|
cover_url = 'http://www.adventure-zone.info/inne/logoaz_2012.png'
|
||||||
index = 'http://www.adventure-zone.info/fusion/'
|
remove_attributes = ['style']
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
preprocess_regexps = [(re.compile(r"<td class='capmain'>Komentarze</td>", re.IGNORECASE), lambda m: ''),
|
keep_only_tags = [dict(attrs={'class':'content'})]
|
||||||
(re.compile(r'</?table.*?>'), lambda match: ''),
|
remove_tags = [dict(attrs={'class':'footer'})]
|
||||||
(re.compile(r'</?tbody.*?>'), lambda match: '')]
|
feeds = [(u'Nowinki', u'http://www.adventure-zone.info/fusion/rss/index.php')]
|
||||||
remove_tags_before = dict(name='td', attrs={'class':'main-bg'})
|
|
||||||
remove_tags = [dict(name='img', attrs={'alt':'Drukuj'})]
|
|
||||||
remove_tags_after = dict(id='comments')
|
|
||||||
extra_css = '.main-bg{text-align: left;} td.capmain{ font-size: 22px; } img.news-category {float: left; margin-right: 5px;}'
|
|
||||||
feeds = [(u'Nowinki', u'http://www.adventure-zone.info/fusion/feeds/news.php')]
|
|
||||||
|
|
||||||
'''def get_cover_url(self):
|
|
||||||
soup = self.index_to_soup('http://www.adventure-zone.info/fusion/news.php')
|
|
||||||
cover=soup.find(id='box_OstatninumerAZ')
|
|
||||||
self.cover_url='http://www.adventure-zone.info/fusion/'+ cover.center.a.img['src']
|
|
||||||
return getattr(self, 'cover_url', self.cover_url)'''
|
|
||||||
|
|
||||||
def populate_article_metadata(self, article, soup, first):
|
|
||||||
result = re.search('(.+) - Adventure Zone', soup.title.string)
|
|
||||||
if result:
|
|
||||||
result = result.group(1)
|
|
||||||
else:
|
|
||||||
result = soup.body.find('strong')
|
|
||||||
if result:
|
|
||||||
result = result.string
|
|
||||||
if result:
|
|
||||||
result = result.replace('&', '&')
|
|
||||||
result = result.replace(''', '’')
|
|
||||||
article.title = result
|
|
||||||
|
|
||||||
def skip_ad_pages(self, soup):
|
def skip_ad_pages(self, soup):
|
||||||
skip_tag = soup.body.find(name='td', attrs={'class':'main-bg'})
|
skip_tag = soup.body.find(attrs={'class':'content'})
|
||||||
skip_tag = skip_tag.findAll(name='a')
|
skip_tag = skip_tag.findAll(name='a')
|
||||||
title = soup.title.string.lower()
|
title = soup.title.string.lower()
|
||||||
if (('zapowied' in title) or ('recenzj' in title) or ('solucj' in title) or ('poradnik' in title)):
|
if (('zapowied' in title) or ('recenzj' in title) or ('solucj' in title) or ('poradnik' in title)):
|
||||||
@ -49,20 +27,10 @@ class Adventure_zone(BasicNewsRecipe):
|
|||||||
if r.strong and r.strong.string:
|
if r.strong and r.strong.string:
|
||||||
word=r.strong.string.lower()
|
word=r.strong.string.lower()
|
||||||
if (('zapowied' in word) or ('recenzj' in word) or ('solucj' in word) or ('poradnik' in word)):
|
if (('zapowied' in word) or ('recenzj' in word) or ('solucj' in word) or ('poradnik' in word)):
|
||||||
return self.index_to_soup('http://www.adventure-zone.info/fusion/print.php?type=A&item'+r['href'][r['href'].find('article_id')+7:], raw=True)
|
return self.index_to_soup(self.BASEURL+r['href'], raw=True)
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
footer=soup.find(attrs={'class':'news-footer middle-border'})
|
for link in soup.findAll('a', href=True):
|
||||||
r = soup.find(name='td', attrs={'class':'capmain'})
|
if not link['href'].startswith('http'):
|
||||||
if r:
|
link['href'] = self.BASEURL + link['href']
|
||||||
r.name='h1'
|
return soup
|
||||||
for item in soup.findAll(name=['tr', 'td']):
|
|
||||||
item.name='div'
|
|
||||||
if footer and len(footer('a'))>=2:
|
|
||||||
footer('a')[1].extract()
|
|
||||||
for item in soup.findAll(style=True):
|
|
||||||
del item['style']
|
|
||||||
for a in soup('a'):
|
|
||||||
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
|
|
||||||
a['href']=self.index + a['href']
|
|
||||||
return soup
|
|
@ -13,6 +13,7 @@ class Astroflesz(BasicNewsRecipe):
|
|||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
|
remove_empty_feeds = True
|
||||||
remove_attributes = ['style']
|
remove_attributes = ['style']
|
||||||
keep_only_tags = [dict(id="k2Container")]
|
keep_only_tags = [dict(id="k2Container")]
|
||||||
remove_tags_after = dict(name='div', attrs={'class':'itemLinks'})
|
remove_tags_after = dict(name='div', attrs={'class':'itemLinks'})
|
||||||
|
@ -6,12 +6,10 @@ __copyright__ = '2011, Piotr Kontek, piotr.kontek@gmail.com \
|
|||||||
2013, Tomasz Długosz, tomek3d@gmail.com'
|
2013, Tomasz Długosz, tomek3d@gmail.com'
|
||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
from calibre.ptempfile import PersistentTemporaryFile
|
|
||||||
from datetime import date
|
|
||||||
import re
|
import re
|
||||||
|
from lxml import html
|
||||||
|
|
||||||
class GN(BasicNewsRecipe):
|
class GN(BasicNewsRecipe):
|
||||||
EDITION = 0
|
|
||||||
|
|
||||||
__author__ = 'Piotr Kontek, Tomasz Długosz'
|
__author__ = 'Piotr Kontek, Tomasz Długosz'
|
||||||
title = u'Gość Niedzielny'
|
title = u'Gość Niedzielny'
|
||||||
@ -20,83 +18,23 @@ class GN(BasicNewsRecipe):
|
|||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
temp_files = []
|
|
||||||
|
|
||||||
articles_are_obfuscated = True
|
def find_last_issue(self):
|
||||||
|
raw = self.index_to_soup('http://gosc.pl/wyszukaj/wydania/3.Gosc-Niedzielny/', raw=True)
|
||||||
|
doc = html.fromstring(raw)
|
||||||
|
page = doc.xpath('//div[@class="c"]//div[@class="search-result"]/div[1]/div[2]/h1//a/@href')
|
||||||
|
|
||||||
def get_obfuscated_article(self, url):
|
return page[1]
|
||||||
br = self.get_browser()
|
|
||||||
br.open(url)
|
|
||||||
source = br.response().read()
|
|
||||||
page = self.index_to_soup(source)
|
|
||||||
|
|
||||||
main_section = page.find('div',attrs={'class':'txt doc_prnt_prv'})
|
|
||||||
|
|
||||||
title = main_section.find('h2')
|
|
||||||
info = main_section.find('div', attrs={'class' : 'cf doc_info'})
|
|
||||||
authors = info.find(attrs={'class':'l'})
|
|
||||||
article = str(main_section.find('p', attrs={'class' : 'doc_lead'}))
|
|
||||||
first = True
|
|
||||||
for p in main_section.findAll('p', attrs={'class':None}, recursive=False):
|
|
||||||
if first and p.find('img') != None:
|
|
||||||
article += '<p>'
|
|
||||||
article += str(p.find('img')).replace('src="/files/','src="http://www.gosc.pl/files/')
|
|
||||||
article += '<font size="-2">'
|
|
||||||
for s in p.findAll('span'):
|
|
||||||
article += self.tag_to_string(s)
|
|
||||||
article += '</font></p>'
|
|
||||||
else:
|
|
||||||
article += str(p).replace('src="/files/','src="http://www.gosc.pl/files/')
|
|
||||||
first = False
|
|
||||||
limiter = main_section.find('p', attrs={'class' : 'limiter'})
|
|
||||||
if limiter:
|
|
||||||
article += str(limiter)
|
|
||||||
|
|
||||||
html = unicode(title)
|
|
||||||
#sometimes authors are not filled in:
|
|
||||||
if authors:
|
|
||||||
html += unicode(authors) + unicode(article)
|
|
||||||
else:
|
|
||||||
html += unicode(article)
|
|
||||||
|
|
||||||
self.temp_files.append(PersistentTemporaryFile('_temparse.html'))
|
|
||||||
self.temp_files[-1].write(html)
|
|
||||||
self.temp_files[-1].close()
|
|
||||||
return self.temp_files[-1].name
|
|
||||||
|
|
||||||
def find_last_issue(self, year):
|
|
||||||
soup = self.index_to_soup('http://gosc.pl/wyszukaj/wydania/3.Gosc-Niedzielny/rok/' + str(year))
|
|
||||||
|
|
||||||
#szukam zdjęcia i linka do poprzedniego pełnego numeru
|
|
||||||
first = True
|
|
||||||
for d in soup.findAll('div', attrs={'class':'l release_preview_l'}):
|
|
||||||
img = d.find('img')
|
|
||||||
if img != None:
|
|
||||||
a = img.parent
|
|
||||||
self.EDITION = a['href']
|
|
||||||
#this was preventing kindles from moving old issues to 'Back Issues' category:
|
|
||||||
#self.title = img['alt']
|
|
||||||
self.cover_url = 'http://www.gosc.pl' + img['src']
|
|
||||||
if year != date.today().year or not first:
|
|
||||||
break
|
|
||||||
first = False
|
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
year = date.today().year
|
soup = self.index_to_soup('http://gosc.pl' + self.find_last_issue())
|
||||||
self.find_last_issue(year)
|
|
||||||
##jeśli to pierwszy numer w roku trzeba pobrać poprzedni rok
|
|
||||||
if self.EDITION == 0:
|
|
||||||
self.find_last_issue(year-1)
|
|
||||||
soup = self.index_to_soup('http://www.gosc.pl' + self.EDITION)
|
|
||||||
feeds = []
|
feeds = []
|
||||||
#wstepniak
|
#wstepniak
|
||||||
a = soup.find('div',attrs={'class':'release-wp-b'}).find('a')
|
a = soup.find('div',attrs={'class':'release-wp-b'}).find('a')
|
||||||
articles = [
|
articles = [
|
||||||
{'title' : self.tag_to_string(a),
|
{'title' : self.tag_to_string(a),
|
||||||
'url' : 'http://www.gosc.pl' + a['href'].replace('/doc/','/doc_pr/'),
|
'url' : 'http://www.gosc.pl' + a['href'].replace('/doc/','/doc_pr/')
|
||||||
'date' : '',
|
}]
|
||||||
'description' : ''}
|
|
||||||
]
|
|
||||||
feeds.append((u'Wstępniak',articles))
|
feeds.append((u'Wstępniak',articles))
|
||||||
#kategorie
|
#kategorie
|
||||||
for addr in soup.findAll('a',attrs={'href':re.compile('kategoria')}):
|
for addr in soup.findAll('a',attrs={'href':re.compile('kategoria')}):
|
||||||
@ -113,16 +51,46 @@ class GN(BasicNewsRecipe):
|
|||||||
art = a.find('a')
|
art = a.find('a')
|
||||||
yield {
|
yield {
|
||||||
'title' : self.tag_to_string(art),
|
'title' : self.tag_to_string(art),
|
||||||
'url' : 'http://www.gosc.pl' + art['href'].replace('/doc/','/doc_pr/'),
|
'url' : 'http://www.gosc.pl' + art['href']
|
||||||
'date' : '',
|
|
||||||
'description' : ''
|
|
||||||
}
|
}
|
||||||
for a in main_block.findAll('div', attrs={'class':'sr-document'}):
|
for a in main_block.findAll('div', attrs={'class':'sr-document'}):
|
||||||
art = a.find('a')
|
art = a.find('a')
|
||||||
yield {
|
yield {
|
||||||
'title' : self.tag_to_string(art),
|
'title' : self.tag_to_string(art),
|
||||||
'url' : 'http://www.gosc.pl' + art['href'].replace('/doc/','/doc_pr/'),
|
'url' : 'http://www.gosc.pl' + art['href']
|
||||||
'date' : '',
|
|
||||||
'description' : ''
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def append_page(self, soup, appendtag):
|
||||||
|
chpage= appendtag.find(attrs={'class':'pgr_nrs'})
|
||||||
|
if chpage:
|
||||||
|
for page in chpage.findAll('a'):
|
||||||
|
soup2 = self.index_to_soup('http://gosc.pl' + page['href'])
|
||||||
|
pagetext = soup2.find(attrs={'class':'intextAd'})
|
||||||
|
pos = len(appendtag.contents)
|
||||||
|
appendtag.insert(pos, pagetext)
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
self.append_page(soup, soup.body)
|
||||||
|
'''
|
||||||
|
for image_div in soup.findAll(attrs={'class':'doc_image'}):
|
||||||
|
link =
|
||||||
|
if 'm.jpg' in image['src']:
|
||||||
|
image['src'] = image['src'].replace('m.jpg', '.jpg')
|
||||||
|
'''
|
||||||
|
return soup
|
||||||
|
|
||||||
|
keep_only_tags = [
|
||||||
|
dict(name='div', attrs={'class':'cf txt'})
|
||||||
|
]
|
||||||
|
|
||||||
|
remove_tags = [
|
||||||
|
dict(name='p', attrs={'class':['r tr', 'l l-2', 'wykop']}),
|
||||||
|
dict(name='div', attrs={'class':['doc_actions', 'pgr', 'fr1_cl']}),
|
||||||
|
dict(name='div', attrs={'id':'vote'})
|
||||||
|
]
|
||||||
|
|
||||||
|
extra_css = '''
|
||||||
|
h1 {font-size:150%}
|
||||||
|
div#doc_image {font-style:italic; font-size:70%}
|
||||||
|
p.limiter {font-size:150%; font-weight: bold}
|
||||||
|
'''
|
||||||
|
@ -13,11 +13,12 @@ class Histmag(BasicNewsRecipe):
|
|||||||
__author__ = 'matek09'
|
__author__ = 'matek09'
|
||||||
description = u"Artykuly historyczne i publicystyczne"
|
description = u"Artykuly historyczne i publicystyczne"
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
|
extra_css = '''.center img {display: block;}'''
|
||||||
#preprocess_regexps = [(re.compile(r'</span>'), lambda match: '</span><br><br>'),(re.compile(r'<span>'), lambda match: '<br><br><span>')]
|
#preprocess_regexps = [(re.compile(r'</span>'), lambda match: '</span><br><br>'),(re.compile(r'<span>'), lambda match: '<br><br><span>')]
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
keep_only_tags=[dict(id='article')]
|
keep_only_tags=[dict(id='article')]
|
||||||
remove_tags=[dict(name = 'p', attrs = {'class' : 'article-tags'})]
|
remove_tags=[dict(name = 'p', attrs = {'class' : 'article-tags'}), dict(attrs={'class':'twitter-share-button'})]
|
||||||
|
|
||||||
feeds = [(u'Wszystkie', u'http://histmag.org/rss/wszystkie.xml'), (u'Wydarzenia', u'http://histmag.org/rss/wydarzenia.xml'), (u'Recenzje', u'http://histmag.org/rss/recenzje.xml'), (u'Artykuły historyczne', u'http://histmag.org/rss/historia.xml'), (u'Publicystyka', u'http://histmag.org/rss/publicystyka.xml')]
|
feeds = [(u'Wszystkie', u'http://histmag.org/rss/wszystkie.xml'), (u'Wydarzenia', u'http://histmag.org/rss/wydarzenia.xml'), (u'Recenzje', u'http://histmag.org/rss/recenzje.xml'), (u'Artykuły historyczne', u'http://histmag.org/rss/historia.xml'), (u'Publicystyka', u'http://histmag.org/rss/publicystyka.xml')]
|
||||||
|
BIN
recipes/icons/geopolityka.png
Normal file
BIN
recipes/icons/geopolityka.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 1.5 KiB |
BIN
recipes/icons/gs24_pl.png
Normal file
BIN
recipes/icons/gs24_pl.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 428 B |
BIN
recipes/icons/homopedia_pl.png
Normal file
BIN
recipes/icons/homopedia_pl.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 541 B |
BIN
recipes/icons/pc_lab.png
Normal file
BIN
recipes/icons/pc_lab.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 697 B |
BIN
recipes/icons/polityka.png
Normal file
BIN
recipes/icons/polityka.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 346 B |
BIN
recipes/icons/rynek_zdrowia.png
Normal file
BIN
recipes/icons/rynek_zdrowia.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 418 B |
@ -20,7 +20,7 @@ class OSNewsRecipe(BasicNewsRecipe):
|
|||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
use_embedded_content = False;
|
use_embedded_content = False;
|
||||||
|
remove_empty_feeds = True
|
||||||
oldest_article = 7
|
oldest_article = 7
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
cover_url='http://osnews.pl/wp-content/themes/osnews/img/logo.png'
|
cover_url='http://osnews.pl/wp-content/themes/osnews/img/logo.png'
|
||||||
@ -31,22 +31,18 @@ class OSNewsRecipe(BasicNewsRecipe):
|
|||||||
'''
|
'''
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
(u'OSNews.pl', u'http://feeds.feedburner.com/OSnewspl')
|
(u'Niusy', u'http://feeds.feedburner.com/OSnewspl'),
|
||||||
|
(u'Wylęgarnia', u'http://feeds.feedburner.com/osnewspl_nowe')
|
||||||
]
|
]
|
||||||
|
|
||||||
keep_only_tags = [
|
keep_only_tags = [
|
||||||
dict(name = 'a', attrs = {'class' : 'news-heading'}),
|
dict(name = 'div', attrs = {'id' : 'content'})
|
||||||
dict(name = 'div', attrs = {'class' : 'newsinformations'}),
|
|
||||||
dict(name = 'div', attrs = {'id' : 'news-content'})
|
|
||||||
]
|
]
|
||||||
|
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
dict(name = 'div', attrs = {'class' : 'sociable'}),
|
dict(name = 'div', attrs = {'class' : ['newstags', 'tw_button', 'post_prev']}),
|
||||||
dict(name = 'div', attrs = {'class' : 'post_prev'}),
|
dict(name = 'div', attrs = {'id' : 'newspage_upinfo'}),
|
||||||
dict(name = 'div', attrs = {'class' : 'post_next'}),
|
|
||||||
dict(name = 'div', attrs = {'class' : 'clr'}),
|
|
||||||
dict(name = 'div', attrs = {'class' : 'tw_button'}),
|
|
||||||
dict(name = 'div', attrs = {'style' : 'width:56px;height:60px;float:left;margin-right:10px'})
|
|
||||||
]
|
]
|
||||||
|
|
||||||
preprocess_regexps = [(re.compile(u'</span>Komentarze: \(?[0-9]+\)? ?<span'), lambda match: '</span><span')]
|
remove_tags_after = dict(name = 'div', attrs = {'class' : 'post_prev'})
|
||||||
|
preprocess_regexps = [(re.compile(u'</span>Komentarze: \(?[0-9]+\)? ?<span'), lambda match: '</span><span'), (re.compile(u'<iframe.+?</iframe>'), lambda match: '')]
|
||||||
|
Loading…
x
Reference in New Issue
Block a user