Merge from trunk

This commit is contained in:
Charles Haley 2012-02-22 08:11:07 +01:00
commit 86a0bae6cb
62 changed files with 891 additions and 221 deletions

View File

@ -7,6 +7,7 @@ class Archeowiesci(BasicNewsRecipe):
language = 'pl'
cover_url='http://archeowiesci.pl/wp-content/uploads/2011/05/Archeowiesci2-115x115.jpg'
oldest_article = 7
needs_subscription='optional'
max_articles_per_feed = 100
auto_cleanup = True
remove_tags=[dict(name='span', attrs={'class':['post-ratings', 'post-ratings-loading']})]
@ -16,6 +17,16 @@ class Archeowiesci(BasicNewsRecipe):
feeds = BasicNewsRecipe.parse_feeds(self)
for feed in feeds:
for article in feed.articles[:]:
if 'subskrypcja' in article.title:
if self.username is None and 'subskrypcja' in article.title:
feed.articles.remove(article)
return feeds
def get_browser(self):
br = BasicNewsRecipe.get_browser()
if self.username is not None and self.password is not None:
br.open('http://archeowiesci.pl/wp-login.php')
br.select_form(name='loginform')
br['log'] = self.username
br['pwd'] = self.password
br.submit()
return br

View File

@ -1,15 +1,18 @@
from calibre.web.feeds.news import BasicNewsRecipe
import re
class Astronomia_pl(BasicNewsRecipe):
title = u'Astronomia.pl'
__author__ = 'fenuks'
description = 'Astronomia - polish astronomy site'
masthead_url = 'http://www.astronomia.pl/grafika/logo.gif'
cover_url = 'http://www.astronomia.pl/grafika/logo.gif'
category = 'astronomy, science'
language = 'pl'
oldest_article = 8
max_articles_per_feed = 100
#no_stylesheets=True
extra_css='#h2 {font-size: 18px;}'
no_stylesheets=True
preprocess_regexps = [(re.compile(ur'<b>Przeczytaj także:.*?</BODY>', re.DOTALL), lambda match: '</BODY>') ]
remove_tags_before=dict(name='div', attrs={'id':'a1'})
keep_only_tags=[dict(name='div', attrs={'id':['a1', 'h2']})]
feeds = [(u'Wiadomości z astronomii i astronautyki', u'http://www.astronomia.pl/rss/')]

View File

@ -4,16 +4,17 @@ class Benchmark_pl(BasicNewsRecipe):
title = u'Benchmark.pl'
__author__ = 'fenuks'
description = u'benchmark.pl -IT site'
masthead_url = 'http://www.benchmark.pl/i/logo-footer.png'
cover_url = 'http://www.ieaddons.pl/benchmark/logo_benchmark_new.gif'
category = 'IT'
language = 'pl'
oldest_article = 8
max_articles_per_feed = 100
no_stylesheets=True
preprocess_regexps = [(re.compile(ur'\bWięcej o .*</body>', re.DOTALL|re.IGNORECASE), lambda match: '</body>')]
preprocess_regexps = [(re.compile(ur'<h3><span style="font-size: small;">&nbsp;Zobacz poprzednie <a href="http://www.benchmark.pl/news/zestawienie/grupa_id/135">Opinie dnia:</a></span>.*</body>', re.DOTALL|re.IGNORECASE), lambda match: '</body>'), (re.compile(ur'Więcej o .*?</ul>', re.DOTALL|re.IGNORECASE), lambda match: '')]
keep_only_tags=[dict(name='div', attrs={'class':['m_zwykly', 'gallery']})]
remove_tags_after=dict(name='div', attrs={'class':'body'})
remove_tags=[dict(name='div', attrs={'class':['kategoria', 'socialize', 'thumb', 'panelOcenaObserwowane', 'categoryNextToSocializeGallery']})]
remove_tags=[dict(name='div', attrs={'class':['kategoria', 'socialize', 'thumb', 'panelOcenaObserwowane', 'categoryNextToSocializeGallery']}), dict(name='table', attrs={'background':'http://www.benchmark.pl/uploads/backend_img/a/fotki_newsy/opinie_dnia/bg.png'}), dict(name='table', attrs={'width':'210', 'cellspacing':'1', 'cellpadding':'4', 'border':'0', 'align':'right'})]
INDEX= 'http://www.benchmark.pl'
feeds = [(u'Aktualności', u'http://www.benchmark.pl/rss/aktualnosci-pliki.xml'),
(u'Testy i recenzje', u'http://www.benchmark.pl/rss/testy-recenzje-minirecenzje.xml')]

View File

@ -10,10 +10,11 @@ class Biolog_pl(BasicNewsRecipe):
description = u'Przyrodnicze aktualności ze świata nauki (codziennie aktualizowane), kurs biologii, testy i sprawdziany, forum dyskusyjne.'
category = 'biology'
language = 'pl'
masthead_url= 'http://www.biolog.pl/naukowy,portal,biolog.png'
cover_url='http://www.biolog.pl/naukowy,portal,biolog.png'
no_stylesheets = True
#keeps_only_tags=[dict(id='main')]
remove_tags_before=dict(id='main')
remove_tags_after=dict(name='a', attrs={'name':'komentarze'})
remove_tags=[dict(name='img', attrs={'alt':'Komentarze'})]
remove_tags=[dict(name='img', attrs={'alt':'Komentarze'}), dict(name='span', attrs={'class':'menu_odsylacze'})]
feeds = [(u'Wszystkie', u'http://www.biolog.pl/backend.php'), (u'Medycyna', u'http://www.biolog.pl/medycyna-rss.php'), (u'Ekologia', u'http://www.biolog.pl/rss-ekologia.php'), (u'Genetyka i biotechnologia', u'http://www.biolog.pl/rss-biotechnologia.php'), (u'Botanika', u'http://www.biolog.pl/rss-botanika.php'), (u'Le\u015bnictwo', u'http://www.biolog.pl/rss-lesnictwo.php'), (u'Zoologia', u'http://www.biolog.pl/rss-zoologia.php')]

View File

@ -1,16 +1,20 @@
from calibre.web.feeds.news import BasicNewsRecipe
class CD_Action(BasicNewsRecipe):
title = u'CD-Action'
__author__ = 'fenuks'
description = 'cdaction.pl - polish magazine about games site'
description = 'cdaction.pl - polish games magazine site'
category = 'games'
language = 'pl'
oldest_article = 8
max_articles_per_feed = 100
no_stylesheets= True
cover_url =u'http://s.cdaction.pl/obrazki/logo-CD-Action_172k9.JPG'
keep_only_tags= dict(id='news_content')
remove_tags_after= dict(name='div', attrs={'class':'tresc'})
feeds = [(u'Newsy', u'http://www.cdaction.pl/rss_newsy.xml')]
def get_cover_url(self):
soup = self.index_to_soup('http://www.cdaction.pl/magazyn/')
self.cover_url='http://www.cdaction.pl'+ soup.find(id='wspolnik').div.a['href']
return getattr(self, 'cover_url', self.cover_url)

View File

@ -5,6 +5,7 @@ class CGM(BasicNewsRecipe):
oldest_article = 7
__author__ = 'fenuks'
description = u'Codzienna Gazeta Muzyczna'
masthead_url='http://www.cgm.pl/img/header/logo.gif'
cover_url = 'http://www.krafcy.com/foto/tinymce/Image/cgm%281%29.jpg'
category = 'music'
language = 'pl'
@ -23,21 +24,19 @@ class CGM(BasicNewsRecipe):
def preprocess_html(self, soup):
gallery=soup.find('div', attrs={'class':'galleryFlash'})
if gallery:
img=gallery.div
gallery.img.extract()
if img:
img=img['style']
img='http://www.cgm.pl'+img[img.find('url(')+4:img.find(')')]
gallery.contents[1].name='img'
gallery.contents[1]['src']=img
for item in soup.findAll(style=True):
del item['style']
ad=soup.findAll('a')
for r in ad:
if 'http://www.hustla.pl' in r['href'] or 'http://www.ebilet.pl' in r['href']:
if 'www.hustla.pl' in r['href'] or 'www.ebilet.pl' in r['href']:
r.extract()
gallery=soup.find('div', attrs={'class':'galleryFlash'})
if gallery:
img=gallery.find('embed')
if img:
img=img['src'][35:]
img='http://www.cgm.pl/_vault/_gallery/_photo/'+img
param=gallery.findAll(name='param')
for i in param:
i.extract()
gallery.contents[1].name='img'
gallery.contents[1]['src']=img
return soup

View File

@ -33,6 +33,32 @@ class ChristianScienceMonitor(BasicNewsRecipe):
remove_javascript = True
no_stylesheets = True
requires_version = (0, 8, 39)
def preprocess_raw_html(self, raw, url):
try:
from html5lib import parse
root = parse(raw, namespaceHTMLElements=False,
treebuilder='lxml').getroot()
from lxml import etree
for tag in root.xpath(
'//script|//style|//noscript|//meta|//link|//object'):
tag.getparent().remove(tag)
for elem in list(root.iterdescendants(tag=etree.Comment)):
elem.getparent().remove(elem)
ans = etree.tostring(root, encoding=unicode)
ans = re.sub('.*<html', '<html', ans, flags=re.DOTALL)
return ans
except:
import traceback
traceback.print_exc()
raise
def index_to_soup(self, url):
raw = BasicNewsRecipe.index_to_soup(self, url,
raw=True).decode('utf-8')
raw = self.preprocess_raw_html(raw, url)
return BasicNewsRecipe.index_to_soup(self, raw)
def append_page(self, soup, appendtag, position):
nav = soup.find('div',attrs={'class':'navigation'})
@ -78,14 +104,6 @@ class ChristianScienceMonitor(BasicNewsRecipe):
print_soup = soup
return print_soup
preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[
(r'<!--.*?-->', lambda match : ''),
(r'<body.*?<div id="story"', lambda match : '<body><div id="story"'),
(r'<div class="pubdate">.*?</div>', lambda m: ''),
(r'Full HTML version of this story which may include photos, graphics, and related links.*</body>',
lambda match : '</body>'),
]]
extra_css = '''
h1{ color:#000000;font-family: Georgia,Times,"Times New Roman",serif; font-size: large}
.sub{ color:#000000;font-family: Georgia,Times,"Times New Roman",serif; font-size: small;}

View File

@ -0,0 +1,48 @@
from calibre.web.feeds.news import BasicNewsRecipe
import re
class Ciekawostki_Historyczne(BasicNewsRecipe):
title = u'Ciekawostki Historyczne'
oldest_article = 7
__author__ = 'fenuks'
description = u'Serwis popularnonaukowy - odkrycia, kontrowersje, historia, ciekawostki, badania, ciekawostki z przeszłości.'
category = 'history'
language = 'pl'
masthead_url= 'http://ciekawostkihistoryczne.pl/wp-content/themes/Wordpress_Magazine/images/logo-ciekawostki-historyczne-male.jpg'
cover_url='http://ciekawostkihistoryczne.pl/wp-content/themes/Wordpress_Magazine/images/logo-ciekawostki-historyczne-male.jpg'
max_articles_per_feed = 100
preprocess_regexps = [(re.compile(ur'Ten artykuł ma kilka stron.*?</fb:like>', re.DOTALL), lambda match: ''), (re.compile(ur'<h2>Zobacz też:</h2>.*?</ol>', re.DOTALL), lambda match: '')]
no_stylesheets=True
remove_empty_feeds=True
keep_only_tags=[dict(name='div', attrs={'class':'post'})]
remove_tags=[dict(id='singlepostinfo')]
feeds = [(u'Staro\u017cytno\u015b\u0107', u'http://ciekawostkihistoryczne.pl/tag/starozytnosc/feed/'), (u'\u015aredniowiecze', u'http://ciekawostkihistoryczne.pl/tag/sredniowiecze/feed/'), (u'Nowo\u017cytno\u015b\u0107', u'http://ciekawostkihistoryczne.pl/tag/nowozytnosc/feed/'), (u'XIX wiek', u'http://ciekawostkihistoryczne.pl/tag/xix-wiek/feed/'), (u'1914-1939', u'http://ciekawostkihistoryczne.pl/tag/1914-1939/feed/'), (u'1939-1945', u'http://ciekawostkihistoryczne.pl/tag/1939-1945/feed/'), (u'Powojnie (od 1945)', u'http://ciekawostkihistoryczne.pl/tag/powojnie/feed/'), (u'Recenzje', u'http://ciekawostkihistoryczne.pl/category/recenzje/feed/')]
def append_page(self, soup, appendtag):
tag=soup.find(name='h7')
if tag:
if tag.br:
pass
elif tag.nextSibling.name=='p':
tag=tag.nextSibling
nexturl = tag.findAll('a')
for nextpage in nexturl:
tag.extract()
nextpage= nextpage['href']
soup2 = self.index_to_soup(nextpage)
pagetext = soup2.find(name='div', attrs={'class':'post'})
for r in pagetext.findAll('div', attrs={'id':'singlepostinfo'}):
r.extract()
for r in pagetext.findAll('div', attrs={'class':'wp-caption alignright'}):
r.extract()
for r in pagetext.findAll('h1'):
r.extract()
pagetext.find('h6').nextSibling.extract()
pagetext.find('h7').nextSibling.extract()
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
def preprocess_html(self, soup):
self.append_page(soup, soup.body)
return soup

View File

@ -7,10 +7,11 @@ class Computerworld_pl(BasicNewsRecipe):
description = u'Serwis o IT w przemyśle, finansach, handlu, administracji oraz rynku IT i telekomunikacyjnym - wiadomości, opinie, analizy, porady prawne'
category = 'IT'
language = 'pl'
masthead_url= 'http://g1.computerworld.pl/cw/beta_gfx/cw2.gif'
no_stylesheets=True
oldest_article = 7
max_articles_per_feed = 100
keep_only_tags=[dict(name='div', attrs={'id':'s'})]
keep_only_tags=[dict(attrs={'class':['tyt_news', 'prawo', 'autor', 'tresc']})]
remove_tags_after=dict(name='div', attrs={'class':'rMobi'})
remove_tags=[dict(name='div', attrs={'class':['nnav', 'rMobi']}), dict(name='table', attrs={'class':'ramka_slx'})]
feeds = [(u'Wiadomo\u015bci', u'http://rssout.idg.pl/cw/news_iso.xml')]

View File

@ -7,6 +7,7 @@ class Dobreprogramy_pl(BasicNewsRecipe):
__licence__ ='GPL v3'
category = 'IT'
language = 'pl'
masthead_url='http://static.dpcdn.pl/css/Black/Images/header_logo_napis_fullVersion.png'
cover_url = 'http://userlogos.org/files/logos/Karmody/dobreprogramy_01.png'
description = u'Aktualności i blogi z dobreprogramy.pl'
encoding = 'utf-8'
@ -16,7 +17,8 @@ class Dobreprogramy_pl(BasicNewsRecipe):
oldest_article = 8
max_articles_per_feed = 100
preprocess_regexps = [(re.compile(ur'<div id="\S+360pmp4">Twoja przeglądarka nie obsługuje Flasha i HTML5 lub wyłączono obsługę JavaScript...</div>'), lambda match: '') ]
remove_tags = [dict(name='div', attrs={'class':['komentarze', 'block', 'portalInfo', 'menuBar', 'topBar']})]
keep_only_tags = [dict(name='div', attrs={'class':['mainBar', 'newsContent', 'postTitle title', 'postInfo', 'contentText', 'content']})]
keep_only_tags=[dict(attrs={'class':['news', 'entry single']})]
remove_tags = [dict(name='div', attrs={'class':['newsOptions', 'noPrint', 'komentarze', 'tags font-heading-master']})]
#remove_tags = [dict(name='div', attrs={'class':['komentarze', 'block', 'portalInfo', 'menuBar', 'topBar']})]
feeds = [(u'Aktualności', 'http://feeds.feedburner.com/dobreprogramy/Aktualnosci'),
('Blogi', 'http://feeds.feedburner.com/dobreprogramy/BlogCzytelnikow')]

View File

@ -8,15 +8,17 @@ class Dziennik_pl(BasicNewsRecipe):
description = u'Wiadomości z kraju i ze świata. Wiadomości gospodarcze. Znajdziesz u nas informacje, wydarzenia, komentarze, opinie.'
category = 'newspaper'
language = 'pl'
cover_url='http://6.s.dziennik.pl/images/og_dziennik.jpg'
masthead_url= 'http://5.s.dziennik.pl/images/logos.png'
cover_url= 'http://5.s.dziennik.pl/images/logos.png'
no_stylesheets = True
oldest_article = 7
max_articles_per_feed = 100
remove_javascript=True
remove_empty_feeds=True
preprocess_regexps = [(re.compile("Komentarze:"), lambda m: '')]
extra_css= 'ul {list-style: none; padding: 0; margin: 0;} li {float: left;margin: 0 0.15em;}'
preprocess_regexps = [(re.compile("Komentarze:"), lambda m: ''), (re.compile('<p><strong><a href=".*?">&gt;&gt;&gt; CZYTAJ TAKŻE: ".*?"</a></strong></p>'), lambda m: '')]
keep_only_tags=[dict(id='article')]
remove_tags=[dict(name='div', attrs={'class':['art_box_dodatki', 'new_facebook_icons2', 'leftArt', 'article_print', 'quiz-widget']}), dict(name='a', attrs={'class':'komentarz'})]
remove_tags=[dict(name='div', attrs={'class':['art_box_dodatki', 'new_facebook_icons2', 'leftArt', 'article_print', 'quiz-widget', 'belka-spol', 'belka-spol belka-spol-bottom', 'art_data_tags', 'cl_right', 'boxRounded gal_inside']}), dict(name='a', attrs={'class':['komentarz', 'article_icon_addcommnent']})]
feeds = [(u'Wszystko', u'http://rss.dziennik.pl/Dziennik-PL/'),
(u'Wiadomości', u'http://rss.dziennik.pl/Dziennik-Wiadomosci'),
(u'Gospodarka', u'http://rss.dziennik.pl/Dziennik-Gospodarka'),
@ -30,6 +32,12 @@ class Dziennik_pl(BasicNewsRecipe):
(u'Podróże', u'http://rss.dziennik.pl/Dziennik-Podroze/'),
(u'Nieruchomości', u'http://rss.dziennik.pl/Dziennik-Nieruchomosci')]
def skip_ad_pages(self, soup):
tag=soup.find(name='a', attrs={'title':'CZYTAJ DALEJ'})
if tag:
new_soup=self.index_to_soup(tag['href'], raw=True)
return new_soup
def append_page(self, soup, appendtag):
tag=soup.find('a', attrs={'class':'page_next'})
if tag:
@ -56,3 +64,4 @@ class Dziennik_pl(BasicNewsRecipe):
def preprocess_html(self, soup):
self.append_page(soup, soup.body)
return soup

View File

@ -10,7 +10,8 @@ class Filmweb_pl(BasicNewsRecipe):
oldest_article = 8
max_articles_per_feed = 100
no_stylesheets= True
extra_css = '.hdrBig {font-size:22px;}'
remove_empty_feeds=True
extra_css = '.hdrBig {font-size:22px;} ul {list-style-type:none; padding: 0; margin: 0;}'
remove_tags= [dict(name='div', attrs={'class':['recommendOthers']}), dict(name='ul', attrs={'class':'fontSizeSet'})]
keep_only_tags= [dict(name='h1', attrs={'class':'hdrBig'}), dict(name='div', attrs={'class':['newsInfo', 'reviewContent fontSizeCont description']})]
feeds = [(u'Wszystkie newsy', u'http://www.filmweb.pl/feed/news/latest'),

View File

@ -0,0 +1,21 @@
from calibre.web.feeds.news import BasicNewsRecipe
class Gameplay_pl(BasicNewsRecipe):
title = u'Gameplay.pl'
oldest_article = 7
__author__ = 'fenuks'
description = u'gameplay.pl - serwis o naszych zainteresowaniach, grach, filmach, książkach, muzyce, fotografii i konsolach.'
category = 'games, movies, books, music'
language = 'pl'
masthead_url= 'http://gameplay.pl/img/gpy_top_logo.png'
cover_url= 'http://gameplay.pl/img/gpy_top_logo.png'
max_articles_per_feed = 100
no_stylesheets= True
keep_only_tags=[dict(name='div', attrs={'class':['news_endpage_tit', 'news']})]
remove_tags=[dict(name='div', attrs={'class':['galeria', 'noedit center im']})]
feeds = [(u'Wiadomo\u015bci', u'http://gameplay.pl/rss/')]
def image_url_processor(self, baseurl, url):
if 'http' not in url:
return 'http://gameplay.pl'+ url[2:]
else:
return url

View File

@ -4,10 +4,11 @@ from calibre.web.feeds.news import BasicNewsRecipe
class Gazeta_Wyborcza(BasicNewsRecipe):
title = u'Gazeta Wyborcza'
__author__ = 'fenuks'
cover_url = 'http://bi.gazeta.pl/im/5/10285/z10285445AA.jpg'
language = 'pl'
description ='news from gazeta.pl'
category='newspaper'
publication_type = 'newspaper'
masthead_url='http://bi.gazeta.pl/im/5/10285/z10285445AA.jpg'
INDEX='http://wyborcza.pl'
remove_empty_feeds= True
oldest_article = 3
@ -81,3 +82,10 @@ class Gazeta_Wyborcza(BasicNewsRecipe):
return url
else:
return url.replace('http://wyborcza.biz/biznes/1', 'http://wyborcza.biz/biznes/2029020')
def get_cover_url(self):
soup = self.index_to_soup('http://wyborcza.pl/0,76762,3751429.html')
cover=soup.find(id='GWmini2')
soup = self.index_to_soup('http://wyborcza.pl/'+ cover.contents[3].a['href'])
self.cover_url='http://wyborcza.pl' + soup.img['src']
return getattr(self, 'cover_url', self.cover_url)

View File

@ -8,29 +8,31 @@ class Gry_online_pl(BasicNewsRecipe):
language = 'pl'
oldest_article = 13
INDEX= 'http://www.gry-online.pl/'
cover_url='http://www.gry-online.pl/img/1st_10/1st-gol-logo.png'
masthead_url='http://www.gry-online.pl/im/gry-online-logo.png'
cover_url='http://www.gry-online.pl/im/gry-online-logo.png'
max_articles_per_feed = 100
no_stylesheets= True
extra_css = 'p.wn1{font-size:22px;}'
remove_tags_after= [dict(name='div', attrs={'class':['tresc-newsa']})]
keep_only_tags = [dict(name='div', attrs={'class':['txthead']}), dict(name='p', attrs={'class':['wtx1', 'wn1', 'wob']}), dict(name='a', attrs={'class':['num_str_nex']})]
#remove_tags= [dict(name='div', attrs={'class':['news_plat']})]
keep_only_tags=[dict(name='div', attrs={'class':'gc660'})]
remove_tags=[dict({'class':['nav-social', 'add-info', 'smlb', 'lista lista3 lista-gry', 'S013po', 'zm_gfx_cnt_bottom', 'ocen-txt', 'wiecej-txt', 'wiecej-txt2']})]
feeds = [(u'Newsy', 'http://www.gry-online.pl/rss/news.xml'), ('Teksty', u'http://www.gry-online.pl/rss/teksty.xml')]
def append_page(self, soup, appendtag):
nexturl = soup.find('a', attrs={'class':'num_str_nex'})
if appendtag.find('a', attrs={'class':'num_str_nex'}) is not None:
appendtag.find('a', attrs={'class':'num_str_nex'}).replaceWith('\n')
if nexturl is not None:
if 'strona' in nexturl.div.string:
nexturl= self.INDEX + nexturl['href']
soup2 = self.index_to_soup(nexturl)
pagetext = soup2.findAll(name='p', attrs={'class':['wtx1', 'wn1', 'wob']})
for tag in pagetext:
pos = len(appendtag.contents)
appendtag.insert(pos, tag)
self.append_page(soup2, appendtag)
tag = appendtag.find('div', attrs={'class':'n5p'})
if tag:
nexturls=tag.findAll('a')
for nexturl in nexturls[1:]:
try:
soup2 = self.index_to_soup('http://www.gry-online.pl/S020.asp'+ nexturl['href'])
except:
soup2 = self.index_to_soup('http://www.gry-online.pl/S022.asp'+ nexturl['href'])
pagetext = soup2.find(attrs={'class':'gc660'})
for r in pagetext.findAll(name='header'):
r.extract()
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
for r in appendtag.findAll(attrs={'class':['n5p', 'add-info', 'twitter-share-button']}):
r.extract()
def preprocess_html(self, soup):

Binary file not shown.

After

Width:  |  Height:  |  Size: 994 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 991 B

BIN
recipes/icons/in4_pl.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 357 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 808 B

BIN
recipes/icons/kresy_pl.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 4.0 KiB

BIN
recipes/icons/oclab_pl.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 881 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 817 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 366 B

BIN
recipes/icons/pc_arena.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.8 KiB

BIN
recipes/icons/pc_foster.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 694 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 322 B

BIN
recipes/icons/pure_pc.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 386 B

BIN
recipes/icons/tanuki.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1017 B

BIN
recipes/icons/tvn24.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.4 KiB

44
recipes/in4_pl.recipe Normal file
View File

@ -0,0 +1,44 @@
from calibre.web.feeds.news import BasicNewsRecipe
import re
class in4(BasicNewsRecipe):
title = u'IN4.pl'
oldest_article = 7
max_articles_per_feed = 100
__author__ = 'fenuks'
description = u'Serwis Informacyjny - Aktualnosci, recenzje'
category = 'IT'
language = 'pl'
#cover_url= 'http://www.in4.pl/recenzje/337/in4pl.jpg'
no_stylesheets = True
remove_empty_feeds = True
preprocess_regexps = [(re.compile(ur'<a title="translate into.*?</a>', re.DOTALL), lambda match: '') ]
keep_only_tags=[dict(name='div', attrs={'class':'left_alone'})]
remove_tags_after=dict(name='img', attrs={'title':'komentarze'})
remove_tags=[dict(name='img', attrs={'title':'komentarze'})]
feeds = [(u'Wiadomo\u015bci', u'http://www.in4.pl/rss.php'), (u'Recenzje', u'http://www.in4.pl/rss_recenzje.php'), (u'Mini recenzje', u'http://www.in4.pl/rss_mini.php')]
def append_page(self, soup, appendtag):
a=soup.findAll('a')
nexturl=None
for i in a:
if i.string and 'następna str' in i.string:
nexturl='http://www.in4.pl/' + i['href']
i.extract()
while nexturl:
soup2 = self.index_to_soup(nexturl)
pagetext = soup2.find(id='news')
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
nexturl=None
tag=soup2.findAll('a')
for z in tag:
if z.string and u'następna str' in z.string:
nexturl='http://www.in4.pl/' + z['href']
break
def preprocess_html(self, soup):
self.append_page(soup, soup.body)
return soup

View File

@ -0,0 +1,18 @@
from calibre.web.feeds.news import BasicNewsRecipe
import re
class Informacje_USA(BasicNewsRecipe):
title = u'Informacje USA'
oldest_article = 7
max_articles_per_feed = 100
__author__ = 'fenuks'
description = u'portal wiadomości amerykańskich'
category = 'news'
language = 'pl'
masthead_url= 'http://www.informacjeusa.com/wp-content/add_images/top_logo_5_2010.jpg'
cover_url='http://www.informacjeusa.com/wp-content/add_images/top_logo_5_2010.jpg'
no_stylesheets = True
preprocess_regexps = [(re.compile(ur'<p>Zobacz:.*?</p>', re.DOTALL), lambda match: ''), (re.compile(ur'<p><a href=".*?Zobacz także:.*?</a></p>', re.DOTALL), lambda match: ''), (re.compile(ur'<p><p>Zobacz też:.*?</a></p>', re.DOTALL), lambda match: '')]
keep_only_tags=[dict(name='div', attrs={'class':'box box-single'})]
remove_tags_after= dict(attrs={'class':'tags'})
remove_tags= [dict(attrs={'class':['postmetadata', 'tags', 'banner']}), dict(name='a', attrs={'title':['Drukuj', u'Wyślij']})]
feeds = [(u'Informacje', u'http://www.informacjeusa.com/feed/')]

14
recipes/kresy_pl.recipe Normal file
View File

@ -0,0 +1,14 @@
from calibre.web.feeds.news import BasicNewsRecipe
class Kresy(BasicNewsRecipe):
title = u'Kresy'
__author__ = 'fenuks'
description = u'portal społeczności kresowej'
language = 'pl'
masthead_url= 'http://www.kresy.pl/public/img/logo.png'
cover_url= 'http://www.kresy.pl/public/img/logo.png'
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = True
keep_only_tags= [dict(id='artykul')]
remove_tags= [dict(attrs={'class':['twitter-share-button', 'likefbborder', 'tagi']})]
feeds = [(u'Wszystkie', u'http://www.kresy.pl/rss')]

View File

@ -0,0 +1,17 @@
__version__ = 'v1.0'
__date__ = '13, February 2011'
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1329125921(BasicNewsRecipe):
title = u'La pausa caff\xe8'
__author__ = 'faber1971'
description = 'An Italian satirical blog'
language = 'it'
oldest_article = 7
max_articles_per_feed = 100
auto_cleanup = True
no_stylesheets = True
feeds = [(u'La pausa caff\xe8', u'http://feeds.feedburner.com/LapausaCaffe')]

View File

@ -1,4 +1,5 @@
__license__ = 'GPL v3'
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1327062445(BasicNewsRecipe):
@ -7,10 +8,13 @@ class AdvancedUserRecipe1327062445(BasicNewsRecipe):
max_articles_per_feed = 100
auto_cleanup = True
remove_javascript = True
no_stylesheets = True
remove_tags = [
dict(name='ul', attrs={'id':'ads0'})
]
masthead_url = 'http://www.simrendeogun.com/wp-content/uploads/2011/06/New-Marketing-Magazine-Logo.jpg'
feeds = [(u'My Marketing', u'http://feed43.com/0537744466058428.xml'), (u'My Marketing_', u'http://feed43.com/8126723074604845.xml'), (u'Venturini', u'http://robertoventurini.blogspot.com/feeds/posts/default?alt=rss'), (u'Ninja Marketing', u'http://feeds.feedburner.com/NinjaMarketing'), (u'Comunitàzione', u'http://www.comunitazione.it/feed/novita.asp'), (u'Brandforum news', u'http://www.brandforum.it/rss/news'), (u'Brandforum papers', u'http://www.brandforum.it/rss/papers'), (u'Disambiguando', u'http://giovannacosenza.wordpress.com/feed/')]
__author__ = 'faber1971'
description = 'Collection of Italian marketing websites - v1.00 (28, January 2012)'
description = 'Collection of Italian marketing websites - v1.03 (20, February 2012)'
language = 'it'
feeds = [(u'My Marketing', u'http://feed43.com/0537744466058428.xml'), (u'My Marketing_', u'http://feed43.com/8126723074604845.xml'), (u'Venturini', u'http://robertoventurini.blogspot.com/feeds/posts/default?alt=rss'), (u'Ninja Marketing', u'http://feeds.feedburner.com/NinjaMarketing'), (u'Comunitàzione', u'http://www.comunitazione.it/feed/novita.asp'), (u'Brandforum news', u'http://www.brandforum.it/rss/news'), (u'Brandforum papers', u'http://www.brandforum.it/rss/papers'), (u'MarketingArena', u'http://feeds.feedburner.com/marketingarena'), (u'minimarketing', u'http://feeds.feedburner.com/minimarketingit'), (u'Disambiguando', u'http://giovannacosenza.wordpress.com/feed/')]

View File

@ -1,16 +1,17 @@
__license__ = 'GPL v3'
__copyright__ = '2009, Mathieu Godlewski <mathieu at godlewski.fr>; 2010, Louis Gesbert <meta at antislash dot info>'
__copyright__ = '2009, Mathieu Godlewski <mathieu at godlewski.fr>; 2010, 2011, Louis Gesbert <meta at antislash dot info>'
'''
Mediapart
'''
from calibre.ebooks.BeautifulSoup import Tag
import re
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.web.feeds.news import BasicNewsRecipe
class Mediapart(BasicNewsRecipe):
title = 'Mediapart'
__author__ = 'Mathieu Godlewski'
description = 'Global news in french from online newspapers'
__author__ = 'Mathieu Godlewski, Louis Gesbert'
description = 'Global news in french from news site Mediapart'
oldest_article = 7
language = 'fr'
needs_subscription = True
@ -18,52 +19,30 @@ class Mediapart(BasicNewsRecipe):
max_articles_per_feed = 50
no_stylesheets = True
cover_url = 'http://www.mediapart.fr/sites/all/themes/mediapart/mediapart/images/annonce.jpg'
cover_url = 'http://static.mediapart.fr/files/pave_mediapart.jpg'
feeds = [
('Les articles', 'http://www.mediapart.fr/articles/feed'),
]
# -- print-version has poor quality on this website, better do the conversion ourselves
#
# preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE|re.DOTALL), i[1]) for i in
# [
# (r'<div class="print-title">([^>]+)</div>', lambda match : '<h2>'+match.group(1)+'</h2>'),
# (r'<span class=\'auteur_staff\'>[^>]+<a title=\'[^\']*\'[^>]*>([^<]*)</a>[^<]*</span>',
# lambda match : '<i>'+match.group(1)+'</i>'),
# (r'\'', lambda match: '&rsquo;'),
# ]
# ]
#
# remove_tags = [ dict(name='div', attrs={'class':'print-source_url'}),
# dict(name='div', attrs={'class':'print-links'}),
# dict(name='img', attrs={'src':'entete_article.png'}),
# dict(name='br') ]
#
# def print_version(self, url):
# raw = self.browser.open(url).read()
# soup = BeautifulSoup(raw.decode('utf8', 'replace'))
# div = soup.find('div', {'id':re.compile('node-\d+')})
# if div is None:
# return None
# article_id = string.replace(div['id'], 'node-', '')
# if article_id is None:
# return None
# return 'http://www.mediapart.fr/print/'+article_id
# -- print-version
# -- Non-print version [dict(name='div', attrs={'class':'advert'})]
keep_only_tags = [
dict(name='h1', attrs={'class':'title'}),
dict(name='div', attrs={'class':'page_papier_detail'}),
preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE|re.DOTALL), i[1]) for i in
[
(r'<div class="print-title">([^>]+)</div>', lambda match : '<h2>'+match.group(1)+'</h2>'),
(r'\'', lambda match: '&rsquo;')
]
]
def preprocess_html(self,soup):
for title in soup.findAll('div', {'class':'titre'}):
tag = Tag(soup, 'h3')
title.replaceWith(tag)
tag.insert(0,title)
return soup
remove_tags = [ dict(name='div', attrs={'class':'print-source_url'}) ]
def print_version(self, url):
raw = self.browser.open(url).read()
soup = BeautifulSoup(raw.decode('utf8', 'replace'))
link = soup.find('a', {'title':'Imprimer'})
if link is None:
return None
return link['href']
# -- Handle login
@ -76,4 +55,3 @@ class Mediapart(BasicNewsRecipe):
br['pass'] = self.password
br.submit()
return br

View File

@ -1,8 +1,9 @@
from calibre.web.feeds.news import BasicNewsRecipe
import re
class naczytniki(BasicNewsRecipe):
title = u'naczytniki.pl'
__author__ = 'fenuks'
masthead_url= 'http://naczytniki.pl/wp-content/uploads/2010/08/logo_nc28.png'
cover_url = 'http://naczytniki.pl/wp-content/uploads/2010/08/logo_nc28.png'
language = 'pl'
description ='everything about e-readers'
@ -10,6 +11,7 @@ class naczytniki(BasicNewsRecipe):
no_stylesheets=True
oldest_article = 7
max_articles_per_feed = 100
preprocess_regexps = [(re.compile(ur'<p><br><b>Zobacz także:</b></p>.*?</body>', re.DOTALL), lambda match: '</body>') ]
remove_tags_after= dict(name='div', attrs={'class':'sociable'})
keep_only_tags=[dict(name='div', attrs={'class':'post'})]
remove_tags=[dict(name='span', attrs={'class':'comments'}), dict(name='div', attrs={'class':'sociable'})]

View File

@ -1,21 +1,33 @@
# -*- coding: utf-8 -*-
from calibre.web.feeds.news import BasicNewsRecipe
import re
class Nowa_Fantastyka(BasicNewsRecipe):
title = u'Nowa Fantastyka'
oldest_article = 7
__author__ = 'fenuks'
__modified_by__ = 'zaslav'
language = 'pl'
encoding='latin2'
description ='site for fantasy readers'
category='fantasy'
masthead_url='http://farm5.static.flickr.com/4133/4956658792_7ba7fbf562.jpg'
#extra_css='.tytul {font-size: 20px;}' #not working
max_articles_per_feed = 100
INDEX='http://www.fantastyka.pl/'
no_stylesheets=True
needs_subscription = 'optional'
remove_tags_before=dict(attrs={'class':'belka1-tlo-md'})
remove_tags_before=dict(attrs={'class':'naglowek2'})
#remove_tags_after=dict(name='span', attrs={'class':'naglowek-oceny'})
remove_tags_after=dict(name='td', attrs={'class':'belka1-bot'})
remove_tags=[dict(attrs={'class':'avatar2'}), dict(name='span', attrs={'class':'alert-oceny'}), dict(name='img', attrs={'src':['obrazki/sledz1.png', 'obrazki/print.gif', 'obrazki/mlnf.gif']}), dict(name='b', text='Dodaj komentarz'),dict(name='a', attrs={'href':'http://www.fantastyka.pl/10,1727.html'})]
remove_tags_after=dict(name='form', attrs={'name':'form1'})
remove_tags=[dict(attrs={'class':['avatar2', 'belka-margin', 'naglowek2']}), dict(name='span', attrs={'class':'alert-oceny'}), dict(name='img', attrs={'src':['obrazki/sledz1.png', 'obrazki/print.gif', 'obrazki/mlnf.gif']}), dict(name='b', text='Dodaj komentarz'),dict(name='a', attrs={'href':'http://www.fantastyka.pl/10,1727.html'}), dict(name='form')]
preprocess_regexps = [
(re.compile(r'\<table .*?\>'), lambda match: ''),
(re.compile(r'\<td.*?\>'), lambda match: ''),
(re.compile(r'\<center\>'), lambda match: '')]
def find_articles(self, url):
articles = []
@ -41,10 +53,10 @@ class Nowa_Fantastyka(BasicNewsRecipe):
return feeds
def get_cover_url(self):
soup = self.index_to_soup('http://www.fantastyka.pl/1.html')
cover=soup.find(name='img', attrs={'class':'okladka'})
self.cover_url=self.INDEX+ cover['src']
soup = self.index_to_soup('http://www.e-kiosk.pl/nowa_fantastyka')
self.cover_url='http://www.e-kiosk.pl' + soup.find(name='a', attrs={'class':'img'})['href']
return getattr(self, 'cover_url', self.cover_url)
def get_browser(self):
@ -56,3 +68,18 @@ class Nowa_Fantastyka(BasicNewsRecipe):
br['pass'] = self.password
br.submit()
return br
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
for item in soup.findAll(font=True):
del item['font']
for item in soup.findAll(align=True):
del item['align']
for item in soup.findAll(name='tr'):
item.name='div'
title=soup.find(attrs={'class':'tytul'})
if title:
title['style']='font-size: 20px; font-weight: bold;'
self.log.warn(soup)
return soup

31
recipes/oclab_pl.recipe Normal file
View File

@ -0,0 +1,31 @@
from calibre.web.feeds.news import BasicNewsRecipe
class OCLab(BasicNewsRecipe):
title = u'OCLab.pl'
oldest_article = 7
max_articles_per_feed = 100
__author__ = 'fenuks'
description = u'Portal OCLab.pl jest miejscem przyjaznym pasjonatom sprzętu komputerowego, w szczególności overclockerom, które będzie służyć im za aktualną bazę wiedzy o podkręcaniu komputera, źródło aktualnych informacji z rynku oraz opinii na temat sprzętu komputerowego.'
category = 'IT'
language = 'pl'
cover_url= 'http://www.idealforum.ru/attachment.php?attachmentid=7963&d=1316008118'
no_stylesheets = True
keep_only_tags=[dict(id='main')]
remove_tags_after= dict(attrs={'class':'single-postmetadata'})
remove_tags=[dict(attrs={'class':['single-postmetadata', 'pagebar']})]
feeds = [(u'Wpisy', u'http://oclab.pl/feed/')]
def append_page(self, soup, appendtag):
tag=soup.find(attrs={'class':'contentjumpddl'})
if tag:
nexturl=tag.findAll('option')
for nextpage in nexturl[1:-1]:
soup2 = self.index_to_soup(nextpage['value'])
pagetext = soup2.find(attrs={'class':'single-entry'})
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
for r in appendtag.findAll(attrs={'class':'post-nav-bottom-list'}):
r.extract()
def preprocess_html(self, soup):
self.append_page(soup, soup.body)
return soup

View File

@ -0,0 +1,37 @@
import re
from calibre.web.feeds.news import BasicNewsRecipe
class Overclock_pl(BasicNewsRecipe):
title = u'Overclock.pl'
oldest_article = 7
max_articles_per_feed = 100
__author__ = 'fenuks'
description = u'Vortal poświęcony tematyce hardware, kładący największy nacisk na podkręcanie / overclocking (włącznie z extreme) i chłodzenie / cooling (air cooling, water cooling, freon cooling, dry ice, liquid nitrogen).'
category = 'IT'
language = 'pl'
masthead_url='http://www.overclock.pl/gfx/logo_m.png'
cover_url='http://www.overclock.pl/gfx/logo_m.png'
no_stylesheets = True
remove_empty_feeds = True
preprocess_regexps = [(re.compile(ur'<b>Komentarze do aktualności:.*?</a>', re.DOTALL), lambda match: ''), (re.compile(ur'<h3>Nawigacja</h3>', re.DOTALL), lambda match: '') ]
keep_only_tags=[dict(name='div', attrs={'class':'news'}), dict(id='articleContent')]
remove_tags=[dict(name='span', attrs={'class':'info'}), dict(attrs={'class':'shareit'})]
feeds = [(u'Aktualno\u015bci', u'http://www.overclock.pl/rss.news.xml'), (u'Testy i recenzje', u'http://www.overclock.pl/rss.articles.xml')]
def append_page(self, soup, appendtag):
tag=soup.find(id='navigation')
if tag:
nexturl=tag.findAll('option')
tag.extract()
for nextpage in nexturl[2:]:
soup2 = self.index_to_soup(nextpage['value'])
pagetext = soup2.find(id='content')
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
rem=appendtag.find(attrs={'alt':'Pierwsza'})
if rem:
rem.parent.extract()
def preprocess_html(self, soup):
self.append_page(soup, soup.body)
return soup

14
recipes/palmtop_pl.recipe Normal file
View File

@ -0,0 +1,14 @@
from calibre.web.feeds.news import BasicNewsRecipe
class palmtop_pl(BasicNewsRecipe):
title = u'Palmtop.pl'
__author__ = 'fenuks'
description = 'wortal technologii mobilnych'
category = 'mobile'
language = 'pl'
cover_url='http://cdn.g-point.biz/wp-content/themes/palmtop-new/images/header_palmtop_logo.png'
masthead_url='http://cdn.g-point.biz/wp-content/themes/palmtop-new/images/header_palmtop_logo.png'
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = True
feeds = [(u'Newsy', u'http://palmtop.pl/feed/atom/')]

31
recipes/pc_arena.recipe Normal file
View File

@ -0,0 +1,31 @@
from calibre.web.feeds.news import BasicNewsRecipe
class PC_Arena(BasicNewsRecipe):
title = u'PCArena'
oldest_article = 18300
max_articles_per_feed = 100
__author__ = 'fenuks'
description = u'Najnowsze informacje z branży IT - testy, recenzje, aktualności, rankingi, wywiady. Twoje źródło informacji o sprzęcie komputerowym.'
category = 'IT'
language = 'pl'
masthead_url='http://pcarena.pl/public/design/frontend/images/logo.gif'
cover_url= 'http://pcarena.pl/public/design/frontend/images/logo.gif'
no_stylesheets = True
keep_only_tags=[dict(attrs={'class':['artHeader', 'art']})]
remove_tags=[dict(attrs={'class':'pages'})]
feeds = [(u'Newsy', u'http://pcarena.pl/misc/rss/news'), (u'Artyku\u0142y', u'http://pcarena.pl/misc/rss/articles')]
def append_page(self, soup, appendtag):
tag=soup.find(name='div', attrs={'class':'pagNum'})
if tag:
nexturl=tag.findAll('a')
tag.extract()
for nextpage in nexturl[1:]:
nextpage= 'http://pcarena.pl' + nextpage['href']
soup2 = self.index_to_soup(nextpage)
pagetext = soup2.find(attrs={'class':'artBody'})
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
def preprocess_html(self, soup):
self.append_page(soup, soup.body)
return soup

View File

@ -0,0 +1,41 @@
from calibre.web.feeds.news import BasicNewsRecipe
class PC_Centre(BasicNewsRecipe):
title = u'PC Centre'
oldest_article = 7
max_articles_per_feed = 100
__author__ = 'fenuks'
description = u'Portal komputerowy, a w nim: testy sprzętu komputerowego, recenzje gier i oprogramowania. a także opisy produktów związanych z komputerami.'
category = 'IT'
language = 'pl'
masthead_url= 'http://pccentre.pl/views/images/logo.gif'
cover_url= 'http://pccentre.pl/views/images/logo.gif'
no_stylesheets = True
keep_only_tags= [dict(id='content')]
remove_tags=[dict(attrs={'class':['ikony r', 'list_of_content', 'dot accordion']}), dict(id='comments')]
feeds = [(u'Publikacje', u'http://pccentre.pl/backend.php?mode=a'), (u'Aktualno\u015bci', u'http://pccentre.pl/backend.php'), (u'Sprz\u0119t komputerowy', u'http://pccentre.pl/backend.php?mode=n&section=2'), (u'Oprogramowanie', u'http://pccentre.pl/backend.php?mode=n&section=3'), (u'Gry komputerowe i konsole', u'http://pccentre.pl/backend.php?mode=n&section=4'), (u'Internet', u'http://pccentre.pl/backend.php?mode=n&section=7'), (u'Bezpiecze\u0144stwo', u'http://pccentre.pl/backend.php?mode=n&section=5'), (u'Multimedia', u'http://pccentre.pl/backend.php?mode=n&section=6'), (u'Biznes', u'http://pccentre.pl/backend.php?mode=n&section=9')]
def append_page(self, soup, appendtag):
tag=soup.find(name='div', attrs={'class':'pages'})
if tag:
nexturl=tag.findAll('a')
tag.extract()
for nextpage in nexturl[:-1]:
nextpage= 'http://pccentre.pl' + nextpage['href']
soup2 = self.index_to_soup(nextpage)
pagetext = soup2.find(id='content')
rem=pagetext.findAll(attrs={'class':['subtitle', 'content_info', 'list_of_content', 'pages', 'social2', 'pcc_acc', 'pcc_acc_na']})
for r in rem:
r.extract()
rem=pagetext.findAll(id='comments')
for r in rem:
r.extract()
rem=pagetext.findAll('h1')
for r in rem:
r.extract()
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
def preprocess_html(self, soup):
self.append_page(soup, soup.body)
return soup

35
recipes/pc_foster.recipe Normal file
View File

@ -0,0 +1,35 @@
from calibre.web.feeds.news import BasicNewsRecipe
class PC_Foster(BasicNewsRecipe):
title = u'PC Foster'
oldest_article = 7
max_articles_per_feed = 100
__author__ = 'fenuks'
description = u'Vortal technologiczny: testy, recenzje sprzętu komputerowego i telefonów, nowinki hardware, programy i gry dla Windows. Podkręcanie, modding i Overclocking.'
category = 'IT'
language = 'pl'
masthead_url='http://pcfoster.pl/public/images/logo.png'
cover_url= 'http://pcfoster.pl/public/images/logo.png'
no_stylesheets= True
remove_empty_feeds= True
keep_only_tags= [dict(id=['news_details', 'review_details']), dict(attrs={'class':'pager more_top'})]
remove_tags=[dict(name='p', attrs={'class':'right'})]
feeds = [(u'G\u0142\xf3wny', u'http://pcfoster.pl/public/rss/main.xml')]
def append_page(self, soup, appendtag):
nexturl= appendtag.find(attrs={'alt':u'Następna strona'})
if nexturl:
appendtag.find(attrs={'class':'pager more_top'}).extract()
while nexturl:
nexturl='http://pcfoster.pl' + nexturl.parent['href']
soup2 = self.index_to_soup(nexturl)
nexturl=soup2.find(attrs={'alt':u'Następna strona'})
pagetext = soup2.find(attrs={'class':'content'})
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
for r in appendtag.findAll(attrs={'class':'review_content double'}):
r.extract()
def preprocess_html(self, soup):
self.append_page(soup, soup.body)
return soup

View File

@ -0,0 +1,81 @@
from calibre.web.feeds.news import BasicNewsRecipe
import re
class Polska_times(BasicNewsRecipe):
title = u'Polska Times'
__author__ = 'fenuks'
description = u'Internetowe wydanie dziennika ogólnopolskiego Polska The Times. Najświeższe informacje: wydarzenia w kraju i na świecie, reportaże, poradniki, opinie.'
category = 'newspaper'
language = 'pl'
masthead_url = 'http://s.polskatimes.pl/g/logo_naglowek/polska.gif?17'
oldest_article = 7
max_articles_per_feed = 100
remove_emty_feeds= True
no_stylesheets = True
preprocess_regexps = [(re.compile(ur'<b>Czytaj także:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur',<b>Czytaj też:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>Zobacz także:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<center><h4><a.*?</a></h4></center>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ TEŻ:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ WIĘCEJ:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ TAKŻE:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>\* CZYTAJ KONIECZNIE:.*', re.DOTALL), lambda match: '</body>'), (re.compile(ur'<b>Nasze serwisy:</b>.*', re.DOTALL), lambda match: '</body>') ]
keep_only_tags= [dict(id=['tytul-artykulu', 'kontent'])]
remove_tags_after= dict(id='material-tagi')
remove_tags=[dict(attrs={'id':'reklama_srodtekst_0'}), dict(attrs={'id':'material-tagi'}), dict(name='div', attrs={'class':'zakladki'}), dict(attrs={'title':u'CZYTAJ TAKŻE'}), dict(attrs={'id':'podobne'}), dict(name='a', attrs={'href':'http://www.dzienniklodzki.pl/newsletter'})]
feeds = [(u'Fakty', u'http://polskatimes.feedsportal.com/c/32980/f/533648/index.rss'), (u'Opinie', u'http://www.polskatimes.pl/rss/opinie.xml'), (u'Sport', u'http://polskatimes.feedsportal.com/c/32980/f/533649/index.rss'), (u'Pieni\u0105dze', u'http://polskatimes.feedsportal.com/c/32980/f/533657/index.rss'), (u'Twoje finanse', u'http://www.polskatimes.pl/rss/twojefinanse.xml'), (u'Kultura', u'http://polskatimes.feedsportal.com/c/32980/f/533650/index.rss'), (u'Dodatki', u'http://www.polskatimes.pl/rss/dodatki.xml')]
def skip_ad_pages(self, soup):
if 'Advertisement' in soup.title:
nexturl=soup.find('a')['href']
return self.index_to_soup(nexturl, raw=True)
def append_page(self, soup, appendtag):
nexturl=soup.find(id='nastepna_strona')
while nexturl:
soup2= self.index_to_soup(nexturl['href'])
nexturl=soup2.find(id='nastepna_strona')
pagetext = soup2.find(id='tresc')
for dictionary in self.remove_tags:
v=pagetext.findAll(attrs=dictionary['attrs'])
for delete in v:
delete.extract()
for b in pagetext.findAll(name='b'):
if b.string:
if u'CZYTAJ TEŻ' in b.string or u'Czytaj także' in b.string or u'Czytaj też' in b.string or u'Zobacz także' in b.string:
b.extract()
for center in pagetext.findAll(name='center'):
if center.h4:
if center.h4.a:
center.extract()
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
for paginator in appendtag.findAll(attrs={'class':'stronicowanie'}):
paginator.extract()
def image_article(self, soup, appendtag):
nexturl=soup.find('a', attrs={'class':'nastepna'})
urls=[]
while nexturl:
if nexturl not in urls:
urls.append(nexturl)
else:
break
soup2= self.index_to_soup('http://www.polskatimes.pl/artykul/' + nexturl['href'])
nexturl=soup2.find('a', attrs={'class':'nastepna'})
if nexturl in urls:
break;
pagetext = soup2.find(id='galeria-material')
pos = len(appendtag.contents)
appendtag.insert(pos, '<br />')
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
for rem in appendtag.findAll(attrs={'class':['galeriaNawigator', 'miniaturyPojemnik']}):
rem.extract()
for paginator in appendtag.findAll(attrs={'class':'stronicowanie'}):
paginator.extract()
def preprocess_html(self, soup):
if soup.find('a', attrs={'class':'nastepna'}):
self.image_article(soup, soup.body)
elif soup.find(id='nastepna_strona'):
self.append_page(soup, soup.body)
return soup
def get_cover_url(self):
soup = self.index_to_soup('http://www.prasa24.pl/gazeta/metropolia-warszawska/')
self.cover_url=soup.find(id='pojemnik').img['src']
return getattr(self, 'cover_url', self.cover_url)

33
recipes/pure_pc.recipe Normal file
View File

@ -0,0 +1,33 @@
from calibre.web.feeds.news import BasicNewsRecipe
class PurePC(BasicNewsRecipe):
title = u'PurePC'
oldest_article = 7
max_articles_per_feed = 100
__author__ = 'fenuks'
description = u'Artykuły, aktualności, sprzęt, forum, chłodzenie, modding, urządzenia mobilne - wszystko w jednym miejscu.'
category = 'IT'
language = 'pl'
masthead_url= 'http://www.purepc.pl/themes/new/images/purepc.jpg'
cover_url= 'http://www.purepc.pl/themes/new/images/purepc.jpg'
no_stylesheets = True
keep_only_tags= [dict(id='content')]
remove_tags_after= dict(attrs={'class':'fivestar-widget'})
remove_tags= [dict(id='navigator'), dict(attrs={'class':['box-tools', 'fivestar-widget', 'PageMenuList']})]
feeds = [(u'Wiadomo\u015bci', u'http://www.purepc.pl/node/feed')]
def append_page(self, soup, appendtag):
nexturl= appendtag.find(attrs={'class':'pager-next'})
if nexturl:
while nexturl:
soup2 = self.index_to_soup('http://www.purepc.pl'+ nexturl.a['href'])
nexturl=soup2.find(attrs={'class':'pager-next'})
pagetext = soup2.find(attrs={'class':'article'})
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
for r in appendtag.findAll(attrs={'class':['PageMenuList', 'pager', 'fivestar-widget']}):
r.extract()
def preprocess_html(self, soup):
self.append_page(soup, soup.body)
return soup

View File

@ -1,14 +1,16 @@
from calibre.web.feeds.news import BasicNewsRecipe
import re
class Tablety_pl(BasicNewsRecipe):
title = u'Tablety.pl'
__author__ = 'fenuks'
description = u'tablety.pl - latest tablet news'
masthead_url= 'http://www.tablety.pl/wp-content/themes/kolektyw/img/logo.png'
cover_url = 'http://www.tablety.pl/wp-content/themes/kolektyw/img/logo.png'
category = 'IT'
language = 'pl'
oldest_article = 8
max_articles_per_feed = 100
preprocess_regexps = [(re.compile(ur'<p><strong>Przeczytaj także.*?</a></strong></p>', re.DOTALL), lambda match: ''), (re.compile(ur'<p><strong>Przeczytaj koniecznie.*?</a></strong></p>', re.DOTALL), lambda match: '')]
remove_tags_before=dict(name="h1", attrs={'class':'entry-title'})
remove_tags_after=dict(name="div", attrs={'class':'snap_nopreview sharing robots-nocontent'})
remove_tags=[dict(name='div', attrs={'class':'snap_nopreview sharing robots-nocontent'})]

37
recipes/tanuki.recipe Normal file
View File

@ -0,0 +1,37 @@
from calibre.web.feeds.news import BasicNewsRecipe
import re
class tanuki(BasicNewsRecipe):
title = u'Tanuki'
oldest_article = 7
__author__ = 'fenuks'
category = 'anime, manga'
language = 'pl'
max_articles_per_feed = 100
encoding='utf-8'
extra_css= 'ul {list-style: none; padding: 0; margin: 0;} .kadr{float: left;} .dwazdania {float: right;}'
preprocess_regexps = [(re.compile(ur'<h3><a class="screen".*?</h3>', re.DOTALL), lambda match: ''), (re.compile(ur'<div><a href="/strony/((manga)|(anime))/[0-9]+?/oceny(\-redakcji){0,1}">Zobacz jak ocenili</a></div>', re.DOTALL), lambda match: '')]
remove_empty_feeds= True
no_stylesheets = True
keep_only_tags=[dict(attrs={'class':['animename', 'storyname', 'nextarrow','sideinfov', 'sidelinfov', 'sideinfo', 'sidelinfo']}), dict(name='table', attrs={'summary':'Technikalia'}), dict(attrs={'class':['chaptername','copycat']}), dict(id='rightcolumn'), dict(attrs={'class':['headn_tt', 'subtable']})]
remove_tags=[dict(name='div', attrs={'class':'screen'}), dict(id='randomtoplist'), dict(attrs={'class':'note'})]
feeds = [(u'Anime', u'http://anime.tanuki.pl/rss_anime.xml'), (u'Manga', u'http://manga.tanuki.pl/rss_manga.xml'), (u'Tomiki', u'http://manga.tanuki.pl/rss_mangabooks.xml'), (u'Artyku\u0142y', u'http://czytelnia.tanuki.pl/rss_czytelnia_artykuly.xml'), (u'Opowiadania', u'http://czytelnia.tanuki.pl/rss_czytelnia.xml')]
def append_page(self, soup, appendtag):
nexturl= appendtag.find(attrs={'class':'nextarrow'})
if nexturl:
while nexturl:
soup2 = self.index_to_soup('http://czytelnia.tanuki.pl'+ nexturl['href'])
nexturl=soup2.find(attrs={'class':'nextarrow'})
pagetext = soup2.find(attrs={'class':['chaptername', 'copycat']})
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
pagetext = soup2.find(attrs={'class':'copycat'})
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
for r in appendtag.findAll(attrs={'class':'nextarrow'}):
r.extract()
def preprocess_html(self, soup):
self.append_page(soup, soup.body)
return soup

View File

@ -1,49 +1,57 @@
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
from calibre.web.feeds.recipes import BasicNewsRecipe
class AdvancedUserRecipe1268409464(BasicNewsRecipe):
title = u'The Sun'
__author__ = 'Chaz Ralph'
description = 'News from The Sun'
class AdvancedUserRecipe1325006965(BasicNewsRecipe):
title = u'The Sun UK'
cover_url = 'http://www.thesun.co.uk/img/global/new-masthead-logo.png'
description = 'A Recipe for The Sun tabloid UK - uses feed43'
__author__ = 'Dave Asbury'
# last updated 20/2/12
language = 'en_GB'
oldest_article = 1
max_articles_per_feed = 100
language = 'en'
max_articles_per_feed = 15
remove_empty_feeds = True
no_stylesheets = True
extra_css = '.headline {font-size: x-large;} \n .fact { padding-top: 10pt }'
encoding= 'iso-8859-1'
remove_javascript = True
masthead_url = 'http://www.thesun.co.uk/sol/img/global/Sun-logo.gif'
encoding = 'cp1251'
encoding = 'cp1252'
remove_empty_feeds = True
remove_javascript = True
no_stylesheets = True
extra_css = '''
body{ text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
'''
preprocess_regexps = [
(re.compile(r'<div class="foot-copyright".*?</div>', re.IGNORECASE | re.DOTALL), lambda match: '')]
keep_only_tags = [
dict(id='column-print')
dict(name='h1'),dict(name='h2',attrs={'class' : 'medium centered'}),
dict(name='div',attrs={'class' : 'text-center'}),
dict(name='div',attrs={'id' : 'bodyText'})
# dict(name='p')
]
remove_tags=[
#dict(name='head'),
dict(attrs={'class' : ['mystery-meat-link','ltbx-container','ltbx-var ltbx-hbxpn','ltbx-var ltbx-nav-loop','ltbx-var ltbx-url']}),
dict(name='div',attrs={'class' : 'cf'}),
dict(attrs={'title' : 'download flash'}),
dict(attrs={'style' : 'padding: 5px'})
]
feeds = [
(u'News','http://feed43.com/2517447382644748.xml'),
(u'Sport', u'http://feed43.com/4283846255668687.xml'),
(u'Bizarre', u'http://feed43.com/0233840304242011.xml'),
(u'Film',u'http://feed43.com/1307545221226200.xml'),
(u'Music',u'http://feed43.com/1701513435064132.xml'),
(u'Sun Woman',u'http://feed43.com/0022626854226453.xml'),
]
remove_tags = [
dict(name='div', attrs={'class':[
'clear text-center small padding-left-right-5 text-999 padding-top-5 padding-bottom-10 grey-solid-line',
'clear width-625 bg-fff padding-top-10'
]}),
dict(name='video'),
]
def preprocess_html(self, soup):
h1 = soup.find('h1')
if h1 is not None:
text = self.tag_to_string(h1)
nh = Tag(soup, 'h1')
nh.insert(0, text)
h1.replaceWith(nh)
return soup
feeds = [(u'News', u'http://www.thesun.co.uk/sol/homepage/feeds/rss/article312900.ece')
,(u'Sport', u'http://www.thesun.co.uk/sol/homepage/feeds/rss/article247732.ece')
,(u'Football', u'http://www.thesun.co.uk/sol/homepage/feeds/rss/article247739.ece')
,(u'Gizmo', u'http://www.thesun.co.uk/sol/homepage/feeds/rss/article247829.ece')
,(u'Bizarre', u'http://www.thesun.co.uk/sol/homepage/feeds/rss/article247767.ece')]
def print_version(self, url):
return re.sub(r'\?OTC-RSS&ATTR=[-a-zA-Z]+', '?print=yes', url)

24
recipes/tvn24.recipe Normal file
View File

@ -0,0 +1,24 @@
from calibre.web.feeds.news import BasicNewsRecipe
class tvn24(BasicNewsRecipe):
title = u'TVN24'
oldest_article = 7
max_articles_per_feed = 100
__author__ = 'fenuks'
description = u'Sport, Biznes, Gospodarka, Informacje, Wiadomości Zawsze aktualne wiadomości z Polski i ze świata'
category = 'news'
language = 'pl'
masthead_url= 'http://www.tvn24.pl/_d/topmenu/logo2.gif'
cover_url= 'http://www.tvn24.pl/_d/topmenu/logo2.gif'
extra_css= 'ul {list-style: none; padding: 0; margin: 0;} li {float: left;margin: 0 0.15em;}'
remove_empty_feeds = True
remove_javascript = True
no_stylesheets = True
keep_only_tags=[dict(id='tvn24_wiadomosci_detal'), dict(name='h1', attrs={'class':'standardHeader1'}), dict(attrs={'class':['date60m rd5', 'imageBackground fl rd7', 'contentFromCMS']})]
remove_tags_after= dict(name='div', attrs={'class':'socialBoxesBottom'})
remove_tags=[dict(attrs={'class':['tagi_detal', 'socialBoxesBottom', 'twitterBox', 'commentsInfo', 'textSize', 'obj_ukrytydruk obj_ramka1_r', 'related newsNews align-right', 'box', 'newsUserList', 'watchMaterial text']})]
feeds = [(u'Najnowsze', u'http://www.tvn24.pl/najnowsze.xml'), (u'Polska', u'www.tvn24.pl/polska.xml'), (u'\u015awiat', u'http://www.tvn24.pl/swiat.xml'), (u'Sport', u'http://www.tvn24.pl/sport.xml'), (u'Biznes', u'http://www.tvn24.pl/biznes.xml'), (u'Meteo', u'http://www.tvn24.pl/meteo.xml'), (u'Micha\u0142ki', u'http://www.tvn24.pl/michalki.xml'), (u'Kultura', u'http://www.tvn24.pl/kultura.xml')]
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
return soup

View File

@ -4,10 +4,12 @@ class Ubuntu_pl(BasicNewsRecipe):
title = u'UBUNTU.pl'
__author__ = 'fenuks'
description = 'UBUNTU.pl - polish ubuntu community site'
masthead_url= 'http://ubuntu.pl/img/logo.jpg'
cover_url = 'http://ubuntu.pl/img/logo.jpg'
category = 'linux, IT'
language = 'pl'
no_stylesheets = True
remove_empty_feeds = True
oldest_article = 8
max_articles_per_feed = 100
extra_css = '#main {text-align:left;}'

View File

@ -0,0 +1,39 @@
from calibre.web.feeds.news import BasicNewsRecipe
class webhosting_pl(BasicNewsRecipe):
title = u'Webhosting.pl'
__author__ = 'fenuks'
description = 'Webhosting.pl to pierwszy na polskim rynku serwis poruszający w szerokim aspekcie tematy związane z hostingiem, globalną Siecią i usługami internetowymi. Głównym celem przedsięwzięcia jest dostarczanie przydatnej i bogatej merytorycznie wiedzy osobom, które chcą tworzyć i efektywnie wykorzystywać współczesny Internet.'
category = 'web'
language = 'pl'
cover_url='http://webhosting.pl/images/logo.png'
masthead_url='http://webhosting.pl/images/logo.png'
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = True
remove_empty_feeds = True
#keep_only_tags= [dict(name='div', attrs={'class':'content_article'}), dict(attrs={'class':'paging'})]
#remove_tags=[dict(attrs={'class':['tags', 'wykop', 'facebook_button_count', 'article_bottom']})]
feeds = [(u'Newsy', u'http://webhosting.pl/feed/rss/an'),
(u'Artyku\u0142y', u'http://webhosting.pl/feed/rss/aa'),
(u'Software', u'http://webhosting.pl/feed/rss/n/12'),
(u'Internet', u'http://webhosting.pl/feed/rss/n/9'),
(u'Biznes', u'http://webhosting.pl/feed/rss/n/13'),
(u'Bezpiecze\u0144stwo', u'http://webhosting.pl/feed/rss/n/10'),
(u'Blogi', u'http://webhosting.pl/feed/rss/ab'),
(u'Programowanie', u'http://webhosting.pl/feed/rss/n/8'),
(u'Kursy', u'http://webhosting.pl/feed/rss/n/11'),
(u'Tips&Tricks', u'http://webhosting.pl/feed/rss/n/15'),
(u'Imprezy', u'http://webhosting.pl/feed/rss/n/22'),
(u'Wywiady', u'http://webhosting.pl/feed/rss/n/24'),
(u'Porady', u'http://webhosting.pl/feed/rss/n/3027'),
(u'Znalezione w sieci', u'http://webhosting.pl/feed/rss/n/6804'),
(u'Dev area', u'http://webhosting.pl/feed/rss/n/24504'),
(u"Webmaster's blog", u'http://webhosting.pl/feed/rss/n/29195'),
(u'Domeny', u'http://webhosting.pl/feed/rss/n/11513'),
(u'Praktyka', u'http://webhosting.pl/feed/rss/n/2'),
(u'Serwery', u'http://webhosting.pl/feed/rss/n/11514'),
(u'Inne', u'http://webhosting.pl/feed/rss/n/24811'),
(u'Marketing', u'http://webhosting.pl/feed/rss/n/11535')]
def print_version(self, url):
return url.replace('webhosting.pl', 'webhosting.pl/print')

View File

@ -190,3 +190,16 @@ def get_windows_username():
return buf.value
return get_unicode_windows_env_var(u'USERNAME')
def get_windows_temp_path():
import ctypes
n = ctypes.windll.kernel32.GetTempPathW(0, None)
if n == 0:
return None
buf = ctypes.create_unicode_buffer(u'\0'*n)
ctypes.windll.kernel32.GetTempPathW(n, buf)
ans = buf.value
if ans[-1] == u'\\':
ans = ans[:-1]
return ans if ans else None

View File

@ -192,9 +192,13 @@ class InputFormatPlugin(Plugin):
def __call__(self, stream, options, file_ext, log,
accelerators, output_dir):
log('InputFormatPlugin: %s running'%self.name)
if hasattr(stream, 'name'):
log('on', stream.name)
try:
log('InputFormatPlugin: %s running'%self.name)
if hasattr(stream, 'name'):
log('on', stream.name)
except:
# In case stdout is broken
pass
with CurrentDir(output_dir):
for x in os.listdir('.'):

View File

@ -184,14 +184,14 @@ class ANDROID(USBMS):
'ALPANDIGITAL', 'ANDROID_MID', 'VTAB1008', 'EMX51_BBG_ANDROI',
'UMS', '.K080', 'P990', 'LTE', 'MB853', 'GT-S5660_CARD', 'A107',
'GT-I9003_CARD', 'XT912', 'FILE-CD_GADGET', 'RK29_SDK', 'MB855',
'XT910', 'BOOK_A10', 'USB_2.0_DRIVER']
'XT910', 'BOOK_A10', 'USB_2.0_DRIVER', 'I9100T']
WINDOWS_CARD_A_MEM = ['ANDROID_PHONE', 'GT-I9000_CARD', 'SGH-I897',
'FILE-STOR_GADGET', 'SGH-T959', 'SAMSUNG_ANDROID', 'GT-P1000_CARD',
'A70S', 'A101IT', '7', 'INCREDIBLE', 'A7EB', 'SGH-T849_CARD',
'__UMS_COMPOSITE', 'SGH-I997_CARD', 'MB870', 'ALPANDIGITAL',
'ANDROID_MID', 'P990_SD_CARD', '.K080', 'LTE_CARD', 'MB853',
'A1-07___C0541A4F', 'XT912', 'MB855', 'XT910', 'BOOK_A10_CARD',
'USB_2.0_DRIVER']
'USB_2.0_DRIVER', 'I9100T']
OSX_MAIN_MEM = 'Android Device Main Memory'

View File

@ -153,7 +153,9 @@ def get_metadata(stream):
mi = MetaInformation(None, [])
if data.has_key('title'):
mi.title = data['title']
if data.has_key('creator'):
if data.get('initial-creator', '').strip():
mi.authors = string_to_authors(data['initial-creator'])
elif data.has_key('creator'):
mi.authors = string_to_authors(data['creator'])
if data.has_key('description'):
mi.comments = data['description']

View File

@ -6,7 +6,6 @@ __license__ = 'GPL 3'
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import re
import urllib
from contextlib import closing
@ -36,9 +35,9 @@ class OReillyStore(BasicStoreConfig, StorePlugin):
def search(self, query, max_results=10, timeout=60):
url = 'http://search.oreilly.com/?t1=Books&t2=Format&t3=Ebook&q=' + urllib.quote_plus(query)
br = browser()
counter = max_results
with closing(br.open(url, timeout=timeout)) as f:
doc = html.fromstring(f.read())
@ -49,7 +48,7 @@ class OReillyStore(BasicStoreConfig, StorePlugin):
ebook = ' '.join(data.xpath('.//p[@class="note"]/text()'))
if 'ebook' not in ebook.lower():
continue
id = ''.join(data.xpath('./div[@class="book_text"]//p[@class="title"]/a/@href'))
cover_url = ''.join(data.xpath('./a/img[1]/@src'))
@ -61,7 +60,7 @@ class OReillyStore(BasicStoreConfig, StorePlugin):
# Get the detail here because we need to get the ebook id for the detail_item.
with closing(br.open(id, timeout=timeout)) as nf:
idoc = html.fromstring(nf.read())
for td in idoc.xpath('//td[@class="optionsTd"]'):
if 'ebook' in ''.join(td.xpath('.//text()')).lower():
price = ''.join(td.xpath('.//span[@class="price"]/text()')).strip()
@ -69,7 +68,7 @@ class OReillyStore(BasicStoreConfig, StorePlugin):
break
counter -= 1
s = SearchResult()
s.cover_url = cover_url.strip()
s.title = title.strip()
@ -78,5 +77,5 @@ class OReillyStore(BasicStoreConfig, StorePlugin):
s.price = price.strip()
s.drm = SearchResult.DRM_UNLOCKED
s.formats = formats.upper()
yield s

View File

@ -10,8 +10,6 @@ __docformat__ = 'restructuredtext en'
import traceback, cPickle, copy
from itertools import repeat
from collections import defaultdict
from functools import partial
from PyQt4.Qt import (QAbstractItemModel, QIcon, QVariant, QFont, Qt,
QMimeData, QModelIndex, pyqtSignal, QObject)

View File

@ -16,7 +16,7 @@ from PyQt4.Qt import (QWizard, QWizardPage, QPixmap, Qt, QAbstractListModel,
from calibre import __appname__, patheq
from calibre.library.database2 import LibraryDatabase2
from calibre.library.move import MoveLibrary
from calibre.constants import filesystem_encoding, iswindows
from calibre.constants import filesystem_encoding, iswindows, plugins
from calibre.gui2.wizard.send_email import smtp_prefs
from calibre.gui2.wizard.device_ui import Ui_WizardPage as DeviceUI
from calibre.gui2.wizard.library_ui import Ui_WizardPage as LibraryUI
@ -30,6 +30,9 @@ from calibre.gui2 import NONE, choose_dir, error_dialog
from calibre.gui2.dialogs.progress import ProgressDialog
from calibre.customize.ui import device_plugins
if iswindows:
winutil = plugins['winutil'][0]
# Devices {{{
class Device(object):
@ -302,13 +305,13 @@ class HanlinV5(HanlinV3):
class BeBook(HanlinV3):
name = 'BeBook'
manufacturer = 'Endless Ideas'
manufacturer = 'BeBook'
id = 'bebook'
class BeBookMini(HanlinV5):
name = 'BeBook Mini'
manufacturer = 'Endless Ideas'
manufacturer = 'BeBook'
id = 'bebook_mini'
class EZReader(HanlinV3):
@ -420,9 +423,9 @@ class KindlePage(QWizardPage, KindleUI):
def commit(self):
x = unicode(self.to_address.text()).strip()
parts = x.split('@')
if len(parts) < 2 or not parts[0]: return
if self.send_email_widget.set_email_settings(True):
if (self.send_email_widget.set_email_settings(True) and len(parts) >= 2
and parts[0]):
conf = smtp_prefs()
accounts = conf.parse().accounts
if not accounts: accounts = {}
@ -751,19 +754,20 @@ class LibraryPage(QWizardPage, LibraryUI):
self.default_library_name = None
if not lp:
fname = _('Calibre Library')
if isinstance(fname, unicode):
try:
fname = fname.encode(filesystem_encoding)
except:
fname = 'Calibre Library'
lp = os.path.expanduser('~'+os.sep+fname)
base = os.path.expanduser(u'~')
if iswindows:
x = winutil.special_folder_path(winutil.CSIDL_PERSONAL)
if x and os.access(x, os.W_OK):
base = x
lp = os.path.join(base, fname)
self.default_library_name = lp
if not os.path.exists(lp):
try:
os.makedirs(lp)
except:
traceback.print_exc()
lp = os.path.expanduser('~')
lp = os.path.expanduser(u'~')
self.location.setText(lp)
def isComplete(self):
@ -779,12 +783,10 @@ class LibraryPage(QWizardPage, LibraryUI):
oldloc = prefs['library_path']
newloc = unicode(self.location.text())
try:
newloce = newloc.encode(filesystem_encoding)
if self.default_library_name is not None and \
os.path.exists(self.default_library_name) and \
not os.listdir(self.default_library_name) and \
newloce != self.default_library_name:
os.rmdir(self.default_library_name)
dln = self.default_library_name
if (dln and os.path.exists(dln) and not os.listdir(dln) and newloc
!= dln):
os.rmdir(dln)
except:
pass
if not os.path.exists(newloc):

View File

@ -7,7 +7,8 @@ being closed.
"""
import tempfile, os, atexit, binascii, cPickle
from calibre.constants import __version__, __appname__
from calibre.constants import (__version__, __appname__,
get_unicode_windows_env_var, iswindows, get_windows_temp_path)
def cleanup(path):
try:
@ -47,7 +48,18 @@ def base_dir():
_base_dir = td
else:
base = os.environ.get('CALIBRE_TEMP_DIR', None)
if base is not None and iswindows:
base = get_unicode_windows_env_var('CALIBRE_TEMP_DIR')
prefix = app_prefix(u'tmp_')
if base is None and iswindows:
# On windows always use a unicode temp path, as for some
# localized (east asian) windows builds, there's no reliable
# way to escalate to unicode only when needed. See
# https://bugs.launchpad.net/bugs/937389 Hopefully, by now, the
# rest of calibre can deal with unicode temp paths. We'll leave
# temp paths as bytestring on Unix, as the temp dir on unix is
# very rarely non ascii anyway.
base = get_windows_temp_path()
try:
# First try an ascii path as that is what was done historically
# and we dont want to break working code
@ -66,7 +78,9 @@ def base_dir():
def _make_file(suffix, prefix, base):
try:
fd, name = tempfile.mkstemp(suffix, prefix, dir=base)
except UnicodeDecodeError:
except (UnicodeDecodeError, OSError):
# On some windows systems, we get an OSError because base is not
# unicode and windows cannot find the path pointed to by base
global _base_dir
from calibre.constants import filesystem_encoding
base_dir()
@ -79,7 +93,9 @@ def _make_file(suffix, prefix, base):
def _make_dir(suffix, prefix, base):
try:
tdir = tempfile.mkdtemp(suffix, prefix, base)
except ValueError:
except (ValueError, OSError):
# On some windows systems, we get an OSError because base is not
# unicode and windows cannot find the path pointed to by base
global _base_dir
from calibre.constants import filesystem_encoding
base_dir()

View File

@ -4,20 +4,14 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''
Builtin recipes.
'''
import re, imp, inspect, time, os
from calibre.web.feeds.news import BasicNewsRecipe, CustomIndexRecipe, \
AutomaticNewsRecipe, CalibrePeriodical
import re, time, io
from calibre.web.feeds.news import (BasicNewsRecipe, CustomIndexRecipe,
AutomaticNewsRecipe, CalibrePeriodical)
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.ptempfile import PersistentTemporaryDirectory
from calibre import __appname__, english_sort
from calibre.utils.config import JSONConfig
BeautifulSoup, time, english_sort
basic_recipes = (BasicNewsRecipe, AutomaticNewsRecipe, CustomIndexRecipe,
CalibrePeriodical)
_tdir = None
_crep = 0
custom_recipes = JSONConfig('custom_recipes/index.json')
@ -28,39 +22,33 @@ def custom_recipe_filename(id_, title):
def compile_recipe(src):
'''
Compile the code in src and return the first object that is a recipe or profile.
@param src: Python source code
@type src: string
@return: Recipe class or None, if no such class was found in C{src}
Compile the code in src and return a recipe object, if found.
:param src: Python source code as bytestring or unicode object
:return: Recipe class or None, if no such class was found in src
'''
global _tdir, _crep
if _tdir is None or not os.path.exists(_tdir):
_tdir = PersistentTemporaryDirectory('_recipes')
temp = os.path.join(_tdir, 'recipe%d.py'%_crep)
_crep += 1
if not isinstance(src, unicode):
match = re.search(r'coding[:=]\s*([-\w.]+)', src[:200])
enc = match.group(1) if match else 'utf-8'
src = src.decode(enc)
src = re.sub(r'from __future__.*', '', src)
f = open(temp, 'wb')
src = 'from %s.web.feeds.news import BasicNewsRecipe, AutomaticNewsRecipe\n'%__appname__ + src
src = '# coding: utf-8\n' + src
src = 'from __future__ import with_statement\n' + src
# Python complains if there is a coding declaration in a unicode string
src = re.sub(r'^#.*coding\s*[:=]\s*([-\w.]+)', '#', src, flags=re.MULTILINE)
# Translate newlines to \n
src = io.StringIO(src, newline=None).getvalue()
src = src.replace('from libprs500', 'from calibre').encode('utf-8')
f.write(src)
f.close()
module = imp.find_module(os.path.splitext(os.path.basename(temp))[0],
[os.path.dirname(temp)])
module = imp.load_module(os.path.splitext(os.path.basename(temp))[0], *module)
classes = inspect.getmembers(module,
lambda x : inspect.isclass(x) and \
issubclass(x, (BasicNewsRecipe,)) and \
x not in basic_recipes)
if not classes:
return None
namespace = {
'BasicNewsRecipe':BasicNewsRecipe,
'AutomaticNewsRecipe':AutomaticNewsRecipe,
'time':time, 're':re,
'BeautifulSoup':BeautifulSoup
}
exec src in namespace
return classes[0][1]
for x in namespace.itervalues():
if (isinstance(x, type) and issubclass(x, BasicNewsRecipe) and x not
in basic_recipes):
return x
return None