diff --git a/resources/images/news/hitro.png b/resources/images/news/hitro.png new file mode 100644 index 0000000000..75c08a1c25 Binary files /dev/null and b/resources/images/news/hitro.png differ diff --git a/resources/images/news/kamikaze.png b/resources/images/news/kamikaze.png new file mode 100644 index 0000000000..49ef2f50a1 Binary files /dev/null and b/resources/images/news/kamikaze.png differ diff --git a/resources/images/news/trombon.png b/resources/images/news/trombon.png new file mode 100644 index 0000000000..641b04f1b7 Binary files /dev/null and b/resources/images/news/trombon.png differ diff --git a/resources/images/news/wallstreetro.png b/resources/images/news/wallstreetro.png new file mode 100644 index 0000000000..d72bc70ca0 Binary files /dev/null and b/resources/images/news/wallstreetro.png differ diff --git a/resources/recipes/el_pais_babelia.recipe b/resources/recipes/el_pais_babelia.recipe new file mode 100644 index 0000000000..31b983ec0b --- /dev/null +++ b/resources/recipes/el_pais_babelia.recipe @@ -0,0 +1,49 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class ElPaisBabelia(BasicNewsRecipe): + + title = 'El Pais Babelia' + __author__ = 'oneillpt' + description = 'El Pais Babelia' + INDEX = 'http://www.elpais.com/suple/babelia/' + language = 'es' + + remove_tags_before = dict(name='div', attrs={'class':'estructura_2col'}) + keep_tags = [dict(name='div', attrs={'class':'estructura_2col'})] + remove_tags = [dict(name='div', attrs={'class':'votos estirar'}), + dict(name='div', attrs={'id':'utilidades'}), + dict(name='div', attrs={'class':'info_relacionada'}), + dict(name='div', attrs={'class':'mod_apoyo'}), + dict(name='div', attrs={'class':'contorno_f'}), + dict(name='div', attrs={'class':'pestanias'}), + dict(name='div', attrs={'class':'otros_webs'}), + dict(name='div', attrs={'id':'pie'}) + ] + #no_stylesheets = True + remove_javascript = True + + def parse_index(self): + articles = [] + soup = self.index_to_soup(self.INDEX) + feeds = [] + for section in soup.findAll('div', attrs={'class':'contenedor_nuevo'}): + section_title = self.tag_to_string(section.find('h1')) + articles = [] + for post in section.findAll('a', href=True): + url = post['href'] + if url.startswith('/'): + url = 'http://www.elpais.es'+url + title = self.tag_to_string(post) + if str(post).find('class=') > 0: + klass = post['class'] + if klass != "": + self.log() + self.log('--> post: ', post) + self.log('--> url: ', url) + self.log('--> title: ', title) + self.log('--> class: ', klass) + articles.append({'title':title, 'url':url}) + if articles: + feeds.append((section_title, articles)) + return feeds + diff --git a/resources/recipes/evz.ro.recipe b/resources/recipes/evz.ro.recipe index bce151d1fc..841dc80429 100644 --- a/resources/recipes/evz.ro.recipe +++ b/resources/recipes/evz.ro.recipe @@ -1,52 +1,54 @@ +# -*- coding: utf-8 -*- +#!/usr/bin/env python + __license__ = 'GPL v3' -__copyright__ = '2010, Darko Miletic ' +__copyright__ = u'2011, Silviu Cotoar\u0103' ''' evz.ro ''' -import re from calibre.web.feeds.news import BasicNewsRecipe -class EVZ_Ro(BasicNewsRecipe): - title = 'evz.ro' - __author__ = 'Darko Miletic' - description = 'News from Romania' - publisher = 'evz.ro' - category = 'news, politics, Romania' - oldest_article = 2 - max_articles_per_feed = 200 - no_stylesheets = True - encoding = 'utf8' - use_embedded_content = False +class EvenimentulZilei(BasicNewsRecipe): + title = u'Evenimentul Zilei' + __author__ = u'Silviu Cotoar\u0103' + description = '' + publisher = u'Evenimentul Zilei' + oldest_article = 5 language = 'ro' - masthead_url = 'http://www.evz.ro/fileadmin/images/logo.gif' - extra_css = ' body{font-family: Georgia,Arial,Helvetica,sans-serif } .firstP{font-size: 1.125em} .author,.articleInfo{font-size: small} ' + max_articles_per_feed = 100 + no_stylesheets = True + use_embedded_content = False + category = 'Ziare,Stiri' + encoding = 'utf-8' + cover_url = 'http://www.evz.ro/fileadmin/images/evzLogo.png' conversion_options = { - 'comment' : description - , 'tags' : category - , 'publisher' : publisher - , 'language' : language - } + 'comments' : description + ,'tags' : category + ,'language' : language + ,'publisher' : publisher + } - preprocess_regexps = [ - (re.compile(r'.*?', re.DOTALL|re.IGNORECASE),lambda match: '<head><title>') - ,(re.compile(r'.*?', re.DOTALL|re.IGNORECASE),lambda match: '') - ] + keep_only_tags = [ + dict(name='div', attrs={'class':'single'}) + , dict(name='img', attrs={'id':'placeholder'}) + , dict(name='a', attrs={'id':'holderlink'}) + ] - remove_tags = [ - dict(name=['form','embed','iframe','object','base','link','script','noscript']) - ,dict(attrs={'class':['section','statsInfo','email il']}) - ,dict(attrs={'id' :'gallery'}) - ] + remove_tags = [ + dict(name='p', attrs={'class':['articleInfo']}) + , dict(name='div', attrs={'id':['bannerAddoceansArticleJos']}) + , dict(name='div', attrs={'id':['bannerAddoceansArticle']}) + ] - remove_tags_after = dict(attrs={'class':'section'}) - keep_only_tags = [dict(attrs={'class':'single'})] - remove_attributes = ['height','width'] + remove_tags_after = [ + dict(name='div', attrs={'id':['bannerAddoceansArticleJos']}) + ] - feeds = [(u'Articles', u'http://www.evz.ro/rss.xml')] + feeds = [ + (u'Feeds', u'http://www.evz.ro/rss.xml') + ] def preprocess_html(self, soup): - for item in soup.findAll(style=True): - del item['style'] - return soup + return self.adeify_images(soup) diff --git a/resources/recipes/hitro.recipe b/resources/recipes/hitro.recipe new file mode 100644 index 0000000000..3a85847c81 --- /dev/null +++ b/resources/recipes/hitro.recipe @@ -0,0 +1,43 @@ +# -*- coding: utf-8 -*- +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = u'2011, Silviu Cotoar\u0103' +''' +hit.ro +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class Hit(BasicNewsRecipe): + title = u'HIT' + __author__ = u'Silviu Cotoar\u0103' + description = 'IT' + publisher = 'HIT' + oldest_article = 5 + language = 'ro' + max_articles_per_feed = 100 + no_stylesheets = True + use_embedded_content = False + category = 'Ziare,Reviste,IT' + encoding = 'utf-8' + cover_url = 'http://www.hit.ro/lib/images/frontend/hit_logo.png' + + conversion_options = { + 'comments' : description + ,'tags' : category + ,'language' : language + ,'publisher' : publisher + } + + keep_only_tags = [ + dict(name='h1', attrs={'class':'art_titl'}) + , dict(name='div', attrs={'id':'continut_articol'}) + ] + + feeds = [ + (u'Feeds', u'http://www.hit.ro/rss') + ] + + def preprocess_html(self, soup): + return self.adeify_images(soup) diff --git a/resources/recipes/kamikaze.recipe b/resources/recipes/kamikaze.recipe new file mode 100644 index 0000000000..1369cb6f85 --- /dev/null +++ b/resources/recipes/kamikaze.recipe @@ -0,0 +1,53 @@ +# -*- coding: utf-8 -*- +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = u'2011, Silviu Cotoar\u0103' +''' +kamikazeonline.ro +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class Kamikaze(BasicNewsRecipe): + title = u'Kamikaze' + __author__ = u'Silviu Cotoar\u0103' + description = u'S\u0103pt\u0103m\u00e2nal sc\u0103pat de sub control' + publisher = 'Kamikaze' + oldest_article = 5 + language = 'ro' + max_articles_per_feed = 100 + no_stylesheets = True + use_embedded_content = False + category = 'Ziare,Reviste' + encoding = 'utf-8' + cover_url = 'http://www.kamikazeonline.ro/wp-content/themes/kamikaze/images/kamikazeonline_header.gif' + + conversion_options = { + 'comments' : description + ,'tags' : category + ,'language' : language + ,'publisher' : publisher + } + + keep_only_tags = [ + dict(name='div', attrs={'id':'content'}) + ] + + remove_tags = [ + dict(name='div', attrs={'class':['connect_confirmation_cell connect_confirmation_cell_no_like']}) + , dict(name='h3', attrs={'id':['comments']}) + , dict(name='ul', attrs={'class':['addtoany_list']}) + , dict(name='p', attrs={'class':['postmetadata']}) + ] + + remove_tags_after = [ + dict(name='p', attrs={'class':['postmetadata']}) + ] + + feeds = [ + (u'Feeds', u'http://www.kamikazeonline.ro/feed/') + ] + + def preprocess_html(self, soup): + return self.adeify_images(soup) diff --git a/resources/recipes/kompiutierra.recipe b/resources/recipes/kompiutierra.recipe index 0d30afa3a7..a82db9aced 100644 --- a/resources/recipes/kompiutierra.recipe +++ b/resources/recipes/kompiutierra.recipe @@ -1,36 +1,37 @@ -#!/usr/bin/python -# -*- coding: utf-8 -*- - -__license__ = 'GPL v3' -__copyright__ = '2010, Vadim Dyadkin, dyadkin@gmail.com' -__author__ = 'Vadim Dyadkin' - -from calibre.web.feeds.news import BasicNewsRecipe - -class Computerra(BasicNewsRecipe): - title = u'\u041a\u043e\u043c\u043f\u044c\u044e\u0442\u0435\u0440\u0440\u0430' - recursion = 50 - oldest_article = 100 - __author__ = 'Vadim Dyadkin' - max_articles_per_feed = 100 - use_embedded_content = False - simultaneous_downloads = 5 - language = 'ru' - description = u'\u041a\u043e\u043c\u043f\u044c\u044e\u0442\u0435\u0440\u044b, \u043e\u043a\u043e\u043b\u043e\u043d\u0430\u0443\u0447\u043d\u044b\u0435 \u0438 \u043e\u043a\u043e\u043b\u043e\u0444\u0438\u043b\u043e\u0441\u043e\u0444\u0441\u043a\u0438\u0435 \u0441\u0442\u0430\u0442\u044c\u0438, \u0433\u0430\u0434\u0436\u0435\u0442\u044b.' - - keep_only_tags = [dict(name='div', attrs={'id': 'content'}),] - - - feeds = [(u'\u041a\u043e\u043c\u043f\u044c\u044e\u0442\u0435\u0440\u0440\u0430', 'http://feeds.feedburner.com/ct_news/'),] - - remove_tags = [dict(name='div', attrs={'id': ['fin', 'idc-container', 'idc-noscript',]}), - dict(name='ul', attrs={'class': "related_post"}), - dict(name='p', attrs={'class': 'info'}), - dict(name='a', attrs={'rel': 'tag', 'class': 'twitter-share-button', 'type': 'button_count'}), - dict(name='h2', attrs={}),] - - extra_css = 'body { text-align: justify; }' - - def get_article_url(self, article): - return article.get('feedburner:origLink', article.get('guid')) - +#!/usr/bin/python +# -*- coding: utf-8 -*- + +__license__ = 'GPL v3' +__copyright__ = '2010, Vadim Dyadkin, dyadkin@gmail.com' +__author__ = 'Vadim Dyadkin' + +from calibre.web.feeds.news import BasicNewsRecipe + +class Computerra(BasicNewsRecipe): + title = u'\u041a\u043e\u043c\u043f\u044c\u044e\u0442\u0435\u0440\u0440\u0430' + oldest_article = 100 + __author__ = 'Vadim Dyadkin (edited by A. Chewi)' + max_articles_per_feed = 50 + use_embedded_content = False + remove_javascript = True + no_stylesheets = True + conversion_options = {'linearize_tables' : True} + simultaneous_downloads = 5 + language = 'ru' + description = u'Компьютерра: все новости про компьютеры, железо, новые технологии, информационные технологии' + + keep_only_tags = [dict(name='div', attrs={'id': 'content'}),] + + feeds = [(u'Компьютерра-Онлайн', 'http://feeds.feedburner.com/ct_news/'),] + + remove_tags = [ + dict(name='div', attrs={'id': ['fin', 'idc-container', 'idc-noscript',]}), + dict(name='ul', attrs={'class': "related_post"}), + dict(name='p', attrs={'class': 'info'}), + dict(name='a', attrs={'class': 'twitter-share-button'}), + dict(name='a', attrs={'type': 'button_count'}), + dict(name='h2', attrs={}) + ] + + def print_version(self, url): + return url + '?print=true' diff --git a/resources/recipes/nationalgeoro.recipe b/resources/recipes/nationalgeoro.recipe index a3c5727d38..8f989be74d 100644 --- a/resources/recipes/nationalgeoro.recipe +++ b/resources/recipes/nationalgeoro.recipe @@ -14,7 +14,7 @@ class NationalGeoRo(BasicNewsRecipe): __author__ = u'Silviu Cotoar\u0103' description = u'S\u0103 avem grij\u0103 de planet\u0103' publisher = 'National Geographic' - oldest_article = 5 + oldest_article = 35 language = 'ro' max_articles_per_feed = 100 no_stylesheets = True diff --git a/resources/recipes/nrc-nl-epub.recipe b/resources/recipes/nrc-nl-epub.recipe index da9b9195ce..2d190e4d0a 100644 --- a/resources/recipes/nrc-nl-epub.recipe +++ b/resources/recipes/nrc-nl-epub.recipe @@ -1,14 +1,14 @@ -#!/usr/bin/env python +#!/usr/bin/env python2 # -*- coding: utf-8 -*- -#Based on Lars Jacob's Taz Digiabo recipe +#Based on veezh's original recipe and Kovid Goyal's New York Times recipe __license__ = 'GPL v3' -__copyright__ = '2010, veezh' +__copyright__ = '2011, Snaab' ''' www.nrc.nl ''' -import os, urllib2, zipfile +import os, zipfile import time from calibre.web.feeds.news import BasicNewsRecipe from calibre.ptempfile import PersistentTemporaryFile @@ -17,41 +17,59 @@ from calibre.ptempfile import PersistentTemporaryFile class NRCHandelsblad(BasicNewsRecipe): title = u'NRC Handelsblad' - description = u'De EPUB-versie van NRC' + description = u'De ePaper-versie van NRC' language = 'nl' lang = 'nl-NL' + needs_subscription = True - __author__ = 'veezh' + __author__ = 'Snaab' conversion_options = { 'no_default_epub_cover' : True } + def get_browser(self): + br = BasicNewsRecipe.get_browser() + if self.username is not None and self.password is not None: + br.open('http://login.nrc.nl/login') + br.select_form(nr=0) + br['username'] = self.username + br['password'] = self.password + br.submit() + return br + def build_index(self): + today = time.strftime("%Y%m%d") + domain = "http://digitaleeditie.nrc.nl" url = domain + "/digitaleeditie/helekrant/epub/nrc_" + today + ".epub" -# print url + #print url try: - f = urllib2.urlopen(url) - except urllib2.HTTPError: + br = self.get_browser() + f = br.open(url) + except: self.report_progress(0,_('Kan niet inloggen om editie te downloaden')) raise ValueError('Krant van vandaag nog niet beschikbaar') + tmp = PersistentTemporaryFile(suffix='.epub') self.report_progress(0,_('downloading epub')) tmp.write(f.read()) - tmp.close() - - zfile = zipfile.ZipFile(tmp.name, 'r') - self.report_progress(0,_('extracting epub')) - - zfile.extractall(self.output_dir) + f.close() + br.close() + if zipfile.is_zipfile(tmp): + try: + zfile = zipfile.ZipFile(tmp.name, 'r') + zfile.extractall(self.output_dir) + self.report_progress(0,_('extracting epub')) + except zipfile.BadZipfile: + self.report_progress(0,_('BadZip error, continuing')) tmp.close() - index = os.path.join(self.output_dir, 'content.opf') + index = os.path.join(self.output_dir, 'metadata.opf') self.report_progress(1,_('epub downloaded and extracted')) diff --git a/resources/recipes/trombon.recipe b/resources/recipes/trombon.recipe new file mode 100644 index 0000000000..1a4e488a43 --- /dev/null +++ b/resources/recipes/trombon.recipe @@ -0,0 +1,51 @@ +# -*- coding: utf-8 -*- +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = u'2011, Silviu Cotoar\u0103' +''' +trombon.ro +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class Trombon(BasicNewsRecipe): + title = u'Trombon' + __author__ = u'Silviu Cotoar\u0103' + description = u'Parodii si Pamflete' + publisher = u'Trombon' + oldest_article = 5 + language = 'ro' + max_articles_per_feed = 100 + no_stylesheets = True + use_embedded_content = False + category = 'Ziare,Reviste,Fun' + encoding = 'utf-8' + cover_url = 'http://www.trombon.ro/i/trombon.gif' + + conversion_options = { + 'comments' : description + ,'tags' : category + ,'language' : language + ,'publisher' : publisher + } + + keep_only_tags = [ + dict(name='div', attrs={'class':'articol'}) + ] + + remove_tags = [ + dict(name='div', attrs={'class':['info_2']}) + , dict(name='iframe', attrs={'scrolling':['no']}) + ] + + remove_tags_after = [ + dict(name='div', attrs={'id':'article_vote'}) + ] + + feeds = [ + (u'Feeds', u'http://feeds.feedburner.com/trombon/ABWb?format=xml') + ] + + def preprocess_html(self, soup): + return self.adeify_images(soup) diff --git a/resources/recipes/wallstreetro.recipe b/resources/recipes/wallstreetro.recipe new file mode 100644 index 0000000000..8a66aa3673 --- /dev/null +++ b/resources/recipes/wallstreetro.recipe @@ -0,0 +1,54 @@ +# -*- coding: utf-8 -*- +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = u'2011, Silviu Cotoar\u0103' +''' +wall-street.ro +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class WallStreetRo(BasicNewsRecipe): + title = u'Wall Street' + __author__ = u'Silviu Cotoar\u0103' + description = '' + publisher = 'Wall Street' + oldest_article = 5 + language = 'ro' + max_articles_per_feed = 100 + no_stylesheets = True + use_embedded_content = False + category = 'Ziare' + encoding = 'utf-8' + cover_url = 'http://img.wall-street.ro/images/WS_new_logo.jpg' + + conversion_options = { + 'comments' : description + ,'tags' : category + ,'language' : language + ,'publisher' : publisher + } + + keep_only_tags = [ + dict(name='div', attrs={'class':'article_header'}) + , dict(name='div', attrs={'class':'article_text'}) + ] + + remove_tags = [ + dict(name='p', attrs={'class':['page_breadcrumbs']}) + , dict(name='div', attrs={'id':['article_user_toolbox']}) + , dict(name='p', attrs={'class':['comments_count_container']}) + , dict(name='div', attrs={'class':['article_left_column']}) + ] + + remove_tags_after = [ + dict(name='div', attrs={'class':'clearfloat'}) + ] + + feeds = [ + (u'Feeds', u'http://img.wall-street.ro/rssfeeds/wall-street.xml') + ] + + def preprocess_html(self, soup): + return self.adeify_images(soup) diff --git a/src/calibre/ebooks/comic/input.py b/src/calibre/ebooks/comic/input.py index 7710d41fb3..56fa123249 100755 --- a/src/calibre/ebooks/comic/input.py +++ b/src/calibre/ebooks/comic/input.py @@ -131,9 +131,12 @@ class PageProcessor(list): # {{{ newsizey = int(newsizex / aspect) deltax = 0 deltay = (SCRHEIGHT - newsizey) / 2 - wand.size = (newsizex, newsizey) - wand.set_border_color(pw) - wand.add_border(pw, deltax, deltay) + if newsizex < 20000 and newsizey < 20000: + # Too large and resizing fails, so better + # to leave it as original size + wand.size = (newsizex, newsizey) + wand.set_border_color(pw) + wand.add_border(pw, deltax, deltay) elif self.opts.wide: # Keep aspect and Use device height as scaled image width so landscape mode is clean aspect = float(sizex) / float(sizey) @@ -152,11 +155,15 @@ class PageProcessor(list): # {{{ newsizey = int(newsizex / aspect) deltax = 0 deltay = (wscreeny - newsizey) / 2 - wand.size = (newsizex, newsizey) - wand.set_border_color(pw) - wand.add_border(pw, deltax, deltay) + if newsizex < 20000 and newsizey < 20000: + # Too large and resizing fails, so better + # to leave it as original size + wand.size = (newsizex, newsizey) + wand.set_border_color(pw) + wand.add_border(pw, deltax, deltay) else: - wand.size = (SCRWIDTH, SCRHEIGHT) + if SCRWIDTH < 20000 and SCRHEIGHT < 20000: + wand.size = (SCRWIDTH, SCRHEIGHT) if not self.opts.dont_sharpen: wand.sharpen(0.0, 1.0) diff --git a/src/calibre/ebooks/snb/snbfile.py b/src/calibre/ebooks/snb/snbfile.py index e42533f241..9a7d65e417 100644 --- a/src/calibre/ebooks/snb/snbfile.py +++ b/src/calibre/ebooks/snb/snbfile.py @@ -75,15 +75,20 @@ class SNBFile: for i in range(self.plainBlock): bzdc = bz2.BZ2Decompressor() if (i < self.plainBlock - 1): - bSize = self.blocks[self.binBlock + i + 1].Offset - self.blocks[self.binBlock + i].Offset; + bSize = self.blocks[self.binBlock + i + 1].Offset - self.blocks[self.binBlock + i].Offset else: - bSize = self.tailOffset - self.blocks[self.binBlock + i].Offset; - snbFile.seek(self.blocks[self.binBlock + i].Offset); + bSize = self.tailOffset - self.blocks[self.binBlock + i].Offset + snbFile.seek(self.blocks[self.binBlock + i].Offset) try: data = snbFile.read(bSize) - uncompressedData += bzdc.decompress(data) + if len(data) < 32768: + uncompressedData += bzdc.decompress(data) + else: + uncompressedData += data except Exception, e: print e + if len(uncompressedData) != self.plainStreamSizeUncompressed: + raise Exception() f.fileBody = uncompressedData[plainPos:plainPos+f.fileSize] plainPos += f.fileSize elif f.attr & 0x01000000 == 0x01000000: