From 60c7c6634936f7c543ff25826630bc425d44e952 Mon Sep 17 00:00:00 2001 From: Li Fanxi Date: Wed, 2 Mar 2011 23:25:53 +0800 Subject: [PATCH 1/8] [Bug] Workaround a strange problem when extracting some SNB files with PDF contents. --- src/calibre/ebooks/snb/snbfile.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/calibre/ebooks/snb/snbfile.py b/src/calibre/ebooks/snb/snbfile.py index e42533f241..10aa6a8715 100644 --- a/src/calibre/ebooks/snb/snbfile.py +++ b/src/calibre/ebooks/snb/snbfile.py @@ -75,15 +75,18 @@ class SNBFile: for i in range(self.plainBlock): bzdc = bz2.BZ2Decompressor() if (i < self.plainBlock - 1): - bSize = self.blocks[self.binBlock + i + 1].Offset - self.blocks[self.binBlock + i].Offset; + bSize = self.blocks[self.binBlock + i + 1].Offset - self.blocks[self.binBlock + i].Offset else: - bSize = self.tailOffset - self.blocks[self.binBlock + i].Offset; - snbFile.seek(self.blocks[self.binBlock + i].Offset); + bSize = self.tailOffset - self.blocks[self.binBlock + i].Offset + snbFile.seek(self.blocks[self.binBlock + i].Offset) try: data = snbFile.read(bSize) - uncompressedData += bzdc.decompress(data) + if len(data) < 32768: + uncompressedData += bzdc.decompress(data) except Exception, e: print e + if len(uncompressedData) != self.plainStreamSizeUncompressed: + raise Exception() f.fileBody = uncompressedData[plainPos:plainPos+f.fileSize] plainPos += f.fileSize elif f.attr & 0x01000000 == 0x01000000: From e373c822e2155f7595b1291764cb3030f418f282 Mon Sep 17 00:00:00 2001 From: Li Fanxi Date: Thu, 3 Mar 2011 00:06:34 +0800 Subject: [PATCH 2/8] [Bug] A better way to workaround a strange problem when extracting some SNB files with PDF contents. --- src/calibre/ebooks/snb/snbfile.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/calibre/ebooks/snb/snbfile.py b/src/calibre/ebooks/snb/snbfile.py index 10aa6a8715..9a7d65e417 100644 --- a/src/calibre/ebooks/snb/snbfile.py +++ b/src/calibre/ebooks/snb/snbfile.py @@ -83,6 +83,8 @@ class SNBFile: data = snbFile.read(bSize) if len(data) < 32768: uncompressedData += bzdc.decompress(data) + else: + uncompressedData += data except Exception, e: print e if len(uncompressedData) != self.plainStreamSizeUncompressed: From bf68165605aa3465e4678ecc9abc07c2a020677a Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 7 Mar 2011 08:50:14 -0700 Subject: [PATCH 3/8] Update NRC Handelsblad Epub version --- resources/recipes/nrc-nl-epub.recipe | 50 +++++++++++++++++++--------- 1 file changed, 34 insertions(+), 16 deletions(-) diff --git a/resources/recipes/nrc-nl-epub.recipe b/resources/recipes/nrc-nl-epub.recipe index da9b9195ce..2d190e4d0a 100644 --- a/resources/recipes/nrc-nl-epub.recipe +++ b/resources/recipes/nrc-nl-epub.recipe @@ -1,14 +1,14 @@ -#!/usr/bin/env python +#!/usr/bin/env python2 # -*- coding: utf-8 -*- -#Based on Lars Jacob's Taz Digiabo recipe +#Based on veezh's original recipe and Kovid Goyal's New York Times recipe __license__ = 'GPL v3' -__copyright__ = '2010, veezh' +__copyright__ = '2011, Snaab' ''' www.nrc.nl ''' -import os, urllib2, zipfile +import os, zipfile import time from calibre.web.feeds.news import BasicNewsRecipe from calibre.ptempfile import PersistentTemporaryFile @@ -17,41 +17,59 @@ from calibre.ptempfile import PersistentTemporaryFile class NRCHandelsblad(BasicNewsRecipe): title = u'NRC Handelsblad' - description = u'De EPUB-versie van NRC' + description = u'De ePaper-versie van NRC' language = 'nl' lang = 'nl-NL' + needs_subscription = True - __author__ = 'veezh' + __author__ = 'Snaab' conversion_options = { 'no_default_epub_cover' : True } + def get_browser(self): + br = BasicNewsRecipe.get_browser() + if self.username is not None and self.password is not None: + br.open('http://login.nrc.nl/login') + br.select_form(nr=0) + br['username'] = self.username + br['password'] = self.password + br.submit() + return br + def build_index(self): + today = time.strftime("%Y%m%d") + domain = "http://digitaleeditie.nrc.nl" url = domain + "/digitaleeditie/helekrant/epub/nrc_" + today + ".epub" -# print url + #print url try: - f = urllib2.urlopen(url) - except urllib2.HTTPError: + br = self.get_browser() + f = br.open(url) + except: self.report_progress(0,_('Kan niet inloggen om editie te downloaden')) raise ValueError('Krant van vandaag nog niet beschikbaar') + tmp = PersistentTemporaryFile(suffix='.epub') self.report_progress(0,_('downloading epub')) tmp.write(f.read()) - tmp.close() - - zfile = zipfile.ZipFile(tmp.name, 'r') - self.report_progress(0,_('extracting epub')) - - zfile.extractall(self.output_dir) + f.close() + br.close() + if zipfile.is_zipfile(tmp): + try: + zfile = zipfile.ZipFile(tmp.name, 'r') + zfile.extractall(self.output_dir) + self.report_progress(0,_('extracting epub')) + except zipfile.BadZipfile: + self.report_progress(0,_('BadZip error, continuing')) tmp.close() - index = os.path.join(self.output_dir, 'content.opf') + index = os.path.join(self.output_dir, 'metadata.opf') self.report_progress(1,_('epub downloaded and extracted')) From 450b8f4341379d69e4f24de34971b591f9dce9f3 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 7 Mar 2011 08:56:30 -0700 Subject: [PATCH 4/8] El Pais Babelia by oneillpt --- resources/recipes/el_pais_babelia.recipe | 49 ++++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 resources/recipes/el_pais_babelia.recipe diff --git a/resources/recipes/el_pais_babelia.recipe b/resources/recipes/el_pais_babelia.recipe new file mode 100644 index 0000000000..31b983ec0b --- /dev/null +++ b/resources/recipes/el_pais_babelia.recipe @@ -0,0 +1,49 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class ElPaisBabelia(BasicNewsRecipe): + + title = 'El Pais Babelia' + __author__ = 'oneillpt' + description = 'El Pais Babelia' + INDEX = 'http://www.elpais.com/suple/babelia/' + language = 'es' + + remove_tags_before = dict(name='div', attrs={'class':'estructura_2col'}) + keep_tags = [dict(name='div', attrs={'class':'estructura_2col'})] + remove_tags = [dict(name='div', attrs={'class':'votos estirar'}), + dict(name='div', attrs={'id':'utilidades'}), + dict(name='div', attrs={'class':'info_relacionada'}), + dict(name='div', attrs={'class':'mod_apoyo'}), + dict(name='div', attrs={'class':'contorno_f'}), + dict(name='div', attrs={'class':'pestanias'}), + dict(name='div', attrs={'class':'otros_webs'}), + dict(name='div', attrs={'id':'pie'}) + ] + #no_stylesheets = True + remove_javascript = True + + def parse_index(self): + articles = [] + soup = self.index_to_soup(self.INDEX) + feeds = [] + for section in soup.findAll('div', attrs={'class':'contenedor_nuevo'}): + section_title = self.tag_to_string(section.find('h1')) + articles = [] + for post in section.findAll('a', href=True): + url = post['href'] + if url.startswith('/'): + url = 'http://www.elpais.es'+url + title = self.tag_to_string(post) + if str(post).find('class=') > 0: + klass = post['class'] + if klass != "": + self.log() + self.log('--> post: ', post) + self.log('--> url: ', url) + self.log('--> title: ', title) + self.log('--> class: ', klass) + articles.append({'title':title, 'url':url}) + if articles: + feeds.append((section_title, articles)) + return feeds + From c384f93e4b0a2859344ff59f8a63b51e3e6f4d84 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 7 Mar 2011 09:08:28 -0700 Subject: [PATCH 5/8] Comic Input: Fix conversion failing when output profile is set to Tablet Output --- src/calibre/ebooks/comic/input.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/src/calibre/ebooks/comic/input.py b/src/calibre/ebooks/comic/input.py index 7710d41fb3..56fa123249 100755 --- a/src/calibre/ebooks/comic/input.py +++ b/src/calibre/ebooks/comic/input.py @@ -131,9 +131,12 @@ class PageProcessor(list): # {{{ newsizey = int(newsizex / aspect) deltax = 0 deltay = (SCRHEIGHT - newsizey) / 2 - wand.size = (newsizex, newsizey) - wand.set_border_color(pw) - wand.add_border(pw, deltax, deltay) + if newsizex < 20000 and newsizey < 20000: + # Too large and resizing fails, so better + # to leave it as original size + wand.size = (newsizex, newsizey) + wand.set_border_color(pw) + wand.add_border(pw, deltax, deltay) elif self.opts.wide: # Keep aspect and Use device height as scaled image width so landscape mode is clean aspect = float(sizex) / float(sizey) @@ -152,11 +155,15 @@ class PageProcessor(list): # {{{ newsizey = int(newsizex / aspect) deltax = 0 deltay = (wscreeny - newsizey) / 2 - wand.size = (newsizex, newsizey) - wand.set_border_color(pw) - wand.add_border(pw, deltax, deltay) + if newsizex < 20000 and newsizey < 20000: + # Too large and resizing fails, so better + # to leave it as original size + wand.size = (newsizex, newsizey) + wand.set_border_color(pw) + wand.add_border(pw, deltax, deltay) else: - wand.size = (SCRWIDTH, SCRHEIGHT) + if SCRWIDTH < 20000 and SCRHEIGHT < 20000: + wand.size = (SCRWIDTH, SCRHEIGHT) if not self.opts.dont_sharpen: wand.sharpen(0.0, 1.0) From 787ead16a2158af8d1dcee10151823ff2bc865eb Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 7 Mar 2011 09:16:18 -0700 Subject: [PATCH 6/8] Updated Kompiuterra --- resources/recipes/kompiutierra.recipe | 73 ++++++++++++++------------- 1 file changed, 37 insertions(+), 36 deletions(-) diff --git a/resources/recipes/kompiutierra.recipe b/resources/recipes/kompiutierra.recipe index 0d30afa3a7..528285b26c 100644 --- a/resources/recipes/kompiutierra.recipe +++ b/resources/recipes/kompiutierra.recipe @@ -1,36 +1,37 @@ -#!/usr/bin/python -# -*- coding: utf-8 -*- - -__license__ = 'GPL v3' -__copyright__ = '2010, Vadim Dyadkin, dyadkin@gmail.com' -__author__ = 'Vadim Dyadkin' - -from calibre.web.feeds.news import BasicNewsRecipe - -class Computerra(BasicNewsRecipe): - title = u'\u041a\u043e\u043c\u043f\u044c\u044e\u0442\u0435\u0440\u0440\u0430' - recursion = 50 - oldest_article = 100 - __author__ = 'Vadim Dyadkin' - max_articles_per_feed = 100 - use_embedded_content = False - simultaneous_downloads = 5 - language = 'ru' - description = u'\u041a\u043e\u043c\u043f\u044c\u044e\u0442\u0435\u0440\u044b, \u043e\u043a\u043e\u043b\u043e\u043d\u0430\u0443\u0447\u043d\u044b\u0435 \u0438 \u043e\u043a\u043e\u043b\u043e\u0444\u0438\u043b\u043e\u0441\u043e\u0444\u0441\u043a\u0438\u0435 \u0441\u0442\u0430\u0442\u044c\u0438, \u0433\u0430\u0434\u0436\u0435\u0442\u044b.' - - keep_only_tags = [dict(name='div', attrs={'id': 'content'}),] - - - feeds = [(u'\u041a\u043e\u043c\u043f\u044c\u044e\u0442\u0435\u0440\u0440\u0430', 'http://feeds.feedburner.com/ct_news/'),] - - remove_tags = [dict(name='div', attrs={'id': ['fin', 'idc-container', 'idc-noscript',]}), - dict(name='ul', attrs={'class': "related_post"}), - dict(name='p', attrs={'class': 'info'}), - dict(name='a', attrs={'rel': 'tag', 'class': 'twitter-share-button', 'type': 'button_count'}), - dict(name='h2', attrs={}),] - - extra_css = 'body { text-align: justify; }' - - def get_article_url(self, article): - return article.get('feedburner:origLink', article.get('guid')) - +#!/usr/bin/python +# -*- coding: utf-8 -*- + +__license__ = 'GPL v3' +__copyright__ = '2010, Vadim Dyadkin, dyadkin@gmail.com' +__author__ = 'Vadim Dyadkin' + +from calibre.web.feeds.news import BasicNewsRecipe + +class Computerra(BasicNewsRecipe): + title = u'\u041a\u043e\u043c\u043f\u044c\u044e\u0442\u0435\u0440\u0440\u0430' + oldest_article = 100 + __author__ = 'Vadim Dyadkin (edited by A. Chewi)' + max_articles_per_feed = 50 + use_embedded_content = False + remove_javascript = True + no_stylesheets = True + conversion_options = {'linearize_tables' : True} + simultaneous_downloads = 5 + language = 'ru' + description = u'Компьютерра: все новости про компьютеры, железо, новые технологии, информационные технологии' + + keep_only_tags = [dict(name='div', attrs={'id': 'content'}),] + + feeds = [(u'Компьютерра-Онлайн', 'http://feeds.feedburner.com/ct_news/'),] + + remove_tags = [ + dict(name='div', attrs={'id': ['fin', 'idc-container', 'idc-noscript',]}), + dict(name='ul', attrs={'class': "related_post"}), + dict(name='p', attrs={'class': 'info'}), + dict(name='a', attrs={'class': 'twitter-share-button'}), + dict(name='a', attrs={'type': 'button_count'}), + dict(name='h2', attrs={}) + ] + + def print_version(self, url): + return url + '?print=true' From c9234596bd481e3f73a9e99d18bdbc787f7981bb Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 7 Mar 2011 11:22:26 -0700 Subject: [PATCH 7/8] Improve evz.ro --- resources/recipes/evz.ro.recipe | 74 +++++++++++++------------- resources/recipes/nationalgeoro.recipe | 2 +- 2 files changed, 39 insertions(+), 37 deletions(-) diff --git a/resources/recipes/evz.ro.recipe b/resources/recipes/evz.ro.recipe index bce151d1fc..841dc80429 100644 --- a/resources/recipes/evz.ro.recipe +++ b/resources/recipes/evz.ro.recipe @@ -1,52 +1,54 @@ +# -*- coding: utf-8 -*- +#!/usr/bin/env python + __license__ = 'GPL v3' -__copyright__ = '2010, Darko Miletic ' +__copyright__ = u'2011, Silviu Cotoar\u0103' ''' evz.ro ''' -import re from calibre.web.feeds.news import BasicNewsRecipe -class EVZ_Ro(BasicNewsRecipe): - title = 'evz.ro' - __author__ = 'Darko Miletic' - description = 'News from Romania' - publisher = 'evz.ro' - category = 'news, politics, Romania' - oldest_article = 2 - max_articles_per_feed = 200 - no_stylesheets = True - encoding = 'utf8' - use_embedded_content = False +class EvenimentulZilei(BasicNewsRecipe): + title = u'Evenimentul Zilei' + __author__ = u'Silviu Cotoar\u0103' + description = '' + publisher = u'Evenimentul Zilei' + oldest_article = 5 language = 'ro' - masthead_url = 'http://www.evz.ro/fileadmin/images/logo.gif' - extra_css = ' body{font-family: Georgia,Arial,Helvetica,sans-serif } .firstP{font-size: 1.125em} .author,.articleInfo{font-size: small} ' + max_articles_per_feed = 100 + no_stylesheets = True + use_embedded_content = False + category = 'Ziare,Stiri' + encoding = 'utf-8' + cover_url = 'http://www.evz.ro/fileadmin/images/evzLogo.png' conversion_options = { - 'comment' : description - , 'tags' : category - , 'publisher' : publisher - , 'language' : language - } + 'comments' : description + ,'tags' : category + ,'language' : language + ,'publisher' : publisher + } - preprocess_regexps = [ - (re.compile(r'.*?', re.DOTALL|re.IGNORECASE),lambda match: '<head><title>') - ,(re.compile(r'.*?', re.DOTALL|re.IGNORECASE),lambda match: '') - ] + keep_only_tags = [ + dict(name='div', attrs={'class':'single'}) + , dict(name='img', attrs={'id':'placeholder'}) + , dict(name='a', attrs={'id':'holderlink'}) + ] - remove_tags = [ - dict(name=['form','embed','iframe','object','base','link','script','noscript']) - ,dict(attrs={'class':['section','statsInfo','email il']}) - ,dict(attrs={'id' :'gallery'}) - ] + remove_tags = [ + dict(name='p', attrs={'class':['articleInfo']}) + , dict(name='div', attrs={'id':['bannerAddoceansArticleJos']}) + , dict(name='div', attrs={'id':['bannerAddoceansArticle']}) + ] - remove_tags_after = dict(attrs={'class':'section'}) - keep_only_tags = [dict(attrs={'class':'single'})] - remove_attributes = ['height','width'] + remove_tags_after = [ + dict(name='div', attrs={'id':['bannerAddoceansArticleJos']}) + ] - feeds = [(u'Articles', u'http://www.evz.ro/rss.xml')] + feeds = [ + (u'Feeds', u'http://www.evz.ro/rss.xml') + ] def preprocess_html(self, soup): - for item in soup.findAll(style=True): - del item['style'] - return soup + return self.adeify_images(soup) diff --git a/resources/recipes/nationalgeoro.recipe b/resources/recipes/nationalgeoro.recipe index a3c5727d38..8f989be74d 100644 --- a/resources/recipes/nationalgeoro.recipe +++ b/resources/recipes/nationalgeoro.recipe @@ -14,7 +14,7 @@ class NationalGeoRo(BasicNewsRecipe): __author__ = u'Silviu Cotoar\u0103' description = u'S\u0103 avem grij\u0103 de planet\u0103' publisher = 'National Geographic' - oldest_article = 5 + oldest_article = 35 language = 'ro' max_articles_per_feed = 100 no_stylesheets = True From c1c17aaf9d7149317aef66e3f3918fea58e71eab Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 7 Mar 2011 11:34:55 -0700 Subject: [PATCH 8/8] Various Romanian news sources by Silviu Cotoara --- resources/images/news/hitro.png | Bin 0 -> 521 bytes resources/images/news/kamikaze.png | Bin 0 -> 262 bytes resources/images/news/trombon.png | Bin 0 -> 375 bytes resources/images/news/wallstreetro.png | Bin 0 -> 768 bytes resources/recipes/hitro.recipe | 43 ++++++++++++++++++++ resources/recipes/kamikaze.recipe | 53 ++++++++++++++++++++++++ resources/recipes/kompiutierra.recipe | 2 +- resources/recipes/trombon.recipe | 51 +++++++++++++++++++++++ resources/recipes/wallstreetro.recipe | 54 +++++++++++++++++++++++++ 9 files changed, 202 insertions(+), 1 deletion(-) create mode 100644 resources/images/news/hitro.png create mode 100644 resources/images/news/kamikaze.png create mode 100644 resources/images/news/trombon.png create mode 100644 resources/images/news/wallstreetro.png create mode 100644 resources/recipes/hitro.recipe create mode 100644 resources/recipes/kamikaze.recipe create mode 100644 resources/recipes/trombon.recipe create mode 100644 resources/recipes/wallstreetro.recipe diff --git a/resources/images/news/hitro.png b/resources/images/news/hitro.png new file mode 100644 index 0000000000000000000000000000000000000000..75c08a1c25776dbd9a2e5a12afdb09169a47e4a6 GIT binary patch literal 521 zcmV+k0`~ohP)War0EEZRi(|)rP7?WWx#V>|vy{PP|wdVy|OFw!jNo7s%OImcrpJ}UiB0LYimA3s6rUU1F{fWMZaC?b=| z(1e2#Xz4cB#FMgFfh ztGjU6p#-bgXqvj=oWpfp%w{tbMSmOIBp}^?r6G#me{Z4``ZQxKfK%2x98%997{gZFo-}4VUKL}Z-1d6Og zhT4o|qZycIZ=<{v*~Z_D?tOo>4YnuwJ{@j6cVwNZKLJ4a{{{R3Vt(Gmv>Kd<00000 LNkvXXu0mjfoS5EJ literal 0 HcmV?d00001 diff --git a/resources/images/news/kamikaze.png b/resources/images/news/kamikaze.png new file mode 100644 index 0000000000000000000000000000000000000000..49ef2f50a1601e51f4f29aefc4660b9cad727583 GIT binary patch literal 262 zcmeAS@N?(olHy`uVBq!ia0vp^0wB!63?wyl`GbL!Yk*IPE0De;rE&#`xOj5-c+LpP zT#-}RGHFT(6U!AYnHDaQJ5n-NfQU^bg^%Tqn93PGpaPjIKn54j6d;;4$A*n1Ogv#0 z&>+T=AirP+hi5m^fSi0!7sn8Zsk#1d`I;Sgv|h~Ftn=>M*MI$5YOm(r{I%Fu)6pF1(mwzbIPgoLZ@qKP5`S4}GCPPJh$s7)%5kbG1w5INn|6y36z~S1)Ti&}s%xS3j3^P6}+amR8myT&djW; zq-~wZ?Tvk-n($u7)q)bLc ztgfv5`ut>GT(q{de0p@;+T1iUETpESM_Taj3wRWlR#BT`b4>to-hDRLrF`|=uP4oy V4|iC{(^voi002ovPDHLkV1n(1rmFw| literal 0 HcmV?d00001 diff --git a/resources/images/news/wallstreetro.png b/resources/images/news/wallstreetro.png new file mode 100644 index 0000000000000000000000000000000000000000..d72bc70ca0e89ed1277b6af8d3c2f8be513df034 GIT binary patch literal 768 zcmV+b1ONPqP))Hk`9gM=1A?$}bO$^3F|AxKu|MEXDCd*>XX3#Jtz|_Gu zn8R%R2uvAl-7cf;z2CDJmz~$=^U2A{^T>bAjtLGUfr$~1AQ(6SH9#TpCT;J_I;hvEP*LRzgh>LSz$h~S)I2#O&t!dqHd<;Ksy^CCE=UMcbQ+*SPA2Y|7= z|C}Fh{>pdPza^1OfkoWo~if`twpi>vdIX6<}&hmZcuFX)dl@J60?yj!$ z)BF-a5TK?cMELybH0AdT^yS~=YZw=kGXWg#~~y<CY=IvI1-3BFo-bT1Z{dUDV#|hP~k1UNgg~HXQCkh0000