Merge from trunk

2025-07-09 03:04:10 -04:00 · 2011-03-07 18:58:28 +00:00 · 2011-03-07 18:58:28 +00:00 · 568ac2a2d1
commit 568ac2a2d1
parent b3c42fc0af c1c17aaf9d
15 changed files with 383 additions and 100 deletions
--- a/resources/images/news/hitro.png
+++ b/resources/images/news/hitro.png
--- a/resources/images/news/kamikaze.png
+++ b/resources/images/news/kamikaze.png
--- a/resources/images/news/trombon.png
+++ b/resources/images/news/trombon.png
--- a/resources/images/news/wallstreetro.png
+++ b/resources/images/news/wallstreetro.png
--- a/resources/recipes/el_pais_babelia.recipe
+++ b/resources/recipes/el_pais_babelia.recipe
@ -0,0 +1,49 @@
 from calibre.web.feeds.news import BasicNewsRecipe
 class ElPaisBabelia(BasicNewsRecipe):
    title      = 'El Pais Babelia'
    __author__ = 'oneillpt'
    description = 'El Pais Babelia'
    INDEX = 'http://www.elpais.com/suple/babelia/'
    language = 'es'
    remove_tags_before = dict(name='div', attrs={'class':'estructura_2col'})
    keep_tags = [dict(name='div', attrs={'class':'estructura_2col'})]
    remove_tags = [dict(name='div', attrs={'class':'votos estirar'}),
        dict(name='div', attrs={'id':'utilidades'}),
        dict(name='div', attrs={'class':'info_relacionada'}),
        dict(name='div', attrs={'class':'mod_apoyo'}),
        dict(name='div', attrs={'class':'contorno_f'}),
        dict(name='div', attrs={'class':'pestanias'}),
        dict(name='div', attrs={'class':'otros_webs'}),
        dict(name='div', attrs={'id':'pie'})
        ]
    #no_stylesheets = True
    remove_javascript     = True
    def parse_index(self):
        articles = []
        soup = self.index_to_soup(self.INDEX)
        feeds = []
        for section in soup.findAll('div', attrs={'class':'contenedor_nuevo'}):
            section_title = self.tag_to_string(section.find('h1'))
            articles = []
            for post in section.findAll('a', href=True):
                url = post['href']
                if url.startswith('/'):
                  url = 'http://www.elpais.es'+url
                  title = self.tag_to_string(post)
                  if str(post).find('class=') > 0:
                    klass = post['class']
                    if klass != "":
                      self.log()
                      self.log('--> post:  ', post)
                      self.log('--> url:   ', url)
                      self.log('--> title: ', title)
                      self.log('--> class: ', klass)
                      articles.append({'title':title, 'url':url})
            if articles:
                feeds.append((section_title, articles))
        return feeds
--- a/resources/recipes/evz.ro.recipe
+++ b/resources/recipes/evz.ro.recipe
@ -1,52 +1,54 @@
 # -*- coding: utf-8 -*-
 #!/usr/bin/env  python
 __license__   = 'GPL v3'
-__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
+__copyright__ = u'2011, Silviu Cotoar\u0103'
 '''
 evz.ro
 '''
 import re
 from calibre.web.feeds.news import BasicNewsRecipe
-class EVZ_Ro(BasicNewsRecipe):
+class EvenimentulZilei(BasicNewsRecipe):
-    title                 = 'evz.ro'
+    title                 = u'Evenimentul Zilei'
-    __author__            = 'Darko Miletic'
+    __author__            = u'Silviu Cotoar\u0103'
-    description           = 'News from Romania'
+    description           = ''
-    publisher             = 'evz.ro'
+    publisher             = u'Evenimentul Zilei'
-    category              = 'news, politics, Romania'
+    oldest_article        = 5
    oldest_article        = 2
    max_articles_per_feed = 200
    no_stylesheets        = True
    encoding              = 'utf8'
    use_embedded_content  = False
    language              = 'ro'
-    masthead_url          = 'http://www.evz.ro/fileadmin/images/logo.gif'
+    max_articles_per_feed = 100
-    extra_css             = ' body{font-family: Georgia,Arial,Helvetica,sans-serif } .firstP{font-size: 1.125em} .author,.articleInfo{font-size: small} '
+    no_stylesheets        = True
    use_embedded_content  = False
    category              = 'Ziare,Stiri'
    encoding              = 'utf-8'
    cover_url             = 'http://www.evz.ro/fileadmin/images/evzLogo.png'
    conversion_options = {
-                          'comment'   : description
+                             'comments'   : description
-                        , 'tags'      : category
+                            ,'tags'       : category
-                        , 'publisher' : publisher
+                            ,'language'   : language
-                        , 'language'  : language
+                            ,'publisher'  : publisher
-                        }
+                         }
-    preprocess_regexps = [
+    keep_only_tags = [
-         (re.compile(r'<head>.*?<title>', re.DOTALL|re.IGNORECASE),lambda match: '<head><title>')
+            dict(name='div', attrs={'class':'single'})
-        ,(re.compile(r'</title>.*?</head>', re.DOTALL|re.IGNORECASE),lambda match: '</title></head>')
+            , dict(name='img', attrs={'id':'placeholder'})
-    ]
+			, dict(name='a', attrs={'id':'holderlink'})			
                     ]
-    remove_tags       = [
+    remove_tags = [
-                          dict(name=['form','embed','iframe','object','base','link','script','noscript'])
+             dict(name='p', attrs={'class':['articleInfo']})
-                         ,dict(attrs={'class':['section','statsInfo','email il']})
+           , dict(name='div', attrs={'id':['bannerAddoceansArticleJos']})
-                         ,dict(attrs={'id'   :'gallery'})
+           , dict(name='div', attrs={'id':['bannerAddoceansArticle']})
-                        ]
+                  ]
-    remove_tags_after = dict(attrs={'class':'section'})
+    remove_tags_after = [
-    keep_only_tags    = [dict(attrs={'class':'single'})]
+             dict(name='div', attrs={'id':['bannerAddoceansArticleJos']})			  			 
-    remove_attributes = ['height','width']
+            ]
-    feeds = [(u'Articles', u'http://www.evz.ro/rss.xml')]
+    feeds          = [
            (u'Feeds', u'http://www.evz.ro/rss.xml')
                 ]
    def preprocess_html(self, soup):
-        for item in soup.findAll(style=True):
+        return self.adeify_images(soup)
            del item['style']
        return soup
--- a/resources/recipes/hitro.recipe
+++ b/resources/recipes/hitro.recipe
@ -0,0 +1,43 @@
 # -*- coding: utf-8 -*-
 #!/usr/bin/env  python
 __license__   = 'GPL v3'
 __copyright__ = u'2011, Silviu Cotoar\u0103'
 '''
 hit.ro
 '''
 from calibre.web.feeds.news import BasicNewsRecipe
 class Hit(BasicNewsRecipe):
    title                 = u'HIT'
    __author__            = u'Silviu Cotoar\u0103'
    description           = 'IT'
    publisher             = 'HIT'
    oldest_article        = 5
    language              = 'ro'
    max_articles_per_feed = 100
    no_stylesheets        = True
    use_embedded_content  = False
    category              = 'Ziare,Reviste,IT'
    encoding              = 'utf-8'
    cover_url             = 'http://www.hit.ro/lib/images/frontend/hit_logo.png'
    conversion_options = {
                             'comments'   : description
                            ,'tags'       : category
                            ,'language'   : language
                            ,'publisher'  : publisher
                         }
    keep_only_tags = [
                          dict(name='h1', attrs={'class':'art_titl'})
                        , dict(name='div', attrs={'id':'continut_articol'})
                     ]
    feeds          = [
                        (u'Feeds', u'http://www.hit.ro/rss')
                     ]
    def preprocess_html(self, soup):
        return self.adeify_images(soup)
--- a/resources/recipes/kamikaze.recipe
+++ b/resources/recipes/kamikaze.recipe
@ -0,0 +1,53 @@
 # -*- coding: utf-8 -*-
 #!/usr/bin/env  python
 __license__   = 'GPL v3'
 __copyright__ = u'2011, Silviu Cotoar\u0103'
 '''
 kamikazeonline.ro
 '''
 from calibre.web.feeds.news import BasicNewsRecipe
 class Kamikaze(BasicNewsRecipe):
    title                 = u'Kamikaze'
    __author__            = u'Silviu Cotoar\u0103'
    description           = u'S\u0103pt\u0103m\u00e2nal sc\u0103pat de sub control'
    publisher             = 'Kamikaze'
    oldest_article        = 5
    language              = 'ro'
    max_articles_per_feed = 100
    no_stylesheets        = True
    use_embedded_content  = False
    category              = 'Ziare,Reviste'
    encoding              = 'utf-8'
    cover_url             = 'http://www.kamikazeonline.ro/wp-content/themes/kamikaze/images/kamikazeonline_header.gif'
    conversion_options = {
                             'comments'   : description
                            ,'tags'       : category
                            ,'language'   : language
                            ,'publisher'  : publisher
                         }
    keep_only_tags = [
            dict(name='div', attrs={'id':'content'})
                     ]
    remove_tags = [
             dict(name='div', attrs={'class':['connect_confirmation_cell connect_confirmation_cell_no_like']})
           , dict(name='h3', attrs={'id':['comments']})
           , dict(name='ul', attrs={'class':['addtoany_list']})
           , dict(name='p', attrs={'class':['postmetadata']})
                  ]
    remove_tags_after = [
             dict(name='p', attrs={'class':['postmetadata']})
            ]
    feeds          = [
            (u'Feeds', u'http://www.kamikazeonline.ro/feed/')
                 ]
    def preprocess_html(self, soup):
        return self.adeify_images(soup)
--- a/resources/recipes/kompiutierra.recipe
+++ b/resources/recipes/kompiutierra.recipe
@ -1,36 +1,37 @@
-#!/usr/bin/python
+#!/usr/bin/python
-# -*- coding: utf-8 -*-
+# -*- coding: utf-8 -*-
-
+
-__license__   = 'GPL v3'
+__license__   = 'GPL v3'
-__copyright__ = '2010, Vadim Dyadkin, dyadkin@gmail.com'
+__copyright__ = '2010, Vadim Dyadkin, dyadkin@gmail.com'
-__author__ = 'Vadim Dyadkin'
+__author__ = 'Vadim Dyadkin'
-
+
-from calibre.web.feeds.news import BasicNewsRecipe
+from calibre.web.feeds.news import BasicNewsRecipe
-
+
-class Computerra(BasicNewsRecipe):
+class Computerra(BasicNewsRecipe):
-    title = u'\u041a\u043e\u043c\u043f\u044c\u044e\u0442\u0435\u0440\u0440\u0430'
+    title = u'\u041a\u043e\u043c\u043f\u044c\u044e\u0442\u0435\u0440\u0440\u0430'
-    recursion = 50
+    oldest_article = 100
-    oldest_article = 100
+    __author__  = 'Vadim Dyadkin (edited by A. Chewi)'
-    __author__  = 'Vadim Dyadkin'
+    max_articles_per_feed = 50
-    max_articles_per_feed = 100
+    use_embedded_content  = False
-    use_embedded_content  = False
+    remove_javascript = True
-    simultaneous_downloads = 5
+    no_stylesheets = True
-    language = 'ru'
+    conversion_options = {'linearize_tables' : True}
-    description = u'\u041a\u043e\u043c\u043f\u044c\u044e\u0442\u0435\u0440\u044b, \u043e\u043a\u043e\u043b\u043e\u043d\u0430\u0443\u0447\u043d\u044b\u0435 \u0438 \u043e\u043a\u043e\u043b\u043e\u0444\u0438\u043b\u043e\u0441\u043e\u0444\u0441\u043a\u0438\u0435 \u0441\u0442\u0430\u0442\u044c\u0438, \u0433\u0430\u0434\u0436\u0435\u0442\u044b.'
+    simultaneous_downloads = 5
-
+    language = 'ru'
-    keep_only_tags = [dict(name='div', attrs={'id': 'content'}),]
+    description = u'Компьютерра: все новости про компьютеры, железо, новые технологии, информационные технологии'
-
+
-
+    keep_only_tags = [dict(name='div', attrs={'id': 'content'}),]
-    feeds = [(u'\u041a\u043e\u043c\u043f\u044c\u044e\u0442\u0435\u0440\u0440\u0430', 'http://feeds.feedburner.com/ct_news/'),]
+
-
+    feeds = [(u'Компьютерра-Онлайн', 'http://feeds.feedburner.com/ct_news/'),]
-    remove_tags = [dict(name='div', attrs={'id': ['fin', 'idc-container', 'idc-noscript',]}),
+
-                   dict(name='ul', attrs={'class': "related_post"}),
+    remove_tags = [
-                   dict(name='p', attrs={'class': 'info'}),
+    dict(name='div', attrs={'id': ['fin', 'idc-container', 'idc-noscript',]}),
-                   dict(name='a', attrs={'rel': 'tag', 'class': 'twitter-share-button', 'type': 'button_count'}),
+    dict(name='ul', attrs={'class': "related_post"}),
-                   dict(name='h2', attrs={}),]
+    dict(name='p', attrs={'class': 'info'}),
-
+    dict(name='a', attrs={'class': 'twitter-share-button'}),
-    extra_css = 'body { text-align: justify; }'
+    dict(name='a', attrs={'type': 'button_count'}),
-
+    dict(name='h2', attrs={})
-    def get_article_url(self, article):
+    ]
-        return article.get('feedburner:origLink', article.get('guid'))
+
-
+    def print_version(self, url):
        return url + '?print=true'
--- a/resources/recipes/nationalgeoro.recipe
+++ b/resources/recipes/nationalgeoro.recipe
@ -14,7 +14,7 @@ class NationalGeoRo(BasicNewsRecipe):
    __author__            = u'Silviu Cotoar\u0103'
    description           = u'S\u0103 avem grij\u0103 de planet\u0103'
    publisher             = 'National Geographic'
-    oldest_article        = 5
+    oldest_article        = 35
    language              = 'ro'
    max_articles_per_feed = 100
    no_stylesheets        = True
--- a/resources/recipes/nrc-nl-epub.recipe
+++ b/resources/recipes/nrc-nl-epub.recipe
@ -1,14 +1,14 @@
-#!/usr/bin/env  python
+#!/usr/bin/env  python2
 # -*- coding: utf-8 -*-
-#Based on Lars Jacob's Taz Digiabo recipe
+#Based on veezh's original recipe and Kovid Goyal's New York Times recipe
 __license__   = 'GPL v3'
-__copyright__ = '2010, veezh'
+__copyright__ = '2011, Snaab'
 '''
 www.nrc.nl
 '''
-import os, urllib2, zipfile
+import os, zipfile
 import time
 from calibre.web.feeds.news import BasicNewsRecipe
 from calibre.ptempfile import PersistentTemporaryFile
@ -17,41 +17,59 @@ from calibre.ptempfile import PersistentTemporaryFile
 class NRCHandelsblad(BasicNewsRecipe):
    title = u'NRC Handelsblad'
-    description = u'De EPUB-versie van NRC'
+    description = u'De ePaper-versie van NRC'
    language = 'nl'
    lang = 'nl-NL'
    needs_subscription = True
-    __author__ = 'veezh'
+    __author__ = 'Snaab'
    conversion_options = {
        'no_default_epub_cover' : True
    }
    def get_browser(self):
        br = BasicNewsRecipe.get_browser()
        if self.username is not None and self.password is not None:
            br.open('http://login.nrc.nl/login')
            br.select_form(nr=0)
            br['username']   = self.username
            br['password'] = self.password
            br.submit()
        return br
    def build_index(self):
        today = time.strftime("%Y%m%d")
        domain = "http://digitaleeditie.nrc.nl"
        url = domain + "/digitaleeditie/helekrant/epub/nrc_" + today + ".epub"
-#        print url
+        #print url
        try:
-            f = urllib2.urlopen(url)
+            br = self.get_browser()
-        except urllib2.HTTPError:
+            f = br.open(url)
        except:
            self.report_progress(0,_('Kan niet inloggen om editie te downloaden'))
            raise ValueError('Krant van vandaag nog niet beschikbaar')
        tmp = PersistentTemporaryFile(suffix='.epub')
        self.report_progress(0,_('downloading epub'))
        tmp.write(f.read())
-        tmp.close()
+        f.close()
-
+        br.close()
-        zfile = zipfile.ZipFile(tmp.name, 'r')
+        if zipfile.is_zipfile(tmp):
-        self.report_progress(0,_('extracting epub'))
+            try:
-
+                zfile = zipfile.ZipFile(tmp.name, 'r')
-        zfile.extractall(self.output_dir)
+                zfile.extractall(self.output_dir)
                self.report_progress(0,_('extracting epub'))
            except zipfile.BadZipfile:
                self.report_progress(0,_('BadZip error, continuing'))
        tmp.close()
-        index = os.path.join(self.output_dir, 'content.opf')
+        index = os.path.join(self.output_dir, 'metadata.opf')
        self.report_progress(1,_('epub downloaded and extracted'))
--- a/resources/recipes/trombon.recipe
+++ b/resources/recipes/trombon.recipe
@ -0,0 +1,51 @@
 # -*- coding: utf-8 -*-
 #!/usr/bin/env  python
 __license__   = 'GPL v3'
 __copyright__ = u'2011, Silviu Cotoar\u0103'
 '''
 trombon.ro
 '''
 from calibre.web.feeds.news import BasicNewsRecipe
 class Trombon(BasicNewsRecipe):
    title                 = u'Trombon'
    __author__            = u'Silviu Cotoar\u0103'
    description           = u'Parodii si Pamflete'
    publisher             = u'Trombon'
    oldest_article        = 5
    language              = 'ro'
    max_articles_per_feed = 100
    no_stylesheets        = True
    use_embedded_content  = False
    category              = 'Ziare,Reviste,Fun'
    encoding              = 'utf-8'
    cover_url             = 'http://www.trombon.ro/i/trombon.gif'
    conversion_options = {
                             'comments'   : description
                            ,'tags'       : category
                            ,'language'   : language
                            ,'publisher'  : publisher
                         }
    keep_only_tags = [
                          dict(name='div', attrs={'class':'articol'})
                     ]
    remove_tags = [
                          dict(name='div', attrs={'class':['info_2']})
                        , dict(name='iframe', attrs={'scrolling':['no']})
                  ]
    remove_tags_after = [
                            dict(name='div', attrs={'id':'article_vote'})
                        ]
    feeds          = [
                        (u'Feeds', u'http://feeds.feedburner.com/trombon/ABWb?format=xml')
                     ]
    def preprocess_html(self, soup):
        return self.adeify_images(soup)
--- a/resources/recipes/wallstreetro.recipe
+++ b/resources/recipes/wallstreetro.recipe
@ -0,0 +1,54 @@
 # -*- coding: utf-8 -*-
 #!/usr/bin/env  python
 __license__   = 'GPL v3'
 __copyright__ = u'2011, Silviu Cotoar\u0103'
 '''
 wall-street.ro
 '''
 from calibre.web.feeds.news import BasicNewsRecipe
 class WallStreetRo(BasicNewsRecipe):
    title                 = u'Wall Street'
    __author__            = u'Silviu Cotoar\u0103'
    description           = ''
    publisher             = 'Wall Street'
    oldest_article        = 5
    language              = 'ro'
    max_articles_per_feed = 100
    no_stylesheets        = True
    use_embedded_content  = False
    category              = 'Ziare'
    encoding              = 'utf-8'
    cover_url             = 'http://img.wall-street.ro/images/WS_new_logo.jpg'
    conversion_options = {
                             'comments'   : description
                            ,'tags'       : category
                            ,'language'   : language
                            ,'publisher'  : publisher
                         }
    keep_only_tags = [
                          dict(name='div', attrs={'class':'article_header'})
                        , dict(name='div', attrs={'class':'article_text'})
                     ]
    remove_tags = [
                          dict(name='p', attrs={'class':['page_breadcrumbs']})
                        , dict(name='div', attrs={'id':['article_user_toolbox']})
                        , dict(name='p', attrs={'class':['comments_count_container']})
                        , dict(name='div', attrs={'class':['article_left_column']})
                  ]
    remove_tags_after = [
                            dict(name='div', attrs={'class':'clearfloat'})
                        ]
    feeds          = [
                        (u'Feeds', u'http://img.wall-street.ro/rssfeeds/wall-street.xml')
                     ]
    def preprocess_html(self, soup):
        return self.adeify_images(soup)
--- a/src/calibre/ebooks/comic/input.py
+++ b/src/calibre/ebooks/comic/input.py
@ -131,9 +131,12 @@ class PageProcessor(list): # {{{
                    newsizey = int(newsizex / aspect)
                    deltax = 0
                    deltay = (SCRHEIGHT - newsizey) / 2
-                wand.size = (newsizex, newsizey)
+                if newsizex < 20000 and newsizey < 20000:
-                wand.set_border_color(pw)
+                    # Too large and resizing fails, so better
-                wand.add_border(pw, deltax, deltay)
+                    # to leave it as original size
                    wand.size = (newsizex, newsizey)
                    wand.set_border_color(pw)
                    wand.add_border(pw, deltax, deltay)
            elif self.opts.wide:
                # Keep aspect and Use device height as scaled image width so landscape mode is clean
                aspect = float(sizex) / float(sizey)
@ -152,11 +155,15 @@ class PageProcessor(list): # {{{
                    newsizey = int(newsizex / aspect)
                    deltax = 0
                    deltay = (wscreeny - newsizey) / 2
-                wand.size = (newsizex, newsizey)
+                if newsizex < 20000 and newsizey < 20000:
-                wand.set_border_color(pw)
+                    # Too large and resizing fails, so better
-                wand.add_border(pw, deltax, deltay)
+                    # to leave it as original size
                    wand.size = (newsizex, newsizey)
                    wand.set_border_color(pw)
                    wand.add_border(pw, deltax, deltay)
            else:
-                wand.size = (SCRWIDTH, SCRHEIGHT)
+                if SCRWIDTH < 20000 and SCRHEIGHT < 20000:
                    wand.size = (SCRWIDTH, SCRHEIGHT)
            if not self.opts.dont_sharpen:
                wand.sharpen(0.0, 1.0)
--- a/src/calibre/ebooks/snb/snbfile.py
+++ b/src/calibre/ebooks/snb/snbfile.py
@ -75,15 +75,20 @@ class SNBFile:
                    for i in range(self.plainBlock):
                        bzdc = bz2.BZ2Decompressor()
                        if (i < self.plainBlock - 1):
-                            bSize = self.blocks[self.binBlock + i + 1].Offset - self.blocks[self.binBlock + i].Offset;
+                            bSize = self.blocks[self.binBlock + i + 1].Offset - self.blocks[self.binBlock + i].Offset
                        else:
-                            bSize = self.tailOffset - self.blocks[self.binBlock + i].Offset;
+                            bSize = self.tailOffset - self.blocks[self.binBlock + i].Offset
-                        snbFile.seek(self.blocks[self.binBlock + i].Offset);
+                        snbFile.seek(self.blocks[self.binBlock + i].Offset)
                        try:
                            data = snbFile.read(bSize)
-                            uncompressedData += bzdc.decompress(data)
+                            if len(data) < 32768:
                                uncompressedData += bzdc.decompress(data)
                            else:
                                uncompressedData += data
                        except Exception, e:
                            print e
                if len(uncompressedData) != self.plainStreamSizeUncompressed:
                    raise Exception()
                f.fileBody = uncompressedData[plainPos:plainPos+f.fileSize]
                plainPos += f.fileSize
            elif f.attr & 0x01000000 == 0x01000000: