Merge from trunk

2025-07-09 03:04:10 -04:00 · 2011-03-07 18:58:28 +00:00 · 2011-03-07 18:58:28 +00:00 · 568ac2a2d1
commit 568ac2a2d1
parent b3c42fc0af c1c17aaf9d
15 changed files with 383 additions and 100 deletions
--- a/resources/images/news/hitro.png
+++ b/resources/images/news/hitro.png
--- a/resources/images/news/kamikaze.png
+++ b/resources/images/news/kamikaze.png
--- a/resources/images/news/trombon.png
+++ b/resources/images/news/trombon.png
--- a/resources/images/news/wallstreetro.png
+++ b/resources/images/news/wallstreetro.png
--- a/resources/recipes/el_pais_babelia.recipe
+++ b/resources/recipes/el_pais_babelia.recipe
@ -0,0 +1,49 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class ElPaisBabelia(BasicNewsRecipe):
+
+    title      = 'El Pais Babelia'
+    __author__ = 'oneillpt'
+    description = 'El Pais Babelia'
+    INDEX = 'http://www.elpais.com/suple/babelia/'
+    language = 'es'
+
+    remove_tags_before = dict(name='div', attrs={'class':'estructura_2col'})
+    keep_tags = [dict(name='div', attrs={'class':'estructura_2col'})]
+    remove_tags = [dict(name='div', attrs={'class':'votos estirar'}),
+        dict(name='div', attrs={'id':'utilidades'}),
+        dict(name='div', attrs={'class':'info_relacionada'}),
+        dict(name='div', attrs={'class':'mod_apoyo'}),
+        dict(name='div', attrs={'class':'contorno_f'}),
+        dict(name='div', attrs={'class':'pestanias'}),
+        dict(name='div', attrs={'class':'otros_webs'}),
+        dict(name='div', attrs={'id':'pie'})
+        ]
+    #no_stylesheets = True
+    remove_javascript     = True
+
+    def parse_index(self):
+        articles = []
+        soup = self.index_to_soup(self.INDEX)
+        feeds = []
+        for section in soup.findAll('div', attrs={'class':'contenedor_nuevo'}):
+            section_title = self.tag_to_string(section.find('h1'))
+            articles = []
+            for post in section.findAll('a', href=True):
+                url = post['href']
+                if url.startswith('/'):
+                  url = 'http://www.elpais.es'+url
+                  title = self.tag_to_string(post)
+                  if str(post).find('class=') > 0:
+                    klass = post['class']
+                    if klass != "":
+                      self.log()
+                      self.log('--> post:  ', post)
+                      self.log('--> url:   ', url)
+                      self.log('--> title: ', title)
+                      self.log('--> class: ', klass)
+                      articles.append({'title':title, 'url':url})
+            if articles:
+                feeds.append((section_title, articles))
+        return feeds
+
--- a/resources/recipes/evz.ro.recipe
+++ b/resources/recipes/evz.ro.recipe
@ -1,52 +1,54 @@
+# -*- coding: utf-8 -*-
+#!/usr/bin/env  python
+
 __license__   = 'GPL v3'
-__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
+__copyright__ = u'2011, Silviu Cotoar\u0103'
 '''
 evz.ro
 '''

-import re
 from calibre.web.feeds.news import BasicNewsRecipe

-class EVZ_Ro(BasicNewsRecipe):
-    title                 = 'evz.ro'
-    __author__            = 'Darko Miletic'
-    description           = 'News from Romania'
-    publisher             = 'evz.ro'
-    category              = 'news, politics, Romania'
-    oldest_article        = 2
-    max_articles_per_feed = 200
-    no_stylesheets        = True
-    encoding              = 'utf8'
-    use_embedded_content  = False
+class EvenimentulZilei(BasicNewsRecipe):
+    title                 = u'Evenimentul Zilei'
+    __author__            = u'Silviu Cotoar\u0103'
+    description           = ''
+    publisher             = u'Evenimentul Zilei'
+    oldest_article        = 5
    language              = 'ro'
-    masthead_url          = 'http://www.evz.ro/fileadmin/images/logo.gif'
-    extra_css             = ' body{font-family: Georgia,Arial,Helvetica,sans-serif } .firstP{font-size: 1.125em} .author,.articleInfo{font-size: small} '
+    max_articles_per_feed = 100
+    no_stylesheets        = True
+    use_embedded_content  = False
+    category              = 'Ziare,Stiri'
+    encoding              = 'utf-8'
+    cover_url             = 'http://www.evz.ro/fileadmin/images/evzLogo.png'

    conversion_options = {
-                          'comment'   : description
-                        , 'tags'      : category
-                        , 'publisher' : publisher
-                        , 'language'  : language
-                        }
+                             'comments'   : description
+                            ,'tags'       : category
+                            ,'language'   : language
+                            ,'publisher'  : publisher
+                         }

-    preprocess_regexps = [
-         (re.compile(r'<head>.*?<title>', re.DOTALL|re.IGNORECASE),lambda match: '<head><title>')
-        ,(re.compile(r'</title>.*?</head>', re.DOTALL|re.IGNORECASE),lambda match: '</title></head>')
-    ]
+    keep_only_tags = [
+            dict(name='div', attrs={'class':'single'})
+            , dict(name='img', attrs={'id':'placeholder'})
+			, dict(name='a', attrs={'id':'holderlink'})			
+                     ]

-    remove_tags       = [
-                          dict(name=['form','embed','iframe','object','base','link','script','noscript'])
-                         ,dict(attrs={'class':['section','statsInfo','email il']})
-                         ,dict(attrs={'id'   :'gallery'})
-                        ]
+    remove_tags = [
+             dict(name='p', attrs={'class':['articleInfo']})
+           , dict(name='div', attrs={'id':['bannerAddoceansArticleJos']})
+           , dict(name='div', attrs={'id':['bannerAddoceansArticle']})
+                  ]

-    remove_tags_after = dict(attrs={'class':'section'})
-    keep_only_tags    = [dict(attrs={'class':'single'})]
-    remove_attributes = ['height','width']
+    remove_tags_after = [
+             dict(name='div', attrs={'id':['bannerAddoceansArticleJos']})			  			 
+            ]

-    feeds = [(u'Articles', u'http://www.evz.ro/rss.xml')]
+    feeds          = [
+            (u'Feeds', u'http://www.evz.ro/rss.xml')
+                 ]

    def preprocess_html(self, soup):
-        for item in soup.findAll(style=True):
-            del item['style']
-        return soup
+        return self.adeify_images(soup)
--- a/resources/recipes/hitro.recipe
+++ b/resources/recipes/hitro.recipe
@ -0,0 +1,43 @@
+# -*- coding: utf-8 -*-
+#!/usr/bin/env  python
+
+__license__   = 'GPL v3'
+__copyright__ = u'2011, Silviu Cotoar\u0103'
+'''
+hit.ro
+'''
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class Hit(BasicNewsRecipe):
+    title                 = u'HIT'
+    __author__            = u'Silviu Cotoar\u0103'
+    description           = 'IT'
+    publisher             = 'HIT'
+    oldest_article        = 5
+    language              = 'ro'
+    max_articles_per_feed = 100
+    no_stylesheets        = True
+    use_embedded_content  = False
+    category              = 'Ziare,Reviste,IT'
+    encoding              = 'utf-8'
+    cover_url             = 'http://www.hit.ro/lib/images/frontend/hit_logo.png'
+
+    conversion_options = {
+                             'comments'   : description
+                            ,'tags'       : category
+                            ,'language'   : language
+                            ,'publisher'  : publisher
+                         }
+
+    keep_only_tags = [
+                          dict(name='h1', attrs={'class':'art_titl'})
+                        , dict(name='div', attrs={'id':'continut_articol'})
+                     ]
+
+    feeds          = [
+                        (u'Feeds', u'http://www.hit.ro/rss')
+                     ]
+
+    def preprocess_html(self, soup):
+        return self.adeify_images(soup)
--- a/resources/recipes/kamikaze.recipe
+++ b/resources/recipes/kamikaze.recipe
@ -0,0 +1,53 @@
+# -*- coding: utf-8 -*-
+#!/usr/bin/env  python
+
+__license__   = 'GPL v3'
+__copyright__ = u'2011, Silviu Cotoar\u0103'
+'''
+kamikazeonline.ro
+'''
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class Kamikaze(BasicNewsRecipe):
+    title                 = u'Kamikaze'
+    __author__            = u'Silviu Cotoar\u0103'
+    description           = u'S\u0103pt\u0103m\u00e2nal sc\u0103pat de sub control'
+    publisher             = 'Kamikaze'
+    oldest_article        = 5
+    language              = 'ro'
+    max_articles_per_feed = 100
+    no_stylesheets        = True
+    use_embedded_content  = False
+    category              = 'Ziare,Reviste'
+    encoding              = 'utf-8'
+    cover_url             = 'http://www.kamikazeonline.ro/wp-content/themes/kamikaze/images/kamikazeonline_header.gif'
+
+    conversion_options = {
+                             'comments'   : description
+                            ,'tags'       : category
+                            ,'language'   : language
+                            ,'publisher'  : publisher
+                         }
+
+    keep_only_tags = [
+            dict(name='div', attrs={'id':'content'})
+                     ]
+
+    remove_tags = [
+             dict(name='div', attrs={'class':['connect_confirmation_cell connect_confirmation_cell_no_like']})
+           , dict(name='h3', attrs={'id':['comments']})
+           , dict(name='ul', attrs={'class':['addtoany_list']})
+           , dict(name='p', attrs={'class':['postmetadata']})
+                  ]
+
+    remove_tags_after = [
+             dict(name='p', attrs={'class':['postmetadata']})
+            ]
+
+    feeds          = [
+            (u'Feeds', u'http://www.kamikazeonline.ro/feed/')
+                 ]
+
+    def preprocess_html(self, soup):
+        return self.adeify_images(soup)
--- a/resources/recipes/kompiutierra.recipe
+++ b/resources/recipes/kompiutierra.recipe
@ -1,36 +1,37 @@
-#!/usr/bin/python
-# -*- coding: utf-8 -*-
-
-__license__   = 'GPL v3'
-__copyright__ = '2010, Vadim Dyadkin, dyadkin@gmail.com'
-__author__ = 'Vadim Dyadkin'
-
-from calibre.web.feeds.news import BasicNewsRecipe
-
-class Computerra(BasicNewsRecipe):
-    title = u'\u041a\u043e\u043c\u043f\u044c\u044e\u0442\u0435\u0440\u0440\u0430'
-    recursion = 50
-    oldest_article = 100
-    __author__  = 'Vadim Dyadkin'
-    max_articles_per_feed = 100
-    use_embedded_content  = False
-    simultaneous_downloads = 5
-    language = 'ru'
-    description = u'\u041a\u043e\u043c\u043f\u044c\u044e\u0442\u0435\u0440\u044b, \u043e\u043a\u043e\u043b\u043e\u043d\u0430\u0443\u0447\u043d\u044b\u0435 \u0438 \u043e\u043a\u043e\u043b\u043e\u0444\u0438\u043b\u043e\u0441\u043e\u0444\u0441\u043a\u0438\u0435 \u0441\u0442\u0430\u0442\u044c\u0438, \u0433\u0430\u0434\u0436\u0435\u0442\u044b.'
-
-    keep_only_tags = [dict(name='div', attrs={'id': 'content'}),]
-
-
-    feeds = [(u'\u041a\u043e\u043c\u043f\u044c\u044e\u0442\u0435\u0440\u0440\u0430', 'http://feeds.feedburner.com/ct_news/'),]
-
-    remove_tags = [dict(name='div', attrs={'id': ['fin', 'idc-container', 'idc-noscript',]}),
-                   dict(name='ul', attrs={'class': "related_post"}),
-                   dict(name='p', attrs={'class': 'info'}),
-                   dict(name='a', attrs={'rel': 'tag', 'class': 'twitter-share-button', 'type': 'button_count'}),
-                   dict(name='h2', attrs={}),]
-
-    extra_css = 'body { text-align: justify; }'
-
-    def get_article_url(self, article):
-        return article.get('feedburner:origLink', article.get('guid'))
-
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+__license__   = 'GPL v3'
+__copyright__ = '2010, Vadim Dyadkin, dyadkin@gmail.com'
+__author__ = 'Vadim Dyadkin'
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class Computerra(BasicNewsRecipe):
+    title = u'\u041a\u043e\u043c\u043f\u044c\u044e\u0442\u0435\u0440\u0440\u0430'
+    oldest_article = 100
+    __author__  = 'Vadim Dyadkin (edited by A. Chewi)'
+    max_articles_per_feed = 50
+    use_embedded_content  = False
+    remove_javascript = True
+    no_stylesheets = True
+    conversion_options = {'linearize_tables' : True}
+    simultaneous_downloads = 5
+    language = 'ru'
+    description = u'Компьютерра: все новости про компьютеры, железо, новые технологии, информационные технологии'
+
+    keep_only_tags = [dict(name='div', attrs={'id': 'content'}),]
+
+    feeds = [(u'Компьютерра-Онлайн', 'http://feeds.feedburner.com/ct_news/'),]
+
+    remove_tags = [
+    dict(name='div', attrs={'id': ['fin', 'idc-container', 'idc-noscript',]}),
+    dict(name='ul', attrs={'class': "related_post"}),
+    dict(name='p', attrs={'class': 'info'}),
+    dict(name='a', attrs={'class': 'twitter-share-button'}),
+    dict(name='a', attrs={'type': 'button_count'}),
+    dict(name='h2', attrs={})
+    ]
+
+    def print_version(self, url):
+        return url + '?print=true'
--- a/resources/recipes/nationalgeoro.recipe
+++ b/resources/recipes/nationalgeoro.recipe
@ -14,7 +14,7 @@ class NationalGeoRo(BasicNewsRecipe):
    __author__            = u'Silviu Cotoar\u0103'
    description           = u'S\u0103 avem grij\u0103 de planet\u0103'
    publisher             = 'National Geographic'
-    oldest_article        = 5
+    oldest_article        = 35
    language              = 'ro'
    max_articles_per_feed = 100
    no_stylesheets        = True
--- a/resources/recipes/nrc-nl-epub.recipe
+++ b/resources/recipes/nrc-nl-epub.recipe
@ -1,14 +1,14 @@
-#!/usr/bin/env  python
+#!/usr/bin/env  python2
 # -*- coding: utf-8 -*-
-#Based on Lars Jacob's Taz Digiabo recipe
+#Based on veezh's original recipe and Kovid Goyal's New York Times recipe

 __license__   = 'GPL v3'
-__copyright__ = '2010, veezh'
+__copyright__ = '2011, Snaab'

 '''
 www.nrc.nl
 '''
-import os, urllib2, zipfile
+import os, zipfile
 import time
 from calibre.web.feeds.news import BasicNewsRecipe
 from calibre.ptempfile import PersistentTemporaryFile
@ -17,41 +17,59 @@ from calibre.ptempfile import PersistentTemporaryFile
 class NRCHandelsblad(BasicNewsRecipe):

    title = u'NRC Handelsblad'
-    description = u'De EPUB-versie van NRC'
+    description = u'De ePaper-versie van NRC'
    language = 'nl'
    lang = 'nl-NL'
+    needs_subscription = True

-    __author__ = 'veezh'
+    __author__ = 'Snaab'

    conversion_options = {
        'no_default_epub_cover' : True
    }

+    def get_browser(self):
+        br = BasicNewsRecipe.get_browser()
+        if self.username is not None and self.password is not None:
+            br.open('http://login.nrc.nl/login')
+            br.select_form(nr=0)
+            br['username']   = self.username
+            br['password'] = self.password
+            br.submit()
+        return br
+
    def build_index(self):
+
        today = time.strftime("%Y%m%d")
+
        domain = "http://digitaleeditie.nrc.nl"

        url = domain + "/digitaleeditie/helekrant/epub/nrc_" + today + ".epub"
-#        print url
+        #print url

        try:
-            f = urllib2.urlopen(url)
-        except urllib2.HTTPError:
+            br = self.get_browser()
+            f = br.open(url)
+        except:
            self.report_progress(0,_('Kan niet inloggen om editie te downloaden'))
            raise ValueError('Krant van vandaag nog niet beschikbaar')

+
        tmp = PersistentTemporaryFile(suffix='.epub')
        self.report_progress(0,_('downloading epub'))
        tmp.write(f.read())
-        tmp.close()
-
-        zfile = zipfile.ZipFile(tmp.name, 'r')
-        self.report_progress(0,_('extracting epub'))
-
-        zfile.extractall(self.output_dir)
+        f.close()
+        br.close()
+        if zipfile.is_zipfile(tmp):
+            try:
+                zfile = zipfile.ZipFile(tmp.name, 'r')
+                zfile.extractall(self.output_dir)
+                self.report_progress(0,_('extracting epub'))
+            except zipfile.BadZipfile:
+                self.report_progress(0,_('BadZip error, continuing'))

        tmp.close()
-        index = os.path.join(self.output_dir, 'content.opf')
+        index = os.path.join(self.output_dir, 'metadata.opf')

        self.report_progress(1,_('epub downloaded and extracted'))

--- a/resources/recipes/trombon.recipe
+++ b/resources/recipes/trombon.recipe
@ -0,0 +1,51 @@
+# -*- coding: utf-8 -*-
+#!/usr/bin/env  python
+
+__license__   = 'GPL v3'
+__copyright__ = u'2011, Silviu Cotoar\u0103'
+'''
+trombon.ro
+'''
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class Trombon(BasicNewsRecipe):
+    title                 = u'Trombon'
+    __author__            = u'Silviu Cotoar\u0103'
+    description           = u'Parodii si Pamflete'
+    publisher             = u'Trombon'
+    oldest_article        = 5
+    language              = 'ro'
+    max_articles_per_feed = 100
+    no_stylesheets        = True
+    use_embedded_content  = False
+    category              = 'Ziare,Reviste,Fun'
+    encoding              = 'utf-8'
+    cover_url             = 'http://www.trombon.ro/i/trombon.gif'
+
+    conversion_options = {
+                             'comments'   : description
+                            ,'tags'       : category
+                            ,'language'   : language
+                            ,'publisher'  : publisher
+                         }
+
+    keep_only_tags = [
+                          dict(name='div', attrs={'class':'articol'})
+                     ]
+
+    remove_tags = [
+                          dict(name='div', attrs={'class':['info_2']})
+                        , dict(name='iframe', attrs={'scrolling':['no']})
+                  ]
+
+    remove_tags_after = [
+                            dict(name='div', attrs={'id':'article_vote'})
+                        ]
+
+    feeds          = [
+                        (u'Feeds', u'http://feeds.feedburner.com/trombon/ABWb?format=xml')
+                     ]
+
+    def preprocess_html(self, soup):
+        return self.adeify_images(soup)
--- a/resources/recipes/wallstreetro.recipe
+++ b/resources/recipes/wallstreetro.recipe
@ -0,0 +1,54 @@
+# -*- coding: utf-8 -*-
+#!/usr/bin/env  python
+
+__license__   = 'GPL v3'
+__copyright__ = u'2011, Silviu Cotoar\u0103'
+'''
+wall-street.ro
+'''
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class WallStreetRo(BasicNewsRecipe):
+    title                 = u'Wall Street'
+    __author__            = u'Silviu Cotoar\u0103'
+    description           = ''
+    publisher             = 'Wall Street'
+    oldest_article        = 5
+    language              = 'ro'
+    max_articles_per_feed = 100
+    no_stylesheets        = True
+    use_embedded_content  = False
+    category              = 'Ziare'
+    encoding              = 'utf-8'
+    cover_url             = 'http://img.wall-street.ro/images/WS_new_logo.jpg'
+
+    conversion_options = {
+                             'comments'   : description
+                            ,'tags'       : category
+                            ,'language'   : language
+                            ,'publisher'  : publisher
+                         }
+
+    keep_only_tags = [
+                          dict(name='div', attrs={'class':'article_header'})
+                        , dict(name='div', attrs={'class':'article_text'})
+                     ]
+
+    remove_tags = [
+                          dict(name='p', attrs={'class':['page_breadcrumbs']})
+                        , dict(name='div', attrs={'id':['article_user_toolbox']})
+                        , dict(name='p', attrs={'class':['comments_count_container']})
+                        , dict(name='div', attrs={'class':['article_left_column']})
+                  ]
+
+    remove_tags_after = [
+                            dict(name='div', attrs={'class':'clearfloat'})
+                        ]
+
+    feeds          = [
+                        (u'Feeds', u'http://img.wall-street.ro/rssfeeds/wall-street.xml')
+                     ]
+
+    def preprocess_html(self, soup):
+        return self.adeify_images(soup)
--- a/src/calibre/ebooks/comic/input.py
+++ b/src/calibre/ebooks/comic/input.py
@ -131,9 +131,12 @@ class PageProcessor(list): # {{{
                    newsizey = int(newsizex / aspect)
                    deltax = 0
                    deltay = (SCRHEIGHT - newsizey) / 2
-                wand.size = (newsizex, newsizey)
-                wand.set_border_color(pw)
-                wand.add_border(pw, deltax, deltay)
+                if newsizex < 20000 and newsizey < 20000:
+                    # Too large and resizing fails, so better
+                    # to leave it as original size
+                    wand.size = (newsizex, newsizey)
+                    wand.set_border_color(pw)
+                    wand.add_border(pw, deltax, deltay)
            elif self.opts.wide:
                # Keep aspect and Use device height as scaled image width so landscape mode is clean
                aspect = float(sizex) / float(sizey)
@ -152,11 +155,15 @@ class PageProcessor(list): # {{{
                    newsizey = int(newsizex / aspect)
                    deltax = 0
                    deltay = (wscreeny - newsizey) / 2
-                wand.size = (newsizex, newsizey)
-                wand.set_border_color(pw)
-                wand.add_border(pw, deltax, deltay)
+                if newsizex < 20000 and newsizey < 20000:
+                    # Too large and resizing fails, so better
+                    # to leave it as original size
+                    wand.size = (newsizex, newsizey)
+                    wand.set_border_color(pw)
+                    wand.add_border(pw, deltax, deltay)
            else:
-                wand.size = (SCRWIDTH, SCRHEIGHT)
+                if SCRWIDTH < 20000 and SCRHEIGHT < 20000:
+                    wand.size = (SCRWIDTH, SCRHEIGHT)

            if not self.opts.dont_sharpen:
                wand.sharpen(0.0, 1.0)
--- a/src/calibre/ebooks/snb/snbfile.py
+++ b/src/calibre/ebooks/snb/snbfile.py
@ -75,15 +75,20 @@ class SNBFile:
                    for i in range(self.plainBlock):
                        bzdc = bz2.BZ2Decompressor()
                        if (i < self.plainBlock - 1):
-                            bSize = self.blocks[self.binBlock + i + 1].Offset - self.blocks[self.binBlock + i].Offset;
+                            bSize = self.blocks[self.binBlock + i + 1].Offset - self.blocks[self.binBlock + i].Offset
                        else:
-                            bSize = self.tailOffset - self.blocks[self.binBlock + i].Offset;
-                        snbFile.seek(self.blocks[self.binBlock + i].Offset);
+                            bSize = self.tailOffset - self.blocks[self.binBlock + i].Offset
+                        snbFile.seek(self.blocks[self.binBlock + i].Offset)
                        try:
                            data = snbFile.read(bSize)
-                            uncompressedData += bzdc.decompress(data)
+                            if len(data) < 32768:
+                                uncompressedData += bzdc.decompress(data)
+                            else:
+                                uncompressedData += data
                        except Exception, e:
                            print e
+                if len(uncompressedData) != self.plainStreamSizeUncompressed:
+                    raise Exception()
                f.fileBody = uncompressedData[plainPos:plainPos+f.fileSize]
                plainPos += f.fileSize
            elif f.attr & 0x01000000 == 0x01000000: