IGN:Updated diepresse recipe and fix pdf metadata reader holding files open

2025-08-30 23:00:21 -04:00 · 2009-04-18 01:36:04 -07:00 · 2009-04-18 01:36:04 -07:00 · 65a3b30a83
commit 65a3b30a83
parent 0cb7a49d11
4 changed files with 70 additions and 18 deletions
--- a/src/calibre/init.py
+++ b/src/calibre/init.py
@ -244,6 +244,23 @@ class CurrentDir(object):
        os.chdir(self.cwd)
 class FileWrapper(object):
    '''
    Used primarily with pyPdf to ensure the stream is properly closed.
    '''
    def __init__(self, stream):
        for x in ('read', 'seek', 'tell'):
            setattr(self, x, getattr(stream, x))
    def __exit__(self, *args):
        for x in ('read', 'seek', 'tell'):
            setattr(self, x, None)
    def __enter__(self):
        return self
 def detect_ncpus():
    """Detects the number of effective CPUs in the system"""
    try:
--- a/src/calibre/ebooks/metadata/pdf.py
+++ b/src/calibre/ebooks/metadata/pdf.py
@ -1,3 +1,4 @@
 from __future__ import with_statement
 __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
 '''Read meta information from PDF files'''
@ -5,6 +6,7 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
 import sys, os, cStringIO
 from threading import Thread
 from calibre import FileWrapper
 from calibre.ebooks.metadata import MetaInformation, authors_to_string, get_parser
 from pyPdf import PdfFileReader, PdfFileWriter
@ -13,18 +15,19 @@ def get_metadata(stream):
    mi = MetaInformation(_('Unknown'), [_('Unknown')])
    stream.seek(0)
    try:
-        info = PdfFileReader(stream).getDocumentInfo()
+        with FileWrapper(stream) as stream:
-        if info.title:
+            info = PdfFileReader(stream).getDocumentInfo()
-            mi.title = info.title
+            if info.title:
-        if info.author:
+                mi.title = info.title
-            src = info.author.split('&')
+            if info.author:
-            authors = []
+                src = info.author.split('&')
-            for au in src:
+                authors = []
-                authors += au.split(',')
+                for au in src:
-            mi.authors = authors
+                    authors += au.split(',')
-            mi.author = info.author
+                mi.authors = authors
-        if info.subject:
+                mi.author = info.author
-            mi.category = info.subject
+            if info.subject:
                mi.category = info.subject
    except Exception, err:
        msg = u'Couldn\'t read metadata from pdf: %s with error %s'%(mi.title, unicode(err))
        print >>sys.stderr, msg.encode('utf8')
--- a/src/calibre/gui2/images/news/diepresse.png
+++ b/src/calibre/gui2/images/news/diepresse.png
--- a/src/calibre/web/feeds/recipes/recipe_diepresse.py
+++ b/src/calibre/web/feeds/recipes/recipe_diepresse.py
@ -1,18 +1,42 @@
-import re
+# -*- coding: utf-8 -*-
 __license__   = 'GPL v3'
 __copyright__ = '2009, Gerhard Aigner <gerhard.aigner at gmail.com>'
 ''' http://www.diepresse.at - Austrian Newspaper '''
 import re
 from calibre.web.feeds.news import BasicNewsRecipe
 class DiePresseRecipe(BasicNewsRecipe):
-    title          = u'diePresse'
+    title = u'diePresse'
    __author__ = 'Gerhard Aigner'
    description = u'DiePresse.com - Die Online-Ausgabe der Österreichischen Tageszeitung Die Presse.' 
    publisher ='DiePresse.com'
    category = 'news, politics, nachrichten, Austria'
    use_embedded_content = False
    remove_empty_feeds = True
    lang = 'de-AT'
    no_stylesheets = True
    encoding = 'ISO-8859-1'
    language = _('German')
    recursions = 0
    oldest_article = 1
    max_articles_per_feed = 100
-    recursions = 0
+  
-    language = _('German')
+    html2lrf_options = [
-    __author__ = 'Gerhard Aigner'
+                          '--comment'  , description
                        , '--category' , category
                        , '--publisher', publisher
                        ]
    html2epub_options  = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' 
    preprocess_regexps = [
 	(re.compile(r'Textversion', re.DOTALL), lambda match: ''),
    ]
    remove_tags = [dict(name='hr'),
 	dict(name='br'),
 	dict(name='small'),
@ -21,6 +45,7 @@ class DiePresseRecipe(BasicNewsRecipe):
 	dict(name='h1', attrs={'class':'titel'}),
 	dict(name='a', attrs={'class':'print'}),
 	dict(name='div', attrs={'class':'hline'})]
    feeds = [(u'Politik', u'http://diepresse.com/rss/Politik'),
 	(u'Wirtschaft', u'http://diepresse.com/rss/Wirtschaft'),
 	(u'Europa', u'http://diepresse.com/rss/EU'),
@ -29,7 +54,7 @@ class DiePresseRecipe(BasicNewsRecipe):
 	(u'Kultur', u'http://diepresse.com/rss/Kultur'),
 	(u'Leben', u'http://diepresse.com/rss/Leben'),
 	(u'Tech', u'http://diepresse.com/rss/Tech'),
-	(u'Science', u'http://diepresse.com/rss/Science'),
+	(u'Wissenschaft', u'http://diepresse.com/rss/Science'),
 	(u'Bildung', u'http://diepresse.com/rss/Bildung'),
 	(u'Gesundheit', u'http://diepresse.com/rss/Gesundheit'),
 	(u'Recht', u'http://diepresse.com/rss/Recht'),
@ -38,3 +63,10 @@ class DiePresseRecipe(BasicNewsRecipe):
    def print_version(self, url):
        return url.replace('home','text/home')
    def preprocess_html(self, soup):
        soup.html['xml:lang'] = self.lang
        soup.html['lang']     = self.lang
 	mtag = '<meta http-equiv="Content-Type" content="text/html; charset=utf-8">'
        soup.head.insert(0,mtag)
 	return soup