IGN:Updated diepresse recipe and fix pdf metadata reader holding files open

2026-01-07 20:50:20 -05:00 · 2009-04-18 01:36:04 -07:00 · 2009-04-18 01:36:04 -07:00 · 65a3b30a83
commit 65a3b30a83
parent 0cb7a49d11
4 changed files with 70 additions and 18 deletions
--- a/src/calibre/init.py
+++ b/src/calibre/init.py
@ -244,6 +244,23 @@ class CurrentDir(object):
        os.chdir(self.cwd)


+class FileWrapper(object):
+    '''
+    Used primarily with pyPdf to ensure the stream is properly closed.
+    '''
+
+    def __init__(self, stream):
+        for x in ('read', 'seek', 'tell'):
+            setattr(self, x, getattr(stream, x))
+
+    def __exit__(self, *args):
+        for x in ('read', 'seek', 'tell'):
+            setattr(self, x, None)
+
+    def __enter__(self):
+        return self
+
+
 def detect_ncpus():
    """Detects the number of effective CPUs in the system"""
    try:
--- a/src/calibre/ebooks/metadata/pdf.py
+++ b/src/calibre/ebooks/metadata/pdf.py
@ -1,3 +1,4 @@
+from __future__ import with_statement
 __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
 '''Read meta information from PDF files'''
@ -5,6 +6,7 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
 import sys, os, cStringIO
 from threading import Thread

+from calibre import FileWrapper
 from calibre.ebooks.metadata import MetaInformation, authors_to_string, get_parser
 from pyPdf import PdfFileReader, PdfFileWriter

@ -13,18 +15,19 @@ def get_metadata(stream):
    mi = MetaInformation(_('Unknown'), [_('Unknown')])
    stream.seek(0)
    try:
-        info = PdfFileReader(stream).getDocumentInfo()
-        if info.title:
-            mi.title = info.title
-        if info.author:
-            src = info.author.split('&')
-            authors = []
-            for au in src:
-                authors += au.split(',')
-            mi.authors = authors
-            mi.author = info.author
-        if info.subject:
-            mi.category = info.subject
+        with FileWrapper(stream) as stream:
+            info = PdfFileReader(stream).getDocumentInfo()
+            if info.title:
+                mi.title = info.title
+            if info.author:
+                src = info.author.split('&')
+                authors = []
+                for au in src:
+                    authors += au.split(',')
+                mi.authors = authors
+                mi.author = info.author
+            if info.subject:
+                mi.category = info.subject
    except Exception, err:
        msg = u'Couldn\'t read metadata from pdf: %s with error %s'%(mi.title, unicode(err))
        print >>sys.stderr, msg.encode('utf8')
--- a/src/calibre/gui2/images/news/diepresse.png
+++ b/src/calibre/gui2/images/news/diepresse.png
--- a/src/calibre/web/feeds/recipes/recipe_diepresse.py
+++ b/src/calibre/web/feeds/recipes/recipe_diepresse.py
@ -1,18 +1,42 @@
-import re
+# -*- coding: utf-8 -*-

+
+__license__   = 'GPL v3'
+__copyright__ = '2009, Gerhard Aigner <gerhard.aigner at gmail.com>'
+
+''' http://www.diepresse.at - Austrian Newspaper '''
+
+import re
 from calibre.web.feeds.news import BasicNewsRecipe

 class DiePresseRecipe(BasicNewsRecipe):
-    title          = u'diePresse'
+    title = u'diePresse'
+    __author__ = 'Gerhard Aigner'
+    description = u'DiePresse.com - Die Online-Ausgabe der Österreichischen Tageszeitung Die Presse.' 
+    publisher ='DiePresse.com'
+    category = 'news, politics, nachrichten, Austria'
+    use_embedded_content = False
+    remove_empty_feeds = True
+    lang = 'de-AT'
+    no_stylesheets = True
+    encoding = 'ISO-8859-1'
+    language = _('German')
+    recursions = 0
    oldest_article = 1
    max_articles_per_feed = 100
-    recursions = 0
-    language = _('German')
-    __author__ = 'Gerhard Aigner'
+  
+    html2lrf_options = [
+                          '--comment'  , description
+                        , '--category' , category
+                        , '--publisher', publisher
+                        ]

+    html2epub_options  = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' 
+  
    preprocess_regexps = [
 	(re.compile(r'Textversion', re.DOTALL), lambda match: ''),
    ]
+    
    remove_tags = [dict(name='hr'),
 	dict(name='br'),
 	dict(name='small'),
@ -21,6 +45,7 @@ class DiePresseRecipe(BasicNewsRecipe):
 	dict(name='h1', attrs={'class':'titel'}),
 	dict(name='a', attrs={'class':'print'}),
 	dict(name='div', attrs={'class':'hline'})]
+	
    feeds = [(u'Politik', u'http://diepresse.com/rss/Politik'),
 	(u'Wirtschaft', u'http://diepresse.com/rss/Wirtschaft'),
 	(u'Europa', u'http://diepresse.com/rss/EU'),
@ -29,7 +54,7 @@ class DiePresseRecipe(BasicNewsRecipe):
 	(u'Kultur', u'http://diepresse.com/rss/Kultur'),
 	(u'Leben', u'http://diepresse.com/rss/Leben'),
 	(u'Tech', u'http://diepresse.com/rss/Tech'),
-	(u'Science', u'http://diepresse.com/rss/Science'),
+	(u'Wissenschaft', u'http://diepresse.com/rss/Science'),
 	(u'Bildung', u'http://diepresse.com/rss/Bildung'),
 	(u'Gesundheit', u'http://diepresse.com/rss/Gesundheit'),
 	(u'Recht', u'http://diepresse.com/rss/Recht'),
@ -38,3 +63,10 @@ class DiePresseRecipe(BasicNewsRecipe):

    def print_version(self, url):
        return url.replace('home','text/home')
+
+    def preprocess_html(self, soup):
+        soup.html['xml:lang'] = self.lang
+        soup.html['lang']     = self.lang
+	mtag = '<meta http-equiv="Content-Type" content="text/html; charset=utf-8">'
+        soup.head.insert(0,mtag)
+	return soup