Pull from trunk

2025-12-19 11:35:01 -05:00 · 2009-04-18 01:03:52 -07:00 · 2009-04-18 01:03:52 -07:00 · 7dd20f593b
commit 7dd20f593b
parent 3e29dfbe56 0cb7a49d11
9 changed files with 145 additions and 50 deletions
--- a/src/calibre/ebooks/metadata/pdf.py
+++ b/src/calibre/ebooks/metadata/pdf.py
@ -4,7 +4,8 @@ __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
 '''Read meta information from PDF files'''

-import sys, os, StringIO
+import sys, os, cStringIO
+from threading import Thread

 from calibre.ebooks.metadata import MetaInformation, authors_to_string
 from calibre.ptempfile import TemporaryDirectory
@ -49,20 +50,42 @@ def get_metadata(stream, extract_cover=True):
        print >>sys.stderr, msg.encode('utf8')
    return mi

+class MetadataWriter(Thread):
+
+    def __init__(self, out_pdf, buf):
+        self.out_pdf = out_pdf
+        self.buf = buf
+        Thread.__init__(self)
+        self.daemon = True
+
+    def run(self):
+        try:
+            self.out_pdf.write(self.buf)
+        except RuntimeError:
+            pass
+
 def set_metadata(stream, mi):
    stream.seek(0)
    # Use a StringIO object for the pdf because we will want to over
    # write it later and if we are working on the stream directly it
    # could cause some issues.
-    raw = StringIO.StringIO(stream.read())
+    raw = cStringIO.StringIO(stream.read())
    orig_pdf = PdfFileReader(raw)
    title = mi.title if mi.title else orig_pdf.documentInfo.title
    author = authors_to_string(mi.authors) if mi.authors else orig_pdf.documentInfo.author
    out_pdf = PdfFileWriter(title=title, author=author)
+    out_str = cStringIO.StringIO()
+    writer = MetadataWriter(out_pdf, out_str)
    for page in orig_pdf.pages:
        out_pdf.addPage(page)
-    out_str = StringIO.StringIO()
-    out_pdf.write(out_str)
+    writer.start()
+    writer.join(10) # Wait 10 secs for writing to complete
+    out_pdf.killed = True
+    writer.join()
+    if out_pdf.killed:
+        print 'Failed to set metadata: took too long'
+        return
+
    stream.seek(0)
    stream.truncate()
    out_str.seek(0)
@ -70,7 +93,7 @@ def set_metadata(stream, mi):
    stream.seek(0)

 def get_cover(stream):
-    data = StringIO.StringIO()
+    data = cStringIO.StringIO()

    try:
        pdf = PdfFileReader(stream)
@ -99,3 +122,4 @@ def get_cover(stream):
        traceback.print_exc()

    return data.getvalue()
+
--- a/src/calibre/gui2/dialogs/metadata_single.py
+++ b/src/calibre/gui2/dialogs/metadata_single.py
@ -319,6 +319,7 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog):
        self.cover_changed = True

    def initialize_series(self):
+        self.series.setSizeAdjustPolicy(self.series.AdjustToContentsOnFirstShow)
        all_series = self.db.all_series()
        all_series.sort(cmp=lambda x, y : cmp(x[1], y[1]))
        series_id = self.db.series_id(self.row)
@ -335,13 +336,6 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog):
            self.series.setCurrentIndex(idx)
            self.enable_series_index()

-        pl = self.series.parentWidget().layout()
-        for i in range(pl.count()):
-            l =  pl.itemAt(i).layout()
-            if l:
-                l.invalidate()
-                l.activate()
-
    def initialize_series_and_publisher(self):
        self.initialize_series()
        all_publishers = self.db.all_publishers()
--- a/src/calibre/gui2/images/news/der_standard.png
+++ b/src/calibre/gui2/images/news/der_standard.png
--- a/src/calibre/gui2/images/news/seattle_times.png
+++ b/src/calibre/gui2/images/news/seattle_times.png
--- a/src/calibre/web/feeds/recipes/init.py
+++ b/src/calibre/web/feeds/recipes/init.py
@ -40,6 +40,7 @@ recipe_modules = ['recipe_' + r for r in (
           'krstarica', 'krstarica_en', 'tanjug', 'laprensa_ni', 'azstarnet',
           'corriere_della_sera_it', 'corriere_della_sera_en', 'msdnmag_en',
           'moneynews', 'der_standard', 'diepresse', 'nzz_ger', 'hna',
+           'seattle_times',
          )]

 import re, imp, inspect, time, os
--- a/src/calibre/web/feeds/recipes/recipe_der_standard.py
+++ b/src/calibre/web/feeds/recipes/recipe_der_standard.py
@ -1,14 +1,37 @@
+#!/usr/bin/env  python
+# -*- coding: utf-8 -*-
+
+__license__   = 'GPL v3'
+__copyright__ = '2009, Gerhard Aigner <gerhard.aigner at gmail.com>'

 ''' http://www.derstandard.at - Austrian Newspaper '''
 import re
 from calibre.web.feeds.news import BasicNewsRecipe

 class DerStandardRecipe(BasicNewsRecipe):
-    title          = u'derStandard'
-    __author__  = 'Gerhard Aigner'
-
+    title = u'derStandard'
+    __author__ = 'Gerhard Aigner'
+    description = u'Nachrichten aus Österreich' 
+    publisher ='derStandard.at'
+    category = 'news, politics, nachrichten, Austria'
+    use_embedded_content = False
+    remove_empty_feeds = True
+    lang = 'de-AT'
+    no_stylesheets = True
+    encoding = 'utf-8'
+    language = _('German')
+    recursions = 0
    oldest_article = 1
    max_articles_per_feed = 100
+    
+    html2lrf_options = [
+                          '--comment'  , description
+                        , '--category' , category
+                        , '--publisher', publisher
+                        ]
+
+    html2epub_options  = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' 
+    
    feeds          = [(u'International', u'http://derstandard.at/?page=rss&ressort=internationalpolitik'),
        (u'Inland', u'http://derstandard.at/?page=rss&ressort=innenpolitik'),
        (u'Wirtschaft', u'http://derstandard.at/?page=rss&ressort=investor'),
@ -20,14 +43,10 @@ class DerStandardRecipe(BasicNewsRecipe):
        (u'Wissenschaft', u'http://derstandard.at/?page=rss&ressort=wissenschaft'),
        (u'Gesundheit', u'http://derstandard.at/?page=rss&ressort=gesundheit'),
        (u'Bildung', u'http://derstandard.at/?page=rss&ressort=subildung')]
-
-    encoding = 'utf-8'
-    language = _('German')
-    recursions = 0
    remove_tags = [dict(name='div'), dict(name='a'), dict(name='link'), dict(name='meta'),
        dict(name='form',attrs={'name':'sitesearch'}), dict(name='hr')]
    preprocess_regexps = [
-        (re.compile(r'\[[\d*]\]', re.DOTALL|re.IGNORECASE), lambda match: ''),
+        (re.compile(r'\[[\d]*\]', re.DOTALL|re.IGNORECASE), lambda match: ''),
        (re.compile(r'bgcolor="#\w{3,6}"', re.DOTALL|re.IGNORECASE), lambda match: '')
    ]
    
@ -40,3 +59,10 @@ class DerStandardRecipe(BasicNewsRecipe):
        if (article.link.count('ressort') > 0 or article.title.lower().count('ansichtssache') > 0):
            return None
        return article.link
+
+    def preprocess_html(self, soup):
+        soup.html['xml:lang'] = self.lang
+        soup.html['lang']     = self.lang
+        mtag = '<meta http-equiv="Content-Type" content="text/html; charset=' + self.encoding + '">'
+        soup.head.insert(0,mtag)
+        return soup  
--- a/src/calibre/web/feeds/recipes/recipe_seattle_times.py
+++ b/src/calibre/web/feeds/recipes/recipe_seattle_times.py
@ -0,0 +1,50 @@
+#!/usr/bin/env  python
+
+__license__   = 'GPL v3'
+__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
+'''
+seattletimes.nwsource.com
+'''
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class SeattleTimes(BasicNewsRecipe):
+    title                 = 'The Seattle Times'
+    __author__            = 'Darko Miletic'
+    description           = 'News from Seattle and USA'
+    publisher             = 'The Seattle Times'
+    category              = 'news, politics, USA'
+    oldest_article        = 2
+    max_articles_per_feed = 100
+    no_stylesheets        = True
+    use_embedded_content  = False
+    encoding              = 'cp1252'
+    language              = _('English')
+
+    html2lrf_options = [
+                          '--comment'  , description
+                        , '--category' , category
+                        , '--publisher', publisher
+                        ]
+
+    html2epub_options  = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
+
+    feeds              = [(u'Articles', u'http://seattletimes.nwsource.com/rss/seattletimes.xml')]
+
+    remove_tags        = [
+                             dict(name=['object','link','script'])
+                            ,dict(name='p', attrs={'class':'permission'})
+                         ]
+
+    def print_version(self, url):
+        start_url, sep, rest_url = url.rpartition('_')
+        rurl, rsep, article_id = start_url.rpartition('/')
+        return u'http://seattletimes.nwsource.com/cgi-bin/PrintStory.pl?document_id=' + article_id
+
+    def preprocess_html(self, soup):
+        mtag = '<meta http-equiv="Content-Language" content="en-US"/>'
+        soup.head.insert(0,mtag)
+        for item in soup.findAll(style=True):
+            del item['style']
+        return soup
+
--- a/src/pyPdf/generic.py
+++ b/src/pyPdf/generic.py
@ -299,7 +299,7 @@ def readStringFromStream(stream):
            elif tok == "t":
                tok = "\t"
            elif tok == "b":
-                tok == "\b"
+                tok = "\b"
            elif tok == "f":
                tok = "\f"
            elif tok == "(":
--- a/src/pyPdf/pdf.py
+++ b/src/pyPdf/pdf.py
@ -39,15 +39,12 @@ __author__ = "Mathieu Fenniak"
 __author_email__ = "biziqe@mathieu.fenniak.net"

 import struct
-try:
-    from cStringIO import StringIO
-except ImportError:
-    from StringIO import StringIO
+from cStringIO import StringIO

-import filters
-import utils
-import warnings
-from generic import *
+from generic import DictionaryObject, NameObject, NumberObject, \
+createStringObject, ArrayObject, ByteStringObject, StreamObject, \
+IndirectObject, utils, readObject, TextStringObject, BooleanObject, \
+RectangleObject, DecodedStreamObject
 from utils import readNonWhitespace, readUntilWhitespace, ConvertFunctionsToVirtualList


@ -56,6 +53,7 @@ from utils import readNonWhitespace, readUntilWhitespace, ConvertFunctionsToVirt
 # class (typically {@link #PdfFileReader PdfFileReader}).
 class PdfFileWriter(object):
    def __init__(self,title=u"Unknown",author=u"Unknown"):
+        self.killed = False
        self._header = "%PDF-1.3"
        self._objects = []  # array of indirect objects

@ -162,7 +160,7 @@ class PdfFileWriter(object):
    # @param stream An object to write the file to.  The object must support
    # the write method, and the tell method, similar to a file object.
    def write(self, stream):
-        import struct, md5
+        import md5

        externalReferenceMap = {}
        self.stack = []
@ -214,6 +212,8 @@ class PdfFileWriter(object):
        stream.write("\nstartxref\n%s\n%%%%EOF\n" % (xref_location))

    def _sweepIndirectReferences(self, externMap, data):
+        if self.killed:
+            raise RuntimeError('Writer killed')
        if isinstance(data, DictionaryObject):
            for key, value in data.items():
                origvalue = value