pdf get_cover returns cover image instead of nothing.

2025-07-09 03:04:10 -04:00 · 2009-04-18 07:54:56 -04:00 · 2009-04-18 07:54:56 -04:00 · b104286f61
commit b104286f61
parent 37b820b046 f969ed39fe
24 changed files with 405 additions and 210 deletions
--- a/src/calibre/customize/builtins.py
+++ b/src/calibre/customize/builtins.py
@ -280,6 +280,7 @@ from calibre.ebooks.epub.input import EPUBInput
 from calibre.ebooks.mobi.input import MOBIInput
 from calibre.ebooks.pdf.input import PDFInput
 from calibre.ebooks.txt.input import TXTInput
+from calibre.ebooks.lit.input import LITInput
 from calibre.ebooks.html.input import HTMLInput
 from calibre.ebooks.oeb.output import OEBOutput
 from calibre.ebooks.txt.output import TXTOutput
@ -287,7 +288,7 @@ from calibre.ebooks.pdf.output import PDFOutput
 from calibre.customize.profiles import input_profiles, output_profiles

 plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDFInput, HTMLInput,
-        TXTInput, OEBOutput, TXTOutput, PDFOutput]
+        TXTInput, OEBOutput, TXTOutput, PDFOutput, LITInput]
 plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
                                        x.__name__.endswith('MetadataReader')]
 plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
--- a/src/calibre/customize/conversion.py
+++ b/src/calibre/customize/conversion.py
@ -41,6 +41,11 @@ class ConversionOption(object):
    def __eq__(self, other):
        return hash(self) == hash(other)

+    def clone(self):
+        return ConversionOption(name=self.name, help=self.help,
+                long_switch=self.long_switch, short_switch=self.short_switch,
+                choices=self.choices)
+
 class OptionRecommendation(object):
    LOW  = 1
    MED  = 2
@ -59,6 +64,10 @@ class OptionRecommendation(object):

        self.validate_parameters()

+    def clone(self):
+        return OptionRecommendation(recommended_value=self.recommended_value,
+                level=self.level, option=self.option.clone())
+
    def validate_parameters(self):
        if self.option.choices and self.recommended_value not in \
                                                    self.option.choices:
@ -170,8 +179,14 @@ class InputFormatPlugin(Plugin):
            options.debug_input = os.path.abspath(options.debug_input)
            if not os.path.exists(options.debug_input):
                os.makedirs(options.debug_input)
+            if isinstance(ret, basestring):
                shutil.rmtree(options.debug_input)
                shutil.copytree(output_dir, options.debug_input)
+            else:
+                from calibre.ebooks.oeb.writer import OEBWriter
+                w = OEBWriter(pretty_print=options.pretty_print)
+                w(ret, options.debug_input)
+
            log.info('Input debug saved to:', options.debug_input)

        return ret
--- a/src/calibre/ebooks/conversion/cli.py
+++ b/src/calibre/ebooks/conversion/cli.py
@ -57,7 +57,7 @@ def check_command_line_options(parser, args, log):
        raise SystemExit(1)

    output = args[2]
-    if output.startswith('.'):
+    if output.startswith('.') and output != '.':
        output = os.path.splitext(os.path.basename(input))[0]+output
    output = os.path.abspath(output)

@ -171,6 +171,7 @@ def main(args=sys.argv):

    plumber.run()

+    if plumber.opts.debug_input is None:
        log(_('Output saved to'), ' ', plumber.output)

    return 0
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@ -32,8 +32,8 @@ class Plumber(object):
        :param input: Path to input file.
        :param output: Path to output file/directory
        '''
-        self.input = input
-        self.output = output
+        self.input = os.path.abspath(input)
+        self.output = os.path.abspath(output)
        self.log = log

        # Initialize the conversion options that are independent of input and
@ -188,15 +188,15 @@ OptionRecommendation(name='language',
 ]


-        input_fmt = os.path.splitext(input)[1]
+        input_fmt = os.path.splitext(self.input)[1]
        if not input_fmt:
            raise ValueError('Input file must have an extension')
        input_fmt = input_fmt[1:].lower()

-        if os.path.exists(output) and os.path.isdir(output):
+        if os.path.exists(self.output) and os.path.isdir(self.output):
            output_fmt = 'oeb'
        else:
-            output_fmt = os.path.splitext(output)[1]
+            output_fmt = os.path.splitext(self.output)[1]
            if not output_fmt:
                output_fmt = '.oeb'
            output_fmt = output_fmt[1:].lower()
@ -323,6 +323,9 @@ OptionRecommendation(name='language',
        self.oeb = self.input_plugin(open(self.input, 'rb'), self.opts,
                                    self.input_fmt, self.log,
                                    accelerators, tdir)
+        if self.opts.debug_input is not None:
+            self.log('Debug input called, aborting the rest of the pipeline.')
+            return
        if not hasattr(self.oeb, 'manifest'):
            self.oeb = create_oebbook(self.log, self.oeb, self.opts)

@ -365,18 +368,20 @@ OptionRecommendation(name='language',
        self.output_plugin.convert(self.oeb, self.output, self.input_plugin,
                self.opts, self.log)

-def create_oebbook(log, opfpath, opts):
+def create_oebbook(log, path_or_stream, opts, reader=None):
    '''
-    Create an OEBBook from an OPF file.
+    Create an OEBBook.
    '''
-    from calibre.ebooks.oeb.reader import OEBReader
    from calibre.ebooks.oeb.base import OEBBook
    html_preprocessor = HTMLPreProcessor()
-    reader = OEBReader()
    oeb = OEBBook(log, html_preprocessor=html_preprocessor,
            pretty_print=opts.pretty_print)
    # Read OEB Book into OEBBook
-    log.info('Parsing all content...')
-    reader(oeb, opfpath)
+    log('Parsing all content...')
+    if reader is None:
+        from calibre.ebooks.oeb.reader import OEBReader
+        reader = OEBReader
+
+    reader()(oeb, path_or_stream)
    return oeb

--- a/src/calibre/ebooks/html/input.py
+++ b/src/calibre/ebooks/html/input.py
@ -252,6 +252,14 @@ class HTMLInput(InputFormatPlugin):
                   )
        ),

+        OptionRecommendation(name='dont_package',
+            recommended_value=False, level=OptionRecommendation.LOW,
+            help=_('Normally this input plugin re-arranges all the input '
+                'files into a standard folder hierarchy. Only use this option '
+                'if you know what you are doing as it can result in various '
+                'nasty side effects in the rest of of the conversion pipeline.'
+                )
+        ),
    ])

    def convert(self, stream, opts, file_ext, log,
@ -276,6 +284,9 @@ class HTMLInput(InputFormatPlugin):
            mi.render(open('metadata.opf', 'wb'))
            opfpath = os.path.abspath('metadata.opf')

+        if opts.dont_package:
+            return opfpath
+
        from calibre.ebooks.conversion.plumber import create_oebbook
        oeb = create_oebbook(log, opfpath, opts)

--- a/src/calibre/ebooks/lit/input.py
+++ b/src/calibre/ebooks/lit/input.py
@ -0,0 +1,24 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import with_statement
+
+__license__   = 'GPL v3'
+__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+from calibre.customize.conversion import InputFormatPlugin
+
+class LITInput(InputFormatPlugin):
+
+    name        = 'LIT Input'
+    author      = 'Marshall T. Vandegrift'
+    description = 'Convert LIT files to HTML'
+    file_types  = set(['lit'])
+
+    def convert(self, stream, options, file_ext, log,
+                accelerators):
+        from calibre.ebooks.lit.reader import LitReader
+        from calibre.ebooks.conversion.plumber import create_oebbook
+        return create_oebbook(log, stream, options, reader=LitReader)
+
+
--- a/src/calibre/ebooks/lit/reader.py
+++ b/src/calibre/ebooks/lit/reader.py
@ -7,13 +7,12 @@ __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net> ' \
    'and Marshall T. Vandegrift <llasram@gmail.com>'

-import sys, struct, os
+import struct, os
 import functools
 import re
 from urlparse import urldefrag
 from cStringIO import StringIO
 from urllib import unquote as urlunquote
-from lxml import etree
 from calibre.ebooks.lit import LitError
 from calibre.ebooks.lit.maps import OPF_MAP, HTML_MAP
 import calibre.ebooks.lit.mssha1 as mssha1
--- a/src/calibre/ebooks/metadata/pdf.py
+++ b/src/calibre/ebooks/metadata/pdf.py
@ -1,10 +1,10 @@
 from __future__ import with_statement
-
 __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
 '''Read meta information from PDF files'''

 import sys, os, cStringIO
+from threading import Thread

 from calibre import FileWrapper
 from calibre.ebooks.metadata import MetaInformation, authors_to_string
@ -13,7 +13,8 @@ from pyPdf import PdfFileReader, PdfFileWriter
 import Image
 try:
    from calibre.utils.PythonMagickWand import \
-        NewMagickWand, MagickReadImage, MagickSetImageFormat, MagickWriteImage
+        NewMagickWand, MagickReadImage, MagickSetImageFormat, \
+        MagickWriteImage, ImageMagick
    _imagemagick_loaded = True
 except:
    _imagemagick_loaded = False
@ -51,9 +52,23 @@ def get_metadata(stream, extract_cover=True):
        print >>sys.stderr, msg.encode('utf8')
    return mi

+class MetadataWriter(Thread):
+
+    def __init__(self, out_pdf, buf):
+        self.out_pdf = out_pdf
+        self.buf = buf
+        Thread.__init__(self)
+        self.daemon = True
+
+    def run(self):
+        try:
+            self.out_pdf.write(self.buf)
+        except RuntimeError:
+            pass
+
 def set_metadata(stream, mi):
    stream.seek(0)
-    # Use a cStringIO object for the pdf because we will want to over
+    # Use a StringIO object for the pdf because we will want to over
    # write it later and if we are working on the stream directly it
    # could cause some issues.
    raw = cStringIO.StringIO(stream.read())
@ -61,10 +76,18 @@ def set_metadata(stream, mi):
    title = mi.title if mi.title else orig_pdf.documentInfo.title
    author = authors_to_string(mi.authors) if mi.authors else orig_pdf.documentInfo.author
    out_pdf = PdfFileWriter(title=title, author=author)
+    out_str = cStringIO.StringIO()
+    writer = MetadataWriter(out_pdf, out_str)
    for page in orig_pdf.pages:
        out_pdf.addPage(page)
-    out_str = cStringIO.StringIO()
-    out_pdf.write(out_str)
+    writer.start()
+    writer.join(10) # Wait 10 secs for writing to complete
+    out_pdf.killed = True
+    writer.join()
+    if out_pdf.killed:
+        print 'Failed to set metadata: took too long'
+        return
+
    stream.seek(0)
    stream.truncate()
    out_str.seek(0)
@ -72,12 +95,9 @@ def set_metadata(stream, mi):
    stream.seek(0)

 def get_cover(stream):
-    stream.seek(0)
-    
    data = cStringIO.StringIO()

    try:
-        with FileWrapper(stream) as stream:
        pdf = PdfFileReader(stream)
        output = PdfFileWriter()

@ -87,20 +107,20 @@ def get_cover(stream):
        with TemporaryDirectory('_pdfmeta') as tdir:
            cover_path = os.path.join(tdir, 'cover.pdf')

-                outputStream = file(cover_path, "wb")
+            with open(cover_path, "wb") as outputStream:
                output.write(outputStream)
-                outputStream.close()
                
+            with ImageMagick():
                wand = NewMagickWand()
                MagickReadImage(wand, cover_path)
                MagickSetImageFormat(wand, 'JPEG')
                MagickWriteImage(wand, '%s.jpg' % cover_path)

                img = Image.open('%s.jpg' % cover_path)
-    
                img.save(data, 'JPEG')
    except:
        import traceback
        traceback.print_exc()

    return data.getvalue()
+
--- a/src/calibre/ebooks/oeb/base.py
+++ b/src/calibre/ebooks/oeb/base.py
@ -272,11 +272,7 @@ def XPath(expr):
 def xpath(elem, expr):
    return elem.xpath(expr, namespaces=XPNSMAP)

-def _prepare_xml_for_serialization(root):
-    pass
-
 def xml2str(root, pretty_print=False, strip_comments=False):
-    _prepare_xml_for_serialization(root)
    ans = etree.tostring(root, encoding='utf-8', xml_declaration=True,
                          pretty_print=pretty_print)

@ -287,7 +283,6 @@ def xml2str(root, pretty_print=False, strip_comments=False):


 def xml2unicode(root, pretty_print=False):
-    _prepare_xml_for_serialization(root)
    return etree.tostring(root, pretty_print=pretty_print)

 ASCII_CHARS   = set(chr(x) for x in xrange(128))
@ -321,6 +316,25 @@ def urlnormalize(href):
    parts = (urlquote(part) for part in parts)
    return urlunparse(parts)

+class DummyHandler(logging.Handler):
+
+    def __init__(self):
+        logging.Handler.__init__(self, logging.WARNING)
+        self.setFormatter(logging.Formatter('%(message)s'))
+        self.log = None
+
+    def emit(self, record):
+        if self.log is not None:
+            msg = self.format(record)
+            f = self.log.error if record.levelno >= logging.ERROR \
+                    else self.log.warn
+            f(msg)
+
+
+_css_logger = logging.getLogger('calibre.css')
+_css_logger.setLevel(logging.WARNING)
+_css_log_handler = DummyHandler()
+_css_logger.addHandler(_css_log_handler)

 class OEBError(Exception):
    """Generic OEB-processing error."""
@ -778,7 +792,8 @@ class Manifest(object):
            data = self.oeb.css_preprocessor(data)
            data = XHTML_CSS_NAMESPACE + data
            parser = CSSParser(loglevel=logging.WARNING,
-                               fetcher=self._fetch_css)
+                               fetcher=self._fetch_css,
+                               log=_css_logger)
            data = parser.parseString(data, href=self.href)
            data.namespaces['h'] = XHTML_NS
            return data
@ -1435,7 +1450,7 @@ class OEBBook(object):
        :attr:`pages`: List of "pages," such as indexed to a print edition of
            the same text.
        """
-
+        _css_log_handler.log = logger
        self.encoding = encoding
        self.html_preprocessor = html_preprocessor
        self.css_preprocessor = css_preprocessor
@ -1450,6 +1465,7 @@ class OEBBook(object):
        self.guide = Guide(self)
        self.toc = TOC()
        self.pages = PageList()
+        self.auto_generated_toc = True

    @classmethod
    def generate(cls, opts):
--- a/src/calibre/ebooks/oeb/iterator.py
+++ b/src/calibre/ebooks/oeb/iterator.py
@ -13,13 +13,12 @@ from PyQt4.Qt import QFontDatabase

 from calibre.customize.ui import available_input_formats
 from calibre.ebooks.epub.from_html import TITLEPAGE
-from calibre.ebooks.metadata.opf2 import OPF, OPFCreator
+from calibre.ebooks.metadata.opf2 import OPF
 from calibre.ptempfile import TemporaryDirectory
 from calibre.ebooks.chardet import xml_to_unicode
 from calibre.utils.zipfile import safe_replace, ZipFile
 from calibre.utils.config import DynamicConfig
 from calibre.utils.logging import Log
-from calibre import CurrentDir

 def character_count(html):
    '''
@ -57,31 +56,21 @@ class FakeOpts(object):
    max_levels = 5
    input_encoding = None

-def html2opf(path, tdir, log):
-    from calibre.ebooks.html.input import get_filelist
-    from calibre.ebooks.metadata.meta import get_metadata
-    with CurrentDir(tdir):
-        fl = get_filelist(path, tdir, FakeOpts(), log)
-        mi = get_metadata(open(path, 'rb'), 'html')
-        mi = OPFCreator(os.getcwdu(), mi)
-        mi.guide = None
-        entries = [(f.path, 'application/xhtml+xml') for f in fl]
-        mi.create_manifest(entries)
-        mi.create_spine([f.path for f in fl])
-
-        mi.render(open('metadata.opf', 'wb'))
-        opfpath = os.path.abspath('metadata.opf')
-
-    return opfpath
-
-def opf2opf(path, tdir, opts):
-    return path
-
 def is_supported(path):
    ext = os.path.splitext(path)[1].replace('.', '').lower()
    ext = re.sub(r'(x{0,1})htm(l{0,1})', 'html', ext)
    return ext in available_input_formats()

+
+def write_oebbook(oeb, path):
+    from calibre.ebooks.oeb.writer import OEBWriter
+    from calibre import walk
+    w = OEBWriter()
+    w(oeb, path)
+    for f in walk(path):
+        if f.endswith('.opf'):
+            return f
+
 class EbookIterator(object):

    CHARACTERS_PER_PAGE = 1000
@ -131,17 +120,16 @@ class EbookIterator(object):
    def __enter__(self):
        self._tdir = TemporaryDirectory('_ebook_iter')
        self.base  = self._tdir.__enter__()
-        if self.ebook_ext == 'opf':
-            self.pathtoopf = self.pathtoebook
-        elif self.ebook_ext == 'html':
-            self.pathtoopf = html2opf(self.pathtoebook, self.base, self.log)
-        else:
        from calibre.ebooks.conversion.plumber import Plumber
        plumber = Plumber(self.pathtoebook, self.base, self.log)
        plumber.setup_options()
+        if hasattr(plumber.opts, 'dont_package'):
+            plumber.opts.dont_package = True
        self.pathtoopf = plumber.input_plugin(open(plumber.input, 'rb'),
                plumber.opts, plumber.input_fmt, self.log,
                {}, self.base)
+        if hasattr(self.pathtoopf, 'manifest'):
+            self.pathtoopf = write_oebbook(self.pathtoebook, self._tdir)


        self.opf = OPF(self.pathtoopf, os.path.dirname(self.pathtoopf))
--- a/src/calibre/ebooks/oeb/output.py
+++ b/src/calibre/ebooks/oeb/output.py
@ -16,7 +16,6 @@ class OEBOutput(OutputFormatPlugin):
    author = 'Kovid Goyal'
    file_type = 'oeb'

-
    def convert(self, oeb_book, output_path, input_plugin, opts, log):
        self.log, self.opts = log, opts
        if not os.path.exists(output_path):
--- a/src/calibre/ebooks/oeb/reader.py
+++ b/src/calibre/ebooks/oeb/reader.py
@ -349,6 +349,7 @@ class OEBReader(object):
    def _toc_from_ncx(self, item):
        if item is None:
            return False
+        self.log.debug('Reading TOC from NCX...')
        ncx = item.data
        title = ''.join(xpath(ncx, 'ncx:docTitle/ncx:text/text()'))
        title = COLLAPSE_RE.sub(' ', title.strip())
@ -364,6 +365,7 @@ class OEBReader(object):
        result = xpath(opf, 'o2:tours/o2:tour')
        if not result:
            return False
+        self.log.debug('Reading TOC from tour...')
        tour = result[0]
        toc = self.oeb.toc
        toc.title = tour.get('title')
@ -384,6 +386,7 @@ class OEBReader(object):
    def _toc_from_html(self, opf):
        if 'toc' not in self.oeb.guide:
            return False
+        self.log.debug('Reading TOC from HTML...')
        itempath, frag = urldefrag(self.oeb.guide['toc'].href)
        item = self.oeb.manifest.hrefs[itempath]
        html = item.data
@ -414,6 +417,7 @@ class OEBReader(object):
        return True

    def _toc_from_spine(self, opf):
+        self.log.warn('Generating default TOC from spine...')
        toc = self.oeb.toc
        titles = []
        headers = []
@ -441,11 +445,14 @@ class OEBReader(object):
        return True

    def _toc_from_opf(self, opf, item):
+        self.oeb.auto_generated_toc = False
        if self._toc_from_ncx(item): return
-        if self._toc_from_tour(opf): return
-        self.logger.warn('No metadata table of contents found')
+        # Prefer HTML to tour based TOC, since several LIT files
+        # have good HTML TOCs but bad tour based TOCs
        if self._toc_from_html(opf): return
+        if self._toc_from_tour(opf): return
        self._toc_from_spine(opf)
+        self.oeb.auto_generated_toc = True

    def _pages_from_ncx(self, opf, item):
        if item is None:
--- a/src/calibre/ebooks/oeb/transforms/split.py
+++ b/src/calibre/ebooks/oeb/transforms/split.py
@ -51,8 +51,8 @@ class Split(object):
        self.log = oeb.log
        self.map = {}
        self.page_break_selectors = None
-        for item in self.oeb.manifest.items:
-            if etree.iselement(item.data):
+        for item in list(self.oeb.manifest.items):
+            if item.spine_position is not None and etree.iselement(item.data):
                self.split_item(item)

        self.fix_links()
@ -74,7 +74,6 @@ class Split(object):
            self.page_break_selectors = set([])
            stylesheets = [x.data for x in self.oeb.manifest if x.media_type in
                    OEB_STYLES]
-        page_break_selectors = set([])
            for rule in rules(stylesheets):
                before = getattr(rule.style.getPropertyCSSValue(
                    'page-break-before'), 'cssText', '').strip().lower()
@ -82,20 +81,24 @@ class Split(object):
                    'page-break-after'), 'cssText', '').strip().lower()
                try:
                    if before and before != 'avoid':
-                    page_break_selectors.add((CSSSelector(rule.selectorText),
+                        self.page_break_selectors.add((CSSSelector(rule.selectorText),
                            True))
                except:
                    pass
                try:
                    if after and after != 'avoid':
-                    page_break_selectors.add((CSSSelector(rule.selectorText),
+                        self.page_break_selectors.add((CSSSelector(rule.selectorText),
                            False))
                except:
                    pass

        page_breaks = set([])
-        for selector, before in page_break_selectors:
-            for elem in selector(item.data):
+        for selector, before in self.page_break_selectors:
+            body = item.data.xpath('//h:body', namespaces=NAMESPACES)
+            if not body:
+                continue
+            for elem in selector(body[0]):
+                if elem not in body:
                    if before:
                        elem.set('pb_before', '1')
                    page_breaks.add(elem)
@ -136,8 +139,10 @@ class Split(object):
        if href in self.map:
            anchor_map = self.map[href]
            nhref = anchor_map[frag if frag else None]
+            nhref = self.current_item.relhref(nhref)
            if frag:
-                nhref = '#'.join(href, frag)
+                nhref = '#'.join((nhref, frag))
+
            return nhref
        return url

@ -153,7 +158,7 @@ class FlowSplitter(object):
        self.page_breaks    = page_breaks
        self.page_break_ids = page_break_ids
        self.max_flow_size  = max_flow_size
-        self.base           = item.abshref(item.href)
+        self.base           = item.href

        base, ext = os.path.splitext(self.base)
        self.base = base.replace('%', '%%')+'_split_%d'+ext
@ -192,9 +197,9 @@ class FlowSplitter(object):
        self.trees = []
        tree = orig_tree
        for pattern, before in ordered_ids:
-            self.log.debug('\t\tSplitting on page-break')
            elem = pattern(tree)
            if elem:
+                self.log.debug('\t\tSplitting on page-break')
                before, after = self.do_split(tree, elem[0], before)
                self.trees.append(before)
                tree = after
@ -414,13 +419,14 @@ class FlowSplitter(object):
                elem.attrib.pop(SPLIT_ATTR, None)
                elem.attrib.pop(SPLIT_POINT_ATTR, '0')

-        spine_pos = self.item.spine_pos
-        for current, tree in zip(map(reversed, (self.files, self.trees))):
+        spine_pos = self.item.spine_position
+        for current, tree in zip(*map(reversed, (self.files, self.trees))):
            for a in tree.getroot().xpath('//h:a[@href]', namespaces=NAMESPACES):
                href = a.get('href').strip()
                if href.startswith('#'):
                    anchor = href[1:]
                    file = self.anchor_map[anchor]
+                    file = self.item.relhref(file)
                    if file != current:
                        a.set('href', file+href)

@ -430,12 +436,12 @@ class FlowSplitter(object):
            self.oeb.spine.insert(spine_pos, new_item, self.item.linear)

        if self.oeb.guide:
-            for ref in self.oeb.guide:
+            for ref in self.oeb.guide.values():
                href, frag = urldefrag(ref.href)
                if href == self.item.href:
                    nhref = self.anchor_map[frag if frag else None]
                    if frag:
-                        nhref = '#'.join(nhref, frag)
+                        nhref = '#'.join((nhref, frag))
                    ref.href = nhref

        def fix_toc_entry(toc):
@ -444,7 +450,7 @@ class FlowSplitter(object):
                if href == self.item.href:
                    nhref = self.anchor_map[frag if frag else None]
                    if frag:
-                        nhref = '#'.join(nhref, frag)
+                        nhref = '#'.join((nhref, frag))
                    toc.href = nhref
            for x in toc:
                fix_toc_entry(x)
--- a/src/calibre/ebooks/oeb/writer.py
+++ b/src/calibre/ebooks/oeb/writer.py
@ -49,7 +49,7 @@ class OEBWriter(object):

    def __call__(self, oeb, path):
        """
-        Read the book in the :class:`OEBBook` object :param:`oeb` to a file
+        Write the book in the :class:`OEBBook` object :param:`oeb` to a folder
        at :param:`path`.
        """
        version = int(self.version[0])
--- a/src/calibre/gui2/dialogs/metadata_single.py
+++ b/src/calibre/gui2/dialogs/metadata_single.py
@ -319,6 +319,7 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog):
        self.cover_changed = True

    def initialize_series(self):
+        self.series.setSizeAdjustPolicy(self.series.AdjustToContentsOnFirstShow)
        all_series = self.db.all_series()
        all_series.sort(cmp=lambda x, y : cmp(x[1], y[1]))
        series_id = self.db.series_id(self.row)
@ -335,13 +336,6 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog):
            self.series.setCurrentIndex(idx)
            self.enable_series_index()

-        pl = self.series.parentWidget().layout()
-        for i in range(pl.count()):
-            l =  pl.itemAt(i).layout()
-            if l:
-                l.invalidate()
-                l.activate()
-
    def initialize_series_and_publisher(self):
        self.initialize_series()
        all_publishers = self.db.all_publishers()
--- a/src/calibre/gui2/images/news/der_standard.png
+++ b/src/calibre/gui2/images/news/der_standard.png
--- a/src/calibre/gui2/images/news/diepresse.png
+++ b/src/calibre/gui2/images/news/diepresse.png
--- a/src/calibre/gui2/images/news/seattle_times.png
+++ b/src/calibre/gui2/images/news/seattle_times.png
--- a/src/calibre/web/feeds/recipes/init.py
+++ b/src/calibre/web/feeds/recipes/init.py
@ -40,6 +40,7 @@ recipe_modules = ['recipe_' + r for r in (
           'krstarica', 'krstarica_en', 'tanjug', 'laprensa_ni', 'azstarnet',
           'corriere_della_sera_it', 'corriere_della_sera_en', 'msdnmag_en',
           'moneynews', 'der_standard', 'diepresse', 'nzz_ger', 'hna',
+           'seattle_times',
          )]

 import re, imp, inspect, time, os
--- a/src/calibre/web/feeds/recipes/recipe_der_standard.py
+++ b/src/calibre/web/feeds/recipes/recipe_der_standard.py
@ -1,3 +1,8 @@
+#!/usr/bin/env  python
+# -*- coding: utf-8 -*-
+
+__license__   = 'GPL v3'
+__copyright__ = '2009, Gerhard Aigner <gerhard.aigner at gmail.com>'

 ''' http://www.derstandard.at - Austrian Newspaper '''
 import re
@ -6,9 +11,27 @@ from calibre.web.feeds.news import BasicNewsRecipe
 class DerStandardRecipe(BasicNewsRecipe):
    title = u'derStandard'
    __author__ = 'Gerhard Aigner'
-
+    description = u'Nachrichten aus Österreich' 
+    publisher ='derStandard.at'
+    category = 'news, politics, nachrichten, Austria'
+    use_embedded_content = False
+    remove_empty_feeds = True
+    lang = 'de-AT'
+    no_stylesheets = True
+    encoding = 'utf-8'
+    language = _('German')
+    recursions = 0
    oldest_article = 1
    max_articles_per_feed = 100
+    
+    html2lrf_options = [
+                          '--comment'  , description
+                        , '--category' , category
+                        , '--publisher', publisher
+                        ]
+
+    html2epub_options  = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' 
+    
    feeds          = [(u'International', u'http://derstandard.at/?page=rss&ressort=internationalpolitik'),
        (u'Inland', u'http://derstandard.at/?page=rss&ressort=innenpolitik'),
        (u'Wirtschaft', u'http://derstandard.at/?page=rss&ressort=investor'),
@ -20,14 +43,10 @@ class DerStandardRecipe(BasicNewsRecipe):
        (u'Wissenschaft', u'http://derstandard.at/?page=rss&ressort=wissenschaft'),
        (u'Gesundheit', u'http://derstandard.at/?page=rss&ressort=gesundheit'),
        (u'Bildung', u'http://derstandard.at/?page=rss&ressort=subildung')]
-
-    encoding = 'utf-8'
-    language = _('German')
-    recursions = 0
    remove_tags = [dict(name='div'), dict(name='a'), dict(name='link'), dict(name='meta'),
        dict(name='form',attrs={'name':'sitesearch'}), dict(name='hr')]
    preprocess_regexps = [
-        (re.compile(r'\[[\d*]\]', re.DOTALL|re.IGNORECASE), lambda match: ''),
+        (re.compile(r'\[[\d]*\]', re.DOTALL|re.IGNORECASE), lambda match: ''),
        (re.compile(r'bgcolor="#\w{3,6}"', re.DOTALL|re.IGNORECASE), lambda match: '')
    ]
    
@ -40,3 +59,10 @@ class DerStandardRecipe(BasicNewsRecipe):
        if (article.link.count('ressort') > 0 or article.title.lower().count('ansichtssache') > 0):
            return None
        return article.link
+
+    def preprocess_html(self, soup):
+        soup.html['xml:lang'] = self.lang
+        soup.html['lang']     = self.lang
+        mtag = '<meta http-equiv="Content-Type" content="text/html; charset=' + self.encoding + '">'
+        soup.head.insert(0,mtag)
+        return soup  
--- a/src/calibre/web/feeds/recipes/recipe_diepresse.py
+++ b/src/calibre/web/feeds/recipes/recipe_diepresse.py
@ -1,18 +1,42 @@
-import re
+# -*- coding: utf-8 -*-

+
+__license__   = 'GPL v3'
+__copyright__ = '2009, Gerhard Aigner <gerhard.aigner at gmail.com>'
+
+''' http://www.diepresse.at - Austrian Newspaper '''
+
+import re
 from calibre.web.feeds.news import BasicNewsRecipe

 class DiePresseRecipe(BasicNewsRecipe):
    title = u'diePresse'
+    __author__ = 'Gerhard Aigner'
+    description = u'DiePresse.com - Die Online-Ausgabe der Österreichischen Tageszeitung Die Presse.' 
+    publisher ='DiePresse.com'
+    category = 'news, politics, nachrichten, Austria'
+    use_embedded_content = False
+    remove_empty_feeds = True
+    lang = 'de-AT'
+    no_stylesheets = True
+    encoding = 'ISO-8859-1'
+    language = _('German')
+    recursions = 0
    oldest_article = 1
    max_articles_per_feed = 100
-    recursions = 0
-    language = _('German')
-    __author__ = 'Gerhard Aigner'
+  
+    html2lrf_options = [
+                          '--comment'  , description
+                        , '--category' , category
+                        , '--publisher', publisher
+                        ]
+
+    html2epub_options  = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' 
  
    preprocess_regexps = [
 	(re.compile(r'Textversion', re.DOTALL), lambda match: ''),
    ]
+    
    remove_tags = [dict(name='hr'),
 	dict(name='br'),
 	dict(name='small'),
@ -21,6 +45,7 @@ class DiePresseRecipe(BasicNewsRecipe):
 	dict(name='h1', attrs={'class':'titel'}),
 	dict(name='a', attrs={'class':'print'}),
 	dict(name='div', attrs={'class':'hline'})]
+	
    feeds = [(u'Politik', u'http://diepresse.com/rss/Politik'),
 	(u'Wirtschaft', u'http://diepresse.com/rss/Wirtschaft'),
 	(u'Europa', u'http://diepresse.com/rss/EU'),
@ -29,7 +54,7 @@ class DiePresseRecipe(BasicNewsRecipe):
 	(u'Kultur', u'http://diepresse.com/rss/Kultur'),
 	(u'Leben', u'http://diepresse.com/rss/Leben'),
 	(u'Tech', u'http://diepresse.com/rss/Tech'),
-	(u'Science', u'http://diepresse.com/rss/Science'),
+	(u'Wissenschaft', u'http://diepresse.com/rss/Science'),
 	(u'Bildung', u'http://diepresse.com/rss/Bildung'),
 	(u'Gesundheit', u'http://diepresse.com/rss/Gesundheit'),
 	(u'Recht', u'http://diepresse.com/rss/Recht'),
@ -38,3 +63,10 @@ class DiePresseRecipe(BasicNewsRecipe):

    def print_version(self, url):
        return url.replace('home','text/home')
+
+    def preprocess_html(self, soup):
+        soup.html['xml:lang'] = self.lang
+        soup.html['lang']     = self.lang
+	mtag = '<meta http-equiv="Content-Type" content="text/html; charset=utf-8">'
+        soup.head.insert(0,mtag)
+	return soup  
--- a/src/calibre/web/feeds/recipes/recipe_seattle_times.py
+++ b/src/calibre/web/feeds/recipes/recipe_seattle_times.py
@ -0,0 +1,50 @@
+#!/usr/bin/env  python
+
+__license__   = 'GPL v3'
+__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
+'''
+seattletimes.nwsource.com
+'''
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class SeattleTimes(BasicNewsRecipe):
+    title                 = 'The Seattle Times'
+    __author__            = 'Darko Miletic'
+    description           = 'News from Seattle and USA'
+    publisher             = 'The Seattle Times'
+    category              = 'news, politics, USA'
+    oldest_article        = 2
+    max_articles_per_feed = 100
+    no_stylesheets        = True
+    use_embedded_content  = False
+    encoding              = 'cp1252'
+    language              = _('English')
+
+    html2lrf_options = [
+                          '--comment'  , description
+                        , '--category' , category
+                        , '--publisher', publisher
+                        ]
+
+    html2epub_options  = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
+
+    feeds              = [(u'Articles', u'http://seattletimes.nwsource.com/rss/seattletimes.xml')]
+
+    remove_tags        = [
+                             dict(name=['object','link','script'])
+                            ,dict(name='p', attrs={'class':'permission'})
+                         ]
+
+    def print_version(self, url):
+        start_url, sep, rest_url = url.rpartition('_')
+        rurl, rsep, article_id = start_url.rpartition('/')
+        return u'http://seattletimes.nwsource.com/cgi-bin/PrintStory.pl?document_id=' + article_id
+
+    def preprocess_html(self, soup):
+        mtag = '<meta http-equiv="Content-Language" content="en-US"/>'
+        soup.head.insert(0,mtag)
+        for item in soup.findAll(style=True):
+            del item['style']
+        return soup
+
--- a/src/pyPdf/generic.py
+++ b/src/pyPdf/generic.py
@ -299,7 +299,7 @@ def readStringFromStream(stream):
            elif tok == "t":
                tok = "\t"
            elif tok == "b":
-                tok == "\b"
+                tok = "\b"
            elif tok == "f":
                tok = "\f"
            elif tok == "(":
--- a/src/pyPdf/pdf.py
+++ b/src/pyPdf/pdf.py
@ -39,15 +39,12 @@ __author__ = "Mathieu Fenniak"
 __author_email__ = "biziqe@mathieu.fenniak.net"

 import struct
-try:
 from cStringIO import StringIO
-except ImportError:
-    from StringIO import StringIO

-import filters
-import utils
-import warnings
-from generic import *
+from generic import DictionaryObject, NameObject, NumberObject, \
+createStringObject, ArrayObject, ByteStringObject, StreamObject, \
+IndirectObject, utils, readObject, TextStringObject, BooleanObject, \
+RectangleObject, DecodedStreamObject
 from utils import readNonWhitespace, readUntilWhitespace, ConvertFunctionsToVirtualList


@ -56,6 +53,7 @@ from utils import readNonWhitespace, readUntilWhitespace, ConvertFunctionsToVirt
 # class (typically {@link #PdfFileReader PdfFileReader}).
 class PdfFileWriter(object):
    def __init__(self,title=u"Unknown",author=u"Unknown"):
+        self.killed = False
        self._header = "%PDF-1.3"
        self._objects = []  # array of indirect objects

@ -162,7 +160,7 @@ class PdfFileWriter(object):
    # @param stream An object to write the file to.  The object must support
    # the write method, and the tell method, similar to a file object.
    def write(self, stream):
-        import struct, md5
+        import md5

        externalReferenceMap = {}
        self.stack = []
@ -214,6 +212,8 @@ class PdfFileWriter(object):
        stream.write("\nstartxref\n%s\n%%%%EOF\n" % (xref_location))

    def _sweepIndirectReferences(self, externMap, data):
+        if self.killed:
+            raise RuntimeError('Writer killed')
        if isinstance(data, DictionaryObject):
            for key, value in data.items():
                origvalue = value