From 36fd295ca12540e73ee7bcde2b3e896a5da53478 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Fri, 20 Mar 2009 19:39:26 -0400
Subject: [PATCH 01/16] any2txt converter

---
 src/calibre/ebooks/htmlsymbols.py  | 219 ++++++++++++++++++++++++++
 src/calibre/ebooks/txt/__init__.py |   9 ++
 src/calibre/ebooks/txt/from_any.py |  74 +++++++++
 src/calibre/ebooks/txt/writer.py   | 237 +++++++++++++++++++++++++++++
 4 files changed, 539 insertions(+)
 create mode 100644 src/calibre/ebooks/htmlsymbols.py
 create mode 100644 src/calibre/ebooks/txt/__init__.py
 create mode 100644 src/calibre/ebooks/txt/from_any.py
 create mode 100644 src/calibre/ebooks/txt/writer.py

diff --git a/src/calibre/ebooks/htmlsymbols.py b/src/calibre/ebooks/htmlsymbols.py
new file mode 100644
index 0000000000..9b50f20fcd
--- /dev/null
+++ b/src/calibre/ebooks/htmlsymbols.py
@@ -0,0 +1,219 @@
+# -*- coding: utf-8 -*-
+'''
+Maping of non-acii symbols and their corresponding html entity number and name
+'''
+__license__   = 'GPL v3'
+__copyright__ = '2009, John Schember <john@nachtimwald.com>'
+
+# http://www.w3schools.com/tags/ref_symbols.asp
+HTML_SYMBOLS = {
+                # Math Symbols
+                u'∀' : ['&#8704;', '&forall;'], # for all
+                u'∂' : ['&#8706;', '&part;'], # part
+                u'∃' : ['&#8707;', '&exists;'], # exists
+                u'∅' : ['&#8709;', '&empty;'], # empty
+                u'∇' : ['&#8711;', '&nabla;'], # nabla
+                u'∈' : ['&#8712;', '&isin;'], # isin
+                u'∉' : ['&#8713;', '&notin;'], # notin
+                u'∋' : ['&#8715;', '&ni;'], # ni
+                u'∏' : ['&#8719;', '&prod;'], # prod
+                u'∑' : ['&#8721;', '&sum;'], # sum
+                u'−' : ['&#8722;', '&minus;'], # minus
+                u'∗' : ['&#8727;', '&lowast;'], # lowast
+                u'√' : ['&#8730;', '&radic;'], # square root
+                u'∝' : ['&#8733;', '&prop;'], # proportional to
+                u'∞' : ['&#8734;', '&infin;'], # infinity
+                u'∠' : ['&#8736;', '&ang;'], # angle
+                u'∧' : ['&#8743;', '&and;'], # and
+                u'∨' : ['&#8744;', '&or;'], # or
+                u'∩' : ['&#8745;', '&cap;'], # cap
+                u'∪' : ['&#8746;', '&cup;'], # cup
+                u'∫' : ['&#8747;', '&int;'], # integral
+                u'∴' : ['&#8756;', '&there4;'], # therefore
+                u'∼' : ['&#8764;', '&sim;'], # simular to
+                u'≅' : ['&#8773;', '&cong;'], # approximately equal
+                u'≈' : ['&#8776;', '&asymp;'], # almost equal
+                u'≠' : ['&#8800;', '&ne;'], # not equal
+                u'≡' : ['&#8801;', '&equiv;'], # equivalent
+                u'≤' : ['&#8804;', '&le;'], # less or equal
+                u'≥' : ['&#8805;', '&ge;'], # greater or equal
+                u'⊂' : ['&#8834;', '&sub;'], # subset of
+                u'⊃' : ['&#8835;', '&sup;'], # superset of
+                u'⊄' : ['&#8836;', '&nsub;'], # not subset of
+                u'⊆' : ['&#8838;', '&sube;'], # subset or equal
+                u'⊇' : ['&#8839;', '&supe;'], # superset or equal
+                u'⊕' : ['&#8853;', '&oplus;'], # circled plus
+                u'⊗' : ['&#8855;', '&otimes;'], # cirled times
+                u'⊥' : ['&#8869;', '&perp;'], # perpendicular
+                u'⋅' : ['&#8901;', '&sdot;'], # dot operator
+                # Greek Letters
+                u'Α' : ['&#913;', '&Alpha;'], # Alpha
+                u'Β' : ['&#914;', '&Beta;'], # Beta
+                u'Γ' : ['&#915;', '&Gamma;'], # Gamma
+                u'Δ' : ['&#916;', '&Delta;'], # Delta
+                u'Ε' : ['&#917;', '&Epsilon;'], # Epsilon
+                u'Ζ' : ['&#918;', '&Zeta;'], # Zeta
+                u'Η' : ['&#919;', '&Eta;'], # Eta
+                u'Θ' : ['&#920;', '&Theta;'], # Theta
+                u'Ι' : ['&#921;', '&Iota;'], # Iota
+                u'Κ' : ['&#922;', '&Kappa;'], # Kappa
+                u'Λ' : ['&#923;', '&Lambda;'], # Lambda
+                u'Μ' : ['&#924;', '&Mu;'], # Mu
+                u'Ν' : ['&#925;', '&Nu;'], # Nu
+                u'Ξ' : ['&#926;', '&Xi;'], # Xi
+                u'Ο' : ['&#927;', '&Omicron;'], # Omicron
+                u'Π' : ['&#928;', '&Pi;'], # Pi
+                u'Ρ' : ['&#929;', '&Rho;'], # Rho
+                u'Σ' : ['&#931;', '&Sigma;'], # Sigma
+                u'Τ' : ['&#932;', '&Tau;'], # Tau
+                u'Υ' : ['&#933;', '&Upsilon;'], # Upsilon
+                u'Φ' : ['&#934;', '&Phi;'], # Phi
+                u'Χ' : ['&#935;', '&Chi;'], # Chi
+                u'Ψ' : ['&#936;', '&Psi;'], # Psi
+                u'ω' : ['&#969;', '&omega;'], # omega
+                u'ϑ' : ['&#977;', '&thetasym;'], # theta symbol
+                u'ϒ' : ['&#978;', '&upsih;'], # upsilon symbol
+                u'ϖ' : ['&#982;', '&piv;'], # pi symbol
+                # Other
+                u'Œ' : ['&#338;', '&OElig;'], # capital ligature OE
+                u'œ' : ['&#339;', '&oelig;'], # small ligature oe
+                u'Š' : ['&#352;', '&Scaron;'], # capital S with caron
+                u'š' : ['&#353;', '&scaron;'], # small S with caron
+                u'Ÿ' : ['&#376;', '&Yuml;'], # capital Y with diaeres
+                u'ƒ' : ['&#402;', '&fnof;'], # f with hook
+                u'ˆ' : ['&#710;', '&circ;'], # modifier letter circumflex accent
+                u'˜' : ['&#732;', '&tilde;'], # small tilde
+                u'–' : ['&#8211;', '&ndash;'], # en dash
+                u'—' : ['&#8212;', '&mdash;'], # em dash
+                u'‘' : ['&#8216;', '&lsquo;'], # left single quotation mark
+                u'’' : ['&#8217;', '&rsquo;'], # right single quotation mark
+                u'‚' : ['&#8218;', '&sbquo;'], # single low-9 quotation mark
+                u'“' : ['&#8220;', '&ldquo;'], # left double quotation mark
+                u'”' : ['&#8221;', '&rdquo;'], # right double quotation mark
+                u'„' : ['&#8222;', '&bdquo;'], # double low-9 quotation mark
+                u'†' : ['&#8224;', '&dagger;'], # dagger
+                u'‡' : ['&#8225;', '&Dagger;'], # double dagger
+                u'•' : ['&#8226;', '&bull;'], # bullet
+                u'…' : ['&#8230;', '&hellip;'], # horizontal ellipsis
+                u'‰' : ['&#8240;', '&permil;'], # per mille 
+                u'′' : ['&#8242;', '&prime;'], # minutes
+                u'″' : ['&#8243;', '&Prime;'], # seconds
+                u'‹' : ['&#8249;', '&lsaquo;'], # single left angle quotation
+                u'›' : ['&#8250;', '&rsaquo;'], # single right angle quotation
+                u'‾' : ['&#8254;', '&oline;'], # overline
+                u'€' : ['&#8364;', '&euro;'], # euro
+                u'™' : ['&#8482;', '&trade;'], # trademark
+                u'←' : ['&#8592;', '&larr;'], # left arrow
+                u'↑' : ['&#8593;', '&uarr;'], # up arrow
+                u'→' : ['&#8594;', '&rarr;'], # right arrow
+                u'↓' : ['&#8595;', '&darr;'], # down arrow
+                u'↔' : ['&#8596;', '&harr;'], # left right arrow
+                u'↵' : ['&#8629;', '&crarr;'], # carriage return arrow
+                u'⌈' : ['&#8968;', '&lceil;'], # left ceiling
+                u'⌉' : ['&#8969;', '&rceil;'], # right ceiling
+                u'⌊' : ['&#8970;', '&lfloor;'], # left floor
+                u'⌋' : ['&#8971;', '&rfloor;'], # right floor
+                u'◊' : ['&#9674;', '&loz;'], # lozenge
+                u'♠' : ['&#9824;', '&spades;'], # spade
+                u'♣' : ['&#9827;', '&clubs;'], # club
+                u'♥' : ['&#9829;', '&hearts;'], # heart
+                u'♦' : ['&#9830;', '&diams;'], # diamond
+                # Extra http://www.ascii.cl/htmlcodes.htm
+                u'<' : ['&#60;', '&lt;'], # less than sign
+                u'>' : ['&#62;', '&gt;'], # greater than sign
+                u'¡' : ['&#161;', '&iexcl;'], # inverted exclamation mark
+                u'¢' : ['&#162;', '&cent;'], # cent sign
+                u'£' : ['&#163;', '&pound;'], # pound sign
+                u'¤' : ['&#164;', '&curren;'], # currency sign
+                u'¥' : ['&#165;', '&yen;'], # yen sign
+                u'¦' : ['&#166;', '&brvbar;'], # broken vertical bar
+                u'§' : ['&#167;', '&sect;'], # section sign
+                u'¨' : ['&#168;', '&uml;'], # spacing diaeresis - umlaut
+                u'©' : ['&#169;', '&copy;'], # copyright sign
+                u'ª' : ['&#170;', '&ordf;'], # feminine ordinal indicator
+                u'«' : ['&#171;', '&laquo;'], # left double angle quotes
+                u'¬' : ['&#172;', '&not;'], # not sign
+                u'®' : ['&#174;', '&reg;'], # registered trade mark sign
+                u'¯' : ['&#175;', '&macr;'], # spacing macron - overline
+                u'°' : ['&#176;', '&deg;'], # degree sign
+                u'±' : ['&#177;', '&plusmn;'], # plus-or-minus sign
+                u'²' : ['&#178;', '&sup2;'], # superscript two - squared
+                u'³' : ['&#179;', '&sup3;'], # superscript three - cubed
+                u'´' : ['&#180;', '&acute;'], # acute accent - spacing acute
+                u'µ' : ['&#181;', '&micro;'], # micro sign
+                u'¶' : ['&#182;', '&para;'], # pilcrow sign - paragraph sign
+                u'·' : ['&#183;', '&middot;'], # middle dot - Georgian comma
+                u'¸' : ['&#184;', '&cedil;'], # spacing cedilla
+                u'¹' : ['&#185;', '&sup1;'], # superscript one
+                u'º' : ['&#186;', '&ordm;'], # masculine ordinal indicator
+                u'»' : ['&#187;', '&raquo;'], # right double angle quotes
+                u'¼' : ['&#188;', '&frac14;'], # fraction one quarter
+                u'½' : ['&#189;', '&frac12;'], # fraction one half
+                u'¾' : ['&#190;', '&frac34;'], # fraction three quarters
+                u'¿' : ['&#191;', '&iquest;'], # inverted question mark
+                u'À' : ['&#192;', '&Agrave;'], # latin capital letter A with grave
+                u'Á' : ['&#193;', '&Aacute;'], # latin capital letter A with acute
+                u'Â' : ['&#194;', '&Acirc;'], # latin capital letter A with circumflex
+                u'Ã' : ['&#195;', '&Atilde;'], # latin capital letter A with tilde
+                u'Ä' : ['&#196;', '&Auml;'], # latin capital letter A with diaeresis
+                u'Å' : ['&#197;', '&Aring;'], # latin capital letter A with ring above
+                u'Æ' : ['&#198;', '&AElig;'], # latin capital letter AE
+                u'Ç' : ['&#199;', '&Ccedil;'], # latin capital letter C with cedilla
+                u'È' : ['&#200;', '&Egrave;'], # latin capital letter E with grave
+                u'É' : ['&#201;', '&Eacute;'], # latin capital letter E with acute
+                u'Ê' : ['&#202;', '&Ecirc;'], # latin capital letter E with circumflex
+                u'Ë' : ['&#203;', '&Euml;'], # latin capital letter E with diaeresis
+                u'Ì' : ['&#204;', '&Igrave;'], # latin capital letter I with grave
+                u'Í' : ['&#205;', '&Iacute;'], # latin capital letter I with acute
+                u'Î' : ['&#206;', '&Icirc;'], # latin capital letter I with circumflex
+                u'Ï' : ['&#207;', '&Iuml;'], # latin capital letter I with diaeresis
+                u'Ð' : ['&#208;', '&ETH;'], # latin capital letter ETH
+                u'Ñ' : ['&#209;', '&Ntilde;'], # latin capital letter N with tilde
+                u'Ò' : ['&#210;', '&Ograve;'], # latin capital letter O with grave
+                u'Ó' : ['&#211;', '&Oacute;'], # latin capital letter O with acute
+                u'Ô' : ['&#212;', '&Ocirc;'], # latin capital letter O with circumflex
+                u'Õ' : ['&#213;', '&Otilde;'], # latin capital letter O with tilde
+                u'Ö' : ['&#214;', '&Ouml;'], # latin capital letter O with diaeresis
+                u'×' : ['&#215;', '&times;'], # multiplication sign
+                u'Ø' : ['&#216;', '&Oslash;'], # latin capital letter O with slash
+                u'Ù' : ['&#217;', '&Ugrave;'], # latin capital letter U with grave
+                u'Ú' : ['&#218;', '&Uacute;'], # latin capital letter U with acute
+                u'Û' : ['&#219;', '&Ucirc;'], # latin capital letter U with circumflex
+                u'Ü' : ['&#220;', '&Uuml;'], # latin capital letter U with diaeresis
+                u'Ý' : ['&#221;', '&Yacute;'], # latin capital letter Y with acute
+                u'Þ' : ['&#222;', '&THORN;'], # latin capital letter THORN
+                u'ß' : ['&#223;', '&szlig;'], # latin small letter sharp s - ess-zed
+                u'à' : ['&#224;', '&agrave;'], # latin small letter a with grave
+                u'á' : ['&#225;', '&aacute;'], # latin small letter a with acute
+                u'â' : ['&#226;', '&acirc;'], # latin small letter a with circumflex
+                u'ã' : ['&#227;', '&atilde;'], # latin small letter a with tilde
+                u'ä' : ['&#228;', '&auml;'], # latin small letter a with diaeresis
+                u'å' : ['&#229;', '&aring;'], # latin small letter a with ring above
+                u'æ' : ['&#230;', '&aelig;'], # latin small letter ae
+                u'ç' : ['&#231;', '&ccedil;'], # latin small letter c with cedilla
+                u'è' : ['&#232;', '&egrave;'], # latin small letter e with grave
+                u'é' : ['&#233;', '&eacute;'], # latin small letter e with acute
+                u'ê' : ['&#234;', '&ecirc;'], # latin small letter e with circumflex
+                u'ë' : ['&#235;', '&euml;'], # latin small letter e with diaeresis
+                u'ì' : ['&#236;', '&igrave;'], # latin small letter i with grave
+                u'í' : ['&#237;', '&iacute;'], # latin small letter i with acute
+                u'î' : ['&#238;', '&icirc;'], # latin small letter i with circumflex
+                u'ï' : ['&#239;', '&iuml;'], # latin small letter i with diaeresis
+                u'ð' : ['&#240;', '&eth;'], # latin small letter eth
+                u'ñ' : ['&#241;', '&ntilde;'], # latin small letter n with tilde
+                u'ò' : ['&#242;', '&ograve;'], # latin small letter o with grave
+                u'ó' : ['&#243;', '&oacute;'], # latin small letter o with acute
+                u'ô' : ['&#244;', '&ocirc;'], # latin small letter o with circumflex
+                u'õ' : ['&#245;', '&otilde;'], # latin small letter o with tilde
+                u'ö' : ['&#246;', '&ouml;'], # latin small letter o with diaeresis
+                u'÷' : ['&#247;', '&divide;'], # division sign
+                u'ø' : ['&#248;', '&oslash;'], # latin small letter o with slash
+                u'ù' : ['&#249;', '&ugrave;'], # latin small letter u with grave
+                u'ú' : ['&#250;', '&uacute;'], # latin small letter u with acute
+                u'û' : ['&#251;', '&ucirc;'], # latin small letter u with circumflex
+                u'ü' : ['&#252;', '&uuml;'], # latin small letter u with diaeresis
+                u'ý' : ['&#253;', '&yacute;'], # latin small letter y with acute
+                u'þ' : ['&#254;', '&thorn;'], # latin small letter thorn
+                u'ÿ' : ['&#255;', '&yuml;'], # latin small letter y with diaeresis
+               }
+
diff --git a/src/calibre/ebooks/txt/__init__.py b/src/calibre/ebooks/txt/__init__.py
new file mode 100644
index 0000000000..dfdbbdb5e2
--- /dev/null
+++ b/src/calibre/ebooks/txt/__init__.py
@@ -0,0 +1,9 @@
+#!/usr/bin/env  python
+__license__   = 'GPL v3'
+__copyright__ = '2008, John Schember john@nachtimwald.com'
+__docformat__ = 'restructuredtext en'
+
+'''
+Used for txt output
+'''
+
diff --git a/src/calibre/ebooks/txt/from_any.py b/src/calibre/ebooks/txt/from_any.py
new file mode 100644
index 0000000000..caf5364c3c
--- /dev/null
+++ b/src/calibre/ebooks/txt/from_any.py
@@ -0,0 +1,74 @@
+'''
+Convert any ebook format to TXT.
+'''
+
+from __future__ import with_statement
+
+__license__   = 'GPL v3'
+__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net ' \
+    'and Marshall T. Vandegrift <llasram@gmail.com>' \
+    'and John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+import sys, os, glob, logging
+
+from calibre.ebooks.epub.from_any import any2epub, formats, USAGE
+from calibre.ebooks.epub import config as common_config
+from calibre.ptempfile import TemporaryDirectory
+from calibre.ebooks.txt.writer import oeb2txt, config as txt_config
+
+def config(defaults=None):
+    c = common_config(defaults=defaults, name='txt')
+    c.remove_opt('profile')
+    del c.option_set.groups['metadata']
+    del c.option_set.groups['traversal']
+    del c.option_set.groups['structure detection']
+    del c.option_set.groups['toc']
+    del c.option_set.groups['page layout']
+    txtc = txt_config(defaults=defaults)
+    c.update(txtc)
+    return c 
+
+def option_parser(usage=USAGE):
+    usage = usage % ('TXT', formats())
+    parser = config().option_parser(usage=usage)
+    return parser
+
+def any2txt(opts, path, notification=None):
+    ext = os.path.splitext(path)[1]
+    if not ext:
+        raise ValueError('Unknown file type: '+path)
+    ext = ext.lower()[1:]
+    
+    if opts.output is None:
+        opts.output = os.path.splitext(os.path.basename(path))[0]+'.txt'
+    
+    opts.output = os.path.abspath(opts.output)
+    orig_output = opts.output
+    
+    with TemporaryDirectory('_any2txt') as tdir:
+        oebdir = os.path.join(tdir, 'oeb')
+        os.mkdir(oebdir)
+        opts.output = os.path.join(tdir, 'dummy.epub')
+        opts.profile = 'None'
+        opts.dont_split_on_page_breaks = True
+        orig_bfs = opts.base_font_size2
+        opts.base_font_size2 = 0
+        any2epub(opts, path, create_epub=False, oeb_cover=False, extract_to=oebdir)
+        opts.base_font_size2 = orig_bfs
+        opf = glob.glob(os.path.join(oebdir, '*.opf'))[0]
+        opts.output = orig_output
+        logging.getLogger('html2epub').info(_('Creating TXT file from EPUB...'))
+        oeb2txt(opts, opf)
+
+def main(args=sys.argv):
+    parser = option_parser()
+    opts, args = parser.parse_args(args)
+    if len(args) < 2:
+        parser.print_help()
+        print 'No input file specified.'
+        return 1
+    any2txt(opts, args[1])
+    
+if __name__ == '__main__':
+    sys.exit(main())
diff --git a/src/calibre/ebooks/txt/writer.py b/src/calibre/ebooks/txt/writer.py
new file mode 100644
index 0000000000..0fbf4a634c
--- /dev/null
+++ b/src/calibre/ebooks/txt/writer.py
@@ -0,0 +1,237 @@
+# -*- coding: utf-8 -*-
+'''
+Write content to TXT.
+'''
+from __future__ import with_statement
+
+__license__   = 'GPL v3'
+__copyright__ = '2009, John Schember <john@nachtimwald.com>'
+
+import os, logging, re, sys
+
+from BeautifulSoup import BeautifulSoup
+
+from calibre import LoggingInterface
+from calibre.ebooks.htmlsymbols import HTML_SYMBOLS
+from calibre.ebooks.epub.iterator import SpineItem
+from calibre.ebooks.metadata import authors_to_string
+from calibre.ebooks.metadata.meta import metadata_from_formats
+from calibre.ebooks.metadata.opf2 import OPF
+from calibre.customize.ui import run_plugins_on_postprocess
+from calibre.utils.config import Config, StringConfig
+
+class TXTWriter(object):
+    def __init__(self, newline):
+        self.newline = newline
+
+    def dump(self, oebpath, path, metadata):
+        opf = OPF(oebpath, os.path.dirname(oebpath))
+        spine = [SpineItem(i.path) for i in opf.spine]
+
+        tmpout = ''
+        for item in spine:
+            with open(item, 'r') as itemf:
+                content = itemf.read().decode(item.encoding)
+                # Convert newlines to unix style \n for processing. These
+                # will be changed to the specified type later in the process.
+                content = self.unix_newlines(content)
+                content = self.strip_html(content)
+                content = self.replace_html_symbols(content)
+                content = self.cleanup_text(content)
+                content = self.specified_newlines(content)
+                tmpout = tmpout + content
+
+        # Prepend metadata
+        if metadata.author != None and metadata.author != '':
+            tmpout = (u'%s%s%s%s' % (metadata.author.upper(), self.newline, self.newline, self.newline)) + tmpout
+        if metadata.title != None and metadata.title != '':
+            tmpout = (u'%s%s%s%s' % (metadata.title.upper(), self.newline, self.newline, self.newline)) + tmpout
+
+            # Put two blank lines at end of file
+
+            end = tmpout[-3 * len(self.newline):]
+            for i in range(3 - end.count(self.newline)):
+                tmpout = tmpout + self.newline
+
+        os.remove(path)
+        with open(path, 'w+b') as out:
+            out.write(tmpout.encode('utf-8'))
+            
+    def strip_html(self, html):
+        stripped = u''
+        
+        for dom_tree in BeautifulSoup(html).findAll('body'):
+            text = unicode(dom_tree)
+            
+            # Remove unnecessary tags
+            for tag in ['script', 'style']:
+                text = re.sub('(?imu)<[ ]*%s[ ]*.*?>(.*)</[ ]*%s[ ]*>' % (tag, tag), '', text)
+            text = re.sub('<!--.*-->', '', text)
+
+            # Headings usually indicate Chapters.
+            # We are going to use a marker to insert the proper number of
+            # newline characters at the end of cleanup_text because cleanup_text
+            # remove excessive (more than 2 newlines).
+            for tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
+                text = re.sub('(?imu)<[ ]*%s[ ]*.*?>' % tag, '-vzxedxy-', text)
+                text = re.sub('(?imu)</[ ]*%s[ ]*>' % tag, '-vlgzxey-', text)
+
+            # Separate content with space.
+            for tag in ['td']:
+                text = re.sub('(?imu)</[ ]*%s[ ]*>', ' ', text)
+            
+            # Separate content with empty line.
+            for tag in ['p', 'div', 'pre', 'li', 'table', 'tr']:
+                text = re.sub('(?imu)</[ ]*%s[ ]*>' % tag, '\n\n', text)
+            
+            for tag in ['hr', 'br']:
+                text = re.sub('(?imu)<[ ]*%s[ ]*/*?>' % tag, '\n\n', text)
+            
+            # Remove any tags that do not need special processing.
+            text = re.sub('<.*?>', '', text)
+            
+            stripped = stripped + text
+            
+        return stripped
+        
+    def replace_html_symbols(self, content):
+        for symbol in HTML_SYMBOLS:
+            for code in HTML_SYMBOLS[symbol]:
+                content = content.replace(code, symbol)
+        return content
+        
+    def cleanup_text(self, text):
+        # Replace bad characters.
+        text = text.replace(u'\xc2', '')
+        text = text.replace(u'\xa0', ' ')
+    
+        # Replace tabs, vertical tags and form feeds with single space.
+        #text = re.sub('\xc2\xa0', '', text)
+        text = text.replace('\t+', ' ')
+        text = text.replace('\v+', ' ')
+        text = text.replace('\f+', ' ')
+    
+        # Single line paragraph.
+        r = re.compile('.\n.')
+        while True:
+            mo = r.search(text)
+            if mo == None:
+                break
+            text = '%s %s' % (text[:mo.start()+1], text[mo.end()-1:])
+        
+        # Remove multiple spaces.
+        text = re.sub('[  ]+', ' ', text)
+        text = re.sub('(?imu)^[ ]+', '', text)
+        text = re.sub('(?imu)[ ]+$', '', text)
+        
+        # Remove excessive newlines.
+        text = re.sub('\n[ ]+\n', '\n\n', text)
+        text = re.sub('\n{3,}', '\n\n', text)
+        
+        # Replace markers with the proper characters.
+        text = text.replace('-vzxedxy-', '\n\n\n\n\n')
+        text = text.replace('-vlgzxey-', '\n\n\n')
+        
+        return text
+
+    def unix_newlines(self, text):
+        text = text.replace('\r\n', '\n')
+        text = text.replace('\r', '\n')
+        
+        return text
+        
+    def specified_newlines(self, text):
+        if self.newline == '\n':
+            return text
+        
+        return text.replace('\n', self.newline)
+        
+class TxtMetadata(object):
+    def __init__(self):
+        self.author = None
+        self.title = None
+        self.series = None
+        
+
+class TxtNewlines(object):
+    NEWLINE_TYPES = {
+                        'system'  : os.linesep,
+                        'unix'    : '\n',
+                        'old_mac' : '\r',
+                        'windows' : '\r\n'
+                     }
+                     
+    def __init__(self, newline_type):
+        self.newline = self.NEWLINE_TYPES.get(newline_type.lower(), os.linesep)
+
+
+def config(defaults=None):
+    desc = _('Options to control the conversion to TXT')
+    if defaults is None:
+        c = Config('txt', desc)
+    else:
+        c = StringConfig(defaults, desc)
+        
+    txt = c.add_group('TXT', _('TXT options.'))
+            
+    txt('newline', ['--newline'], default='system',
+        help=_('Type of newline to use. Options are %s. Default is \'system\'. '
+            'Use \'old_mac\' for compatibility with Mac OS 9 and earlier. '
+            'For Mac OS X use \'unix\'. \'system\' will default to the newline '
+            'type used by this OS.' % sorted(TxtNewlines.NEWLINE_TYPES.keys())))
+    txt('prepend_author', ['--prepend-author'], default='true',
+        help=_('Write the author to the beginning of the file. '
+            'Default is \'true\'. Use \'false\' to disable.'))
+    txt('prepend_title', ['--prepend-title'], default='true',
+        help=_('Write the title to the beginning of the file. '
+            'Default is \'true\'. Use \'false\' to disable.'))
+        
+    return c
+
+def option_parser():
+    c = config()
+    parser = c.option_parser(usage='%prog '+_('[options]')+' file.opf')
+    parser.add_option(
+        '-o', '--output', default=None, 
+        help=_('Output file. Default is derived from input filename.'))
+    parser.add_option(
+        '-v', '--verbose', default=0, action='count',
+        help=_('Useful for debugging.'))        
+    return parser
+
+def oeb2txt(opts, inpath):
+    logger = LoggingInterface(logging.getLogger('oeb2txt'))
+    logger.setup_cli_handler(opts.verbose)
+    
+    outpath = opts.output
+    if outpath is None:
+        outpath = os.path.basename(inpath)
+        outpath = os.path.splitext(outpath)[0] + '.txt'
+
+    mi = metadata_from_formats([inpath])
+    metadata = TxtMetadata()
+    if opts.prepend_author.lower() == 'true':
+        metadata.author = opts.authors if opts.authors else authors_to_string(mi.authors)
+    if opts.prepend_title.lower() == 'true':
+        metadata.title = opts.title if opts.title else mi.title
+
+    newline = TxtNewlines(opts.newline)
+    
+    writer = TXTWriter(newline.newline)
+    writer.dump(inpath, outpath, metadata)
+    run_plugins_on_postprocess(outpath, 'txt')
+    logger.log_info(_('Output written to ') + outpath)
+    
+def main(argv=sys.argv):
+    parser = option_parser()
+    opts, args = parser.parse_args(argv[1:])
+    if len(args) != 1:
+        parser.print_help()
+        return 1
+    inpath = args[0]
+    retval = oeb2txt(opts, inpath)
+    return retval
+
+if __name__ == '__main__':
+    sys.exit(main())
+

From 8d124f92d6ea9ac262542507330b6d19b9a0421c Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sat, 21 Mar 2009 07:37:26 -0400
Subject: [PATCH 02/16] Only remove output file if it exists before writing to
 it in txt output

---
 src/calibre/ebooks/txt/writer.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/calibre/ebooks/txt/writer.py b/src/calibre/ebooks/txt/writer.py
index 0fbf4a634c..84376ca2e7 100644
--- a/src/calibre/ebooks/txt/writer.py
+++ b/src/calibre/ebooks/txt/writer.py
@@ -53,7 +53,8 @@ class TXTWriter(object):
             for i in range(3 - end.count(self.newline)):
                 tmpout = tmpout + self.newline
 
-        os.remove(path)
+        if os.path.exists(path):
+            os.remove(path)
         with open(path, 'w+b') as out:
             out.write(tmpout.encode('utf-8'))
             

From 11013c26657fe56b2581061a8243981dc3ff0d6a Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sat, 21 Mar 2009 17:31:15 -0400
Subject: [PATCH 03/16] More html symbols

---
 src/calibre/ebooks/htmlsymbols.py | 91 +++++++++++++++++++++++++++++++
 1 file changed, 91 insertions(+)

diff --git a/src/calibre/ebooks/htmlsymbols.py b/src/calibre/ebooks/htmlsymbols.py
index 9b50f20fcd..fa10873845 100644
--- a/src/calibre/ebooks/htmlsymbols.py
+++ b/src/calibre/ebooks/htmlsymbols.py
@@ -119,6 +119,97 @@ HTML_SYMBOLS = {
                 u'♥' : ['&#9829;', '&hearts;'], # heart
                 u'♦' : ['&#9830;', '&diams;'], # diamond
                 # Extra http://www.ascii.cl/htmlcodes.htm
+                u' ' : ['&#32;'], # space
+                u'!' : ['&#33;'], # exclamation point
+                u'#' : ['&#35;'], # number sign
+                u'$' : ['&#36;'], # dollar sign
+                u'%' : ['&#37;'], # percent sign
+                u'\'' : ['&#39;'], # single quote
+                u'(' : ['&#40;'], # opening parenthesis
+                u')' : ['&#41;'], # closing parenthesis
+                u'*' : ['&#42;'], # asterisk
+                u'+' : ['&#43;'], # plus sign
+                u',' : ['&#44;'], # comma
+                u'-' : ['&#45;'], # minus sign - hyphen
+                u'.' : ['&#46;'], # period
+                u'/' : ['&#47;'], # slash
+                u'0' : ['&#48;'], # zero
+                u'1' : ['&#49;'], # one
+                u'2' : ['&#50;'], # two
+                u'3' : ['&#51;'], # three
+                u'4' : ['&#52;'], # four
+                u'5' : ['&#53;'], # five
+                u'6' : ['&#54;'], # six
+                u'7' : ['&#55;'], # seven
+                u'8' : ['&#56;'], # eight
+                u'9' : ['&#57;'], # nine
+                u':' : ['&#58;'], # colon
+                u';' : ['&#59;'], # semicolon
+                u'=' : ['&#61;'], # equal sign
+                u'?' : ['&#63;'], # question mark
+                u'@' : ['&#64;'], # at symbol
+                u'A' : ['&#65;'], # 
+                u'B' : ['&#66;'], # 
+                u'C' : ['&#67;'], # 
+                u'D' : ['&#68;'], # 
+                u'E' : ['&#69;'], # 
+                u'F' : ['&#70;'], # 
+                u'G' : ['&#71;'], # 
+                u'H' : ['&#72;'], # 
+                u'I' : ['&#73;'], # 
+                u'J' : ['&#74;'], # 
+                u'K' : ['&#75;'], # 
+                u'L' : ['&#76;'], # 
+                u'M' : ['&#77;'], # 
+                u'N' : ['&#78;'], # 
+                u'O' : ['&#79;'], # 
+                u'P' : ['&#80;'], # 
+                u'Q' : ['&#81;'], # 
+                u'R' : ['&#82;'], # 
+                u'S' : ['&#83;'], # 
+                u'T' : ['&#84;'], # 
+                u'U' : ['&#85;'], # 
+                u'V' : ['&#86;'], # 
+                u'W' : ['&#87;'], # 
+                u'X' : ['&#88;'], # 
+                u'Y' : ['&#89;'], # 
+                u'Z' : ['&#90;'], # 
+                u'[' : ['&#91;'], # opening bracket
+                u'\\' : ['&#92;'], # backslash
+                u']' : ['&#93;'], # closing bracket
+                u'^' : ['&#94;'], # caret - circumflex
+                u'_' : ['&#95;'], # underscore
+                u'`' : ['&#96;'], # grave accent
+                u'a' : ['&#97;'], # 
+                u'b' : ['&#98;'], # 
+                u'c' : ['&#99;'], # 
+                u'd' : ['&#100;'], # 
+                u'e' : ['&#101;'], # 
+                u'f' : ['&#102;'], # 
+                u'g' : ['&#103;'], # 
+                u'h' : ['&#104;'], # 
+                u'i' : ['&#105;'], # 
+                u'j' : ['&#106;'], # 
+                u'k' : ['&#107;'], # 
+                u'l' : ['&#108;'], # 
+                u'm' : ['&#109;'], # 
+                u'n' : ['&#110;'], # 
+                u'o' : ['&#111;'], # 
+                u'p' : ['&#112;'], # 
+                u'q' : ['&#113;'], # 
+                u'r' : ['&#114;'], # 
+                u's' : ['&#115;'], # 
+                u't' : ['&#116;'], # 
+                u'u' : ['&#117;'], # 
+                u'v' : ['&#118;'], # 
+                u'w' : ['&#119;'], # 
+                u'x' : ['&#120;'], # 
+                u'y' : ['&#121;'], # 
+                u'z' : ['&#122;'], # 
+                u'{' : ['&#123;'], # opening brace
+                u'|' : ['&#124;'], # vertical bar
+                u'}' : ['&#125;'], # closing brace
+                u'~' : ['&#126;'], # equivalency sign - tilde
                 u'<' : ['&#60;', '&lt;'], # less than sign
                 u'>' : ['&#62;', '&gt;'], # greater than sign
                 u'¡' : ['&#161;', '&iexcl;'], # inverted exclamation mark

From 94c5e717a15bf4faf59bf23ab74f7caf6fc161be Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sat, 21 Mar 2009 17:58:53 -0400
Subject: [PATCH 04/16] Txt output: remove more tags, ensure no spaces at
 beginning and end of lines

---
 src/calibre/ebooks/txt/writer.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/calibre/ebooks/txt/writer.py b/src/calibre/ebooks/txt/writer.py
index 84376ca2e7..205d8423e3 100644
--- a/src/calibre/ebooks/txt/writer.py
+++ b/src/calibre/ebooks/txt/writer.py
@@ -68,6 +68,9 @@ class TXTWriter(object):
             for tag in ['script', 'style']:
                 text = re.sub('(?imu)<[ ]*%s[ ]*.*?>(.*)</[ ]*%s[ ]*>' % (tag, tag), '', text)
             text = re.sub('<!--.*-->', '', text)
+            text = re.sub('<\?.*?\?>', '', text)
+            text = re.sub('<@.*?@>', '', text)
+            text = re.sub('<%.*?%>', '', text)
 
             # Headings usually indicate Chapters.
             # We are going to use a marker to insert the proper number of
@@ -107,7 +110,6 @@ class TXTWriter(object):
         text = text.replace(u'\xa0', ' ')
     
         # Replace tabs, vertical tags and form feeds with single space.
-        #text = re.sub('\xc2\xa0', '', text)
         text = text.replace('\t+', ' ')
         text = text.replace('\v+', ' ')
         text = text.replace('\f+', ' ')
@@ -122,8 +124,6 @@ class TXTWriter(object):
         
         # Remove multiple spaces.
         text = re.sub('[  ]+', ' ', text)
-        text = re.sub('(?imu)^[ ]+', '', text)
-        text = re.sub('(?imu)[ ]+$', '', text)
         
         # Remove excessive newlines.
         text = re.sub('\n[ ]+\n', '\n\n', text)
@@ -133,6 +133,10 @@ class TXTWriter(object):
         text = text.replace('-vzxedxy-', '\n\n\n\n\n')
         text = text.replace('-vlgzxey-', '\n\n\n')
         
+        # Replace spaces at the beginning and end of lines
+        text = re.sub('(?imu)^[ ]+', '', text)
+        text = re.sub('(?imu)[ ]+$', '', text)
+        
         return text
 
     def unix_newlines(self, text):

From 9abca9d60feb6896ef11c0da04a755fd24feb867 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Mon, 23 Mar 2009 19:07:14 -0400
Subject: [PATCH 05/16] Do not enable edit rows in device tab for devices that
 do not support editing ebook metadata.

---
 src/calibre/devices/interface.py    |  2 ++
 src/calibre/devices/usbms/driver.py |  1 +
 src/calibre/gui2/library.py         | 12 +++++++++---
 src/calibre/gui2/main.py            |  2 ++
 4 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/src/calibre/devices/interface.py b/src/calibre/devices/interface.py
index ed51962236..21790e3c46 100644
--- a/src/calibre/devices/interface.py
+++ b/src/calibre/devices/interface.py
@@ -24,6 +24,8 @@ class Device(object):
     # it can be a list of the BCD numbers of all devices supported by this driver.
     BCD         = None
     THUMBNAIL_HEIGHT = 68 # Height for thumbnails on device
+    # Whether the metadata on books can be set via the GUI.
+    CAN_SET_METADATA = True
     
     def __init__(self, key='-1', log_packets=False, report_progress=None) :
         """ 
diff --git a/src/calibre/devices/usbms/driver.py b/src/calibre/devices/usbms/driver.py
index 4285881447..68041a19cd 100644
--- a/src/calibre/devices/usbms/driver.py
+++ b/src/calibre/devices/usbms/driver.py
@@ -35,6 +35,7 @@ class USBMS(Device):
     EBOOK_DIR_MAIN = ''
     EBOOK_DIR_CARD = ''
     SUPPORTS_SUB_DIRS = False
+    CAN_SET_METADATA = False
 
     def __init__(self, key='-1', log_packets=False, report_progress=None):
         Device.__init__(self, key=key, log_packets=log_packets,
diff --git a/src/calibre/gui2/library.py b/src/calibre/gui2/library.py
index d7581bf458..9f82b3b318 100644
--- a/src/calibre/gui2/library.py
+++ b/src/calibre/gui2/library.py
@@ -708,6 +708,9 @@ class BooksView(TableView):
 
     def close(self):
         self._model.close()
+        
+    def set_editable(self, editable):
+        self._model.set_editable(editable)
 
     def connect_to_search_box(self, sb):
         QObject.connect(sb, SIGNAL('search(PyQt_PyObject, PyQt_PyObject)'),
@@ -785,7 +788,7 @@ class DeviceBooksModel(BooksModel):
         self.unknown = str(self.trUtf8('Unknown'))
         self.marked_for_deletion = {}
         self.search_engine = OnDeviceSearch(self)
-
+        self.editable = True
 
     def mark_for_deletion(self, job, rows):
         self.marked_for_deletion[job] = self.indices(rows)
@@ -793,7 +796,6 @@ class DeviceBooksModel(BooksModel):
             indices = self.row_indices(row)
             self.emit(SIGNAL('dataChanged(QModelIndex, QModelIndex)'), indices[0], indices[-1])
 
-
     def deletion_done(self, job, succeeded=True):
         if not self.marked_for_deletion.has_key(job):
             return
@@ -818,7 +820,7 @@ class DeviceBooksModel(BooksModel):
         if self.map[index.row()] in self.indices_to_be_deleted():
             return Qt.ItemIsUserCheckable  # Can't figure out how to get the disabled flag in python
         flags = QAbstractTableModel.flags(self, index)
-        if index.isValid():
+        if index.isValid() and self.editable:
             if index.column() in [0, 1] or (index.column() == 4 and self.db.supports_tags()):
                 flags |= Qt.ItemIsEditable
         return flags
@@ -999,6 +1001,10 @@ class DeviceBooksModel(BooksModel):
                 self.sort(col, self.sorted_on[1])
             done = True
         return done
+        
+    def set_editable(self, editable):
+        self.editable = editable
+        
 
 class SearchBox(QLineEdit):
 
diff --git a/src/calibre/gui2/main.py b/src/calibre/gui2/main.py
index 76775ae9bf..f297d1465c 100644
--- a/src/calibre/gui2/main.py
+++ b/src/calibre/gui2/main.py
@@ -585,7 +585,9 @@ class Main(MainWindow, Ui_MainWindow):
             return
         mainlist, cardlist = job.result
         self.memory_view.set_database(mainlist)
+        self.memory_view.set_editable(self.device_manager.device_class.CAN_SET_METADATA)
         self.card_view.set_database(cardlist)
+        self.card_view.set_editable(self.device_manager.device_class.CAN_SET_METADATA)
         for view in (self.memory_view, self.card_view):
             view.sortByColumn(3, Qt.DescendingOrder)
             if not view.restore_column_widths():

From 4579b1057130ae27d6f9b312b3a94ab1a1e86107 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sat, 28 Mar 2009 14:39:17 -0400
Subject: [PATCH 06/16] PDF merging utility

---
 src/calibre/ebooks/pdf/pdfmerge.py | 94 ++++++++++++++++++++++++++++++
 src/calibre/linux.py               |  3 +-
 2 files changed, 96 insertions(+), 1 deletion(-)
 create mode 100644 src/calibre/ebooks/pdf/pdfmerge.py

diff --git a/src/calibre/ebooks/pdf/pdfmerge.py b/src/calibre/ebooks/pdf/pdfmerge.py
new file mode 100644
index 0000000000..e8554dbc6b
--- /dev/null
+++ b/src/calibre/ebooks/pdf/pdfmerge.py
@@ -0,0 +1,94 @@
+'''
+Merge PDF files into a single PDF document.
+'''
+from __future__ import with_statement
+
+__license__   = 'GPL v3'
+__copyright__ = '2009, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+import os, sys, re
+
+from calibre.ebooks.metadata.meta import metadata_from_formats
+from calibre.ebooks.metadata import authors_to_string
+from calibre.utils.config import Config, StringConfig
+
+from pyPdf import PdfFileWriter, PdfFileReader
+
+def config(defaults=None):
+    desc = _('Options to control the transformation of pdf')
+    default_crop=10
+    if defaults is None:
+        c = Config('trimpdf', desc)
+    else:
+        c = StringConfig(defaults, desc)
+    c.add_opt('verbose', ['-v', '--verbose'], default=0, action='count',
+          help=_('Be verbose, useful for debugging. Can be specified multiple times for greater verbosity.'))
+    c.add_opt('output', ['-o', '--output'],default='merged.pdf',
+          help=_('Path to output file. By default a file is created in the current directory.'))
+    return c
+
+def option_parser():
+    c = config()
+    return c.option_parser(usage=_('''\
+	%prog [options] file1.pdf file2.pdf ...
+
+	Merges individual pdfs. Metadata will be used from the first PDF specified.
+	'''))
+
+def merge_files(in_paths, out_path, metadata=None):
+    if metadata == None:
+        title = _('Unknown')
+        author = _('Unknown')
+    else:
+        title = metadata.title
+        author = authors_to_string(metadata.authors)
+
+    out_pdf = PdfFileWriter(title=title, author=author)
+
+    for pdf_path in in_paths:
+        pdf = PdfFileReader(open(os.path.abspath(pdf_path), 'rb'))
+        for page in pdf.pages:
+            out_pdf.addPage(page)
+
+    with open(out_path, 'wb') as out_file:
+        out_pdf.write(out_file)
+    
+def verify_files(files):
+    invalid = []
+
+    for pdf_path in files:
+        try:
+            with open(os.path.abspath(pdf_path), 'rb') as pdf_file:
+                pdf = PdfFileReader(pdf_file)
+                if pdf.isEncrypted or pdf.numPages <= 0:
+                    raise Exception
+        except:
+            invalid.append(pdf_path)
+    return invalid
+
+def main(args=sys.argv):
+    parser = option_parser()
+    opts, args = parser.parse_args(args)
+    args = args[1:]
+    
+    if len(args) < 2:
+        print 'Error: Two or more PDF files are required.\n\n'
+        print parser.get_usage()
+        return 2
+    
+    bad_pdfs = verify_files(args)
+    if bad_pdfs != []:
+        for pdf in bad_pdfs:
+            print 'Error: Could not read file `%s`. Is it a vaild PDF file or is it encrypted/DRMed?.' % pdf
+        return 2
+        
+    mi = metadata_from_formats([args[0]])
+
+    merge_files(args, opts.output, mi)
+
+    return 0
+
+if __name__ == '__main__':
+    sys.exit(main())
+
diff --git a/src/calibre/linux.py b/src/calibre/linux.py
index 15dcb6fed9..c7a6099623 100644
--- a/src/calibre/linux.py
+++ b/src/calibre/linux.py
@@ -39,7 +39,8 @@ entry_points = {
              'calibre-fontconfig = calibre.utils.fontconfig:main',
              'calibre-parallel   = calibre.parallel:main',
              'calibre-customize  = calibre.customize.ui:main',
-             'pdftrim            = calibre.ebooks.pdf.pdftrim:main' ,
+             'pdftrim            = calibre.ebooks.pdf.pdftrim:main',
+             'pdfmerge           = calibre.ebooks.pdf.pdfmerge:main',
              'fetch-ebook-metadata = calibre.ebooks.metadata.fetch:main',
         ],
         'gui_scripts'    : [

From a5228d56d2b04fedba4c77791ac2893c5bd1c6b7 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sat, 28 Mar 2009 21:57:42 -0400
Subject: [PATCH 07/16] Add PDF splitting utility

---
 src/calibre/ebooks/pdf/pdfmerge.py |   6 +-
 src/calibre/ebooks/pdf/pdfsplit.py | 189 +++++++++++++++++++++++++++++
 src/calibre/linux.py               |   1 +
 3 files changed, 193 insertions(+), 3 deletions(-)
 create mode 100644 src/calibre/ebooks/pdf/pdfsplit.py

diff --git a/src/calibre/ebooks/pdf/pdfmerge.py b/src/calibre/ebooks/pdf/pdfmerge.py
index e8554dbc6b..4a741c4f5a 100644
--- a/src/calibre/ebooks/pdf/pdfmerge.py
+++ b/src/calibre/ebooks/pdf/pdfmerge.py
@@ -7,7 +7,7 @@ __license__   = 'GPL v3'
 __copyright__ = '2009, John Schember <john@nachtimwald.com>'
 __docformat__ = 'restructuredtext en'
 
-import os, sys, re
+import os, sys
 
 from calibre.ebooks.metadata.meta import metadata_from_formats
 from calibre.ebooks.metadata import authors_to_string
@@ -24,7 +24,7 @@ def config(defaults=None):
         c = StringConfig(defaults, desc)
     c.add_opt('verbose', ['-v', '--verbose'], default=0, action='count',
           help=_('Be verbose, useful for debugging. Can be specified multiple times for greater verbosity.'))
-    c.add_opt('output', ['-o', '--output'],default='merged.pdf',
+    c.add_opt('output', ['-o', '--output'], default='merged.pdf',
           help=_('Path to output file. By default a file is created in the current directory.'))
     return c
 
@@ -33,7 +33,7 @@ def option_parser():
     return c.option_parser(usage=_('''\
 	%prog [options] file1.pdf file2.pdf ...
 
-	Merges individual pdfs. Metadata will be used from the first PDF specified.
+	Merges individual PDFs. Metadata will be used from the first PDF specified.
 	'''))
 
 def merge_files(in_paths, out_path, metadata=None):
diff --git a/src/calibre/ebooks/pdf/pdfsplit.py b/src/calibre/ebooks/pdf/pdfsplit.py
new file mode 100644
index 0000000000..460dbef148
--- /dev/null
+++ b/src/calibre/ebooks/pdf/pdfsplit.py
@@ -0,0 +1,189 @@
+'''
+Split PDF file into multiple PDF documents.
+'''
+from __future__ import with_statement
+
+__license__   = 'GPL v3'
+__copyright__ = '2009, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+import os, sys, re
+
+from calibre.ebooks.metadata.meta import metadata_from_formats
+from calibre.ebooks.metadata import authors_to_string
+from calibre.utils.config import Config, StringConfig
+
+from pyPdf import PdfFileWriter, PdfFileReader
+
+def config(defaults=None):
+    desc = _('Options to control the transformation of pdf')
+    default_crop=10
+    if defaults is None:
+        c = Config('trimpdf', desc)
+    else:
+        c = StringConfig(defaults, desc)
+    c.add_opt('verbose', ['-v', '--verbose'], default=0, action='count',
+          help=_('Be verbose, useful for debugging. Can be specified multiple times for greater verbosity.'))
+    c.add_opt('output', ['-o', '--output'], default='split.pdf',
+          help=_('Path to output file. By default a file is created in the current directory. \
+            The file name will be the base name for the output.'))
+    return c
+
+def option_parser():
+    c = config()
+    return c.option_parser(usage=_('''\
+    
+	%prog [options] file.pdf page_to_split_on ...
+	%prog [options] file.pdf page_range_to_split_on ...
+	
+	Ex.
+	
+	%prog file.pdf 6
+	%prog file.pdf 6-12
+	%prog file.pdf 6-12 8 10 9-20
+
+	Split a PDF.
+	'''))
+
+def split_pdf(in_path, pages, page_ranges, out_name, metadata=None):
+    pdf = PdfFileReader(open(os.path.abspath(in_path), 'rb'))
+    total_pages = pdf.numPages - 1
+
+    for index in pages+page_ranges:
+        if index in pages:
+            write_pdf(pdf, out_name, '%s' % (index + 1), index, total_pages, metadata)
+        else:
+            
+            write_pdf(pdf, out_name, '%s-%s' % (index[0] + 1, index[1] + 1), index[0], index[1], metadata)
+        
+def write_pdf(pdf, name, suffix, start, end, metadata=None):
+    if metadata == None:
+        title = _('Unknown')
+        author = _('Unknown')
+    else:
+        title = metadata.title
+        author = authors_to_string(metadata.authors)
+    
+    out_pdf = PdfFileWriter(title=title, author=author)
+    for page_num in range(start, end + 1):
+        out_pdf.addPage(pdf.getPage(page_num))
+    with open('%s%s.pdf' % (name, suffix), 'wb') as out_file:
+        out_pdf.write(out_file)
+    
+def split_args(args):
+    pdf = ''
+    pages = []
+    page_ranges = []
+    bad = []
+
+    for arg in args:
+        arg = arg.strip()
+        # Find the pdf input
+        if re.search('(?iu)^.*?\.pdf[ ]*$', arg) != None:
+            if pdf == '':
+                pdf = arg
+            else:
+                bad.append(arg)
+        # Find single indexes
+        elif re.search('^[ ]*\d+[ ]*$', arg) != None:
+            pages.append(arg)
+        # Find index ranges
+        elif re.search('^[ ]*\d+[ ]*-[ ]*\d+[ ]*$', arg) != None:
+            mo = re.search('^[ ]*(?P<start>\d+)[ ]*-[ ]*(?P<end>\d+)[ ]*$', arg)
+            start = mo.group('start')
+            end = mo.group('end')
+            
+            # check to see if the range is really a single index
+            if start == end:
+                pages.append(start)
+            else:
+                page_ranges.append([start, end])
+        else:
+            bad.append(arg)
+        
+    bad = sorted(list(set(bad)))
+    
+    return pdf, pages, page_ranges, bad
+
+# Remove duplicates from pages and page_ranges.
+# Set pages higher than the total number of pages in the pdf to the last page.
+# Return pages and page_ranges as lists of ints.
+def clean_page_list(pdf_path, pages, page_ranges):
+    pdf = PdfFileReader(open(os.path.abspath(pdf_path), 'rb'))
+    
+    total_pages = pdf.numPages
+    sorted_pages = []
+    sorted_ranges = []
+
+    for index in pages:
+        index = int(index)
+        if index > total_pages:
+            sorted_pages.append(total_pages - 1)
+        else:
+            sorted_pages.append(index - 1)
+    
+    for start, end in page_ranges:
+        start = int(start)
+        end = int(end)
+        
+        if start > total_pages and end > total_pages:
+            sorted_pages.append(total_pages - 1)
+            continue
+            
+        if start > total_pages:
+            start = total_pages
+        if end > total_pages:
+            end = total_pages
+        page_range = sorted([start - 1, end - 1])
+        if page_range not in sorted_ranges:
+            sorted_ranges.append(page_range)
+    
+    # Remove duplicates and sort
+    pages = sorted(list(set(sorted_pages)))
+    page_ranges = sorted(sorted_ranges)
+    
+    return pages, page_ranges
+
+# Return True if the pdf is valid.
+def valid_pdf(pdf_path):
+    try:
+        with open(os.path.abspath(pdf_path), 'rb') as pdf_file:
+            pdf = PdfFileReader(pdf_file)
+            if pdf.isEncrypted or pdf.numPages <= 0:
+                raise Exception
+    except:
+        return False
+    return True
+
+def main(args=sys.argv):
+    parser = option_parser()
+    opts, args = parser.parse_args(args)
+    
+    pdf, pages, page_ranges, unknown = split_args(args[1:])
+    
+    if pdf == '' and (pages == [] or page_ranges == []):
+        print 'Error: PDF and where to split is required.\n\n'
+        print parser.get_usage()
+        return 2
+    
+    if unknown != []:
+        for arg in unknown:
+            print 'Error: Unknown argument `%s`' % arg
+        print parser.get_usage()
+        return 2
+    
+    if not valid_pdf(pdf):
+        print 'Error: Could not read file `%s`. Is it a vaild PDF file or is it encrypted/DRMed?.' % pdf
+        return 2
+        
+    pages, page_ranges = clean_page_list(pdf, pages, page_ranges)
+        
+    mi = metadata_from_formats([pdf])
+
+    split_pdf(pdf, pages, page_ranges, os.path.splitext(opts.output)[0], mi)
+
+    return 0
+
+if __name__ == '__main__':
+    sys.exit(main())
+
diff --git a/src/calibre/linux.py b/src/calibre/linux.py
index c7a6099623..3ba6f55bc8 100644
--- a/src/calibre/linux.py
+++ b/src/calibre/linux.py
@@ -41,6 +41,7 @@ entry_points = {
              'calibre-customize  = calibre.customize.ui:main',
              'pdftrim            = calibre.ebooks.pdf.pdftrim:main',
              'pdfmerge           = calibre.ebooks.pdf.pdfmerge:main',
+             'pdfsplit           = calibre.ebooks.pdf.pdfsplit:main',
              'fetch-ebook-metadata = calibre.ebooks.metadata.fetch:main',
         ],
         'gui_scripts'    : [

From ffa5f36fae29af536d8eb4a8eb8082eefc917f86 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sun, 29 Mar 2009 09:10:47 -0400
Subject: [PATCH 08/16] bzr 'command sub-command' style wrapper for pdf
 manipulation

---
 src/calibre/ebooks/pdf/manipulate.py          | 67 +++++++++++++++++++
 .../ebooks/pdf/{pdfmerge.py => merge.py}      | 13 ++--
 .../ebooks/pdf/{pdfsplit.py => split.py}      | 21 +++---
 .../ebooks/pdf/{pdftrim.py => trim.py}        | 10 +--
 src/calibre/linux.py                          |  5 +-
 5 files changed, 89 insertions(+), 27 deletions(-)
 create mode 100644 src/calibre/ebooks/pdf/manipulate.py
 rename src/calibre/ebooks/pdf/{pdfmerge.py => merge.py} (92%)
 rename src/calibre/ebooks/pdf/{pdfsplit.py => split.py} (93%)
 rename src/calibre/ebooks/pdf/{pdftrim.py => trim.py} (95%)

diff --git a/src/calibre/ebooks/pdf/manipulate.py b/src/calibre/ebooks/pdf/manipulate.py
new file mode 100644
index 0000000000..0e75734bb9
--- /dev/null
+++ b/src/calibre/ebooks/pdf/manipulate.py
@@ -0,0 +1,67 @@
+'''
+Command line interface to run pdf manipulation commands.
+'''
+from __future__ import with_statement
+
+__license__   = 'GPL v3'
+__copyright__ = '2009, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+import string, sys
+
+from calibre.utils.config import Config, StringConfig
+from calibre.ebooks.pdf import merge, split, trim
+
+COMMANDS = {
+             'merge' : merge,
+             'split' : split,
+             'trim'  : trim,
+           }
+
+def config(defaults=None):
+    desc = _('Options to control the transformation of pdf')
+    if defaults is None:
+        c = Config('trimpdf', desc)
+    else:
+        c = StringConfig(defaults, desc)
+    return c
+
+def option_parser():
+    c = config()
+    return c.option_parser(usage=_('''\
+    
+	%prog command ...
+	
+	command can be one of the following:
+	[%%commands]
+	
+	Use %prog command --help to get more information about a specific command
+	
+	Manipulate a PDF.
+	'''.replace('%%commands', string.join(sorted(COMMANDS.keys()), ', '))))
+
+def main(args=sys.argv):
+    parser = option_parser()
+
+    if len(args) < 2:
+        print 'Error: No command sepecified.\n'
+        print parser.get_usage()
+        return 2
+    
+    command = args[1].lower().strip()
+    
+    if command in COMMANDS.keys():    
+        del args[1]
+        return COMMANDS[command].main(args, command)
+    else:
+        parser.parse_args(args)
+        print 'Unknown command %s.\n' % command
+        print parser.get_usage()
+        return 2
+    
+    # We should never get here.
+    return 0
+
+if __name__ == '__main__':
+    sys.exit(main())
+
diff --git a/src/calibre/ebooks/pdf/pdfmerge.py b/src/calibre/ebooks/pdf/merge.py
similarity index 92%
rename from src/calibre/ebooks/pdf/pdfmerge.py
rename to src/calibre/ebooks/pdf/merge.py
index 4a741c4f5a..7ae35d1065 100644
--- a/src/calibre/ebooks/pdf/pdfmerge.py
+++ b/src/calibre/ebooks/pdf/merge.py
@@ -17,9 +17,8 @@ from pyPdf import PdfFileWriter, PdfFileReader
 
 def config(defaults=None):
     desc = _('Options to control the transformation of pdf')
-    default_crop=10
     if defaults is None:
-        c = Config('trimpdf', desc)
+        c = Config('mergepdf', desc)
     else:
         c = StringConfig(defaults, desc)
     c.add_opt('verbose', ['-v', '--verbose'], default=0, action='count',
@@ -28,13 +27,13 @@ def config(defaults=None):
           help=_('Path to output file. By default a file is created in the current directory.'))
     return c
 
-def option_parser():
+def option_parser(name):
     c = config()
     return c.option_parser(usage=_('''\
-	%prog [options] file1.pdf file2.pdf ...
+	%prog %%name [options] file1.pdf file2.pdf ...
 
 	Merges individual PDFs. Metadata will be used from the first PDF specified.
-	'''))
+	'''.replace('%%name', name)))
 
 def merge_files(in_paths, out_path, metadata=None):
     if metadata == None:
@@ -67,8 +66,8 @@ def verify_files(files):
             invalid.append(pdf_path)
     return invalid
 
-def main(args=sys.argv):
-    parser = option_parser()
+def main(args=sys.argv, name=''):
+    parser = option_parser(name)
     opts, args = parser.parse_args(args)
     args = args[1:]
     
diff --git a/src/calibre/ebooks/pdf/pdfsplit.py b/src/calibre/ebooks/pdf/split.py
similarity index 93%
rename from src/calibre/ebooks/pdf/pdfsplit.py
rename to src/calibre/ebooks/pdf/split.py
index 460dbef148..36517fb704 100644
--- a/src/calibre/ebooks/pdf/pdfsplit.py
+++ b/src/calibre/ebooks/pdf/split.py
@@ -17,9 +17,8 @@ from pyPdf import PdfFileWriter, PdfFileReader
 
 def config(defaults=None):
     desc = _('Options to control the transformation of pdf')
-    default_crop=10
     if defaults is None:
-        c = Config('trimpdf', desc)
+        c = Config('splitpdf', desc)
     else:
         c = StringConfig(defaults, desc)
     c.add_opt('verbose', ['-v', '--verbose'], default=0, action='count',
@@ -29,21 +28,21 @@ def config(defaults=None):
             The file name will be the base name for the output.'))
     return c
 
-def option_parser():
+def option_parser(name):
     c = config()
     return c.option_parser(usage=_('''\
     
-	%prog [options] file.pdf page_to_split_on ...
-	%prog [options] file.pdf page_range_to_split_on ...
+	%prog %%name [options] file.pdf page_to_split_on ...
+	%prog %%name [options] file.pdf page_range_to_split_on ...
 	
 	Ex.
 	
-	%prog file.pdf 6
-	%prog file.pdf 6-12
-	%prog file.pdf 6-12 8 10 9-20
+	%prog %%name file.pdf 6
+	%prog %%name file.pdf 6-12
+	%prog %%name file.pdf 6-12 8 10 9-20
 
 	Split a PDF.
-	'''))
+	'''.replace('%%name', name)))
 
 def split_pdf(in_path, pages, page_ranges, out_name, metadata=None):
     pdf = PdfFileReader(open(os.path.abspath(in_path), 'rb'))
@@ -155,8 +154,8 @@ def valid_pdf(pdf_path):
         return False
     return True
 
-def main(args=sys.argv):
-    parser = option_parser()
+def main(args=sys.argv, name=''):
+    parser = option_parser(name)
     opts, args = parser.parse_args(args)
     
     pdf, pages, page_ranges, unknown = split_args(args[1:])
diff --git a/src/calibre/ebooks/pdf/pdftrim.py b/src/calibre/ebooks/pdf/trim.py
similarity index 95%
rename from src/calibre/ebooks/pdf/pdftrim.py
rename to src/calibre/ebooks/pdf/trim.py
index c1e8fa2494..c999d24a46 100644
--- a/src/calibre/ebooks/pdf/pdftrim.py
+++ b/src/calibre/ebooks/pdf/trim.py
@@ -33,16 +33,16 @@ def config(defaults=None):
     return c
 
 
-def option_parser():
+def option_parser(name):
     c = config()
     return c.option_parser(usage=_('''\
-	%prog [options] file.pdf
+	%prog %%name [options] file.pdf
 
 	Crops a pdf. 
-	'''))
+	'''.replace('%%name', name)))
 
-def main(args=sys.argv):
-    parser = option_parser()
+def main(args=sys.argv, name=''):
+    parser = option_parser(name)
     opts, args = parser.parse_args(args)
     try:
         source = os.path.abspath(args[1])
diff --git a/src/calibre/linux.py b/src/calibre/linux.py
index 3ba6f55bc8..6bfe665557 100644
--- a/src/calibre/linux.py
+++ b/src/calibre/linux.py
@@ -39,10 +39,7 @@ entry_points = {
              'calibre-fontconfig = calibre.utils.fontconfig:main',
              'calibre-parallel   = calibre.parallel:main',
              'calibre-customize  = calibre.customize.ui:main',
-             'pdftrim            = calibre.ebooks.pdf.pdftrim:main',
-             'pdfmerge           = calibre.ebooks.pdf.pdfmerge:main',
-             'pdfsplit           = calibre.ebooks.pdf.pdfsplit:main',
-             'fetch-ebook-metadata = calibre.ebooks.metadata.fetch:main',
+             'pdfmanipulate      = calibre.ebooks.pdf.manipulate:main',
         ],
         'gui_scripts'    : [
             __appname__+' = calibre.gui2.main:main',

From 9a81882d4f9b306289ffb0dd564e2a1f2f006f9e Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sun, 29 Mar 2009 09:34:43 -0400
Subject: [PATCH 09/16] Remove unnecessary options from pdf manipulation
 routines

---
 src/calibre/ebooks/pdf/manipulate.py | 2 +-
 src/calibre/ebooks/pdf/merge.py      | 2 --
 src/calibre/ebooks/pdf/split.py      | 2 --
 src/calibre/ebooks/pdf/trim.py       | 2 --
 4 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/src/calibre/ebooks/pdf/manipulate.py b/src/calibre/ebooks/pdf/manipulate.py
index 0e75734bb9..15c9404e25 100644
--- a/src/calibre/ebooks/pdf/manipulate.py
+++ b/src/calibre/ebooks/pdf/manipulate.py
@@ -21,7 +21,7 @@ COMMANDS = {
 def config(defaults=None):
     desc = _('Options to control the transformation of pdf')
     if defaults is None:
-        c = Config('trimpdf', desc)
+        c = Config('manipulatepdf', desc)
     else:
         c = StringConfig(defaults, desc)
     return c
diff --git a/src/calibre/ebooks/pdf/merge.py b/src/calibre/ebooks/pdf/merge.py
index 7ae35d1065..c0385080ad 100644
--- a/src/calibre/ebooks/pdf/merge.py
+++ b/src/calibre/ebooks/pdf/merge.py
@@ -21,8 +21,6 @@ def config(defaults=None):
         c = Config('mergepdf', desc)
     else:
         c = StringConfig(defaults, desc)
-    c.add_opt('verbose', ['-v', '--verbose'], default=0, action='count',
-          help=_('Be verbose, useful for debugging. Can be specified multiple times for greater verbosity.'))
     c.add_opt('output', ['-o', '--output'], default='merged.pdf',
           help=_('Path to output file. By default a file is created in the current directory.'))
     return c
diff --git a/src/calibre/ebooks/pdf/split.py b/src/calibre/ebooks/pdf/split.py
index 36517fb704..cc6965dd68 100644
--- a/src/calibre/ebooks/pdf/split.py
+++ b/src/calibre/ebooks/pdf/split.py
@@ -21,8 +21,6 @@ def config(defaults=None):
         c = Config('splitpdf', desc)
     else:
         c = StringConfig(defaults, desc)
-    c.add_opt('verbose', ['-v', '--verbose'], default=0, action='count',
-          help=_('Be verbose, useful for debugging. Can be specified multiple times for greater verbosity.'))
     c.add_opt('output', ['-o', '--output'], default='split.pdf',
           help=_('Path to output file. By default a file is created in the current directory. \
             The file name will be the base name for the output.'))
diff --git a/src/calibre/ebooks/pdf/trim.py b/src/calibre/ebooks/pdf/trim.py
index c999d24a46..b32312fee8 100644
--- a/src/calibre/ebooks/pdf/trim.py
+++ b/src/calibre/ebooks/pdf/trim.py
@@ -16,8 +16,6 @@ def config(defaults=None):
         c = Config('trimpdf', desc)
     else:
         c = StringConfig(defaults, desc)
-    c.add_opt('verbose', ['-v', '--verbose'], default=0, action='count',
-          help=_('Be verbose, useful for debugging. Can be specified multiple times for greater verbosity.'))
     c.add_opt('output', ['-o', '--output'],default='cropped.pdf',
           help=_('Path to output file. By default a file is created in the current directory.'))
     c.add_opt('bottom_left_x', [ '-x', '--leftx'], default=default_crop,

From 9e15e485883c6b589967a5ebe4d9f8bc58ae0982 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sun, 29 Mar 2009 10:18:29 -0400
Subject: [PATCH 10/16] PDF info command

---
 src/calibre/ebooks/pdf/info.py       | 89 ++++++++++++++++++++++++++++
 src/calibre/ebooks/pdf/manipulate.py |  3 +-
 2 files changed, 91 insertions(+), 1 deletion(-)
 create mode 100644 src/calibre/ebooks/pdf/info.py

diff --git a/src/calibre/ebooks/pdf/info.py b/src/calibre/ebooks/pdf/info.py
new file mode 100644
index 0000000000..46f1f11681
--- /dev/null
+++ b/src/calibre/ebooks/pdf/info.py
@@ -0,0 +1,89 @@
+'''
+Merge PDF files into a single PDF document.
+'''
+from __future__ import with_statement
+
+__license__   = 'GPL v3'
+__copyright__ = '2009, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+import os, re, sys, time
+
+from calibre.utils.config import Config, StringConfig
+
+from pyPdf import PdfFileWriter, PdfFileReader
+
+
+def config(defaults=None):
+    desc = _('Options to control the transformation of pdf')
+    if defaults is None:
+        c = Config('manipulatepdf', desc)
+    else:
+        c = StringConfig(defaults, desc)
+    return c
+
+def option_parser(name):
+    c = config()
+    return c.option_parser(usage=_('''\
+	%prog %%name [options] file.pdf ...
+
+	Get info about a PDF.
+	'''.replace('%%name', name)))
+
+def print_info(pdf_path):
+    with open(os.path.abspath(pdf_path), 'rb') as pdf_file:
+        pdf = PdfFileReader(pdf_file)
+        print _('Title:                 %s' % pdf.documentInfo.title)
+        print _('Author:                %s' % pdf.documentInfo.author)
+        print _('Creator:               %s' % pdf.documentInfo.creator)
+        print _('Producer:              %s' % pdf.documentInfo.producer)
+        print _('Creation Date:         %s' % time.strftime('%a %b %d %H:%M:%S %Y', time.gmtime(os.path.getctime(pdf_path))))
+        print _('Modification Date:     %s' % time.strftime('%a %b %d %H:%M:%S %Y', time.gmtime(os.path.getmtime(pdf_path))))
+        print _('Pages:                 %s' % pdf.numPages)
+        print _('Encrypted:             %s' % pdf.isEncrypted)
+        try:
+            print _('File Size:             %s bytes' % os.path.getsize(pdf_path))
+        except: pass
+        try:
+            pdf_file.seek(0)
+            vline = pdf_file.readline()
+            mo = re.search('(?iu)^%...-(?P<version>\d+\.\d+)', vline)
+            if mo != None:
+                print _('PDF Version:           %s' % mo.group('version'))
+        except: pass
+
+def verify_files(files):
+    invalid = []
+
+    for pdf_path in files:
+        try:
+            with open(os.path.abspath(pdf_path), 'rb') as pdf_file:
+                pdf = PdfFileReader(pdf_file)
+        except:
+            invalid.append(pdf_path)
+    return invalid
+
+def main(args=sys.argv, name=''):
+    parser = option_parser(name)
+    opts, args = parser.parse_args(args)
+    args = args[1:]
+    
+    if len(args) < 1:
+        print 'Error: No PDF sepecified.\n'
+        print parser.get_usage()
+        return 2
+    
+    bad_pdfs = verify_files(args)
+    if bad_pdfs != []:
+        for pdf in bad_pdfs:
+            print 'Error: Could not read file `%s`. Is it a vaild PDF file or is it encrypted/DRMed?.' % pdf
+        return 2
+        
+    for pdf in args:
+        print_info(pdf)
+    
+    return 0
+
+if __name__ == '__main__':
+    sys.exit(main())
+
diff --git a/src/calibre/ebooks/pdf/manipulate.py b/src/calibre/ebooks/pdf/manipulate.py
index 15c9404e25..262aaf78d4 100644
--- a/src/calibre/ebooks/pdf/manipulate.py
+++ b/src/calibre/ebooks/pdf/manipulate.py
@@ -10,9 +10,10 @@ __docformat__ = 'restructuredtext en'
 import string, sys
 
 from calibre.utils.config import Config, StringConfig
-from calibre.ebooks.pdf import merge, split, trim
+from calibre.ebooks.pdf import info, merge, split, trim
 
 COMMANDS = {
+             'info'  : info,
              'merge' : merge,
              'split' : split,
              'trim'  : trim,

From 1ed9efeb3904075310e05d3c18b1475617428f19 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sun, 29 Mar 2009 10:20:21 -0400
Subject: [PATCH 11/16] Added subject to pdf info command

---
 src/calibre/ebooks/pdf/info.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/calibre/ebooks/pdf/info.py b/src/calibre/ebooks/pdf/info.py
index 46f1f11681..115e411ce4 100644
--- a/src/calibre/ebooks/pdf/info.py
+++ b/src/calibre/ebooks/pdf/info.py
@@ -35,6 +35,7 @@ def print_info(pdf_path):
         pdf = PdfFileReader(pdf_file)
         print _('Title:                 %s' % pdf.documentInfo.title)
         print _('Author:                %s' % pdf.documentInfo.author)
+        print _('Subject:               %s' % pdf.documentInfo.subject)
         print _('Creator:               %s' % pdf.documentInfo.creator)
         print _('Producer:              %s' % pdf.documentInfo.producer)
         print _('Creation Date:         %s' % time.strftime('%a %b %d %H:%M:%S %Y', time.gmtime(os.path.getctime(pdf_path))))

From 87580e27ba9e270e1c104b32b9fdc6d0b41fd283 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Mon, 30 Mar 2009 19:03:49 -0400
Subject: [PATCH 12/16] TXT metadata reader

---
 src/calibre/customize/builtins.py  | 10 ++++++++++
 src/calibre/ebooks/metadata/txt.py | 30 ++++++++++++++++++++++++++++++
 2 files changed, 40 insertions(+)
 create mode 100644 src/calibre/ebooks/metadata/txt.py

diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py
index d37e241891..2cbf036c1f 100644
--- a/src/calibre/customize/builtins.py
+++ b/src/calibre/customize/builtins.py
@@ -159,6 +159,16 @@ class ODTMetadataReader(MetadataReaderPlugin):
     def get_metadata(self, stream, ftype):
         from calibre.ebooks.metadata.odt import get_metadata
         return get_metadata(stream)
+        
+class TXTMetadataReader(MetaReaderPlugin):
+    
+    name        = 'Read TXT metadata'
+    file_types  = set(['txt'])
+    description = _('Read metadata from %s files') % 'TXT'
+    
+    def get_metadata(self, stream, ftype):
+        from calibre.ebooks.metadata.txt import get_metadata
+        return get_metadata(stream)
 
 class LRXMetadataReader(MetadataReaderPlugin):
     
diff --git a/src/calibre/ebooks/metadata/txt.py b/src/calibre/ebooks/metadata/txt.py
new file mode 100644
index 0000000000..5a5ab13ae9
--- /dev/null
+++ b/src/calibre/ebooks/metadata/txt.py
@@ -0,0 +1,30 @@
+'''Read meta information from TXT files'''
+
+from __future__ import with_statement
+
+__license__   = 'GPL v3'
+__copyright__ = '2009, John Schember <john@nachtimwald.com>'
+
+import re
+
+from calibre.ebooks.metadata import MetaInformation
+
+def get_metadata(stream, extract_cover=True):
+    """ Return metadata as a L{MetaInfo} object """
+    mi = MetaInformation(_('Unknown'), [_('Unknown')])
+    stream.seek(0)
+
+    mdata = ''
+    for x in range(0, 4):
+        line = stream.readline()
+        if line == '':
+            break
+        else:
+            mdata += line
+    
+    mo = re.search('(?u)^[ ]*(?P<title>.+)[ ]*\n\n\n[ ]*(?P<author>.+)[ ]*\n$', mdata)
+    if mo != None:
+        mi.title = mo.group('title')
+        mi.authors = mo.group('author').split(',')
+
+    return mi

From 90362ab56ae0594651571117c0e934e108c7b877 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Tue, 31 Mar 2009 18:41:49 -0400
Subject: [PATCH 13/16] txt output now uses new conversion pipeline

---
 src/calibre/customize/builtins.py        |   5 +-
 src/calibre/ebooks/conversion/plumber.py |   2 +-
 src/calibre/ebooks/metadata/txt.py       |   2 +-
 src/calibre/ebooks/txt/from_any.py       |  74 -------------
 src/calibre/ebooks/txt/output.py         |  62 +++++++++++
 src/calibre/ebooks/txt/writer.py         | 130 ++++-------------------
 6 files changed, 90 insertions(+), 185 deletions(-)
 delete mode 100644 src/calibre/ebooks/txt/from_any.py
 create mode 100644 src/calibre/ebooks/txt/output.py

diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py
index 2cbf036c1f..acc7ba71ec 100644
--- a/src/calibre/customize/builtins.py
+++ b/src/calibre/customize/builtins.py
@@ -160,7 +160,7 @@ class ODTMetadataReader(MetadataReaderPlugin):
         from calibre.ebooks.metadata.odt import get_metadata
         return get_metadata(stream)
         
-class TXTMetadataReader(MetaReaderPlugin):
+class TXTMetadataReader(MetadataReaderPlugin):
     
     name        = 'Read TXT metadata'
     file_types  = set(['txt'])
@@ -266,9 +266,10 @@ class MOBIMetadataWriter(MetadataWriterPlugin):
 from calibre.ebooks.epub.input import EPUBInput
 from calibre.ebooks.mobi.input import MOBIInput
 from calibre.ebooks.oeb.output import OEBOutput
+from calibre.ebooks.txt.output import TXTOutput
 from calibre.customize.profiles import input_profiles, output_profiles
 
-plugins = [HTML2ZIP, EPUBInput, MOBIInput, OEBOutput]
+plugins = [HTML2ZIP, EPUBInput, MOBIInput, OEBOutput, TXTOutput]
 plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
                                         x.__name__.endswith('MetadataReader')]
 plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py
index 5393aaf034..da41423750 100644
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@@ -195,7 +195,7 @@ OptionRecommendation(name='language',
         self.input_fmt = input_fmt
         self.output_fmt = output_fmt
 
-        # Build set of all possible options. Two options are equal iff their
+        # Build set of all possible options. Two options are equal if their
         # names are the same.
         self.input_options  = self.input_plugin.options.union(
                                     self.input_plugin.common_options)
diff --git a/src/calibre/ebooks/metadata/txt.py b/src/calibre/ebooks/metadata/txt.py
index 5a5ab13ae9..6283c72256 100644
--- a/src/calibre/ebooks/metadata/txt.py
+++ b/src/calibre/ebooks/metadata/txt.py
@@ -22,7 +22,7 @@ def get_metadata(stream, extract_cover=True):
         else:
             mdata += line
     
-    mo = re.search('(?u)^[ ]*(?P<title>.+)[ ]*\n\n\n[ ]*(?P<author>.+)[ ]*\n$', mdata)
+    mo = re.search('(?u)^[ ]*(?P<title>.+)[ ]*(\n{3}|(\r\n){3}|\r{3})[ ]*(?P<author>.+)[ ]*(\n|\r\n|\r)$', mdata)
     if mo != None:
         mi.title = mo.group('title')
         mi.authors = mo.group('author').split(',')
diff --git a/src/calibre/ebooks/txt/from_any.py b/src/calibre/ebooks/txt/from_any.py
deleted file mode 100644
index caf5364c3c..0000000000
--- a/src/calibre/ebooks/txt/from_any.py
+++ /dev/null
@@ -1,74 +0,0 @@
-'''
-Convert any ebook format to TXT.
-'''
-
-from __future__ import with_statement
-
-__license__   = 'GPL v3'
-__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net ' \
-    'and Marshall T. Vandegrift <llasram@gmail.com>' \
-    'and John Schember <john@nachtimwald.com>'
-__docformat__ = 'restructuredtext en'
-
-import sys, os, glob, logging
-
-from calibre.ebooks.epub.from_any import any2epub, formats, USAGE
-from calibre.ebooks.epub import config as common_config
-from calibre.ptempfile import TemporaryDirectory
-from calibre.ebooks.txt.writer import oeb2txt, config as txt_config
-
-def config(defaults=None):
-    c = common_config(defaults=defaults, name='txt')
-    c.remove_opt('profile')
-    del c.option_set.groups['metadata']
-    del c.option_set.groups['traversal']
-    del c.option_set.groups['structure detection']
-    del c.option_set.groups['toc']
-    del c.option_set.groups['page layout']
-    txtc = txt_config(defaults=defaults)
-    c.update(txtc)
-    return c 
-
-def option_parser(usage=USAGE):
-    usage = usage % ('TXT', formats())
-    parser = config().option_parser(usage=usage)
-    return parser
-
-def any2txt(opts, path, notification=None):
-    ext = os.path.splitext(path)[1]
-    if not ext:
-        raise ValueError('Unknown file type: '+path)
-    ext = ext.lower()[1:]
-    
-    if opts.output is None:
-        opts.output = os.path.splitext(os.path.basename(path))[0]+'.txt'
-    
-    opts.output = os.path.abspath(opts.output)
-    orig_output = opts.output
-    
-    with TemporaryDirectory('_any2txt') as tdir:
-        oebdir = os.path.join(tdir, 'oeb')
-        os.mkdir(oebdir)
-        opts.output = os.path.join(tdir, 'dummy.epub')
-        opts.profile = 'None'
-        opts.dont_split_on_page_breaks = True
-        orig_bfs = opts.base_font_size2
-        opts.base_font_size2 = 0
-        any2epub(opts, path, create_epub=False, oeb_cover=False, extract_to=oebdir)
-        opts.base_font_size2 = orig_bfs
-        opf = glob.glob(os.path.join(oebdir, '*.opf'))[0]
-        opts.output = orig_output
-        logging.getLogger('html2epub').info(_('Creating TXT file from EPUB...'))
-        oeb2txt(opts, opf)
-
-def main(args=sys.argv):
-    parser = option_parser()
-    opts, args = parser.parse_args(args)
-    if len(args) < 2:
-        parser.print_help()
-        print 'No input file specified.'
-        return 1
-    any2txt(opts, args[1])
-    
-if __name__ == '__main__':
-    sys.exit(main())
diff --git a/src/calibre/ebooks/txt/output.py b/src/calibre/ebooks/txt/output.py
new file mode 100644
index 0000000000..21498074ac
--- /dev/null
+++ b/src/calibre/ebooks/txt/output.py
@@ -0,0 +1,62 @@
+# -*- coding: utf-8 -*-
+__license__ = 'GPL 3'
+__copyright__ = '2009, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+import os
+
+from calibre.customize.conversion import OutputFormatPlugin, \
+    OptionRecommendation
+from calibre.ebooks.txt.writer import TxtWriter, TxtNewlines, TxtMetadata
+from calibre.ebooks.metadata import authors_to_string
+
+class TXTOutput(OutputFormatPlugin):
+
+    name = 'TXT Output'
+    author = 'John Schember'
+    file_type = 'txt'
+
+    options = set([
+                    OptionRecommendation(name='newline', recommended_value='system',
+                        level=OptionRecommendation.LOW, long_switch='newline',
+                        short_switch='n', choices=TxtNewlines.NEWLINE_TYPES.keys(),
+                        help=_('Type of newline to use. Options are %s. Default is \'system\'. '
+                            'Use \'old_mac\' for compatibility with Mac OS 9 and earlier. '
+                            'For Mac OS X use \'unix\'. \'system\' will default to the newline '
+                            'type used by this OS.' % sorted(TxtNewlines.NEWLINE_TYPES.keys()))),
+                    OptionRecommendation(name='prepend_author', recommended_value='true',
+                        level=OptionRecommendation.LOW, long_switch='prepend_author',
+                        choices=['true', 'false'],
+                        help=_('Write the author to the beginning of the file. '
+                            'Default is \'true\'. Use \'false\' to disable.')),
+                    OptionRecommendation(name='prepend_title', recommended_value='true',
+                        choices=['true', 'false'],
+                        level=OptionRecommendation.LOW, long_switch='prepend_title',
+                        help=_('Write the title to the beginning of the file. '
+                            'Default is \'true\'. Use \'false\' to disable.'))
+                 ])
+
+    def convert(self, oeb_book, output_path, input_plugin, opts, log):
+        metadata = TxtMetadata()
+        if opts.prepend_author.lower() == 'true':
+            metadata.author = opts.authors if opts.authors else authors_to_string(oeb_book.metadata.authors)
+        if opts.prepend_title.lower() == 'true':
+            metadata.title = opts.title if opts.title else oeb_book.metadata.title
+
+        writer = TxtWriter(TxtNewlines(opts.newline).newline, log)
+        txt = writer.dump(oeb_book.spine, metadata)
+
+        close = False
+        if not hasattr(output_path, 'write'):
+            close = True
+            if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path) != '':
+                os.makedirs(os.path.dirname(output_path))
+            out_stream = open(output_path, 'wb')
+        else:
+            out_stream = output_path
+        
+        out_stream.seek(0)
+        out_stream.write(txt)
+        
+        if close:
+            out_stream.close()
diff --git a/src/calibre/ebooks/txt/writer.py b/src/calibre/ebooks/txt/writer.py
index 205d8423e3..eabc2d64ed 100644
--- a/src/calibre/ebooks/txt/writer.py
+++ b/src/calibre/ebooks/txt/writer.py
@@ -1,34 +1,26 @@
 # -*- coding: utf-8 -*-
+from __future__ import with_statement
 '''
 Write content to TXT.
 '''
-from __future__ import with_statement
 
 __license__   = 'GPL v3'
 __copyright__ = '2009, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
 
-import os, logging, re, sys
+import os, re, sys
+
+from calibre.ebooks.htmlsymbols import HTML_SYMBOLS
 
 from BeautifulSoup import BeautifulSoup
 
-from calibre import LoggingInterface
-from calibre.ebooks.htmlsymbols import HTML_SYMBOLS
-from calibre.ebooks.epub.iterator import SpineItem
-from calibre.ebooks.metadata import authors_to_string
-from calibre.ebooks.metadata.meta import metadata_from_formats
-from calibre.ebooks.metadata.opf2 import OPF
-from calibre.customize.ui import run_plugins_on_postprocess
-from calibre.utils.config import Config, StringConfig
-
-class TXTWriter(object):
-    def __init__(self, newline):
+class TxtWriter(object):
+    def __init__(self, newline, log):
         self.newline = newline
+        self.log = log
 
-    def dump(self, oebpath, path, metadata):
-        opf = OPF(oebpath, os.path.dirname(oebpath))
-        spine = [SpineItem(i.path) for i in opf.spine]
-
-        tmpout = ''
+    def dump(self, spine, metadata):
+        out = u''
         for item in spine:
             with open(item, 'r') as itemf:
                 content = itemf.read().decode(item.encoding)
@@ -39,25 +31,21 @@ class TXTWriter(object):
                 content = self.replace_html_symbols(content)
                 content = self.cleanup_text(content)
                 content = self.specified_newlines(content)
-                tmpout = tmpout + content
+                out += content
 
         # Prepend metadata
         if metadata.author != None and metadata.author != '':
-            tmpout = (u'%s%s%s%s' % (metadata.author.upper(), self.newline, self.newline, self.newline)) + tmpout
+            out = (u'%s%s%s%s' % (metadata.author.upper(), self.newline, self.newline, self.newline)) + out
         if metadata.title != None and metadata.title != '':
-            tmpout = (u'%s%s%s%s' % (metadata.title.upper(), self.newline, self.newline, self.newline)) + tmpout
+            out = (u'%s%s%s%s' % (metadata.title.upper(), self.newline, self.newline, self.newline)) + out
 
             # Put two blank lines at end of file
-
-            end = tmpout[-3 * len(self.newline):]
+            end = out[-3 * len(self.newline):]
             for i in range(3 - end.count(self.newline)):
-                tmpout = tmpout + self.newline
+                out += self.newline
+
+        return out
 
-        if os.path.exists(path):
-            os.remove(path)
-        with open(path, 'w+b') as out:
-            out.write(tmpout.encode('utf-8'))
-            
     def strip_html(self, html):
         stripped = u''
         
@@ -149,14 +137,8 @@ class TXTWriter(object):
         if self.newline == '\n':
             return text
         
-        return text.replace('\n', self.newline)
-        
-class TxtMetadata(object):
-    def __init__(self):
-        self.author = None
-        self.title = None
-        self.series = None
-        
+        return text.replace('\n', self.newline)        
+
 
 class TxtNewlines(object):
     NEWLINE_TYPES = {
@@ -170,73 +152,7 @@ class TxtNewlines(object):
         self.newline = self.NEWLINE_TYPES.get(newline_type.lower(), os.linesep)
 
 
-def config(defaults=None):
-    desc = _('Options to control the conversion to TXT')
-    if defaults is None:
-        c = Config('txt', desc)
-    else:
-        c = StringConfig(defaults, desc)
-        
-    txt = c.add_group('TXT', _('TXT options.'))
-            
-    txt('newline', ['--newline'], default='system',
-        help=_('Type of newline to use. Options are %s. Default is \'system\'. '
-            'Use \'old_mac\' for compatibility with Mac OS 9 and earlier. '
-            'For Mac OS X use \'unix\'. \'system\' will default to the newline '
-            'type used by this OS.' % sorted(TxtNewlines.NEWLINE_TYPES.keys())))
-    txt('prepend_author', ['--prepend-author'], default='true',
-        help=_('Write the author to the beginning of the file. '
-            'Default is \'true\'. Use \'false\' to disable.'))
-    txt('prepend_title', ['--prepend-title'], default='true',
-        help=_('Write the title to the beginning of the file. '
-            'Default is \'true\'. Use \'false\' to disable.'))
-        
-    return c
-
-def option_parser():
-    c = config()
-    parser = c.option_parser(usage='%prog '+_('[options]')+' file.opf')
-    parser.add_option(
-        '-o', '--output', default=None, 
-        help=_('Output file. Default is derived from input filename.'))
-    parser.add_option(
-        '-v', '--verbose', default=0, action='count',
-        help=_('Useful for debugging.'))        
-    return parser
-
-def oeb2txt(opts, inpath):
-    logger = LoggingInterface(logging.getLogger('oeb2txt'))
-    logger.setup_cli_handler(opts.verbose)
-    
-    outpath = opts.output
-    if outpath is None:
-        outpath = os.path.basename(inpath)
-        outpath = os.path.splitext(outpath)[0] + '.txt'
-
-    mi = metadata_from_formats([inpath])
-    metadata = TxtMetadata()
-    if opts.prepend_author.lower() == 'true':
-        metadata.author = opts.authors if opts.authors else authors_to_string(mi.authors)
-    if opts.prepend_title.lower() == 'true':
-        metadata.title = opts.title if opts.title else mi.title
-
-    newline = TxtNewlines(opts.newline)
-    
-    writer = TXTWriter(newline.newline)
-    writer.dump(inpath, outpath, metadata)
-    run_plugins_on_postprocess(outpath, 'txt')
-    logger.log_info(_('Output written to ') + outpath)
-    
-def main(argv=sys.argv):
-    parser = option_parser()
-    opts, args = parser.parse_args(argv[1:])
-    if len(args) != 1:
-        parser.print_help()
-        return 1
-    inpath = args[0]
-    retval = oeb2txt(opts, inpath)
-    return retval
-
-if __name__ == '__main__':
-    sys.exit(main())
-
+class TxtMetadata(object):
+    def __init__(self):
+        self.title = None
+        self.author = None

From 79e509eeb48bf7156e62bae9ca9291311dd25778 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Tue, 31 Mar 2009 20:23:49 -0400
Subject: [PATCH 14/16] Move PDF output to use new conversion framework

---
 src/calibre/customize/builtins.py  |  3 +-
 src/calibre/ebooks/pdf/from_any.py | 69 ---------------------
 src/calibre/ebooks/pdf/output.py   | 62 +++++++++++++++++++
 src/calibre/ebooks/pdf/writer.py   | 99 +++++-------------------------
 src/calibre/ebooks/txt/output.py   |  1 +
 5 files changed, 79 insertions(+), 155 deletions(-)
 delete mode 100644 src/calibre/ebooks/pdf/from_any.py
 create mode 100644 src/calibre/ebooks/pdf/output.py

diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py
index acc7ba71ec..932261c45d 100644
--- a/src/calibre/customize/builtins.py
+++ b/src/calibre/customize/builtins.py
@@ -267,9 +267,10 @@ from calibre.ebooks.epub.input import EPUBInput
 from calibre.ebooks.mobi.input import MOBIInput
 from calibre.ebooks.oeb.output import OEBOutput
 from calibre.ebooks.txt.output import TXTOutput
+from calibre.ebooks.pdf.output import PDFOutput
 from calibre.customize.profiles import input_profiles, output_profiles
 
-plugins = [HTML2ZIP, EPUBInput, MOBIInput, OEBOutput, TXTOutput]
+plugins = [HTML2ZIP, EPUBInput, MOBIInput, OEBOutput, TXTOutput, PDFOutput]
 plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
                                         x.__name__.endswith('MetadataReader')]
 plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
diff --git a/src/calibre/ebooks/pdf/from_any.py b/src/calibre/ebooks/pdf/from_any.py
deleted file mode 100644
index e4fb937cdb..0000000000
--- a/src/calibre/ebooks/pdf/from_any.py
+++ /dev/null
@@ -1,69 +0,0 @@
-'''
-Convert any ebook format to PDF.
-'''
-
-from __future__ import with_statement
-
-__license__   = 'GPL v3'
-__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net ' \
-    'and Marshall T. Vandegrift <llasram@gmail.com>' \
-    'and John Schember <john@nachtimwald.com>'
-__docformat__ = 'restructuredtext en'
-
-import sys, os, glob, logging
-
-from calibre.ebooks.epub.from_any import any2epub, formats, USAGE
-from calibre.ebooks.epub import config as common_config
-from calibre.ptempfile import TemporaryDirectory
-from calibre.ebooks.pdf.writer import oeb2pdf, config as pdf_config
-
-def config(defaults=None):
-    c = common_config(defaults=defaults, name='pdf')
-    c.remove_opt('profile')
-    pdfc = pdf_config(defaults=defaults)
-    c.update(pdfc)
-    return c 
-
-def option_parser(usage=USAGE):
-    usage = usage % ('PDF', formats())
-    parser = config().option_parser(usage=usage)
-    return parser
-
-def any2pdf(opts, path, notification=None):
-    ext = os.path.splitext(path)[1]
-    if not ext:
-        raise ValueError('Unknown file type: '+path)
-    ext = ext.lower()[1:]
-    
-    if opts.output is None:
-        opts.output = os.path.splitext(os.path.basename(path))[0]+'.pdf'
-    
-    opts.output = os.path.abspath(opts.output)
-    orig_output = opts.output
-    
-    with TemporaryDirectory('_any2pdf') as tdir:
-        oebdir = os.path.join(tdir, 'oeb')
-        os.mkdir(oebdir)
-        opts.output = os.path.join(tdir, 'dummy.epub')
-        opts.profile = 'None'
-        opts.dont_split_on_page_breaks = True
-        orig_bfs = opts.base_font_size2
-        opts.base_font_size2 = 0
-        any2epub(opts, path, create_epub=False, oeb_cover=True, extract_to=oebdir)
-        opts.base_font_size2 = orig_bfs
-        opf = glob.glob(os.path.join(oebdir, '*.opf'))[0]
-        opts.output = orig_output
-        logging.getLogger('html2epub').info(_('Creating PDF file from EPUB...'))
-        oeb2pdf(opts, opf)
-
-def main(args=sys.argv):
-    parser = option_parser()
-    opts, args = parser.parse_args(args)
-    if len(args) < 2:
-        parser.print_help()
-        print 'No input file specified.'
-        return 1
-    any2pdf(opts, args[1])
-    
-if __name__ == '__main__':
-    sys.exit(main())
diff --git a/src/calibre/ebooks/pdf/output.py b/src/calibre/ebooks/pdf/output.py
new file mode 100644
index 0000000000..71bd77ee73
--- /dev/null
+++ b/src/calibre/ebooks/pdf/output.py
@@ -0,0 +1,62 @@
+# -*- coding: utf-8 -*-
+__license__ = 'GPL 3'
+__copyright__ = '2009, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+'''
+Convert OEB ebook format to PDF.
+'''
+
+#unit, papersize, orientation, custom_size, profile
+
+import os
+
+from calibre.customize.conversion import OutputFormatPlugin, \
+    OptionRecommendation
+from calibre.ebooks.pdf.writer import PDFWriter, PDFMargins
+
+class PDFOutput(OutputFormatPlugin):
+
+    name = 'PDF Output'
+    author = 'John Schember'
+    file_type = 'pdf'
+
+    options = set([
+                    OptionRecommendation(name='margin_top', recommended_value='1',
+                        level=OptionRecommendation.LOW, long_switch='margin_top',
+                        help=_('The top margin around the document.')),
+                    OptionRecommendation(name='margin_bottom', recommended_value='1',
+                        level=OptionRecommendation.LOW, long_switch='margin_bottom',
+                        help=_('The bottom margin around the document.')),
+                    OptionRecommendation(name='margin_left', recommended_value='1',
+                        level=OptionRecommendation.LOW, long_switch='margin_left',
+                        help=_('The left margin around the document.')),
+                    OptionRecommendation(name='margin_right', recommended_value='1',
+                        level=OptionRecommendation.LOW, long_switch='margin_right',
+                        help=_('The right margin around the document.')),
+                 ])
+                 
+    def convert(self, oeb_book, output_path, input_plugin, opts, log):
+        margins = PDFMargins()
+        margins.top = opts.margin_top
+        margins.bottom = opts.margin_bottom
+        margins.left = opts.margin_left
+        margins.right = opts.margin_right
+    
+        writer = PDFWriter(log, margins)
+        
+        close = False
+        if not hasattr(output_path, 'write'):
+            close = True
+            if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path) != '':
+                os.makedirs(os.path.dirname(output_path))
+            out_stream = open(output_path, 'wb')
+        else:
+            out_stream = output_path
+        
+        out_stream.seek(0)
+        out_stream.truncate()
+        writer.dump(oeb_book.spine, out_stream)
+        
+        if close:
+            out_stream.close()
diff --git a/src/calibre/ebooks/pdf/writer.py b/src/calibre/ebooks/pdf/writer.py
index c189407dac..511c968a20 100644
--- a/src/calibre/ebooks/pdf/writer.py
+++ b/src/calibre/ebooks/pdf/writer.py
@@ -1,20 +1,17 @@
-'''
-Write content to PDF.
-'''
+# -*- coding: utf-8 -*-
 from __future__ import with_statement
 
 __license__   = 'GPL v3'
 __copyright__ = '2009, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
 
-import os, logging, shutil, sys
+'''
+Write content to PDF.
+'''
+
+import os, shutil, sys
 
-from calibre import LoggingInterface
-from calibre.ebooks.epub.iterator import SpineItem
-from calibre.ebooks.metadata.opf2 import OPF
 from calibre.ptempfile import PersistentTemporaryDirectory
-from calibre.customize.ui import run_plugins_on_postprocess
-from calibre.utils.config import Config, StringConfig
-
 from PyQt4 import QtCore
 from PyQt4.Qt import QUrl, QEventLoop, SIGNAL, QObject, QApplication, QPrinter, \
     QMetaObject, Qt
@@ -29,13 +26,14 @@ class PDFMargins:
         self.left   = margin
         self.right  = margin
         
+        
 class PDFWriter(QObject):
-    def __init__(self, margins=PDFMargins()):
+    def __init__(self, log, margins=PDFMargins()):
         if QApplication.instance() is None:
             QApplication([])
         QObject.__init__(self)
         
-        self.logger = logging.getLogger('oeb2pdf')
+        self.logger = log
         
         self.loop = QEventLoop()
         self.view = QWebView()
@@ -45,13 +43,12 @@ class PDFWriter(QObject):
         self.tmp_path = PersistentTemporaryDirectory('_any2pdf_parts')
         self.margins = margins
 
-    def dump(self, oebpath, path):
+    def dump(self, spine, out_stream):
         self._delete_tmpdir()
         
-        opf = OPF(oebpath, os.path.dirname(oebpath))
-        self.render_queue = [SpineItem(i.path) for i in opf.spine]
+        self.render_queue = spine[:]
         self.combine_queue = []
-        self.path = path
+        self.out_stream = out_stream
         
         QMetaObject.invokeMethod(self, "_render_book", Qt.QueuedConnection)
         self.loop.exec_()
@@ -98,75 +95,7 @@ class PDFWriter(QObject):
                 inputPDF = PdfFileReader(file(item, 'rb'))
                 for page in inputPDF.pages:
                     outPDF.addPage(page)
-            outputStream = file(self.path, 'wb')
-            outPDF.write(outputStream)
-            outputStream.close()
+            outPDF.write(self.out_stream)
         finally:
             self._delete_tmpdir()
             self.loop.exit(0)
-
-
-def config(defaults=None):
-    desc = _('Options to control the conversion to PDF')
-    if defaults is None:
-        c = Config('pdf', desc)
-    else:
-        c = StringConfig(defaults, desc)
-        
-    pdf = c.add_group('PDF', _('PDF options.'))
-            
-    pdf('margin_top', ['--margin_top'], default=1,
-         help=_('The top margin around the document in inches.'))
-    pdf('margin_bottom', ['--margin_bottom'], default=1,
-         help=_('The bottom margin around the document in inches.'))
-    pdf('margin_left', ['--margin_left'], default=1,
-         help=_('The left margin around the document in inches.'))
-    pdf('margin_right', ['--margin_right'], default=1,
-         help=_('The right margin around the document in inches.'))
-    
-    return c
-
-def option_parser():
-    c = config()
-    parser = c.option_parser(usage='%prog '+_('[options]')+' file.opf')
-    parser.add_option(
-        '-o', '--output', default=None, 
-        help=_('Output file. Default is derived from input filename.'))
-    parser.add_option(
-        '-v', '--verbose', default=0, action='count',
-        help=_('Useful for debugging.'))        
-    return parser
-
-def oeb2pdf(opts, inpath):
-    logger = LoggingInterface(logging.getLogger('oeb2pdf'))
-    logger.setup_cli_handler(opts.verbose)
-    
-    outpath = opts.output
-    if outpath is None:
-        outpath = os.path.basename(inpath)
-        outpath = os.path.splitext(outpath)[0] + '.pdf'
-
-    margins = PDFMargins()
-    margins.top = opts.margin_top
-    margins.bottom = opts.margin_bottom
-    margins.left = opts.margin_left
-    margins.right = opts.margin_right
-
-    writer = PDFWriter(margins)
-    writer.dump(inpath, outpath)
-    run_plugins_on_postprocess(outpath, 'pdf')
-    logger.log_info(_('Output written to ') + outpath)
-    
-def main(argv=sys.argv):
-    parser = option_parser()
-    opts, args = parser.parse_args(argv[1:])
-    if len(args) != 1:
-        parser.print_help()
-        return 1
-    inpath = args[0]
-    retval = oeb2pdf(opts, inpath)
-    return retval
-
-if __name__ == '__main__':
-    sys.exit(main())
-    
diff --git a/src/calibre/ebooks/txt/output.py b/src/calibre/ebooks/txt/output.py
index 21498074ac..7d44172b3f 100644
--- a/src/calibre/ebooks/txt/output.py
+++ b/src/calibre/ebooks/txt/output.py
@@ -56,6 +56,7 @@ class TXTOutput(OutputFormatPlugin):
             out_stream = output_path
         
         out_stream.seek(0)
+        out_stream.truncate()
         out_stream.write(txt)
         
         if close:

From 596e3f71388cef57c1e7593c796431a984e66233 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Wed, 1 Apr 2009 07:39:41 -0400
Subject: [PATCH 15/16] More robust pdf output

---
 src/calibre/ebooks/pdf/output.py      | 37 ++++++++--
 src/calibre/ebooks/pdf/pageoptions.py | 98 +++++++++++++++++++++++++++
 src/calibre/ebooks/pdf/writer.py      | 18 ++---
 3 files changed, 135 insertions(+), 18 deletions(-)
 create mode 100644 src/calibre/ebooks/pdf/pageoptions.py

diff --git a/src/calibre/ebooks/pdf/output.py b/src/calibre/ebooks/pdf/output.py
index 71bd77ee73..5af4e4bed7 100644
--- a/src/calibre/ebooks/pdf/output.py
+++ b/src/calibre/ebooks/pdf/output.py
@@ -13,7 +13,9 @@ import os
 
 from calibre.customize.conversion import OutputFormatPlugin, \
     OptionRecommendation
-from calibre.ebooks.pdf.writer import PDFWriter, PDFMargins
+from calibre.ebooks.pdf.writer import PDFWriter
+from calibre.ebooks.pdf.pageoptions import UNITS, unit, PAPER_SIZES, \
+    paper_size, ORIENTATIONS, orientation, PageOptions
 
 class PDFOutput(OutputFormatPlugin):
 
@@ -34,16 +36,37 @@ class PDFOutput(OutputFormatPlugin):
                     OptionRecommendation(name='margin_right', recommended_value='1',
                         level=OptionRecommendation.LOW, long_switch='margin_right',
                         help=_('The right margin around the document.')),
+                        
+                    OptionRecommendation(name='unit', recommended_value='inch',
+                        level=OptionRecommendation.LOW, short_switch='u',
+                        long_switch='unit', choices=UNITS.keys(),
+                        help=_('The unit of measure. Default is inch. Choices '
+                        'are %s' % UNITS.keys())),
+                    OptionRecommendation(name='paper_size', recommended_value='letter',
+                        level=OptionRecommendation.LOW,
+                        long_switch='paper_size', choices=PAPER_SIZES.keys(),
+                        help=_('The size of the paper. Default is letter. Choices '
+                        'are %s' % PAPER_SIZES.keys())),
+                    OptionRecommendation(name='orientation', recommended_value='portrait',
+                        level=OptionRecommendation.LOW,
+                        long_switch='orientation', choices=ORIENTATIONS.keys(),
+                        help=_('The orientation of the page. Default is portrait. Choices '
+                        'are %s' % ORIENTATIONS.keys())),
                  ])
                  
     def convert(self, oeb_book, output_path, input_plugin, opts, log):
-        margins = PDFMargins()
-        margins.top = opts.margin_top
-        margins.bottom = opts.margin_bottom
-        margins.left = opts.margin_left
-        margins.right = opts.margin_right
+        popts = PageOptions()
+        
+        popts.set_margin_top(opts.margin_top)
+        popts.set_margin_bottom(opts.margin_bottom)
+        popts.set_margin_left(opts.margin_left)
+        popts.set_margin_right(opts.margin_right)
+        
+        popts.unit = unit(opts.unit)
+        popts.paper_size = paper_size(opts.paper_size)
+        popts.orientation = orientation(opts.orientation)
     
-        writer = PDFWriter(log, margins)
+        writer = PDFWriter(log, popts)
         
         close = False
         if not hasattr(output_path, 'write'):
diff --git a/src/calibre/ebooks/pdf/pageoptions.py b/src/calibre/ebooks/pdf/pageoptions.py
new file mode 100644
index 0000000000..26fae81662
--- /dev/null
+++ b/src/calibre/ebooks/pdf/pageoptions.py
@@ -0,0 +1,98 @@
+# -*- coding: utf-8 -*-
+__license__ = 'GPL 3'
+__copyright__ = '2009, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+from PyQt4.Qt import QPrinter
+
+UNITS = {
+            'millimeter' : QPrinter.Millimeter,
+            'point' : QPrinter.Point,
+            'inch' : QPrinter.Inch,
+            'pica' : QPrinter.Pica,
+            'didot' : QPrinter.Didot,
+            'cicero' : QPrinter.Cicero,
+            'devicepixel' : QPrinter.DevicePixel,
+        }
+
+def unit(unit):
+    return UNITS.get(unit, QPrinter.Inch)
+
+PAPER_SIZES = {
+                'a0' : QPrinter.A0, # 841 x 1189 mm
+                'a1' : QPrinter.A1, # 594 x 841 mm
+                'a2' : QPrinter.A2, # 420 x 594 mm
+                'a3' : QPrinter.A3, # 297 x 420 mm
+                'a4' : QPrinter.A4, # 210 x 297 mm, 8.26 x 11.69 inches
+                'a5' : QPrinter.A5, # 148 x 210 mm
+                'a6' : QPrinter.A6, # 105 x 148 mm
+                'a7' : QPrinter.A7, # 74 x 105 mm
+                'a8' : QPrinter.A8, # 52 x 74 mm
+                'a9' : QPrinter.A9, # 37 x 52 mm
+                'b0' : QPrinter.B0, # 1030 x 1456 mm
+                'b1' : QPrinter.B1, # 728 x 1030 mm
+                'b2' : QPrinter.B2, # 515 x 728 mm
+                'b3' : QPrinter.B3, # 364 x 515 mm
+                'b4' : QPrinter.B4, # 257 x 364 mm
+                'b5' : QPrinter.B5, # 182 x 257 mm, 7.17 x 10.13 inches
+                'b6' : QPrinter.B6, # 128 x 182 mm
+                'b7' : QPrinter.B7, # 91 x 128 mm
+                'b8' : QPrinter.B8, # 64 x 91 mm
+                'b9' : QPrinter.B9, # 45 x 64 mm
+                'b10' : QPrinter.B10, # 32 x 45 mm
+                'c5e' : QPrinter.C5E, # 163 x 229 mm
+                'comm10e' : QPrinter.Comm10E, # 105 x 241 mm, U.S. Common 10 Envelope
+                'dle' : QPrinter.DLE, # 110 x 220 mm
+                'executive' : QPrinter.Executive, # 7.5 x 10 inches, 191 x 254 mm
+                'folio' : QPrinter.Folio, # 210 x 330 mm
+                'ledger' : QPrinter.Ledger, # 432 x 279 mm
+                'legal' : QPrinter.Legal, # 8.5 x 14 inches, 216 x 356 mm
+                'letter' : QPrinter.Letter, # 8.5 x 11 inches, 216 x 279 mm
+                'tabloid' : QPrinter.Tabloid, #  279 x 432 mm
+                #'custom' : QPrinter.Custom, # Unknown, or a user defined size.
+             }
+
+def paper_size(size):
+    return PAPER_SIZES.get(size, QPrinter.Letter)
+
+ORIENTATIONS = {
+                'portrait' : QPrinter.Portrait,
+                'landscape' : QPrinter.Landscape,
+               }
+
+def orientation(orientation):
+    return ORIENTATIONS.get(orientation, QPrinter.Portrait)
+
+
+class PageOptions(object):
+    margin_top = 1
+    margin_bottom = 1
+    margin_left = 1
+    margin_right = 1
+    unit = QPrinter.Inch
+    paper_size = QPrinter.Letter
+    orientation = QPrinter.Portrait
+    
+    def set_margin_top(self, size):
+        try:
+            self.margin_top = int(size)
+        except:
+            self.margin_top = 1
+    
+    def set_margin_bottom(self, size):
+        try:
+            self.margin_bottom = int(size)
+        except:
+            self.margin_bottom = 1
+    
+    def set_margin_left(self, size):
+        try:
+            self.margin_left = int(size)
+        except:
+            self.margin_left = 1
+    
+    def set_margin_right(self, size):
+        try:
+            self.margin_right = int(size)
+        except:
+            self.margin_right = 1
diff --git a/src/calibre/ebooks/pdf/writer.py b/src/calibre/ebooks/pdf/writer.py
index 511c968a20..cf77aebc14 100644
--- a/src/calibre/ebooks/pdf/writer.py
+++ b/src/calibre/ebooks/pdf/writer.py
@@ -12,23 +12,17 @@ Write content to PDF.
 import os, shutil, sys
 
 from calibre.ptempfile import PersistentTemporaryDirectory
+from calibre.ebooks.pdf.pageoptions import PageOptions
+
 from PyQt4 import QtCore
 from PyQt4.Qt import QUrl, QEventLoop, SIGNAL, QObject, QApplication, QPrinter, \
     QMetaObject, Qt
 from PyQt4.QtWebKit import QWebView
 
 from pyPdf import PdfFileWriter, PdfFileReader
-    
-class PDFMargins:
-    def __init__(self, margin=1):
-        self.top    = margin
-        self.bottom = margin
-        self.left   = margin
-        self.right  = margin
-        
         
 class PDFWriter(QObject):
-    def __init__(self, log, margins=PDFMargins()):
+    def __init__(self, log, popts=PageOptions()):
         if QApplication.instance() is None:
             QApplication([])
         QObject.__init__(self)
@@ -41,7 +35,7 @@ class PDFWriter(QObject):
         self.render_queue = []
         self.combine_queue = []
         self.tmp_path = PersistentTemporaryDirectory('_any2pdf_parts')
-        self.margins = margins
+        self.popts = popts
 
     def dump(self, spine, out_stream):
         self._delete_tmpdir()
@@ -75,7 +69,9 @@ class PDFWriter(QObject):
             self.logger.debug('\tRendering item as %s' % item_path)
         
             printer = QPrinter(QPrinter.HighResolution)
-            printer.setPageMargins(self.margins.left, self.margins.top, self.margins.right, self.margins.bottom, QPrinter.Inch)
+            printer.setPageMargins(self.popts.margin_left, self.popts.margin_top, self.popts.margin_right, self.popts.margin_bottom, self.popts.unit)
+            printer.setPaperSize(self.popts.paper_size)
+            printer.setOrientation(self.popts.orientation)
             printer.setOutputFormat(QPrinter.PdfFormat)
             printer.setOutputFileName(item_path)
             self.view.print_(printer)

From 118fd6ece0625f9bb95657df74401abe46f775ad Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Wed, 1 Apr 2009 08:08:03 -0400
Subject: [PATCH 16/16] reverse pdfmanipulate command

---
 src/calibre/ebooks/pdf/manipulate.py | 11 ++--
 src/calibre/ebooks/pdf/reverse.py    | 88 ++++++++++++++++++++++++++++
 2 files changed, 94 insertions(+), 5 deletions(-)
 create mode 100644 src/calibre/ebooks/pdf/reverse.py

diff --git a/src/calibre/ebooks/pdf/manipulate.py b/src/calibre/ebooks/pdf/manipulate.py
index 262aaf78d4..8c49650730 100644
--- a/src/calibre/ebooks/pdf/manipulate.py
+++ b/src/calibre/ebooks/pdf/manipulate.py
@@ -10,13 +10,14 @@ __docformat__ = 'restructuredtext en'
 import string, sys
 
 from calibre.utils.config import Config, StringConfig
-from calibre.ebooks.pdf import info, merge, split, trim
+from calibre.ebooks.pdf import info, merge, reverse, split, trim
 
 COMMANDS = {
-             'info'  : info,
-             'merge' : merge,
-             'split' : split,
-             'trim'  : trim,
+             'info'    : info,
+             'merge'   : merge,
+             'reverse' : reverse,
+             'split'   : split,
+             'trim'    : trim,
            }
 
 def config(defaults=None):
diff --git a/src/calibre/ebooks/pdf/reverse.py b/src/calibre/ebooks/pdf/reverse.py
new file mode 100644
index 0000000000..87bb9018c1
--- /dev/null
+++ b/src/calibre/ebooks/pdf/reverse.py
@@ -0,0 +1,88 @@
+# -*- coding: utf-8 -*-
+from __future__ import with_statement
+
+__license__   = 'GPL v3'
+__copyright__ = '2009, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+'''
+Reverse content of PDF.
+'''
+
+import os, sys
+
+from calibre.ebooks.metadata.meta import metadata_from_formats
+from calibre.ebooks.metadata import authors_to_string
+from calibre.utils.config import Config, StringConfig
+
+from pyPdf import PdfFileWriter, PdfFileReader
+
+def config(defaults=None):
+    desc = _('Options to control the transformation of pdf')
+    if defaults is None:
+        c = Config('reversepdf', desc)
+    else:
+        c = StringConfig(defaults, desc)
+    c.add_opt('output', ['-o', '--output'], default='reversed.pdf',
+          help=_('Path to output file. By default a file is created in the current directory.'))
+    return c
+
+def option_parser(name):
+    c = config()
+    return c.option_parser(usage=_('''\
+	%prog %%name [options] file1.pdf
+
+	Reverse PDF.
+	'''.replace('%%name', name)))
+
+def reverse(pdf_path, out_path, metadata=None):
+    if metadata == None:
+        title = _('Unknown')
+        author = _('Unknown')
+    else:
+        title = metadata.title
+        author = authors_to_string(metadata.authors)
+
+    out_pdf = PdfFileWriter(title=title, author=author)
+
+    pdf = PdfFileReader(open(os.path.abspath(pdf_path), 'rb'))
+    for page in reversed(pdf.pages):
+        out_pdf.addPage(page)
+
+    with open(out_path, 'wb') as out_file:
+        out_pdf.write(out_file)
+
+# Return True if the pdf is valid.
+def valid_pdf(pdf_path):
+    try:
+        with open(os.path.abspath(pdf_path), 'rb') as pdf_file:
+            pdf = PdfFileReader(pdf_file)
+            if pdf.isEncrypted or pdf.numPages <= 0:
+                raise Exception
+    except:
+        return False
+    return True
+
+
+def main(args=sys.argv, name=''):
+    parser = option_parser(name)
+    opts, args = parser.parse_args(args)
+    args = args[1:]
+    
+    if len(args) < 1:
+        print 'Error: A PDF file is required.\n\n'
+        print parser.get_usage()
+        return 2
+    
+    if not valid_pdf(args[0]):
+        print 'Error: Could not read file `%s`. Is it a vaild PDF file or is it encrypted/DRMed?.' % args[0]
+        return 2
+    
+    mi = metadata_from_formats([args[0]])
+
+    reverse(args[0], opts.output, mi)
+
+    return 0
+
+if __name__ == '__main__':
+    sys.exit(main())