any2txt converter

2025-07-09 03:04:10 -04:00 · 2009-03-20 19:39:26 -04:00 · 2009-03-20 19:39:26 -04:00 · 36fd295ca1
commit 36fd295ca1
parent 94b7149180
4 changed files with 539 additions and 0 deletions
--- a/src/calibre/ebooks/htmlsymbols.py
+++ b/src/calibre/ebooks/htmlsymbols.py
@ -0,0 +1,219 @@
+# -*- coding: utf-8 -*-
+'''
+Maping of non-acii symbols and their corresponding html entity number and name
+'''
+__license__   = 'GPL v3'
+__copyright__ = '2009, John Schember <john@nachtimwald.com>'
+
+# http://www.w3schools.com/tags/ref_symbols.asp
+HTML_SYMBOLS = {
+                # Math Symbols
+                u'∀' : ['&#8704;', '&forall;'], # for all
+                u'∂' : ['&#8706;', '&part;'], # part
+                u'∃' : ['&#8707;', '&exists;'], # exists
+                u'∅' : ['&#8709;', '&empty;'], # empty
+                u'∇' : ['&#8711;', '&nabla;'], # nabla
+                u'∈' : ['&#8712;', '&isin;'], # isin
+                u'∉' : ['&#8713;', '&notin;'], # notin
+                u'∋' : ['&#8715;', '&ni;'], # ni
+                u'∏' : ['&#8719;', '&prod;'], # prod
+                u'∑' : ['&#8721;', '&sum;'], # sum
+                u'−' : ['&#8722;', '&minus;'], # minus
+                u'∗' : ['&#8727;', '&lowast;'], # lowast
+                u'√' : ['&#8730;', '&radic;'], # square root
+                u'∝' : ['&#8733;', '&prop;'], # proportional to
+                u'∞' : ['&#8734;', '&infin;'], # infinity
+                u'∠' : ['&#8736;', '&ang;'], # angle
+                u'∧' : ['&#8743;', '&and;'], # and
+                u'∨' : ['&#8744;', '&or;'], # or
+                u'∩' : ['&#8745;', '&cap;'], # cap
+                u'∪' : ['&#8746;', '&cup;'], # cup
+                u'∫' : ['&#8747;', '&int;'], # integral
+                u'∴' : ['&#8756;', '&there4;'], # therefore
+                u'∼' : ['&#8764;', '&sim;'], # simular to
+                u'≅' : ['&#8773;', '&cong;'], # approximately equal
+                u'≈' : ['&#8776;', '&asymp;'], # almost equal
+                u'≠' : ['&#8800;', '&ne;'], # not equal
+                u'≡' : ['&#8801;', '&equiv;'], # equivalent
+                u'≤' : ['&#8804;', '&le;'], # less or equal
+                u'≥' : ['&#8805;', '&ge;'], # greater or equal
+                u'⊂' : ['&#8834;', '&sub;'], # subset of
+                u'⊃' : ['&#8835;', '&sup;'], # superset of
+                u'⊄' : ['&#8836;', '&nsub;'], # not subset of
+                u'⊆' : ['&#8838;', '&sube;'], # subset or equal
+                u'⊇' : ['&#8839;', '&supe;'], # superset or equal
+                u'⊕' : ['&#8853;', '&oplus;'], # circled plus
+                u'⊗' : ['&#8855;', '&otimes;'], # cirled times
+                u'⊥' : ['&#8869;', '&perp;'], # perpendicular
+                u'⋅' : ['&#8901;', '&sdot;'], # dot operator
+                # Greek Letters
+                u'Α' : ['&#913;', '&Alpha;'], # Alpha
+                u'Β' : ['&#914;', '&Beta;'], # Beta
+                u'Γ' : ['&#915;', '&Gamma;'], # Gamma
+                u'Δ' : ['&#916;', '&Delta;'], # Delta
+                u'Ε' : ['&#917;', '&Epsilon;'], # Epsilon
+                u'Ζ' : ['&#918;', '&Zeta;'], # Zeta
+                u'Η' : ['&#919;', '&Eta;'], # Eta
+                u'Θ' : ['&#920;', '&Theta;'], # Theta
+                u'Ι' : ['&#921;', '&Iota;'], # Iota
+                u'Κ' : ['&#922;', '&Kappa;'], # Kappa
+                u'Λ' : ['&#923;', '&Lambda;'], # Lambda
+                u'Μ' : ['&#924;', '&Mu;'], # Mu
+                u'Ν' : ['&#925;', '&Nu;'], # Nu
+                u'Ξ' : ['&#926;', '&Xi;'], # Xi
+                u'Ο' : ['&#927;', '&Omicron;'], # Omicron
+                u'Π' : ['&#928;', '&Pi;'], # Pi
+                u'Ρ' : ['&#929;', '&Rho;'], # Rho
+                u'Σ' : ['&#931;', '&Sigma;'], # Sigma
+                u'Τ' : ['&#932;', '&Tau;'], # Tau
+                u'Υ' : ['&#933;', '&Upsilon;'], # Upsilon
+                u'Φ' : ['&#934;', '&Phi;'], # Phi
+                u'Χ' : ['&#935;', '&Chi;'], # Chi
+                u'Ψ' : ['&#936;', '&Psi;'], # Psi
+                u'ω' : ['&#969;', '&omega;'], # omega
+                u'ϑ' : ['&#977;', '&thetasym;'], # theta symbol
+                u'ϒ' : ['&#978;', '&upsih;'], # upsilon symbol
+                u'ϖ' : ['&#982;', '&piv;'], # pi symbol
+                # Other
+                u'Œ' : ['&#338;', '&OElig;'], # capital ligature OE
+                u'œ' : ['&#339;', '&oelig;'], # small ligature oe
+                u'Š' : ['&#352;', '&Scaron;'], # capital S with caron
+                u'š' : ['&#353;', '&scaron;'], # small S with caron
+                u'Ÿ' : ['&#376;', '&Yuml;'], # capital Y with diaeres
+                u'ƒ' : ['&#402;', '&fnof;'], # f with hook
+                u'ˆ' : ['&#710;', '&circ;'], # modifier letter circumflex accent
+                u'˜' : ['&#732;', '&tilde;'], # small tilde
+                u'–' : ['&#8211;', '&ndash;'], # en dash
+                u'—' : ['&#8212;', '&mdash;'], # em dash
+                u'‘' : ['&#8216;', '&lsquo;'], # left single quotation mark
+                u'’' : ['&#8217;', '&rsquo;'], # right single quotation mark
+                u'‚' : ['&#8218;', '&sbquo;'], # single low-9 quotation mark
+                u'“' : ['&#8220;', '&ldquo;'], # left double quotation mark
+                u'”' : ['&#8221;', '&rdquo;'], # right double quotation mark
+                u'„' : ['&#8222;', '&bdquo;'], # double low-9 quotation mark
+                u'†' : ['&#8224;', '&dagger;'], # dagger
+                u'‡' : ['&#8225;', '&Dagger;'], # double dagger
+                u'•' : ['&#8226;', '&bull;'], # bullet
+                u'…' : ['&#8230;', '&hellip;'], # horizontal ellipsis
+                u'‰' : ['&#8240;', '&permil;'], # per mille 
+                u'′' : ['&#8242;', '&prime;'], # minutes
+                u'″' : ['&#8243;', '&Prime;'], # seconds
+                u'‹' : ['&#8249;', '&lsaquo;'], # single left angle quotation
+                u'›' : ['&#8250;', '&rsaquo;'], # single right angle quotation
+                u'‾' : ['&#8254;', '&oline;'], # overline
+                u'€' : ['&#8364;', '&euro;'], # euro
+                u'™' : ['&#8482;', '&trade;'], # trademark
+                u'←' : ['&#8592;', '&larr;'], # left arrow
+                u'↑' : ['&#8593;', '&uarr;'], # up arrow
+                u'→' : ['&#8594;', '&rarr;'], # right arrow
+                u'↓' : ['&#8595;', '&darr;'], # down arrow
+                u'↔' : ['&#8596;', '&harr;'], # left right arrow
+                u'↵' : ['&#8629;', '&crarr;'], # carriage return arrow
+                u'⌈' : ['&#8968;', '&lceil;'], # left ceiling
+                u'⌉' : ['&#8969;', '&rceil;'], # right ceiling
+                u'⌊' : ['&#8970;', '&lfloor;'], # left floor
+                u'⌋' : ['&#8971;', '&rfloor;'], # right floor
+                u'◊' : ['&#9674;', '&loz;'], # lozenge
+                u'♠' : ['&#9824;', '&spades;'], # spade
+                u'♣' : ['&#9827;', '&clubs;'], # club
+                u'♥' : ['&#9829;', '&hearts;'], # heart
+                u'♦' : ['&#9830;', '&diams;'], # diamond
+                # Extra http://www.ascii.cl/htmlcodes.htm
+                u'<' : ['&#60;', '&lt;'], # less than sign
+                u'>' : ['&#62;', '&gt;'], # greater than sign
+                u'¡' : ['&#161;', '&iexcl;'], # inverted exclamation mark
+                u'¢' : ['&#162;', '&cent;'], # cent sign
+                u'£' : ['&#163;', '&pound;'], # pound sign
+                u'¤' : ['&#164;', '&curren;'], # currency sign
+                u'¥' : ['&#165;', '&yen;'], # yen sign
+                u'¦' : ['&#166;', '&brvbar;'], # broken vertical bar
+                u'§' : ['&#167;', '&sect;'], # section sign
+                u'¨' : ['&#168;', '&uml;'], # spacing diaeresis - umlaut
+                u'©' : ['&#169;', '&copy;'], # copyright sign
+                u'ª' : ['&#170;', '&ordf;'], # feminine ordinal indicator
+                u'«' : ['&#171;', '&laquo;'], # left double angle quotes
+                u'¬' : ['&#172;', '&not;'], # not sign
+                u'®' : ['&#174;', '&reg;'], # registered trade mark sign
+                u'¯' : ['&#175;', '&macr;'], # spacing macron - overline
+                u'°' : ['&#176;', '&deg;'], # degree sign
+                u'±' : ['&#177;', '&plusmn;'], # plus-or-minus sign
+                u'²' : ['&#178;', '&sup2;'], # superscript two - squared
+                u'³' : ['&#179;', '&sup3;'], # superscript three - cubed
+                u'´' : ['&#180;', '&acute;'], # acute accent - spacing acute
+                u'µ' : ['&#181;', '&micro;'], # micro sign
+                u'¶' : ['&#182;', '&para;'], # pilcrow sign - paragraph sign
+                u'·' : ['&#183;', '&middot;'], # middle dot - Georgian comma
+                u'¸' : ['&#184;', '&cedil;'], # spacing cedilla
+                u'¹' : ['&#185;', '&sup1;'], # superscript one
+                u'º' : ['&#186;', '&ordm;'], # masculine ordinal indicator
+                u'»' : ['&#187;', '&raquo;'], # right double angle quotes
+                u'¼' : ['&#188;', '&frac14;'], # fraction one quarter
+                u'½' : ['&#189;', '&frac12;'], # fraction one half
+                u'¾' : ['&#190;', '&frac34;'], # fraction three quarters
+                u'¿' : ['&#191;', '&iquest;'], # inverted question mark
+                u'À' : ['&#192;', '&Agrave;'], # latin capital letter A with grave
+                u'Á' : ['&#193;', '&Aacute;'], # latin capital letter A with acute
+                u'Â' : ['&#194;', '&Acirc;'], # latin capital letter A with circumflex
+                u'Ã' : ['&#195;', '&Atilde;'], # latin capital letter A with tilde
+                u'Ä' : ['&#196;', '&Auml;'], # latin capital letter A with diaeresis
+                u'Å' : ['&#197;', '&Aring;'], # latin capital letter A with ring above
+                u'Æ' : ['&#198;', '&AElig;'], # latin capital letter AE
+                u'Ç' : ['&#199;', '&Ccedil;'], # latin capital letter C with cedilla
+                u'È' : ['&#200;', '&Egrave;'], # latin capital letter E with grave
+                u'É' : ['&#201;', '&Eacute;'], # latin capital letter E with acute
+                u'Ê' : ['&#202;', '&Ecirc;'], # latin capital letter E with circumflex
+                u'Ë' : ['&#203;', '&Euml;'], # latin capital letter E with diaeresis
+                u'Ì' : ['&#204;', '&Igrave;'], # latin capital letter I with grave
+                u'Í' : ['&#205;', '&Iacute;'], # latin capital letter I with acute
+                u'Î' : ['&#206;', '&Icirc;'], # latin capital letter I with circumflex
+                u'Ï' : ['&#207;', '&Iuml;'], # latin capital letter I with diaeresis
+                u'Ð' : ['&#208;', '&ETH;'], # latin capital letter ETH
+                u'Ñ' : ['&#209;', '&Ntilde;'], # latin capital letter N with tilde
+                u'Ò' : ['&#210;', '&Ograve;'], # latin capital letter O with grave
+                u'Ó' : ['&#211;', '&Oacute;'], # latin capital letter O with acute
+                u'Ô' : ['&#212;', '&Ocirc;'], # latin capital letter O with circumflex
+                u'Õ' : ['&#213;', '&Otilde;'], # latin capital letter O with tilde
+                u'Ö' : ['&#214;', '&Ouml;'], # latin capital letter O with diaeresis
+                u'×' : ['&#215;', '&times;'], # multiplication sign
+                u'Ø' : ['&#216;', '&Oslash;'], # latin capital letter O with slash
+                u'Ù' : ['&#217;', '&Ugrave;'], # latin capital letter U with grave
+                u'Ú' : ['&#218;', '&Uacute;'], # latin capital letter U with acute
+                u'Û' : ['&#219;', '&Ucirc;'], # latin capital letter U with circumflex
+                u'Ü' : ['&#220;', '&Uuml;'], # latin capital letter U with diaeresis
+                u'Ý' : ['&#221;', '&Yacute;'], # latin capital letter Y with acute
+                u'Þ' : ['&#222;', '&THORN;'], # latin capital letter THORN
+                u'ß' : ['&#223;', '&szlig;'], # latin small letter sharp s - ess-zed
+                u'à' : ['&#224;', '&agrave;'], # latin small letter a with grave
+                u'á' : ['&#225;', '&aacute;'], # latin small letter a with acute
+                u'â' : ['&#226;', '&acirc;'], # latin small letter a with circumflex
+                u'ã' : ['&#227;', '&atilde;'], # latin small letter a with tilde
+                u'ä' : ['&#228;', '&auml;'], # latin small letter a with diaeresis
+                u'å' : ['&#229;', '&aring;'], # latin small letter a with ring above
+                u'æ' : ['&#230;', '&aelig;'], # latin small letter ae
+                u'ç' : ['&#231;', '&ccedil;'], # latin small letter c with cedilla
+                u'è' : ['&#232;', '&egrave;'], # latin small letter e with grave
+                u'é' : ['&#233;', '&eacute;'], # latin small letter e with acute
+                u'ê' : ['&#234;', '&ecirc;'], # latin small letter e with circumflex
+                u'ë' : ['&#235;', '&euml;'], # latin small letter e with diaeresis
+                u'ì' : ['&#236;', '&igrave;'], # latin small letter i with grave
+                u'í' : ['&#237;', '&iacute;'], # latin small letter i with acute
+                u'î' : ['&#238;', '&icirc;'], # latin small letter i with circumflex
+                u'ï' : ['&#239;', '&iuml;'], # latin small letter i with diaeresis
+                u'ð' : ['&#240;', '&eth;'], # latin small letter eth
+                u'ñ' : ['&#241;', '&ntilde;'], # latin small letter n with tilde
+                u'ò' : ['&#242;', '&ograve;'], # latin small letter o with grave
+                u'ó' : ['&#243;', '&oacute;'], # latin small letter o with acute
+                u'ô' : ['&#244;', '&ocirc;'], # latin small letter o with circumflex
+                u'õ' : ['&#245;', '&otilde;'], # latin small letter o with tilde
+                u'ö' : ['&#246;', '&ouml;'], # latin small letter o with diaeresis
+                u'÷' : ['&#247;', '&divide;'], # division sign
+                u'ø' : ['&#248;', '&oslash;'], # latin small letter o with slash
+                u'ù' : ['&#249;', '&ugrave;'], # latin small letter u with grave
+                u'ú' : ['&#250;', '&uacute;'], # latin small letter u with acute
+                u'û' : ['&#251;', '&ucirc;'], # latin small letter u with circumflex
+                u'ü' : ['&#252;', '&uuml;'], # latin small letter u with diaeresis
+                u'ý' : ['&#253;', '&yacute;'], # latin small letter y with acute
+                u'þ' : ['&#254;', '&thorn;'], # latin small letter thorn
+                u'ÿ' : ['&#255;', '&yuml;'], # latin small letter y with diaeresis
+               }
+
--- a/src/calibre/ebooks/txt/init.py
+++ b/src/calibre/ebooks/txt/init.py
@ -0,0 +1,9 @@
+#!/usr/bin/env  python
+__license__   = 'GPL v3'
+__copyright__ = '2008, John Schember john@nachtimwald.com'
+__docformat__ = 'restructuredtext en'
+
+'''
+Used for txt output
+'''
+
--- a/src/calibre/ebooks/txt/from_any.py
+++ b/src/calibre/ebooks/txt/from_any.py
@ -0,0 +1,74 @@
+'''
+Convert any ebook format to TXT.
+'''
+
+from __future__ import with_statement
+
+__license__   = 'GPL v3'
+__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net ' \
+    'and Marshall T. Vandegrift <llasram@gmail.com>' \
+    'and John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+import sys, os, glob, logging
+
+from calibre.ebooks.epub.from_any import any2epub, formats, USAGE
+from calibre.ebooks.epub import config as common_config
+from calibre.ptempfile import TemporaryDirectory
+from calibre.ebooks.txt.writer import oeb2txt, config as txt_config
+
+def config(defaults=None):
+    c = common_config(defaults=defaults, name='txt')
+    c.remove_opt('profile')
+    del c.option_set.groups['metadata']
+    del c.option_set.groups['traversal']
+    del c.option_set.groups['structure detection']
+    del c.option_set.groups['toc']
+    del c.option_set.groups['page layout']
+    txtc = txt_config(defaults=defaults)
+    c.update(txtc)
+    return c 
+
+def option_parser(usage=USAGE):
+    usage = usage % ('TXT', formats())
+    parser = config().option_parser(usage=usage)
+    return parser
+
+def any2txt(opts, path, notification=None):
+    ext = os.path.splitext(path)[1]
+    if not ext:
+        raise ValueError('Unknown file type: '+path)
+    ext = ext.lower()[1:]
+    
+    if opts.output is None:
+        opts.output = os.path.splitext(os.path.basename(path))[0]+'.txt'
+    
+    opts.output = os.path.abspath(opts.output)
+    orig_output = opts.output
+    
+    with TemporaryDirectory('_any2txt') as tdir:
+        oebdir = os.path.join(tdir, 'oeb')
+        os.mkdir(oebdir)
+        opts.output = os.path.join(tdir, 'dummy.epub')
+        opts.profile = 'None'
+        opts.dont_split_on_page_breaks = True
+        orig_bfs = opts.base_font_size2
+        opts.base_font_size2 = 0
+        any2epub(opts, path, create_epub=False, oeb_cover=False, extract_to=oebdir)
+        opts.base_font_size2 = orig_bfs
+        opf = glob.glob(os.path.join(oebdir, '*.opf'))[0]
+        opts.output = orig_output
+        logging.getLogger('html2epub').info(_('Creating TXT file from EPUB...'))
+        oeb2txt(opts, opf)
+
+def main(args=sys.argv):
+    parser = option_parser()
+    opts, args = parser.parse_args(args)
+    if len(args) < 2:
+        parser.print_help()
+        print 'No input file specified.'
+        return 1
+    any2txt(opts, args[1])
+    
+if __name__ == '__main__':
+    sys.exit(main())
--- a/src/calibre/ebooks/txt/writer.py
+++ b/src/calibre/ebooks/txt/writer.py
@ -0,0 +1,237 @@
+# -*- coding: utf-8 -*-
+'''
+Write content to TXT.
+'''
+from __future__ import with_statement
+
+__license__   = 'GPL v3'
+__copyright__ = '2009, John Schember <john@nachtimwald.com>'
+
+import os, logging, re, sys
+
+from BeautifulSoup import BeautifulSoup
+
+from calibre import LoggingInterface
+from calibre.ebooks.htmlsymbols import HTML_SYMBOLS
+from calibre.ebooks.epub.iterator import SpineItem
+from calibre.ebooks.metadata import authors_to_string
+from calibre.ebooks.metadata.meta import metadata_from_formats
+from calibre.ebooks.metadata.opf2 import OPF
+from calibre.customize.ui import run_plugins_on_postprocess
+from calibre.utils.config import Config, StringConfig
+
+class TXTWriter(object):
+    def __init__(self, newline):
+        self.newline = newline
+
+    def dump(self, oebpath, path, metadata):
+        opf = OPF(oebpath, os.path.dirname(oebpath))
+        spine = [SpineItem(i.path) for i in opf.spine]
+
+        tmpout = ''
+        for item in spine:
+            with open(item, 'r') as itemf:
+                content = itemf.read().decode(item.encoding)
+                # Convert newlines to unix style \n for processing. These
+                # will be changed to the specified type later in the process.
+                content = self.unix_newlines(content)
+                content = self.strip_html(content)
+                content = self.replace_html_symbols(content)
+                content = self.cleanup_text(content)
+                content = self.specified_newlines(content)
+                tmpout = tmpout + content
+
+        # Prepend metadata
+        if metadata.author != None and metadata.author != '':
+            tmpout = (u'%s%s%s%s' % (metadata.author.upper(), self.newline, self.newline, self.newline)) + tmpout
+        if metadata.title != None and metadata.title != '':
+            tmpout = (u'%s%s%s%s' % (metadata.title.upper(), self.newline, self.newline, self.newline)) + tmpout
+
+            # Put two blank lines at end of file
+
+            end = tmpout[-3 * len(self.newline):]
+            for i in range(3 - end.count(self.newline)):
+                tmpout = tmpout + self.newline
+
+        os.remove(path)
+        with open(path, 'w+b') as out:
+            out.write(tmpout.encode('utf-8'))
+            
+    def strip_html(self, html):
+        stripped = u''
+        
+        for dom_tree in BeautifulSoup(html).findAll('body'):
+            text = unicode(dom_tree)
+            
+            # Remove unnecessary tags
+            for tag in ['script', 'style']:
+                text = re.sub('(?imu)<[ ]*%s[ ]*.*?>(.*)</[ ]*%s[ ]*>' % (tag, tag), '', text)
+            text = re.sub('<!--.*-->', '', text)
+
+            # Headings usually indicate Chapters.
+            # We are going to use a marker to insert the proper number of
+            # newline characters at the end of cleanup_text because cleanup_text
+            # remove excessive (more than 2 newlines).
+            for tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
+                text = re.sub('(?imu)<[ ]*%s[ ]*.*?>' % tag, '-vzxedxy-', text)
+                text = re.sub('(?imu)</[ ]*%s[ ]*>' % tag, '-vlgzxey-', text)
+
+            # Separate content with space.
+            for tag in ['td']:
+                text = re.sub('(?imu)</[ ]*%s[ ]*>', ' ', text)
+            
+            # Separate content with empty line.
+            for tag in ['p', 'div', 'pre', 'li', 'table', 'tr']:
+                text = re.sub('(?imu)</[ ]*%s[ ]*>' % tag, '\n\n', text)
+            
+            for tag in ['hr', 'br']:
+                text = re.sub('(?imu)<[ ]*%s[ ]*/*?>' % tag, '\n\n', text)
+            
+            # Remove any tags that do not need special processing.
+            text = re.sub('<.*?>', '', text)
+            
+            stripped = stripped + text
+            
+        return stripped
+        
+    def replace_html_symbols(self, content):
+        for symbol in HTML_SYMBOLS:
+            for code in HTML_SYMBOLS[symbol]:
+                content = content.replace(code, symbol)
+        return content
+        
+    def cleanup_text(self, text):
+        # Replace bad characters.
+        text = text.replace(u'\xc2', '')
+        text = text.replace(u'\xa0', ' ')
+    
+        # Replace tabs, vertical tags and form feeds with single space.
+        #text = re.sub('\xc2\xa0', '', text)
+        text = text.replace('\t+', ' ')
+        text = text.replace('\v+', ' ')
+        text = text.replace('\f+', ' ')
+    
+        # Single line paragraph.
+        r = re.compile('.\n.')
+        while True:
+            mo = r.search(text)
+            if mo == None:
+                break
+            text = '%s %s' % (text[:mo.start()+1], text[mo.end()-1:])
+        
+        # Remove multiple spaces.
+        text = re.sub('[  ]+', ' ', text)
+        text = re.sub('(?imu)^[ ]+', '', text)
+        text = re.sub('(?imu)[ ]+$', '', text)
+        
+        # Remove excessive newlines.
+        text = re.sub('\n[ ]+\n', '\n\n', text)
+        text = re.sub('\n{3,}', '\n\n', text)
+        
+        # Replace markers with the proper characters.
+        text = text.replace('-vzxedxy-', '\n\n\n\n\n')
+        text = text.replace('-vlgzxey-', '\n\n\n')
+        
+        return text
+
+    def unix_newlines(self, text):
+        text = text.replace('\r\n', '\n')
+        text = text.replace('\r', '\n')
+        
+        return text
+        
+    def specified_newlines(self, text):
+        if self.newline == '\n':
+            return text
+        
+        return text.replace('\n', self.newline)
+        
+class TxtMetadata(object):
+    def __init__(self):
+        self.author = None
+        self.title = None
+        self.series = None
+        
+
+class TxtNewlines(object):
+    NEWLINE_TYPES = {
+                        'system'  : os.linesep,
+                        'unix'    : '\n',
+                        'old_mac' : '\r',
+                        'windows' : '\r\n'
+                     }
+                     
+    def __init__(self, newline_type):
+        self.newline = self.NEWLINE_TYPES.get(newline_type.lower(), os.linesep)
+
+
+def config(defaults=None):
+    desc = _('Options to control the conversion to TXT')
+    if defaults is None:
+        c = Config('txt', desc)
+    else:
+        c = StringConfig(defaults, desc)
+        
+    txt = c.add_group('TXT', _('TXT options.'))
+            
+    txt('newline', ['--newline'], default='system',
+        help=_('Type of newline to use. Options are %s. Default is \'system\'. '
+            'Use \'old_mac\' for compatibility with Mac OS 9 and earlier. '
+            'For Mac OS X use \'unix\'. \'system\' will default to the newline '
+            'type used by this OS.' % sorted(TxtNewlines.NEWLINE_TYPES.keys())))
+    txt('prepend_author', ['--prepend-author'], default='true',
+        help=_('Write the author to the beginning of the file. '
+            'Default is \'true\'. Use \'false\' to disable.'))
+    txt('prepend_title', ['--prepend-title'], default='true',
+        help=_('Write the title to the beginning of the file. '
+            'Default is \'true\'. Use \'false\' to disable.'))
+        
+    return c
+
+def option_parser():
+    c = config()
+    parser = c.option_parser(usage='%prog '+_('[options]')+' file.opf')
+    parser.add_option(
+        '-o', '--output', default=None, 
+        help=_('Output file. Default is derived from input filename.'))
+    parser.add_option(
+        '-v', '--verbose', default=0, action='count',
+        help=_('Useful for debugging.'))        
+    return parser
+
+def oeb2txt(opts, inpath):
+    logger = LoggingInterface(logging.getLogger('oeb2txt'))
+    logger.setup_cli_handler(opts.verbose)
+    
+    outpath = opts.output
+    if outpath is None:
+        outpath = os.path.basename(inpath)
+        outpath = os.path.splitext(outpath)[0] + '.txt'
+
+    mi = metadata_from_formats([inpath])
+    metadata = TxtMetadata()
+    if opts.prepend_author.lower() == 'true':
+        metadata.author = opts.authors if opts.authors else authors_to_string(mi.authors)
+    if opts.prepend_title.lower() == 'true':
+        metadata.title = opts.title if opts.title else mi.title
+
+    newline = TxtNewlines(opts.newline)
+    
+    writer = TXTWriter(newline.newline)
+    writer.dump(inpath, outpath, metadata)
+    run_plugins_on_postprocess(outpath, 'txt')
+    logger.log_info(_('Output written to ') + outpath)
+    
+def main(argv=sys.argv):
+    parser = option_parser()
+    opts, args = parser.parse_args(argv[1:])
+    if len(args) != 1:
+        parser.print_help()
+        return 1
+    inpath = args[0]
+    retval = oeb2txt(opts, inpath)
+    return retval
+
+if __name__ == '__main__':
+    sys.exit(main())
+