txt output now uses new conversion pipeline

This commit is contained in:
John Schember 2009-03-31 18:41:49 -04:00
parent d3801fb00c
commit 90362ab56a
6 changed files with 90 additions and 185 deletions

View File

@ -160,7 +160,7 @@ class ODTMetadataReader(MetadataReaderPlugin):
from calibre.ebooks.metadata.odt import get_metadata from calibre.ebooks.metadata.odt import get_metadata
return get_metadata(stream) return get_metadata(stream)
class TXTMetadataReader(MetaReaderPlugin): class TXTMetadataReader(MetadataReaderPlugin):
name = 'Read TXT metadata' name = 'Read TXT metadata'
file_types = set(['txt']) file_types = set(['txt'])
@ -266,9 +266,10 @@ class MOBIMetadataWriter(MetadataWriterPlugin):
from calibre.ebooks.epub.input import EPUBInput from calibre.ebooks.epub.input import EPUBInput
from calibre.ebooks.mobi.input import MOBIInput from calibre.ebooks.mobi.input import MOBIInput
from calibre.ebooks.oeb.output import OEBOutput from calibre.ebooks.oeb.output import OEBOutput
from calibre.ebooks.txt.output import TXTOutput
from calibre.customize.profiles import input_profiles, output_profiles from calibre.customize.profiles import input_profiles, output_profiles
plugins = [HTML2ZIP, EPUBInput, MOBIInput, OEBOutput] plugins = [HTML2ZIP, EPUBInput, MOBIInput, OEBOutput, TXTOutput]
plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
x.__name__.endswith('MetadataReader')] x.__name__.endswith('MetadataReader')]
plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ plugins += [x for x in list(locals().values()) if isinstance(x, type) and \

View File

@ -195,7 +195,7 @@ OptionRecommendation(name='language',
self.input_fmt = input_fmt self.input_fmt = input_fmt
self.output_fmt = output_fmt self.output_fmt = output_fmt
# Build set of all possible options. Two options are equal iff their # Build set of all possible options. Two options are equal if their
# names are the same. # names are the same.
self.input_options = self.input_plugin.options.union( self.input_options = self.input_plugin.options.union(
self.input_plugin.common_options) self.input_plugin.common_options)

View File

@ -22,7 +22,7 @@ def get_metadata(stream, extract_cover=True):
else: else:
mdata += line mdata += line
mo = re.search('(?u)^[ ]*(?P<title>.+)[ ]*\n\n\n[ ]*(?P<author>.+)[ ]*\n$', mdata) mo = re.search('(?u)^[ ]*(?P<title>.+)[ ]*(\n{3}|(\r\n){3}|\r{3})[ ]*(?P<author>.+)[ ]*(\n|\r\n|\r)$', mdata)
if mo != None: if mo != None:
mi.title = mo.group('title') mi.title = mo.group('title')
mi.authors = mo.group('author').split(',') mi.authors = mo.group('author').split(',')

View File

@ -1,74 +0,0 @@
'''
Convert any ebook format to TXT.
'''
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net ' \
'and Marshall T. Vandegrift <llasram@gmail.com>' \
'and John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import sys, os, glob, logging
from calibre.ebooks.epub.from_any import any2epub, formats, USAGE
from calibre.ebooks.epub import config as common_config
from calibre.ptempfile import TemporaryDirectory
from calibre.ebooks.txt.writer import oeb2txt, config as txt_config
def config(defaults=None):
c = common_config(defaults=defaults, name='txt')
c.remove_opt('profile')
del c.option_set.groups['metadata']
del c.option_set.groups['traversal']
del c.option_set.groups['structure detection']
del c.option_set.groups['toc']
del c.option_set.groups['page layout']
txtc = txt_config(defaults=defaults)
c.update(txtc)
return c
def option_parser(usage=USAGE):
usage = usage % ('TXT', formats())
parser = config().option_parser(usage=usage)
return parser
def any2txt(opts, path, notification=None):
ext = os.path.splitext(path)[1]
if not ext:
raise ValueError('Unknown file type: '+path)
ext = ext.lower()[1:]
if opts.output is None:
opts.output = os.path.splitext(os.path.basename(path))[0]+'.txt'
opts.output = os.path.abspath(opts.output)
orig_output = opts.output
with TemporaryDirectory('_any2txt') as tdir:
oebdir = os.path.join(tdir, 'oeb')
os.mkdir(oebdir)
opts.output = os.path.join(tdir, 'dummy.epub')
opts.profile = 'None'
opts.dont_split_on_page_breaks = True
orig_bfs = opts.base_font_size2
opts.base_font_size2 = 0
any2epub(opts, path, create_epub=False, oeb_cover=False, extract_to=oebdir)
opts.base_font_size2 = orig_bfs
opf = glob.glob(os.path.join(oebdir, '*.opf'))[0]
opts.output = orig_output
logging.getLogger('html2epub').info(_('Creating TXT file from EPUB...'))
oeb2txt(opts, opf)
def main(args=sys.argv):
parser = option_parser()
opts, args = parser.parse_args(args)
if len(args) < 2:
parser.print_help()
print 'No input file specified.'
return 1
any2txt(opts, args[1])
if __name__ == '__main__':
sys.exit(main())

View File

@ -0,0 +1,62 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os
from calibre.customize.conversion import OutputFormatPlugin, \
OptionRecommendation
from calibre.ebooks.txt.writer import TxtWriter, TxtNewlines, TxtMetadata
from calibre.ebooks.metadata import authors_to_string
class TXTOutput(OutputFormatPlugin):
name = 'TXT Output'
author = 'John Schember'
file_type = 'txt'
options = set([
OptionRecommendation(name='newline', recommended_value='system',
level=OptionRecommendation.LOW, long_switch='newline',
short_switch='n', choices=TxtNewlines.NEWLINE_TYPES.keys(),
help=_('Type of newline to use. Options are %s. Default is \'system\'. '
'Use \'old_mac\' for compatibility with Mac OS 9 and earlier. '
'For Mac OS X use \'unix\'. \'system\' will default to the newline '
'type used by this OS.' % sorted(TxtNewlines.NEWLINE_TYPES.keys()))),
OptionRecommendation(name='prepend_author', recommended_value='true',
level=OptionRecommendation.LOW, long_switch='prepend_author',
choices=['true', 'false'],
help=_('Write the author to the beginning of the file. '
'Default is \'true\'. Use \'false\' to disable.')),
OptionRecommendation(name='prepend_title', recommended_value='true',
choices=['true', 'false'],
level=OptionRecommendation.LOW, long_switch='prepend_title',
help=_('Write the title to the beginning of the file. '
'Default is \'true\'. Use \'false\' to disable.'))
])
def convert(self, oeb_book, output_path, input_plugin, opts, log):
metadata = TxtMetadata()
if opts.prepend_author.lower() == 'true':
metadata.author = opts.authors if opts.authors else authors_to_string(oeb_book.metadata.authors)
if opts.prepend_title.lower() == 'true':
metadata.title = opts.title if opts.title else oeb_book.metadata.title
writer = TxtWriter(TxtNewlines(opts.newline).newline, log)
txt = writer.dump(oeb_book.spine, metadata)
close = False
if not hasattr(output_path, 'write'):
close = True
if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path) != '':
os.makedirs(os.path.dirname(output_path))
out_stream = open(output_path, 'wb')
else:
out_stream = output_path
out_stream.seek(0)
out_stream.write(txt)
if close:
out_stream.close()

View File

@ -1,34 +1,26 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from __future__ import with_statement
''' '''
Write content to TXT. Write content to TXT.
''' '''
from __future__ import with_statement
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>' __copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os, logging, re, sys import os, re, sys
from calibre.ebooks.htmlsymbols import HTML_SYMBOLS
from BeautifulSoup import BeautifulSoup from BeautifulSoup import BeautifulSoup
from calibre import LoggingInterface class TxtWriter(object):
from calibre.ebooks.htmlsymbols import HTML_SYMBOLS def __init__(self, newline, log):
from calibre.ebooks.epub.iterator import SpineItem
from calibre.ebooks.metadata import authors_to_string
from calibre.ebooks.metadata.meta import metadata_from_formats
from calibre.ebooks.metadata.opf2 import OPF
from calibre.customize.ui import run_plugins_on_postprocess
from calibre.utils.config import Config, StringConfig
class TXTWriter(object):
def __init__(self, newline):
self.newline = newline self.newline = newline
self.log = log
def dump(self, oebpath, path, metadata): def dump(self, spine, metadata):
opf = OPF(oebpath, os.path.dirname(oebpath)) out = u''
spine = [SpineItem(i.path) for i in opf.spine]
tmpout = ''
for item in spine: for item in spine:
with open(item, 'r') as itemf: with open(item, 'r') as itemf:
content = itemf.read().decode(item.encoding) content = itemf.read().decode(item.encoding)
@ -39,24 +31,20 @@ class TXTWriter(object):
content = self.replace_html_symbols(content) content = self.replace_html_symbols(content)
content = self.cleanup_text(content) content = self.cleanup_text(content)
content = self.specified_newlines(content) content = self.specified_newlines(content)
tmpout = tmpout + content out += content
# Prepend metadata # Prepend metadata
if metadata.author != None and metadata.author != '': if metadata.author != None and metadata.author != '':
tmpout = (u'%s%s%s%s' % (metadata.author.upper(), self.newline, self.newline, self.newline)) + tmpout out = (u'%s%s%s%s' % (metadata.author.upper(), self.newline, self.newline, self.newline)) + out
if metadata.title != None and metadata.title != '': if metadata.title != None and metadata.title != '':
tmpout = (u'%s%s%s%s' % (metadata.title.upper(), self.newline, self.newline, self.newline)) + tmpout out = (u'%s%s%s%s' % (metadata.title.upper(), self.newline, self.newline, self.newline)) + out
# Put two blank lines at end of file # Put two blank lines at end of file
end = out[-3 * len(self.newline):]
end = tmpout[-3 * len(self.newline):]
for i in range(3 - end.count(self.newline)): for i in range(3 - end.count(self.newline)):
tmpout = tmpout + self.newline out += self.newline
if os.path.exists(path): return out
os.remove(path)
with open(path, 'w+b') as out:
out.write(tmpout.encode('utf-8'))
def strip_html(self, html): def strip_html(self, html):
stripped = u'' stripped = u''
@ -151,12 +139,6 @@ class TXTWriter(object):
return text.replace('\n', self.newline) return text.replace('\n', self.newline)
class TxtMetadata(object):
def __init__(self):
self.author = None
self.title = None
self.series = None
class TxtNewlines(object): class TxtNewlines(object):
NEWLINE_TYPES = { NEWLINE_TYPES = {
@ -170,73 +152,7 @@ class TxtNewlines(object):
self.newline = self.NEWLINE_TYPES.get(newline_type.lower(), os.linesep) self.newline = self.NEWLINE_TYPES.get(newline_type.lower(), os.linesep)
def config(defaults=None): class TxtMetadata(object):
desc = _('Options to control the conversion to TXT') def __init__(self):
if defaults is None: self.title = None
c = Config('txt', desc) self.author = None
else:
c = StringConfig(defaults, desc)
txt = c.add_group('TXT', _('TXT options.'))
txt('newline', ['--newline'], default='system',
help=_('Type of newline to use. Options are %s. Default is \'system\'. '
'Use \'old_mac\' for compatibility with Mac OS 9 and earlier. '
'For Mac OS X use \'unix\'. \'system\' will default to the newline '
'type used by this OS.' % sorted(TxtNewlines.NEWLINE_TYPES.keys())))
txt('prepend_author', ['--prepend-author'], default='true',
help=_('Write the author to the beginning of the file. '
'Default is \'true\'. Use \'false\' to disable.'))
txt('prepend_title', ['--prepend-title'], default='true',
help=_('Write the title to the beginning of the file. '
'Default is \'true\'. Use \'false\' to disable.'))
return c
def option_parser():
c = config()
parser = c.option_parser(usage='%prog '+_('[options]')+' file.opf')
parser.add_option(
'-o', '--output', default=None,
help=_('Output file. Default is derived from input filename.'))
parser.add_option(
'-v', '--verbose', default=0, action='count',
help=_('Useful for debugging.'))
return parser
def oeb2txt(opts, inpath):
logger = LoggingInterface(logging.getLogger('oeb2txt'))
logger.setup_cli_handler(opts.verbose)
outpath = opts.output
if outpath is None:
outpath = os.path.basename(inpath)
outpath = os.path.splitext(outpath)[0] + '.txt'
mi = metadata_from_formats([inpath])
metadata = TxtMetadata()
if opts.prepend_author.lower() == 'true':
metadata.author = opts.authors if opts.authors else authors_to_string(mi.authors)
if opts.prepend_title.lower() == 'true':
metadata.title = opts.title if opts.title else mi.title
newline = TxtNewlines(opts.newline)
writer = TXTWriter(newline.newline)
writer.dump(inpath, outpath, metadata)
run_plugins_on_postprocess(outpath, 'txt')
logger.log_info(_('Output written to ') + outpath)
def main(argv=sys.argv):
parser = option_parser()
opts, args = parser.parse_args(argv[1:])
if len(args) != 1:
parser.print_help()
return 1
inpath = args[0]
retval = oeb2txt(opts, inpath)
return retval
if __name__ == '__main__':
sys.exit(main())