mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-31 14:33:54 -04:00
txt output now uses new conversion pipeline
This commit is contained in:
parent
d3801fb00c
commit
90362ab56a
@ -160,7 +160,7 @@ class ODTMetadataReader(MetadataReaderPlugin):
|
|||||||
from calibre.ebooks.metadata.odt import get_metadata
|
from calibre.ebooks.metadata.odt import get_metadata
|
||||||
return get_metadata(stream)
|
return get_metadata(stream)
|
||||||
|
|
||||||
class TXTMetadataReader(MetaReaderPlugin):
|
class TXTMetadataReader(MetadataReaderPlugin):
|
||||||
|
|
||||||
name = 'Read TXT metadata'
|
name = 'Read TXT metadata'
|
||||||
file_types = set(['txt'])
|
file_types = set(['txt'])
|
||||||
@ -266,9 +266,10 @@ class MOBIMetadataWriter(MetadataWriterPlugin):
|
|||||||
from calibre.ebooks.epub.input import EPUBInput
|
from calibre.ebooks.epub.input import EPUBInput
|
||||||
from calibre.ebooks.mobi.input import MOBIInput
|
from calibre.ebooks.mobi.input import MOBIInput
|
||||||
from calibre.ebooks.oeb.output import OEBOutput
|
from calibre.ebooks.oeb.output import OEBOutput
|
||||||
|
from calibre.ebooks.txt.output import TXTOutput
|
||||||
from calibre.customize.profiles import input_profiles, output_profiles
|
from calibre.customize.profiles import input_profiles, output_profiles
|
||||||
|
|
||||||
plugins = [HTML2ZIP, EPUBInput, MOBIInput, OEBOutput]
|
plugins = [HTML2ZIP, EPUBInput, MOBIInput, OEBOutput, TXTOutput]
|
||||||
plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
|
plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
|
||||||
x.__name__.endswith('MetadataReader')]
|
x.__name__.endswith('MetadataReader')]
|
||||||
plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
|
plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
|
||||||
|
@ -195,7 +195,7 @@ OptionRecommendation(name='language',
|
|||||||
self.input_fmt = input_fmt
|
self.input_fmt = input_fmt
|
||||||
self.output_fmt = output_fmt
|
self.output_fmt = output_fmt
|
||||||
|
|
||||||
# Build set of all possible options. Two options are equal iff their
|
# Build set of all possible options. Two options are equal if their
|
||||||
# names are the same.
|
# names are the same.
|
||||||
self.input_options = self.input_plugin.options.union(
|
self.input_options = self.input_plugin.options.union(
|
||||||
self.input_plugin.common_options)
|
self.input_plugin.common_options)
|
||||||
|
@ -22,7 +22,7 @@ def get_metadata(stream, extract_cover=True):
|
|||||||
else:
|
else:
|
||||||
mdata += line
|
mdata += line
|
||||||
|
|
||||||
mo = re.search('(?u)^[ ]*(?P<title>.+)[ ]*\n\n\n[ ]*(?P<author>.+)[ ]*\n$', mdata)
|
mo = re.search('(?u)^[ ]*(?P<title>.+)[ ]*(\n{3}|(\r\n){3}|\r{3})[ ]*(?P<author>.+)[ ]*(\n|\r\n|\r)$', mdata)
|
||||||
if mo != None:
|
if mo != None:
|
||||||
mi.title = mo.group('title')
|
mi.title = mo.group('title')
|
||||||
mi.authors = mo.group('author').split(',')
|
mi.authors = mo.group('author').split(',')
|
||||||
|
@ -1,74 +0,0 @@
|
|||||||
'''
|
|
||||||
Convert any ebook format to TXT.
|
|
||||||
'''
|
|
||||||
|
|
||||||
from __future__ import with_statement
|
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
|
||||||
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net ' \
|
|
||||||
'and Marshall T. Vandegrift <llasram@gmail.com>' \
|
|
||||||
'and John Schember <john@nachtimwald.com>'
|
|
||||||
__docformat__ = 'restructuredtext en'
|
|
||||||
|
|
||||||
import sys, os, glob, logging
|
|
||||||
|
|
||||||
from calibre.ebooks.epub.from_any import any2epub, formats, USAGE
|
|
||||||
from calibre.ebooks.epub import config as common_config
|
|
||||||
from calibre.ptempfile import TemporaryDirectory
|
|
||||||
from calibre.ebooks.txt.writer import oeb2txt, config as txt_config
|
|
||||||
|
|
||||||
def config(defaults=None):
|
|
||||||
c = common_config(defaults=defaults, name='txt')
|
|
||||||
c.remove_opt('profile')
|
|
||||||
del c.option_set.groups['metadata']
|
|
||||||
del c.option_set.groups['traversal']
|
|
||||||
del c.option_set.groups['structure detection']
|
|
||||||
del c.option_set.groups['toc']
|
|
||||||
del c.option_set.groups['page layout']
|
|
||||||
txtc = txt_config(defaults=defaults)
|
|
||||||
c.update(txtc)
|
|
||||||
return c
|
|
||||||
|
|
||||||
def option_parser(usage=USAGE):
|
|
||||||
usage = usage % ('TXT', formats())
|
|
||||||
parser = config().option_parser(usage=usage)
|
|
||||||
return parser
|
|
||||||
|
|
||||||
def any2txt(opts, path, notification=None):
|
|
||||||
ext = os.path.splitext(path)[1]
|
|
||||||
if not ext:
|
|
||||||
raise ValueError('Unknown file type: '+path)
|
|
||||||
ext = ext.lower()[1:]
|
|
||||||
|
|
||||||
if opts.output is None:
|
|
||||||
opts.output = os.path.splitext(os.path.basename(path))[0]+'.txt'
|
|
||||||
|
|
||||||
opts.output = os.path.abspath(opts.output)
|
|
||||||
orig_output = opts.output
|
|
||||||
|
|
||||||
with TemporaryDirectory('_any2txt') as tdir:
|
|
||||||
oebdir = os.path.join(tdir, 'oeb')
|
|
||||||
os.mkdir(oebdir)
|
|
||||||
opts.output = os.path.join(tdir, 'dummy.epub')
|
|
||||||
opts.profile = 'None'
|
|
||||||
opts.dont_split_on_page_breaks = True
|
|
||||||
orig_bfs = opts.base_font_size2
|
|
||||||
opts.base_font_size2 = 0
|
|
||||||
any2epub(opts, path, create_epub=False, oeb_cover=False, extract_to=oebdir)
|
|
||||||
opts.base_font_size2 = orig_bfs
|
|
||||||
opf = glob.glob(os.path.join(oebdir, '*.opf'))[0]
|
|
||||||
opts.output = orig_output
|
|
||||||
logging.getLogger('html2epub').info(_('Creating TXT file from EPUB...'))
|
|
||||||
oeb2txt(opts, opf)
|
|
||||||
|
|
||||||
def main(args=sys.argv):
|
|
||||||
parser = option_parser()
|
|
||||||
opts, args = parser.parse_args(args)
|
|
||||||
if len(args) < 2:
|
|
||||||
parser.print_help()
|
|
||||||
print 'No input file specified.'
|
|
||||||
return 1
|
|
||||||
any2txt(opts, args[1])
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
sys.exit(main())
|
|
62
src/calibre/ebooks/txt/output.py
Normal file
62
src/calibre/ebooks/txt/output.py
Normal file
@ -0,0 +1,62 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
__license__ = 'GPL 3'
|
||||||
|
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||||
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
|
from calibre.customize.conversion import OutputFormatPlugin, \
|
||||||
|
OptionRecommendation
|
||||||
|
from calibre.ebooks.txt.writer import TxtWriter, TxtNewlines, TxtMetadata
|
||||||
|
from calibre.ebooks.metadata import authors_to_string
|
||||||
|
|
||||||
|
class TXTOutput(OutputFormatPlugin):
|
||||||
|
|
||||||
|
name = 'TXT Output'
|
||||||
|
author = 'John Schember'
|
||||||
|
file_type = 'txt'
|
||||||
|
|
||||||
|
options = set([
|
||||||
|
OptionRecommendation(name='newline', recommended_value='system',
|
||||||
|
level=OptionRecommendation.LOW, long_switch='newline',
|
||||||
|
short_switch='n', choices=TxtNewlines.NEWLINE_TYPES.keys(),
|
||||||
|
help=_('Type of newline to use. Options are %s. Default is \'system\'. '
|
||||||
|
'Use \'old_mac\' for compatibility with Mac OS 9 and earlier. '
|
||||||
|
'For Mac OS X use \'unix\'. \'system\' will default to the newline '
|
||||||
|
'type used by this OS.' % sorted(TxtNewlines.NEWLINE_TYPES.keys()))),
|
||||||
|
OptionRecommendation(name='prepend_author', recommended_value='true',
|
||||||
|
level=OptionRecommendation.LOW, long_switch='prepend_author',
|
||||||
|
choices=['true', 'false'],
|
||||||
|
help=_('Write the author to the beginning of the file. '
|
||||||
|
'Default is \'true\'. Use \'false\' to disable.')),
|
||||||
|
OptionRecommendation(name='prepend_title', recommended_value='true',
|
||||||
|
choices=['true', 'false'],
|
||||||
|
level=OptionRecommendation.LOW, long_switch='prepend_title',
|
||||||
|
help=_('Write the title to the beginning of the file. '
|
||||||
|
'Default is \'true\'. Use \'false\' to disable.'))
|
||||||
|
])
|
||||||
|
|
||||||
|
def convert(self, oeb_book, output_path, input_plugin, opts, log):
|
||||||
|
metadata = TxtMetadata()
|
||||||
|
if opts.prepend_author.lower() == 'true':
|
||||||
|
metadata.author = opts.authors if opts.authors else authors_to_string(oeb_book.metadata.authors)
|
||||||
|
if opts.prepend_title.lower() == 'true':
|
||||||
|
metadata.title = opts.title if opts.title else oeb_book.metadata.title
|
||||||
|
|
||||||
|
writer = TxtWriter(TxtNewlines(opts.newline).newline, log)
|
||||||
|
txt = writer.dump(oeb_book.spine, metadata)
|
||||||
|
|
||||||
|
close = False
|
||||||
|
if not hasattr(output_path, 'write'):
|
||||||
|
close = True
|
||||||
|
if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path) != '':
|
||||||
|
os.makedirs(os.path.dirname(output_path))
|
||||||
|
out_stream = open(output_path, 'wb')
|
||||||
|
else:
|
||||||
|
out_stream = output_path
|
||||||
|
|
||||||
|
out_stream.seek(0)
|
||||||
|
out_stream.write(txt)
|
||||||
|
|
||||||
|
if close:
|
||||||
|
out_stream.close()
|
@ -1,34 +1,26 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
from __future__ import with_statement
|
||||||
'''
|
'''
|
||||||
Write content to TXT.
|
Write content to TXT.
|
||||||
'''
|
'''
|
||||||
from __future__ import with_statement
|
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||||
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
import os, logging, re, sys
|
import os, re, sys
|
||||||
|
|
||||||
|
from calibre.ebooks.htmlsymbols import HTML_SYMBOLS
|
||||||
|
|
||||||
from BeautifulSoup import BeautifulSoup
|
from BeautifulSoup import BeautifulSoup
|
||||||
|
|
||||||
from calibre import LoggingInterface
|
class TxtWriter(object):
|
||||||
from calibre.ebooks.htmlsymbols import HTML_SYMBOLS
|
def __init__(self, newline, log):
|
||||||
from calibre.ebooks.epub.iterator import SpineItem
|
|
||||||
from calibre.ebooks.metadata import authors_to_string
|
|
||||||
from calibre.ebooks.metadata.meta import metadata_from_formats
|
|
||||||
from calibre.ebooks.metadata.opf2 import OPF
|
|
||||||
from calibre.customize.ui import run_plugins_on_postprocess
|
|
||||||
from calibre.utils.config import Config, StringConfig
|
|
||||||
|
|
||||||
class TXTWriter(object):
|
|
||||||
def __init__(self, newline):
|
|
||||||
self.newline = newline
|
self.newline = newline
|
||||||
|
self.log = log
|
||||||
|
|
||||||
def dump(self, oebpath, path, metadata):
|
def dump(self, spine, metadata):
|
||||||
opf = OPF(oebpath, os.path.dirname(oebpath))
|
out = u''
|
||||||
spine = [SpineItem(i.path) for i in opf.spine]
|
|
||||||
|
|
||||||
tmpout = ''
|
|
||||||
for item in spine:
|
for item in spine:
|
||||||
with open(item, 'r') as itemf:
|
with open(item, 'r') as itemf:
|
||||||
content = itemf.read().decode(item.encoding)
|
content = itemf.read().decode(item.encoding)
|
||||||
@ -39,24 +31,20 @@ class TXTWriter(object):
|
|||||||
content = self.replace_html_symbols(content)
|
content = self.replace_html_symbols(content)
|
||||||
content = self.cleanup_text(content)
|
content = self.cleanup_text(content)
|
||||||
content = self.specified_newlines(content)
|
content = self.specified_newlines(content)
|
||||||
tmpout = tmpout + content
|
out += content
|
||||||
|
|
||||||
# Prepend metadata
|
# Prepend metadata
|
||||||
if metadata.author != None and metadata.author != '':
|
if metadata.author != None and metadata.author != '':
|
||||||
tmpout = (u'%s%s%s%s' % (metadata.author.upper(), self.newline, self.newline, self.newline)) + tmpout
|
out = (u'%s%s%s%s' % (metadata.author.upper(), self.newline, self.newline, self.newline)) + out
|
||||||
if metadata.title != None and metadata.title != '':
|
if metadata.title != None and metadata.title != '':
|
||||||
tmpout = (u'%s%s%s%s' % (metadata.title.upper(), self.newline, self.newline, self.newline)) + tmpout
|
out = (u'%s%s%s%s' % (metadata.title.upper(), self.newline, self.newline, self.newline)) + out
|
||||||
|
|
||||||
# Put two blank lines at end of file
|
# Put two blank lines at end of file
|
||||||
|
end = out[-3 * len(self.newline):]
|
||||||
end = tmpout[-3 * len(self.newline):]
|
|
||||||
for i in range(3 - end.count(self.newline)):
|
for i in range(3 - end.count(self.newline)):
|
||||||
tmpout = tmpout + self.newline
|
out += self.newline
|
||||||
|
|
||||||
if os.path.exists(path):
|
return out
|
||||||
os.remove(path)
|
|
||||||
with open(path, 'w+b') as out:
|
|
||||||
out.write(tmpout.encode('utf-8'))
|
|
||||||
|
|
||||||
def strip_html(self, html):
|
def strip_html(self, html):
|
||||||
stripped = u''
|
stripped = u''
|
||||||
@ -151,12 +139,6 @@ class TXTWriter(object):
|
|||||||
|
|
||||||
return text.replace('\n', self.newline)
|
return text.replace('\n', self.newline)
|
||||||
|
|
||||||
class TxtMetadata(object):
|
|
||||||
def __init__(self):
|
|
||||||
self.author = None
|
|
||||||
self.title = None
|
|
||||||
self.series = None
|
|
||||||
|
|
||||||
|
|
||||||
class TxtNewlines(object):
|
class TxtNewlines(object):
|
||||||
NEWLINE_TYPES = {
|
NEWLINE_TYPES = {
|
||||||
@ -170,73 +152,7 @@ class TxtNewlines(object):
|
|||||||
self.newline = self.NEWLINE_TYPES.get(newline_type.lower(), os.linesep)
|
self.newline = self.NEWLINE_TYPES.get(newline_type.lower(), os.linesep)
|
||||||
|
|
||||||
|
|
||||||
def config(defaults=None):
|
class TxtMetadata(object):
|
||||||
desc = _('Options to control the conversion to TXT')
|
def __init__(self):
|
||||||
if defaults is None:
|
self.title = None
|
||||||
c = Config('txt', desc)
|
self.author = None
|
||||||
else:
|
|
||||||
c = StringConfig(defaults, desc)
|
|
||||||
|
|
||||||
txt = c.add_group('TXT', _('TXT options.'))
|
|
||||||
|
|
||||||
txt('newline', ['--newline'], default='system',
|
|
||||||
help=_('Type of newline to use. Options are %s. Default is \'system\'. '
|
|
||||||
'Use \'old_mac\' for compatibility with Mac OS 9 and earlier. '
|
|
||||||
'For Mac OS X use \'unix\'. \'system\' will default to the newline '
|
|
||||||
'type used by this OS.' % sorted(TxtNewlines.NEWLINE_TYPES.keys())))
|
|
||||||
txt('prepend_author', ['--prepend-author'], default='true',
|
|
||||||
help=_('Write the author to the beginning of the file. '
|
|
||||||
'Default is \'true\'. Use \'false\' to disable.'))
|
|
||||||
txt('prepend_title', ['--prepend-title'], default='true',
|
|
||||||
help=_('Write the title to the beginning of the file. '
|
|
||||||
'Default is \'true\'. Use \'false\' to disable.'))
|
|
||||||
|
|
||||||
return c
|
|
||||||
|
|
||||||
def option_parser():
|
|
||||||
c = config()
|
|
||||||
parser = c.option_parser(usage='%prog '+_('[options]')+' file.opf')
|
|
||||||
parser.add_option(
|
|
||||||
'-o', '--output', default=None,
|
|
||||||
help=_('Output file. Default is derived from input filename.'))
|
|
||||||
parser.add_option(
|
|
||||||
'-v', '--verbose', default=0, action='count',
|
|
||||||
help=_('Useful for debugging.'))
|
|
||||||
return parser
|
|
||||||
|
|
||||||
def oeb2txt(opts, inpath):
|
|
||||||
logger = LoggingInterface(logging.getLogger('oeb2txt'))
|
|
||||||
logger.setup_cli_handler(opts.verbose)
|
|
||||||
|
|
||||||
outpath = opts.output
|
|
||||||
if outpath is None:
|
|
||||||
outpath = os.path.basename(inpath)
|
|
||||||
outpath = os.path.splitext(outpath)[0] + '.txt'
|
|
||||||
|
|
||||||
mi = metadata_from_formats([inpath])
|
|
||||||
metadata = TxtMetadata()
|
|
||||||
if opts.prepend_author.lower() == 'true':
|
|
||||||
metadata.author = opts.authors if opts.authors else authors_to_string(mi.authors)
|
|
||||||
if opts.prepend_title.lower() == 'true':
|
|
||||||
metadata.title = opts.title if opts.title else mi.title
|
|
||||||
|
|
||||||
newline = TxtNewlines(opts.newline)
|
|
||||||
|
|
||||||
writer = TXTWriter(newline.newline)
|
|
||||||
writer.dump(inpath, outpath, metadata)
|
|
||||||
run_plugins_on_postprocess(outpath, 'txt')
|
|
||||||
logger.log_info(_('Output written to ') + outpath)
|
|
||||||
|
|
||||||
def main(argv=sys.argv):
|
|
||||||
parser = option_parser()
|
|
||||||
opts, args = parser.parse_args(argv[1:])
|
|
||||||
if len(args) != 1:
|
|
||||||
parser.print_help()
|
|
||||||
return 1
|
|
||||||
inpath = args[0]
|
|
||||||
retval = oeb2txt(opts, inpath)
|
|
||||||
return retval
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
sys.exit(main())
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user