TXT Input: Attempt to detect the input encoding when not specified. TCR, PDB Input: Use TXT Input converion plugin for conversion, adds encoding detection and allows for all of TXT Input options to be used (eReader PDB ignores options that do not apply to it).

This commit is contained in:
John Schember 2011-01-01 23:03:58 -05:00
parent 38a82b049d
commit 47aeaf10b6
5 changed files with 67 additions and 79 deletions

View File

@ -22,13 +22,23 @@ class PDBInput(InputFormatPlugin):
OptionRecommendation(name='single_line_paras', recommended_value=False, OptionRecommendation(name='single_line_paras', recommended_value=False,
help=_('Normally calibre treats blank lines as paragraph markers. ' help=_('Normally calibre treats blank lines as paragraph markers. '
'With this option it will assume that every line represents ' 'With this option it will assume that every line represents '
'a paragraph instead.')), 'a paragraph instead. This option is ignored by eReader format.')),
OptionRecommendation(name='print_formatted_paras', recommended_value=False, OptionRecommendation(name='print_formatted_paras', recommended_value=False,
help=_('Normally calibre treats blank lines as paragraph markers. ' help=_('Normally calibre treats blank lines as paragraph markers. '
'With this option it will assume that every line starting with ' 'With this option it will assume that every line starting with '
'an indent (either a tab or 2+ spaces) represents a paragraph. ' 'an indent (either a tab or 2+ spaces) represents a paragraph. '
'Paragraphs end when the next line that starts with an indent ' 'Paragraphs end when the next line that starts with an indent '
'is reached.')), 'is reached. This option is ignored by eReader format.')),
OptionRecommendation(name='preserve_spaces', recommended_value=False,
help=_('Normally extra spaces are condensed into a single space. '
'With this option all spaces will be displayed. This option '
'is ignored by eReader format.')),
OptionRecommendation(name='markdown', recommended_value=False,
help=_('Run the text input through the markdown pre-processor. To '
'learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
OptionRecommendation(name="markdown_disable_toc", recommended_value=False,
help=_('Do not insert a Table of Contents into the output text. '
'This option is ignored by eReader format.')),
]) ])
def convert(self, stream, options, file_ext, log, def convert(self, stream, options, file_ext, log,

View File

@ -11,9 +11,9 @@ __docformat__ = 'restructuredtext en'
import os import os
import struct import struct
from cStringIO import StringIO
from calibre.ebooks.pdb.formatreader import FormatReader from calibre.ebooks.pdb.formatreader import FormatReader
from calibre.ebooks.txt.processor import convert_basic, opf_writer, \
separate_paragraphs_single_line, separate_paragraphs_print_formatted
class HeaderRecord(object): class HeaderRecord(object):
''' '''
@ -33,9 +33,7 @@ class Reader(FormatReader):
def __init__(self, header, stream, log, options): def __init__(self, header, stream, log, options):
self.stream = stream self.stream = stream
self.log = log self.log = log
self.encoding = options.input_encoding self.options = options
self.single_line_paras = options.single_line_paras
self.print_formatted_paras = options.print_formatted_paras
self.sections = [] self.sections = []
for i in range(header.num_sections): for i in range(header.num_sections):
@ -48,34 +46,23 @@ class Reader(FormatReader):
def decompress_text(self, number): def decompress_text(self, number):
if self.header_record.compression == 1: if self.header_record.compression == 1:
return self.section_data(number).decode('cp1252' if self.encoding is None else self.encoding) return self.section_data(number)
if self.header_record.compression == 2 or self.header_record.compression == 258: if self.header_record.compression == 2 or self.header_record.compression == 258:
from calibre.ebooks.compression.palmdoc import decompress_doc from calibre.ebooks.compression.palmdoc import decompress_doc
return decompress_doc(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding, 'replace') return decompress_doc(self.section_data(number))
return '' return ''
def extract_content(self, output_dir): def extract_content(self, output_dir):
txt = '' raw_txt = ''
self.log.info('Decompressing text...') self.log.info('Decompressing text...')
for i in range(1, self.header_record.num_records + 1): for i in range(1, self.header_record.num_records + 1):
self.log.debug('\tDecompressing text section %i' % i) self.log.debug('\tDecompressing text section %i' % i)
txt += self.decompress_text(i) raw_txt += self.decompress_text(i)
self.log.info('Converting text to OEB...') self.log.info('Converting text to OEB...')
if self.single_line_paras: stream = StringIO(raw_txt)
txt = separate_paragraphs_single_line(txt) from calibre.customize.ui import plugin_for_input_format
if self.print_formatted_paras: stream.seek(0)
txt = separate_paragraphs_print_formatted(txt) return plugin_for_input_format('txt').convert(stream, self.options,
html = convert_basic(txt) 'txt', self.log, {})
with open(os.path.join(output_dir, 'index.html'), 'wb') as index:
index.write(html.encode('utf-8'))
from calibre.ebooks.metadata.meta import get_metadata
mi = get_metadata(self.stream, 'pdb')
manifest = [('index.html', None)]
spine = ['index.html']
opf_writer(output_dir, 'metadata.opf', manifest, spine, mi)
return os.path.join(output_dir, 'metadata.opf')

View File

@ -8,12 +8,13 @@ __license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>' __copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import os, struct, zlib import struct
import zlib
from cStringIO import StringIO
from calibre.ebooks.pdb.formatreader import FormatReader from calibre.ebooks.pdb.formatreader import FormatReader
from calibre.ebooks.pdb.ztxt import zTXTError from calibre.ebooks.pdb.ztxt import zTXTError
from calibre.ebooks.txt.processor import convert_basic, opf_writer, \
separate_paragraphs_single_line, separate_paragraphs_print_formatted
SUPPORTED_VERSION = (1, 40) SUPPORTED_VERSION = (1, 40)
@ -38,9 +39,7 @@ class Reader(FormatReader):
def __init__(self, header, stream, log, options): def __init__(self, header, stream, log, options):
self.stream = stream self.stream = stream
self.log = log self.log = log
self.encoding = options.input_encoding self.options = options
self.single_line_paras = options.single_line_paras
self.print_formatted_paras = options.print_formatted_paras
self.sections = [] self.sections = []
for i in range(header.num_sections): for i in range(header.num_sections):
@ -68,30 +67,19 @@ class Reader(FormatReader):
def decompress_text(self, number): def decompress_text(self, number):
if number == 1: if number == 1:
self.uncompressor = zlib.decompressobj() self.uncompressor = zlib.decompressobj()
return self.uncompressor.decompress(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding, 'replace') return self.uncompressor.decompress(self.section_data(number))
def extract_content(self, output_dir): def extract_content(self, output_dir):
txt = '' raw_txt = ''
self.log.info('Decompressing text...') self.log.info('Decompressing text...')
for i in range(1, self.header_record.num_records + 1): for i in range(1, self.header_record.num_records + 1):
self.log.debug('\tDecompressing text section %i' % i) self.log.debug('\tDecompressing text section %i' % i)
txt += self.decompress_text(i) raw_txt += self.decompress_text(i)
self.log.info('Converting text to OEB...') self.log.info('Converting text to OEB...')
if self.single_line_paras: stream = StringIO(raw_txt)
txt = separate_paragraphs_single_line(txt) from calibre.customize.ui import plugin_for_input_format
if self.print_formatted_paras: stream.seek(0)
txt = separate_paragraphs_print_formatted(txt) return plugin_for_input_format('txt').convert(stream, self.options,
html = convert_basic(txt) 'txt', self.log, {})
with open(os.path.join(output_dir, 'index.html'), 'wb') as index:
index.write(html.encode('utf-8'))
from calibre.ebooks.metadata.meta import get_metadata
mi = get_metadata(self.stream, 'pdb')
manifest = [('index.html', None)]
spine = ['index.html']
opf_writer(output_dir, 'metadata.opf', manifest, spine, mi)
return os.path.join(output_dir, 'metadata.opf')

View File

@ -4,11 +4,9 @@ __license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>' __copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import os from cStringIO import StringIO
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
from calibre.ebooks.txt.processor import convert_basic, opf_writer, \
separate_paragraphs_single_line, separate_paragraphs_print_formatted
from calibre.ebooks.compression.tcr import decompress from calibre.ebooks.compression.tcr import decompress
class TCRInput(InputFormatPlugin): class TCRInput(InputFormatPlugin):
@ -29,26 +27,23 @@ class TCRInput(InputFormatPlugin):
'an indent (either a tab or 2+ spaces) represents a paragraph. ' 'an indent (either a tab or 2+ spaces) represents a paragraph. '
'Paragraphs end when the next line that starts with an indent ' 'Paragraphs end when the next line that starts with an indent '
'is reached.')), 'is reached.')),
OptionRecommendation(name='preserve_spaces', recommended_value=False,
help=_('Normally extra spaces are condensed into a single space. '
'With this option all spaces will be displayed.')),
OptionRecommendation(name='markdown', recommended_value=False,
help=_('Run the text input through the markdown pre-processor. To '
'learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
OptionRecommendation(name="markdown_disable_toc", recommended_value=False,
help=_('Do not insert a Table of Contents into the output text.')),
]) ])
def convert(self, stream, options, file_ext, log, accelerators): def convert(self, stream, options, file_ext, log, accelerators):
log.info('Decompressing text...') log.info('Decompressing text...')
ienc = options.input_encoding if options.input_encoding else 'utf-8' raw_txt = decompress(stream)
txt = decompress(stream).decode(ienc, 'replace')
log.info('Converting text to OEB...') log.info('Converting text to OEB...')
if options.single_line_paras: stream = StringIO(raw_txt)
txt = separate_paragraphs_single_line(txt) from calibre.customize.ui import plugin_for_input_format
if options.print_formatted_paras: stream.seek(0)
txt = separate_paragraphs_print_formatted(txt) return plugin_for_input_format('txt').convert(stream, options,
html = convert_basic(txt) 'txt', log, accelerators)
with open(os.path.join(os.getcwd(), 'index.html'), 'wb') as index:
index.write(html.encode('utf-8'))
from calibre.ebooks.metadata.meta import get_metadata
mi = get_metadata(stream, 'tcr')
manifest = [('index.html', None)]
spine = ['index.html']
opf_writer(os.getcwd(), 'metadata.opf', manifest, spine, mi)
return os.path.join(os.getcwd(), 'metadata.opf')

View File

@ -7,6 +7,7 @@ __docformat__ = 'restructuredtext en'
import os import os
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
from calibre.ebooks.chardet import detect
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \ from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \ separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
preserve_spaces preserve_spaces
@ -42,11 +43,19 @@ class TXTInput(InputFormatPlugin):
def convert(self, stream, options, file_ext, log, def convert(self, stream, options, file_ext, log,
accelerators): accelerators):
ienc = stream.encoding if stream.encoding else 'utf-8' log.debug('Reading text from file...')
txt = stream.read()
if options.input_encoding: if options.input_encoding:
ienc = options.input_encoding ienc = options.input_encoding
log.debug('Reading text from file...') log.debug('Using user specified input encoding of %s' % ienc)
txt = stream.read().decode(ienc, 'replace') else:
ienc = detect(txt)['encoding']
log.debug('Detected input encoding as %s' % ienc)
if not ienc:
ienc = 'utf-8'
log.debug('No input encoding specified and could not auto detect using %s' % ienc)
txt = txt.decode(ienc, 'replace')
# Adjust paragraph formatting as requested # Adjust paragraph formatting as requested
if options.single_line_paras: if options.single_line_paras:
@ -85,11 +94,10 @@ class TXTInput(InputFormatPlugin):
htmlfile = open(fname, 'wb') htmlfile = open(fname, 'wb')
with htmlfile: with htmlfile:
htmlfile.write(html.encode('utf-8')) htmlfile.write(html.encode('utf-8'))
cwd = os.getcwdu()
odi = options.debug_pipeline odi = options.debug_pipeline
options.debug_pipeline = None options.debug_pipeline = None
oeb = html_input(open(htmlfile.name, 'rb'), options, 'html', log, oeb = html_input.convert(open(htmlfile.name, 'rb'), options, 'html', log,
{}, cwd) {})
options.debug_pipeline = odi options.debug_pipeline = odi
os.remove(htmlfile.name) os.remove(htmlfile.name)
return oeb return oeb