mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
TXT Input: Attempt to detect the input encoding when not specified. TCR, PDB Input: Use TXT Input converion plugin for conversion, adds encoding detection and allows for all of TXT Input options to be used (eReader PDB ignores options that do not apply to it).
This commit is contained in:
parent
38a82b049d
commit
47aeaf10b6
@ -22,13 +22,23 @@ class PDBInput(InputFormatPlugin):
|
||||
OptionRecommendation(name='single_line_paras', recommended_value=False,
|
||||
help=_('Normally calibre treats blank lines as paragraph markers. '
|
||||
'With this option it will assume that every line represents '
|
||||
'a paragraph instead.')),
|
||||
'a paragraph instead. This option is ignored by eReader format.')),
|
||||
OptionRecommendation(name='print_formatted_paras', recommended_value=False,
|
||||
help=_('Normally calibre treats blank lines as paragraph markers. '
|
||||
'With this option it will assume that every line starting with '
|
||||
'an indent (either a tab or 2+ spaces) represents a paragraph. '
|
||||
'Paragraphs end when the next line that starts with an indent '
|
||||
'is reached.')),
|
||||
'is reached. This option is ignored by eReader format.')),
|
||||
OptionRecommendation(name='preserve_spaces', recommended_value=False,
|
||||
help=_('Normally extra spaces are condensed into a single space. '
|
||||
'With this option all spaces will be displayed. This option '
|
||||
'is ignored by eReader format.')),
|
||||
OptionRecommendation(name='markdown', recommended_value=False,
|
||||
help=_('Run the text input through the markdown pre-processor. To '
|
||||
'learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
|
||||
OptionRecommendation(name="markdown_disable_toc", recommended_value=False,
|
||||
help=_('Do not insert a Table of Contents into the output text. '
|
||||
'This option is ignored by eReader format.')),
|
||||
])
|
||||
|
||||
def convert(self, stream, options, file_ext, log,
|
||||
|
@ -11,9 +11,9 @@ __docformat__ = 'restructuredtext en'
|
||||
import os
|
||||
import struct
|
||||
|
||||
from cStringIO import StringIO
|
||||
|
||||
from calibre.ebooks.pdb.formatreader import FormatReader
|
||||
from calibre.ebooks.txt.processor import convert_basic, opf_writer, \
|
||||
separate_paragraphs_single_line, separate_paragraphs_print_formatted
|
||||
|
||||
class HeaderRecord(object):
|
||||
'''
|
||||
@ -33,9 +33,7 @@ class Reader(FormatReader):
|
||||
def __init__(self, header, stream, log, options):
|
||||
self.stream = stream
|
||||
self.log = log
|
||||
self.encoding = options.input_encoding
|
||||
self.single_line_paras = options.single_line_paras
|
||||
self.print_formatted_paras = options.print_formatted_paras
|
||||
self.options = options
|
||||
|
||||
self.sections = []
|
||||
for i in range(header.num_sections):
|
||||
@ -48,34 +46,23 @@ class Reader(FormatReader):
|
||||
|
||||
def decompress_text(self, number):
|
||||
if self.header_record.compression == 1:
|
||||
return self.section_data(number).decode('cp1252' if self.encoding is None else self.encoding)
|
||||
return self.section_data(number)
|
||||
if self.header_record.compression == 2 or self.header_record.compression == 258:
|
||||
from calibre.ebooks.compression.palmdoc import decompress_doc
|
||||
return decompress_doc(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding, 'replace')
|
||||
return decompress_doc(self.section_data(number))
|
||||
return ''
|
||||
|
||||
def extract_content(self, output_dir):
|
||||
txt = ''
|
||||
raw_txt = ''
|
||||
|
||||
self.log.info('Decompressing text...')
|
||||
for i in range(1, self.header_record.num_records + 1):
|
||||
self.log.debug('\tDecompressing text section %i' % i)
|
||||
txt += self.decompress_text(i)
|
||||
raw_txt += self.decompress_text(i)
|
||||
|
||||
self.log.info('Converting text to OEB...')
|
||||
if self.single_line_paras:
|
||||
txt = separate_paragraphs_single_line(txt)
|
||||
if self.print_formatted_paras:
|
||||
txt = separate_paragraphs_print_formatted(txt)
|
||||
html = convert_basic(txt)
|
||||
with open(os.path.join(output_dir, 'index.html'), 'wb') as index:
|
||||
index.write(html.encode('utf-8'))
|
||||
|
||||
from calibre.ebooks.metadata.meta import get_metadata
|
||||
mi = get_metadata(self.stream, 'pdb')
|
||||
manifest = [('index.html', None)]
|
||||
spine = ['index.html']
|
||||
opf_writer(output_dir, 'metadata.opf', manifest, spine, mi)
|
||||
|
||||
return os.path.join(output_dir, 'metadata.opf')
|
||||
|
||||
stream = StringIO(raw_txt)
|
||||
from calibre.customize.ui import plugin_for_input_format
|
||||
stream.seek(0)
|
||||
return plugin_for_input_format('txt').convert(stream, self.options,
|
||||
'txt', self.log, {})
|
||||
|
@ -8,12 +8,13 @@ __license__ = 'GPL v3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os, struct, zlib
|
||||
import struct
|
||||
import zlib
|
||||
|
||||
from cStringIO import StringIO
|
||||
|
||||
from calibre.ebooks.pdb.formatreader import FormatReader
|
||||
from calibre.ebooks.pdb.ztxt import zTXTError
|
||||
from calibre.ebooks.txt.processor import convert_basic, opf_writer, \
|
||||
separate_paragraphs_single_line, separate_paragraphs_print_formatted
|
||||
|
||||
SUPPORTED_VERSION = (1, 40)
|
||||
|
||||
@ -38,9 +39,7 @@ class Reader(FormatReader):
|
||||
def __init__(self, header, stream, log, options):
|
||||
self.stream = stream
|
||||
self.log = log
|
||||
self.encoding = options.input_encoding
|
||||
self.single_line_paras = options.single_line_paras
|
||||
self.print_formatted_paras = options.print_formatted_paras
|
||||
self.options = options
|
||||
|
||||
self.sections = []
|
||||
for i in range(header.num_sections):
|
||||
@ -68,30 +67,19 @@ class Reader(FormatReader):
|
||||
def decompress_text(self, number):
|
||||
if number == 1:
|
||||
self.uncompressor = zlib.decompressobj()
|
||||
return self.uncompressor.decompress(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding, 'replace')
|
||||
return self.uncompressor.decompress(self.section_data(number))
|
||||
|
||||
def extract_content(self, output_dir):
|
||||
txt = ''
|
||||
raw_txt = ''
|
||||
|
||||
self.log.info('Decompressing text...')
|
||||
for i in range(1, self.header_record.num_records + 1):
|
||||
self.log.debug('\tDecompressing text section %i' % i)
|
||||
txt += self.decompress_text(i)
|
||||
|
||||
raw_txt += self.decompress_text(i)
|
||||
|
||||
self.log.info('Converting text to OEB...')
|
||||
if self.single_line_paras:
|
||||
txt = separate_paragraphs_single_line(txt)
|
||||
if self.print_formatted_paras:
|
||||
txt = separate_paragraphs_print_formatted(txt)
|
||||
html = convert_basic(txt)
|
||||
with open(os.path.join(output_dir, 'index.html'), 'wb') as index:
|
||||
index.write(html.encode('utf-8'))
|
||||
|
||||
from calibre.ebooks.metadata.meta import get_metadata
|
||||
mi = get_metadata(self.stream, 'pdb')
|
||||
manifest = [('index.html', None)]
|
||||
spine = ['index.html']
|
||||
opf_writer(output_dir, 'metadata.opf', manifest, spine, mi)
|
||||
|
||||
return os.path.join(output_dir, 'metadata.opf')
|
||||
|
||||
stream = StringIO(raw_txt)
|
||||
from calibre.customize.ui import plugin_for_input_format
|
||||
stream.seek(0)
|
||||
return plugin_for_input_format('txt').convert(stream, self.options,
|
||||
'txt', self.log, {})
|
||||
|
@ -4,11 +4,9 @@ __license__ = 'GPL 3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
from cStringIO import StringIO
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
|
||||
from calibre.ebooks.txt.processor import convert_basic, opf_writer, \
|
||||
separate_paragraphs_single_line, separate_paragraphs_print_formatted
|
||||
from calibre.ebooks.compression.tcr import decompress
|
||||
|
||||
class TCRInput(InputFormatPlugin):
|
||||
@ -29,26 +27,23 @@ class TCRInput(InputFormatPlugin):
|
||||
'an indent (either a tab or 2+ spaces) represents a paragraph. '
|
||||
'Paragraphs end when the next line that starts with an indent '
|
||||
'is reached.')),
|
||||
OptionRecommendation(name='preserve_spaces', recommended_value=False,
|
||||
help=_('Normally extra spaces are condensed into a single space. '
|
||||
'With this option all spaces will be displayed.')),
|
||||
OptionRecommendation(name='markdown', recommended_value=False,
|
||||
help=_('Run the text input through the markdown pre-processor. To '
|
||||
'learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
|
||||
OptionRecommendation(name="markdown_disable_toc", recommended_value=False,
|
||||
help=_('Do not insert a Table of Contents into the output text.')),
|
||||
])
|
||||
|
||||
def convert(self, stream, options, file_ext, log, accelerators):
|
||||
log.info('Decompressing text...')
|
||||
ienc = options.input_encoding if options.input_encoding else 'utf-8'
|
||||
txt = decompress(stream).decode(ienc, 'replace')
|
||||
raw_txt = decompress(stream)
|
||||
|
||||
log.info('Converting text to OEB...')
|
||||
if options.single_line_paras:
|
||||
txt = separate_paragraphs_single_line(txt)
|
||||
if options.print_formatted_paras:
|
||||
txt = separate_paragraphs_print_formatted(txt)
|
||||
html = convert_basic(txt)
|
||||
with open(os.path.join(os.getcwd(), 'index.html'), 'wb') as index:
|
||||
index.write(html.encode('utf-8'))
|
||||
|
||||
from calibre.ebooks.metadata.meta import get_metadata
|
||||
mi = get_metadata(stream, 'tcr')
|
||||
manifest = [('index.html', None)]
|
||||
spine = ['index.html']
|
||||
opf_writer(os.getcwd(), 'metadata.opf', manifest, spine, mi)
|
||||
|
||||
return os.path.join(os.getcwd(), 'metadata.opf')
|
||||
stream = StringIO(raw_txt)
|
||||
from calibre.customize.ui import plugin_for_input_format
|
||||
stream.seek(0)
|
||||
return plugin_for_input_format('txt').convert(stream, options,
|
||||
'txt', log, accelerators)
|
||||
|
@ -7,6 +7,7 @@ __docformat__ = 'restructuredtext en'
|
||||
import os
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
|
||||
from calibre.ebooks.chardet import detect
|
||||
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
|
||||
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
|
||||
preserve_spaces
|
||||
@ -42,11 +43,19 @@ class TXTInput(InputFormatPlugin):
|
||||
|
||||
def convert(self, stream, options, file_ext, log,
|
||||
accelerators):
|
||||
ienc = stream.encoding if stream.encoding else 'utf-8'
|
||||
log.debug('Reading text from file...')
|
||||
|
||||
txt = stream.read()
|
||||
if options.input_encoding:
|
||||
ienc = options.input_encoding
|
||||
log.debug('Reading text from file...')
|
||||
txt = stream.read().decode(ienc, 'replace')
|
||||
log.debug('Using user specified input encoding of %s' % ienc)
|
||||
else:
|
||||
ienc = detect(txt)['encoding']
|
||||
log.debug('Detected input encoding as %s' % ienc)
|
||||
if not ienc:
|
||||
ienc = 'utf-8'
|
||||
log.debug('No input encoding specified and could not auto detect using %s' % ienc)
|
||||
txt = txt.decode(ienc, 'replace')
|
||||
|
||||
# Adjust paragraph formatting as requested
|
||||
if options.single_line_paras:
|
||||
@ -85,11 +94,10 @@ class TXTInput(InputFormatPlugin):
|
||||
htmlfile = open(fname, 'wb')
|
||||
with htmlfile:
|
||||
htmlfile.write(html.encode('utf-8'))
|
||||
cwd = os.getcwdu()
|
||||
odi = options.debug_pipeline
|
||||
options.debug_pipeline = None
|
||||
oeb = html_input(open(htmlfile.name, 'rb'), options, 'html', log,
|
||||
{}, cwd)
|
||||
oeb = html_input.convert(open(htmlfile.name, 'rb'), options, 'html', log,
|
||||
{})
|
||||
options.debug_pipeline = odi
|
||||
os.remove(htmlfile.name)
|
||||
return oeb
|
||||
|
Loading…
x
Reference in New Issue
Block a user