Implement #3418 (Break long lines in txt files)

This commit is contained in:
Kovid Goyal 2009-09-09 19:43:09 -06:00
parent 02e6f7fbb4
commit 63a37da4ea
9 changed files with 62 additions and 23 deletions

View File

@ -22,6 +22,12 @@ class PDBInput(InputFormatPlugin):
help=_('Normally calibre treats blank lines as paragraph markers. ' help=_('Normally calibre treats blank lines as paragraph markers. '
'With this option it will assume that every line represents ' 'With this option it will assume that every line represents '
'a paragraph instead.')), 'a paragraph instead.')),
OptionRecommendation(name='print_formatted_paras', recommended_value=False,
help=_('Normally calibre treats blank lines as paragraph markers. '
'With this option it will assume that every line starting with '
'an indent (either a tab or 2+ spaces) represents a paragraph.'
'Paragraphs end when the next line that starts with an indent '
'is reached.')),
]) ])
def convert(self, stream, options, file_ext, log, def convert(self, stream, options, file_ext, log,

View File

@ -13,8 +13,8 @@ import struct
from calibre.ebooks.compression.palmdoc import decompress_doc from calibre.ebooks.compression.palmdoc import decompress_doc
from calibre.ebooks.pdb.formatreader import FormatReader from calibre.ebooks.pdb.formatreader import FormatReader
from calibre.ebooks.txt.processor import convert_basic, separate_paragraphs, \ from calibre.ebooks.txt.processor import convert_basic, opf_writer, \
opf_writer separate_paragraphs_single_line, separate_paragraphs_print_formatted
class HeaderRecord(object): class HeaderRecord(object):
''' '''
@ -36,6 +36,7 @@ class Reader(FormatReader):
self.log = log self.log = log
self.encoding = options.input_encoding self.encoding = options.input_encoding
self.single_line_paras = options.single_line_paras self.single_line_paras = options.single_line_paras
self.print_formatted_paras = options.print_formatted_paras
self.sections = [] self.sections = []
for i in range(header.num_sections): for i in range(header.num_sections):
@ -63,7 +64,9 @@ class Reader(FormatReader):
self.log.info('Converting text to OEB...') self.log.info('Converting text to OEB...')
if self.single_line_paras: if self.single_line_paras:
txt = separate_paragraphs(txt) txt = separate_paragraphs_single_line(txt)
if self.print_formatted_paras:
txt = separate_paragraphs_print_formatted(txt)
html = convert_basic(txt) html = convert_basic(txt)
with open(os.path.join(output_dir, 'index.html'), 'wb') as index: with open(os.path.join(output_dir, 'index.html'), 'wb') as index:
index.write(html.encode('utf-8')) index.write(html.encode('utf-8'))

View File

@ -12,8 +12,8 @@ import os, struct, zlib
from calibre.ebooks.pdb.formatreader import FormatReader from calibre.ebooks.pdb.formatreader import FormatReader
from calibre.ebooks.pdb.ztxt import zTXTError from calibre.ebooks.pdb.ztxt import zTXTError
from calibre.ebooks.txt.processor import convert_basic, separate_paragraphs, \ from calibre.ebooks.txt.processor import convert_basic, opf_writer, \
opf_writer separate_paragraphs_single_line, separate_paragraphs_print_formatted
SUPPORTED_VERSION = (1, 40) SUPPORTED_VERSION = (1, 40)
@ -31,22 +31,23 @@ class HeaderRecord(object):
self.size, = struct.unpack('>L', raw[4:8]) self.size, = struct.unpack('>L', raw[4:8])
self.record_size, = struct.unpack('>H', raw[8:10]) self.record_size, = struct.unpack('>H', raw[8:10])
self.flags, = struct.unpack('>B', raw[18:19]) self.flags, = struct.unpack('>B', raw[18:19])
class Reader(FormatReader): class Reader(FormatReader):
def __init__(self, header, stream, log, options): def __init__(self, header, stream, log, options):
self.stream = stream self.stream = stream
self.log = log self.log = log
self.encoding = options.input_encoding self.encoding = options.input_encoding
self.single_line_paras = options.single_line_paras self.single_line_paras = options.single_line_paras
self.print_formatted_paras = options.print_formatted_paras
self.sections = [] self.sections = []
for i in range(header.num_sections): for i in range(header.num_sections):
self.sections.append(header.section_data(i)) self.sections.append(header.section_data(i))
self.header_record = HeaderRecord(self.section_data(0)) self.header_record = HeaderRecord(self.section_data(0))
vmajor = (self.header_record.version & 0x0000FF00) >> 8 vmajor = (self.header_record.version & 0x0000FF00) >> 8
vminor = self.header_record.version & 0x000000FF vminor = self.header_record.version & 0x000000FF
if vmajor < 1 or (vmajor == 1 and vminor < 40): if vmajor < 1 or (vmajor == 1 and vminor < 40):
@ -71,7 +72,7 @@ class Reader(FormatReader):
def extract_content(self, output_dir): def extract_content(self, output_dir):
txt = '' txt = ''
self.log.info('Decompressing text...') self.log.info('Decompressing text...')
for i in range(1, self.header_record.num_records + 1): for i in range(1, self.header_record.num_records + 1):
self.log.debug('\tDecompressing text section %i' % i) self.log.debug('\tDecompressing text section %i' % i)
@ -79,16 +80,18 @@ class Reader(FormatReader):
self.log.info('Converting text to OEB...') self.log.info('Converting text to OEB...')
if self.single_line_paras: if self.single_line_paras:
txt = separate_paragraphs(txt) txt = separate_paragraphs_single_line(txt)
if self.print_formatted_paras:
txt = separate_paragraphs_print_formatted(txt)
html = convert_basic(txt) html = convert_basic(txt)
with open(os.path.join(output_dir, 'index.html'), 'wb') as index: with open(os.path.join(output_dir, 'index.html'), 'wb') as index:
index.write(html.encode('utf-8')) index.write(html.encode('utf-8'))
from calibre.ebooks.metadata.meta import get_metadata from calibre.ebooks.metadata.meta import get_metadata
mi = get_metadata(self.stream, 'pdb') mi = get_metadata(self.stream, 'pdb')
manifest = [('index.html', None)] manifest = [('index.html', None)]
spine = ['index.html'] spine = ['index.html']
opf_writer(output_dir, 'metadata.opf', manifest, spine, mi) opf_writer(output_dir, 'metadata.opf', manifest, spine, mi)
return os.path.join(output_dir, 'metadata.opf') return os.path.join(output_dir, 'metadata.opf')

View File

@ -8,7 +8,7 @@ import os
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \ from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
separate_paragraphs separate_paragraphs_single_line, separate_paragraphs_print_formatted
class TXTInput(InputFormatPlugin): class TXTInput(InputFormatPlugin):
@ -22,6 +22,12 @@ class TXTInput(InputFormatPlugin):
help=_('Normally calibre treats blank lines as paragraph markers. ' help=_('Normally calibre treats blank lines as paragraph markers. '
'With this option it will assume that every line represents ' 'With this option it will assume that every line represents '
'a paragraph instead.')), 'a paragraph instead.')),
OptionRecommendation(name='print_formatted_paras', recommended_value=False,
help=_('Normally calibre treats blank lines as paragraph markers. '
'With this option it will assume that every line starting with '
'an indent (either a tab or 2+ spaces) represents a paragraph.'
'Paragraphs end when the next line that starts with an indent '
'is reached.')),
OptionRecommendation(name='markdown', recommended_value=False, OptionRecommendation(name='markdown', recommended_value=False,
help=_('Run the text input through the markdown pre-processor. To ' help=_('Run the text input through the markdown pre-processor. To '
'learn more about markdown see')+' http://daringfireball.net/projects/markdown/'), 'learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
@ -35,8 +41,11 @@ class TXTInput(InputFormatPlugin):
log.debug('Reading text from file...') log.debug('Reading text from file...')
txt = stream.read().decode(ienc, 'replace') txt = stream.read().decode(ienc, 'replace')
# Adjust paragraph formatting as requested
if options.single_line_paras: if options.single_line_paras:
txt = separate_paragraphs(txt) txt = separate_paragraphs_single_line(txt)
if options.print_formatted_paras:
txt = separate_paragraphs_print_formatted(txt)
if options.markdown: if options.markdown:
log.debug('Running text though markdown conversion...') log.debug('Running text though markdown conversion...')

View File

@ -45,12 +45,16 @@ def convert_markdown(txt, title=''):
safe_mode=False,) safe_mode=False,)
return HTML_TEMPLATE % (title, md.convert(txt)) return HTML_TEMPLATE % (title, md.convert(txt))
def separate_paragraphs(txt): def separate_paragraphs_single_line(txt):
txt = txt.replace('\r\n', '\n') txt = txt.replace('\r\n', '\n')
txt = txt.replace('\r', '\n') txt = txt.replace('\r', '\n')
txt = re.sub(u'(?<=.)\n(?=.)', u'\n\n', txt) txt = re.sub(u'(?<=.)\n(?=.)', u'\n\n', txt)
return txt return txt
def separate_paragraphs_print_formatted(txt):
txt = re.sub('(?miu)^(\t+|[ ]{2,})(?=.)', '\n\t', txt)
return txt
def opf_writer(path, opf_name, manifest, spine, mi): def opf_writer(path, opf_name, manifest, spine, mi):
opf = OPFCreator(path, mi) opf = OPFCreator(path, mi)
opf.create_manifest(manifest) opf.create_manifest(manifest)

View File

@ -14,6 +14,6 @@ class PluginWidget(Widget, Ui_Form):
def __init__(self, parent, get_option, get_help, db=None, book_id=None): def __init__(self, parent, get_option, get_help, db=None, book_id=None):
Widget.__init__(self, parent, 'pdb_input', Widget.__init__(self, parent, 'pdb_input',
['single_line_paras']) ['single_line_paras', 'print_formatted_paras'])
self.db, self.book_id = db, book_id self.db, self.book_id = db, book_id
self.initialize_options(get_option, get_help, db, book_id) self.initialize_options(get_option, get_help, db, book_id)

View File

@ -14,7 +14,7 @@
<string>Form</string> <string>Form</string>
</property> </property>
<layout class="QGridLayout" name="gridLayout"> <layout class="QGridLayout" name="gridLayout">
<item row="1" column="0"> <item row="2" column="0">
<spacer name="verticalSpacer"> <spacer name="verticalSpacer">
<property name="orientation"> <property name="orientation">
<enum>Qt::Vertical</enum> <enum>Qt::Vertical</enum>
@ -34,6 +34,13 @@
</property> </property>
</widget> </widget>
</item> </item>
<item row="1" column="0">
<widget class="QCheckBox" name="opt_print_formatted_paras">
<property name="text">
<string>Assume print formatting</string>
</property>
</widget>
</item>
</layout> </layout>
</widget> </widget>
<resources/> <resources/>

View File

@ -14,6 +14,6 @@ class PluginWidget(Widget, Ui_Form):
def __init__(self, parent, get_option, get_help, db=None, book_id=None): def __init__(self, parent, get_option, get_help, db=None, book_id=None):
Widget.__init__(self, parent, 'txt_input', Widget.__init__(self, parent, 'txt_input',
['single_line_paras', 'markdown']) ['single_line_paras', 'print_formatted_paras', 'markdown'])
self.db, self.book_id = db, book_id self.db, self.book_id = db, book_id
self.initialize_options(get_option, get_help, db, book_id) self.initialize_options(get_option, get_help, db, book_id)

View File

@ -14,7 +14,7 @@
<string>Form</string> <string>Form</string>
</property> </property>
<layout class="QGridLayout" name="gridLayout"> <layout class="QGridLayout" name="gridLayout">
<item row="3" column="0"> <item row="4" column="0">
<spacer name="verticalSpacer"> <spacer name="verticalSpacer">
<property name="orientation"> <property name="orientation">
<enum>Qt::Vertical</enum> <enum>Qt::Vertical</enum>
@ -34,14 +34,14 @@
</property> </property>
</widget> </widget>
</item> </item>
<item row="1" column="0"> <item row="2" column="0">
<widget class="QCheckBox" name="opt_markdown"> <widget class="QCheckBox" name="opt_markdown">
<property name="text"> <property name="text">
<string>Process using markdown</string> <string>Process using markdown</string>
</property> </property>
</widget> </widget>
</item> </item>
<item row="2" column="0"> <item row="3" column="0">
<widget class="QLabel" name="label"> <widget class="QLabel" name="label">
<property name="text"> <property name="text">
<string>&lt;p&gt;Markdown is a simple markup language for text files, that allows for advanced formatting. To learn more visit &lt;a href=&quot;http://daringfireball.net/projects/markdown&quot;&gt;markdown&lt;/a&gt;.</string> <string>&lt;p&gt;Markdown is a simple markup language for text files, that allows for advanced formatting. To learn more visit &lt;a href=&quot;http://daringfireball.net/projects/markdown&quot;&gt;markdown&lt;/a&gt;.</string>
@ -51,6 +51,13 @@
</property> </property>
</widget> </widget>
</item> </item>
<item row="1" column="0">
<widget class="QCheckBox" name="opt_print_formatted_paras">
<property name="text">
<string>Assume print formatting</string>
</property>
</widget>
</item>
</layout> </layout>
</widget> </widget>
<resources/> <resources/>