TXT Input: Split pargarph and formatting into two different options.

This commit is contained in:
John Schember 2011-01-05 20:03:49 -05:00
parent 3bb40c9911
commit dea9ae6832
8 changed files with 126 additions and 81 deletions

View File

@ -19,15 +19,20 @@ class PDBInput(InputFormatPlugin):
file_types = set(['pdb'])
options = set([
OptionRecommendation(name='paragraph_format', recommended_value='auto',
choices=['auto', 'block', 'single', 'print', 'markdown'],
help=_('How calibre splits text into paragraphs.\n'
OptionRecommendation(name='paragraph_type', recommended_value='auto',
choices=['auto', 'block', 'single', 'print'],
help=_('Paragraph structure.\n'
'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n'
'* auto: Try to auto detect paragraph format.\n'
'* auto: Try to auto detect paragraph type.\n'
'* block: Treat a blank line as a paragraph break.\n'
'* single: Assume every line is a paragraph.\n'
'* print: Assume every line starting with 2+ spaces or a tab '
'starts a paragraph.\n'
'starts a paragraph.')),
OptionRecommendation(name='formatting_type', recommended_value='auto',
choices=['auto', 'none', 'markdown'],
help=_('Formatting used within the document.'
'* auto: Try to auto detect the document formatting.\n'
'* none: Do not modify the paragraph formatting. Everything is a paragraph.\n'
'* markdown: Run the input though the markdown pre-processor. '
'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
OptionRecommendation(name='preserve_spaces', recommended_value=False,

View File

@ -17,15 +17,20 @@ class TCRInput(InputFormatPlugin):
file_types = set(['tcr'])
options = set([
OptionRecommendation(name='paragraph_format', recommended_value='auto',
choices=['auto', 'block', 'single', 'print', 'markdown'],
help=_('How calibre splits text into paragraphs.\n'
OptionRecommendation(name='paragraph_type', recommended_value='auto',
choices=['auto', 'block', 'single', 'print'],
help=_('Paragraph structure.\n'
'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n'
'* auto: Try to auto detect paragraph format.\n'
'* auto: Try to auto detect paragraph type.\n'
'* block: Treat a blank line as a paragraph break.\n'
'* single: Assume every line is a paragraph.\n'
'* print: Assume every line starting with 2+ spaces or a tab '
'starts a paragraph.\n'
'starts a paragraph.')),
OptionRecommendation(name='formatting_type', recommended_value='auto',
choices=['auto', 'none', 'markdown'],
help=_('Formatting used within the document.'
'* auto: Try to auto detect the document formatting.\n'
'* none: Do not modify the paragraph formatting. Everything is a paragraph.\n'
'* markdown: Run the input though the markdown pre-processor. '
'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
OptionRecommendation(name='preserve_spaces', recommended_value=False,

View File

@ -10,7 +10,7 @@ from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
from calibre.ebooks.chardet import detect
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
preserve_spaces, detect_paragraph_formatting
preserve_spaces, detect_paragraph_type, detect_formatting_type
from calibre import _ent_pat, xml_entity_to_unicode
class TXTInput(InputFormatPlugin):
@ -21,15 +21,20 @@ class TXTInput(InputFormatPlugin):
file_types = set(['txt'])
options = set([
OptionRecommendation(name='paragraph_format', recommended_value='auto',
choices=['auto', 'block', 'single', 'print', 'markdown'],
help=_('How calibre splits text into paragraphs.\n'
OptionRecommendation(name='paragraph_type', recommended_value='auto',
choices=['auto', 'block', 'single', 'print'],
help=_('Paragraph structure.\n'
'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n'
'* auto: Try to auto detect paragraph format.\n'
'* auto: Try to auto detect paragraph type.\n'
'* block: Treat a blank line as a paragraph break.\n'
'* single: Assume every line is a paragraph.\n'
'* print: Assume every line starting with 2+ spaces or a tab '
'starts a paragraph.\n'
'starts a paragraph.')),
OptionRecommendation(name='formatting_type', recommended_value='auto',
choices=['auto', 'none', 'markdown'],
help=_('Formatting used within the document.'
'* auto: Try to auto detect the document formatting.\n'
'* none: Do not modify the paragraph formatting. Everything is a paragraph.\n'
'* markdown: Run the input though the markdown pre-processor. '
'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
OptionRecommendation(name='preserve_spaces', recommended_value=False,
@ -57,29 +62,16 @@ class TXTInput(InputFormatPlugin):
log.debug('No input encoding specified and could not auto detect using %s' % ienc)
txt = txt.decode(ienc, 'replace')
# Determine the formatting of the document.
if options.paragraph_format == 'auto':
options.paragraph_format = detect_paragraph_formatting(txt)
if options.paragraph_format == 'unknown':
log.debug('Could not reliably determine paragraph format using block format')
options.paragraph_format = 'block'
else:
log.debug('Auto detected paragraph format as %s' % options.paragraph_format)
# We don't check for block because the processor assumes block.
# single and print at transformed to block for processing.
if options.paragraph_format == 'single':
txt = separate_paragraphs_single_line(txt)
elif options.paragraph_format == 'print':
txt = separate_paragraphs_print_formatted(txt)
txt = _ent_pat.sub(xml_entity_to_unicode, txt)
# Preserve spaces will replace multiple spaces to a space
# followed by the   entity.
if options.preserve_spaces:
txt = preserve_spaces(txt)
if options.formatting_type == 'auto':
options.formatting_type = detect_formatting_type(txt)
if options.paragraph_format == 'markdown':
if options.formatting_type == 'markdown':
log.debug('Running text though markdown conversion...')
try:
html = convert_markdown(txt, disable_toc=options.markdown_disable_toc)
@ -87,6 +79,22 @@ class TXTInput(InputFormatPlugin):
raise ValueError('This txt file has malformed markup, it cannot be'
' converted by calibre. See http://daringfireball.net/projects/markdown/syntax')
else:
# Determine the paragraph type of the document.
if options.paragraph_type == 'auto':
options.paragraph_type = detect_paragraph_type(txt)
if options.paragraph_type == 'unknown':
log.debug('Could not reliably determine paragraph type using block')
options.paragraph_type = 'block'
else:
log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
# We don't check for block because the processor assumes block.
# single and print at transformed to block for processing.
if options.paragraph_type == 'single':
txt = separate_paragraphs_single_line(txt)
elif options.paragraph_type == 'print':
txt = separate_paragraphs_print_formatted(txt)
flow_size = getattr(options, 'flow_size', 0)
html = convert_basic(txt, epub_split_size_kb=flow_size)

View File

@ -93,7 +93,7 @@ def split_string_separator(txt, size) :
xrange(0, len(txt), size)])
return txt
def detect_paragraph_formatting(txt):
def detect_paragraph_type(txt):
'''
Tries to determine the formatting of the document.
@ -109,6 +109,20 @@ def detect_paragraph_formatting(txt):
txt = txt.replace('\r', '\n')
txt_line_count = len(re.findall('(?mu)^\s*.+$', txt))
# Check for print
tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt))
if tab_line_count / float(txt_line_count) >= .25:
return 'print'
# Check for block
empty_line_count = len(re.findall('(?mu)^\s*$', txt))
if empty_line_count / float(txt_line_count) >= .25:
return 'block'
# Nothing else matched to assume single.
return 'single'
def detect_formatting_type(txt):
# Check for markdown
# Headings
if len(re.findall('(?mu)^#+', txt)) >= 5:
@ -129,16 +143,4 @@ def detect_paragraph_formatting(txt):
if txt.count('\\'+c) > 10:
return 'markdown'
# Check for print
tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt))
if tab_line_count / float(txt_line_count) >= .25:
return 'print'
# Check for block
empty_line_count = len(re.findall('(?mu)^\s*$', txt))
if empty_line_count / float(txt_line_count) >= .25:
return 'block'
# Nothing else matched to assume single.
return 'single'
return 'none'

View File

@ -16,8 +16,10 @@ class PluginWidget(Widget, Ui_Form):
def __init__(self, parent, get_option, get_help, db=None, book_id=None):
Widget.__init__(self, parent,
['paragraph_format', 'markdown_disable_toc', 'preserve_spaces'])
['paragraph_type', 'formatting_type', 'markdown_disable_toc', 'preserve_spaces'])
self.db, self.book_id = db, book_id
for x in get_option('paragraph_format').option.choices:
self.opt_paragraph_format.addItem(x)
for x in get_option('paragraph_type').option.choices:
self.opt_paragraph_type.addItem(x)
for x in get_option('formatting_type').option.choices:
self.opt_formatting_type.addItem(x)
self.initialize_options(get_option, get_help, db, book_id)

View File

@ -16,8 +16,10 @@ class PluginWidget(Widget, Ui_Form):
def __init__(self, parent, get_option, get_help, db=None, book_id=None):
Widget.__init__(self, parent,
['paragraph_format', 'markdown_disable_toc', 'preserve_spaces'])
['paragraph_type', 'formatting_type', 'markdown_disable_toc', 'preserve_spaces'])
self.db, self.book_id = db, book_id
for x in get_option('paragraph_format').option.choices:
self.opt_paragraph_format.addItem(x)
for x in get_option('paragraph_type').option.choices:
self.opt_paragraph_type.addItem(x)
for x in get_option('formatting_type').option.choices:
self.opt_formatting_type.addItem(x)
self.initialize_options(get_option, get_help, db, book_id)

View File

@ -16,8 +16,10 @@ class PluginWidget(Widget, Ui_Form):
def __init__(self, parent, get_option, get_help, db=None, book_id=None):
Widget.__init__(self, parent,
['paragraph_format', 'markdown_disable_toc', 'preserve_spaces'])
['paragraph_type', 'formatting_type', 'markdown_disable_toc', 'preserve_spaces'])
self.db, self.book_id = db, book_id
for x in get_option('paragraph_format').option.choices:
self.opt_paragraph_format.addItem(x)
for x in get_option('paragraph_type').option.choices:
self.opt_paragraph_type.addItem(x)
for x in get_option('formatting_type').option.choices:
self.opt_formatting_type.addItem(x)
self.initialize_options(get_option, get_help, db, book_id)

View File

@ -6,7 +6,7 @@
<rect>
<x>0</x>
<y>0</y>
<width>488</width>
<width>518</width>
<height>300</height>
</rect>
</property>
@ -17,41 +17,21 @@
<item row="0" column="0">
<widget class="QLabel" name="label_2">
<property name="text">
<string>Document structure detection</string>
<string>Paragraph style:</string>
</property>
</widget>
</item>
<item row="0" column="1">
<widget class="QComboBox" name="opt_paragraph_format"/>
<widget class="QComboBox" name="opt_paragraph_type"/>
</item>
<item row="1" column="0" colspan="2">
<widget class="QLabel" name="label">
<property name="text">
<string>&lt;p&gt;Markdown is a simple markup language for text files, that allows for advanced formatting. To learn more visit &lt;a href=&quot;http://daringfireball.net/projects/markdown&quot;&gt;markdown&lt;/a&gt;.</string>
</property>
<property name="wordWrap">
<bool>true</bool>
</property>
<property name="openExternalLinks">
<bool>true</bool>
</property>
</widget>
</item>
<item row="2" column="0" colspan="2">
<widget class="QCheckBox" name="opt_markdown_disable_toc">
<property name="text">
<string>Do not insert Table of Contents into output text when using markdown</string>
</property>
</widget>
</item>
<item row="3" column="0" colspan="2">
<item row="5" column="0" colspan="2">
<widget class="QCheckBox" name="opt_preserve_spaces">
<property name="text">
<string>Preserve &amp;spaces</string>
</property>
</widget>
</item>
<item row="4" column="0" colspan="2">
<item row="6" column="0" colspan="2">
<spacer name="verticalSpacer">
<property name="orientation">
<enum>Qt::Vertical</enum>
@ -64,6 +44,45 @@
</property>
</spacer>
</item>
<item row="1" column="1">
<widget class="QComboBox" name="opt_formatting_type"/>
</item>
<item row="1" column="0">
<widget class="QLabel" name="label_3">
<property name="text">
<string>Formatting style:</string>
</property>
</widget>
</item>
<item row="2" column="0" rowspan="2" colspan="2">
<widget class="QGroupBox" name="groupBox">
<property name="title">
<string>Markdown Options</string>
</property>
<layout class="QVBoxLayout" name="verticalLayout">
<item>
<widget class="QLabel" name="label">
<property name="text">
<string>&lt;p&gt;Markdown is a simple markup language for text files, that allows for advanced formatting. To learn more visit &lt;a href=&quot;http://daringfireball.net/projects/markdown&quot;&gt;markdown&lt;/a&gt;.</string>
</property>
<property name="wordWrap">
<bool>true</bool>
</property>
<property name="openExternalLinks">
<bool>true</bool>
</property>
</widget>
</item>
<item>
<widget class="QCheckBox" name="opt_markdown_disable_toc">
<property name="text">
<string>Do not insert Table of Contents into output text when using markdown</string>
</property>
</widget>
</item>
</layout>
</widget>
</item>
</layout>
</widget>
<resources/>