TXT Input: Split pargarph and formatting into two different options.

This commit is contained in:
John Schember 2011-01-05 20:03:49 -05:00
parent 3bb40c9911
commit dea9ae6832
8 changed files with 126 additions and 81 deletions

View File

@ -19,15 +19,20 @@ class PDBInput(InputFormatPlugin):
file_types = set(['pdb']) file_types = set(['pdb'])
options = set([ options = set([
OptionRecommendation(name='paragraph_format', recommended_value='auto', OptionRecommendation(name='paragraph_type', recommended_value='auto',
choices=['auto', 'block', 'single', 'print', 'markdown'], choices=['auto', 'block', 'single', 'print'],
help=_('How calibre splits text into paragraphs.\n' help=_('Paragraph structure.\n'
'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n' 'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n'
'* auto: Try to auto detect paragraph format.\n' '* auto: Try to auto detect paragraph type.\n'
'* block: Treat a blank line as a paragraph break.\n' '* block: Treat a blank line as a paragraph break.\n'
'* single: Assume every line is a paragraph.\n' '* single: Assume every line is a paragraph.\n'
'* print: Assume every line starting with 2+ spaces or a tab ' '* print: Assume every line starting with 2+ spaces or a tab '
'starts a paragraph.\n' 'starts a paragraph.')),
OptionRecommendation(name='formatting_type', recommended_value='auto',
choices=['auto', 'none', 'markdown'],
help=_('Formatting used within the document.'
'* auto: Try to auto detect the document formatting.\n'
'* none: Do not modify the paragraph formatting. Everything is a paragraph.\n'
'* markdown: Run the input though the markdown pre-processor. ' '* markdown: Run the input though the markdown pre-processor. '
'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'), 'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
OptionRecommendation(name='preserve_spaces', recommended_value=False, OptionRecommendation(name='preserve_spaces', recommended_value=False,

View File

@ -17,15 +17,20 @@ class TCRInput(InputFormatPlugin):
file_types = set(['tcr']) file_types = set(['tcr'])
options = set([ options = set([
OptionRecommendation(name='paragraph_format', recommended_value='auto', OptionRecommendation(name='paragraph_type', recommended_value='auto',
choices=['auto', 'block', 'single', 'print', 'markdown'], choices=['auto', 'block', 'single', 'print'],
help=_('How calibre splits text into paragraphs.\n' help=_('Paragraph structure.\n'
'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n' 'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n'
'* auto: Try to auto detect paragraph format.\n' '* auto: Try to auto detect paragraph type.\n'
'* block: Treat a blank line as a paragraph break.\n' '* block: Treat a blank line as a paragraph break.\n'
'* single: Assume every line is a paragraph.\n' '* single: Assume every line is a paragraph.\n'
'* print: Assume every line starting with 2+ spaces or a tab ' '* print: Assume every line starting with 2+ spaces or a tab '
'starts a paragraph.\n' 'starts a paragraph.')),
OptionRecommendation(name='formatting_type', recommended_value='auto',
choices=['auto', 'none', 'markdown'],
help=_('Formatting used within the document.'
'* auto: Try to auto detect the document formatting.\n'
'* none: Do not modify the paragraph formatting. Everything is a paragraph.\n'
'* markdown: Run the input though the markdown pre-processor. ' '* markdown: Run the input though the markdown pre-processor. '
'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'), 'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
OptionRecommendation(name='preserve_spaces', recommended_value=False, OptionRecommendation(name='preserve_spaces', recommended_value=False,

View File

@ -10,7 +10,7 @@ from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
from calibre.ebooks.chardet import detect from calibre.ebooks.chardet import detect
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \ from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \ separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
preserve_spaces, detect_paragraph_formatting preserve_spaces, detect_paragraph_type, detect_formatting_type
from calibre import _ent_pat, xml_entity_to_unicode from calibre import _ent_pat, xml_entity_to_unicode
class TXTInput(InputFormatPlugin): class TXTInput(InputFormatPlugin):
@ -21,15 +21,20 @@ class TXTInput(InputFormatPlugin):
file_types = set(['txt']) file_types = set(['txt'])
options = set([ options = set([
OptionRecommendation(name='paragraph_format', recommended_value='auto', OptionRecommendation(name='paragraph_type', recommended_value='auto',
choices=['auto', 'block', 'single', 'print', 'markdown'], choices=['auto', 'block', 'single', 'print'],
help=_('How calibre splits text into paragraphs.\n' help=_('Paragraph structure.\n'
'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n' 'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n'
'* auto: Try to auto detect paragraph format.\n' '* auto: Try to auto detect paragraph type.\n'
'* block: Treat a blank line as a paragraph break.\n' '* block: Treat a blank line as a paragraph break.\n'
'* single: Assume every line is a paragraph.\n' '* single: Assume every line is a paragraph.\n'
'* print: Assume every line starting with 2+ spaces or a tab ' '* print: Assume every line starting with 2+ spaces or a tab '
'starts a paragraph.\n' 'starts a paragraph.')),
OptionRecommendation(name='formatting_type', recommended_value='auto',
choices=['auto', 'none', 'markdown'],
help=_('Formatting used within the document.'
'* auto: Try to auto detect the document formatting.\n'
'* none: Do not modify the paragraph formatting. Everything is a paragraph.\n'
'* markdown: Run the input though the markdown pre-processor. ' '* markdown: Run the input though the markdown pre-processor. '
'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'), 'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
OptionRecommendation(name='preserve_spaces', recommended_value=False, OptionRecommendation(name='preserve_spaces', recommended_value=False,
@ -57,29 +62,16 @@ class TXTInput(InputFormatPlugin):
log.debug('No input encoding specified and could not auto detect using %s' % ienc) log.debug('No input encoding specified and could not auto detect using %s' % ienc)
txt = txt.decode(ienc, 'replace') txt = txt.decode(ienc, 'replace')
# Determine the formatting of the document.
if options.paragraph_format == 'auto':
options.paragraph_format = detect_paragraph_formatting(txt)
if options.paragraph_format == 'unknown':
log.debug('Could not reliably determine paragraph format using block format')
options.paragraph_format = 'block'
else:
log.debug('Auto detected paragraph format as %s' % options.paragraph_format)
# We don't check for block because the processor assumes block.
# single and print at transformed to block for processing.
if options.paragraph_format == 'single':
txt = separate_paragraphs_single_line(txt)
elif options.paragraph_format == 'print':
txt = separate_paragraphs_print_formatted(txt)
txt = _ent_pat.sub(xml_entity_to_unicode, txt) txt = _ent_pat.sub(xml_entity_to_unicode, txt)
# Preserve spaces will replace multiple spaces to a space # Preserve spaces will replace multiple spaces to a space
# followed by the   entity. # followed by the   entity.
if options.preserve_spaces: if options.preserve_spaces:
txt = preserve_spaces(txt) txt = preserve_spaces(txt)
if options.paragraph_format == 'markdown': if options.formatting_type == 'auto':
options.formatting_type = detect_formatting_type(txt)
if options.formatting_type == 'markdown':
log.debug('Running text though markdown conversion...') log.debug('Running text though markdown conversion...')
try: try:
html = convert_markdown(txt, disable_toc=options.markdown_disable_toc) html = convert_markdown(txt, disable_toc=options.markdown_disable_toc)
@ -87,6 +79,22 @@ class TXTInput(InputFormatPlugin):
raise ValueError('This txt file has malformed markup, it cannot be' raise ValueError('This txt file has malformed markup, it cannot be'
' converted by calibre. See http://daringfireball.net/projects/markdown/syntax') ' converted by calibre. See http://daringfireball.net/projects/markdown/syntax')
else: else:
# Determine the paragraph type of the document.
if options.paragraph_type == 'auto':
options.paragraph_type = detect_paragraph_type(txt)
if options.paragraph_type == 'unknown':
log.debug('Could not reliably determine paragraph type using block')
options.paragraph_type = 'block'
else:
log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
# We don't check for block because the processor assumes block.
# single and print at transformed to block for processing.
if options.paragraph_type == 'single':
txt = separate_paragraphs_single_line(txt)
elif options.paragraph_type == 'print':
txt = separate_paragraphs_print_formatted(txt)
flow_size = getattr(options, 'flow_size', 0) flow_size = getattr(options, 'flow_size', 0)
html = convert_basic(txt, epub_split_size_kb=flow_size) html = convert_basic(txt, epub_split_size_kb=flow_size)

View File

@ -93,7 +93,7 @@ def split_string_separator(txt, size) :
xrange(0, len(txt), size)]) xrange(0, len(txt), size)])
return txt return txt
def detect_paragraph_formatting(txt): def detect_paragraph_type(txt):
''' '''
Tries to determine the formatting of the document. Tries to determine the formatting of the document.
@ -109,6 +109,20 @@ def detect_paragraph_formatting(txt):
txt = txt.replace('\r', '\n') txt = txt.replace('\r', '\n')
txt_line_count = len(re.findall('(?mu)^\s*.+$', txt)) txt_line_count = len(re.findall('(?mu)^\s*.+$', txt))
# Check for print
tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt))
if tab_line_count / float(txt_line_count) >= .25:
return 'print'
# Check for block
empty_line_count = len(re.findall('(?mu)^\s*$', txt))
if empty_line_count / float(txt_line_count) >= .25:
return 'block'
# Nothing else matched to assume single.
return 'single'
def detect_formatting_type(txt):
# Check for markdown # Check for markdown
# Headings # Headings
if len(re.findall('(?mu)^#+', txt)) >= 5: if len(re.findall('(?mu)^#+', txt)) >= 5:
@ -129,16 +143,4 @@ def detect_paragraph_formatting(txt):
if txt.count('\\'+c) > 10: if txt.count('\\'+c) > 10:
return 'markdown' return 'markdown'
# Check for print return 'none'
tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt))
if tab_line_count / float(txt_line_count) >= .25:
return 'print'
# Check for block
empty_line_count = len(re.findall('(?mu)^\s*$', txt))
if empty_line_count / float(txt_line_count) >= .25:
return 'block'
# Nothing else matched to assume single.
return 'single'

View File

@ -16,8 +16,10 @@ class PluginWidget(Widget, Ui_Form):
def __init__(self, parent, get_option, get_help, db=None, book_id=None): def __init__(self, parent, get_option, get_help, db=None, book_id=None):
Widget.__init__(self, parent, Widget.__init__(self, parent,
['paragraph_format', 'markdown_disable_toc', 'preserve_spaces']) ['paragraph_type', 'formatting_type', 'markdown_disable_toc', 'preserve_spaces'])
self.db, self.book_id = db, book_id self.db, self.book_id = db, book_id
for x in get_option('paragraph_format').option.choices: for x in get_option('paragraph_type').option.choices:
self.opt_paragraph_format.addItem(x) self.opt_paragraph_type.addItem(x)
for x in get_option('formatting_type').option.choices:
self.opt_formatting_type.addItem(x)
self.initialize_options(get_option, get_help, db, book_id) self.initialize_options(get_option, get_help, db, book_id)

View File

@ -16,8 +16,10 @@ class PluginWidget(Widget, Ui_Form):
def __init__(self, parent, get_option, get_help, db=None, book_id=None): def __init__(self, parent, get_option, get_help, db=None, book_id=None):
Widget.__init__(self, parent, Widget.__init__(self, parent,
['paragraph_format', 'markdown_disable_toc', 'preserve_spaces']) ['paragraph_type', 'formatting_type', 'markdown_disable_toc', 'preserve_spaces'])
self.db, self.book_id = db, book_id self.db, self.book_id = db, book_id
for x in get_option('paragraph_format').option.choices: for x in get_option('paragraph_type').option.choices:
self.opt_paragraph_format.addItem(x) self.opt_paragraph_type.addItem(x)
for x in get_option('formatting_type').option.choices:
self.opt_formatting_type.addItem(x)
self.initialize_options(get_option, get_help, db, book_id) self.initialize_options(get_option, get_help, db, book_id)

View File

@ -16,8 +16,10 @@ class PluginWidget(Widget, Ui_Form):
def __init__(self, parent, get_option, get_help, db=None, book_id=None): def __init__(self, parent, get_option, get_help, db=None, book_id=None):
Widget.__init__(self, parent, Widget.__init__(self, parent,
['paragraph_format', 'markdown_disable_toc', 'preserve_spaces']) ['paragraph_type', 'formatting_type', 'markdown_disable_toc', 'preserve_spaces'])
self.db, self.book_id = db, book_id self.db, self.book_id = db, book_id
for x in get_option('paragraph_format').option.choices: for x in get_option('paragraph_type').option.choices:
self.opt_paragraph_format.addItem(x) self.opt_paragraph_type.addItem(x)
for x in get_option('formatting_type').option.choices:
self.opt_formatting_type.addItem(x)
self.initialize_options(get_option, get_help, db, book_id) self.initialize_options(get_option, get_help, db, book_id)

View File

@ -6,7 +6,7 @@
<rect> <rect>
<x>0</x> <x>0</x>
<y>0</y> <y>0</y>
<width>488</width> <width>518</width>
<height>300</height> <height>300</height>
</rect> </rect>
</property> </property>
@ -17,41 +17,21 @@
<item row="0" column="0"> <item row="0" column="0">
<widget class="QLabel" name="label_2"> <widget class="QLabel" name="label_2">
<property name="text"> <property name="text">
<string>Document structure detection</string> <string>Paragraph style:</string>
</property> </property>
</widget> </widget>
</item> </item>
<item row="0" column="1"> <item row="0" column="1">
<widget class="QComboBox" name="opt_paragraph_format"/> <widget class="QComboBox" name="opt_paragraph_type"/>
</item> </item>
<item row="1" column="0" colspan="2"> <item row="5" column="0" colspan="2">
<widget class="QLabel" name="label">
<property name="text">
<string>&lt;p&gt;Markdown is a simple markup language for text files, that allows for advanced formatting. To learn more visit &lt;a href=&quot;http://daringfireball.net/projects/markdown&quot;&gt;markdown&lt;/a&gt;.</string>
</property>
<property name="wordWrap">
<bool>true</bool>
</property>
<property name="openExternalLinks">
<bool>true</bool>
</property>
</widget>
</item>
<item row="2" column="0" colspan="2">
<widget class="QCheckBox" name="opt_markdown_disable_toc">
<property name="text">
<string>Do not insert Table of Contents into output text when using markdown</string>
</property>
</widget>
</item>
<item row="3" column="0" colspan="2">
<widget class="QCheckBox" name="opt_preserve_spaces"> <widget class="QCheckBox" name="opt_preserve_spaces">
<property name="text"> <property name="text">
<string>Preserve &amp;spaces</string> <string>Preserve &amp;spaces</string>
</property> </property>
</widget> </widget>
</item> </item>
<item row="4" column="0" colspan="2"> <item row="6" column="0" colspan="2">
<spacer name="verticalSpacer"> <spacer name="verticalSpacer">
<property name="orientation"> <property name="orientation">
<enum>Qt::Vertical</enum> <enum>Qt::Vertical</enum>
@ -64,6 +44,45 @@
</property> </property>
</spacer> </spacer>
</item> </item>
<item row="1" column="1">
<widget class="QComboBox" name="opt_formatting_type"/>
</item>
<item row="1" column="0">
<widget class="QLabel" name="label_3">
<property name="text">
<string>Formatting style:</string>
</property>
</widget>
</item>
<item row="2" column="0" rowspan="2" colspan="2">
<widget class="QGroupBox" name="groupBox">
<property name="title">
<string>Markdown Options</string>
</property>
<layout class="QVBoxLayout" name="verticalLayout">
<item>
<widget class="QLabel" name="label">
<property name="text">
<string>&lt;p&gt;Markdown is a simple markup language for text files, that allows for advanced formatting. To learn more visit &lt;a href=&quot;http://daringfireball.net/projects/markdown&quot;&gt;markdown&lt;/a&gt;.</string>
</property>
<property name="wordWrap">
<bool>true</bool>
</property>
<property name="openExternalLinks">
<bool>true</bool>
</property>
</widget>
</item>
<item>
<widget class="QCheckBox" name="opt_markdown_disable_toc">
<property name="text">
<string>Do not insert Table of Contents into output text when using markdown</string>
</property>
</widget>
</item>
</layout>
</widget>
</item>
</layout> </layout>
</widget> </widget>
<resources/> <resources/>