mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-08-07 09:01:38 -04:00
TXT Input: Split pargarph and formatting into two different options.
This commit is contained in:
parent
3bb40c9911
commit
dea9ae6832
@ -19,15 +19,20 @@ class PDBInput(InputFormatPlugin):
|
||||
file_types = set(['pdb'])
|
||||
|
||||
options = set([
|
||||
OptionRecommendation(name='paragraph_format', recommended_value='auto',
|
||||
choices=['auto', 'block', 'single', 'print', 'markdown'],
|
||||
help=_('How calibre splits text into paragraphs.\n'
|
||||
OptionRecommendation(name='paragraph_type', recommended_value='auto',
|
||||
choices=['auto', 'block', 'single', 'print'],
|
||||
help=_('Paragraph structure.\n'
|
||||
'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n'
|
||||
'* auto: Try to auto detect paragraph format.\n'
|
||||
'* auto: Try to auto detect paragraph type.\n'
|
||||
'* block: Treat a blank line as a paragraph break.\n'
|
||||
'* single: Assume every line is a paragraph.\n'
|
||||
'* print: Assume every line starting with 2+ spaces or a tab '
|
||||
'starts a paragraph.\n'
|
||||
'starts a paragraph.')),
|
||||
OptionRecommendation(name='formatting_type', recommended_value='auto',
|
||||
choices=['auto', 'none', 'markdown'],
|
||||
help=_('Formatting used within the document.'
|
||||
'* auto: Try to auto detect the document formatting.\n'
|
||||
'* none: Do not modify the paragraph formatting. Everything is a paragraph.\n'
|
||||
'* markdown: Run the input though the markdown pre-processor. '
|
||||
'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
|
||||
OptionRecommendation(name='preserve_spaces', recommended_value=False,
|
||||
|
@ -17,15 +17,20 @@ class TCRInput(InputFormatPlugin):
|
||||
file_types = set(['tcr'])
|
||||
|
||||
options = set([
|
||||
OptionRecommendation(name='paragraph_format', recommended_value='auto',
|
||||
choices=['auto', 'block', 'single', 'print', 'markdown'],
|
||||
help=_('How calibre splits text into paragraphs.\n'
|
||||
OptionRecommendation(name='paragraph_type', recommended_value='auto',
|
||||
choices=['auto', 'block', 'single', 'print'],
|
||||
help=_('Paragraph structure.\n'
|
||||
'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n'
|
||||
'* auto: Try to auto detect paragraph format.\n'
|
||||
'* auto: Try to auto detect paragraph type.\n'
|
||||
'* block: Treat a blank line as a paragraph break.\n'
|
||||
'* single: Assume every line is a paragraph.\n'
|
||||
'* print: Assume every line starting with 2+ spaces or a tab '
|
||||
'starts a paragraph.\n'
|
||||
'starts a paragraph.')),
|
||||
OptionRecommendation(name='formatting_type', recommended_value='auto',
|
||||
choices=['auto', 'none', 'markdown'],
|
||||
help=_('Formatting used within the document.'
|
||||
'* auto: Try to auto detect the document formatting.\n'
|
||||
'* none: Do not modify the paragraph formatting. Everything is a paragraph.\n'
|
||||
'* markdown: Run the input though the markdown pre-processor. '
|
||||
'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
|
||||
OptionRecommendation(name='preserve_spaces', recommended_value=False,
|
||||
|
@ -10,7 +10,7 @@ from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
|
||||
from calibre.ebooks.chardet import detect
|
||||
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
|
||||
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
|
||||
preserve_spaces, detect_paragraph_formatting
|
||||
preserve_spaces, detect_paragraph_type, detect_formatting_type
|
||||
from calibre import _ent_pat, xml_entity_to_unicode
|
||||
|
||||
class TXTInput(InputFormatPlugin):
|
||||
@ -21,15 +21,20 @@ class TXTInput(InputFormatPlugin):
|
||||
file_types = set(['txt'])
|
||||
|
||||
options = set([
|
||||
OptionRecommendation(name='paragraph_format', recommended_value='auto',
|
||||
choices=['auto', 'block', 'single', 'print', 'markdown'],
|
||||
help=_('How calibre splits text into paragraphs.\n'
|
||||
OptionRecommendation(name='paragraph_type', recommended_value='auto',
|
||||
choices=['auto', 'block', 'single', 'print'],
|
||||
help=_('Paragraph structure.\n'
|
||||
'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n'
|
||||
'* auto: Try to auto detect paragraph format.\n'
|
||||
'* auto: Try to auto detect paragraph type.\n'
|
||||
'* block: Treat a blank line as a paragraph break.\n'
|
||||
'* single: Assume every line is a paragraph.\n'
|
||||
'* print: Assume every line starting with 2+ spaces or a tab '
|
||||
'starts a paragraph.\n'
|
||||
'starts a paragraph.')),
|
||||
OptionRecommendation(name='formatting_type', recommended_value='auto',
|
||||
choices=['auto', 'none', 'markdown'],
|
||||
help=_('Formatting used within the document.'
|
||||
'* auto: Try to auto detect the document formatting.\n'
|
||||
'* none: Do not modify the paragraph formatting. Everything is a paragraph.\n'
|
||||
'* markdown: Run the input though the markdown pre-processor. '
|
||||
'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
|
||||
OptionRecommendation(name='preserve_spaces', recommended_value=False,
|
||||
@ -57,29 +62,16 @@ class TXTInput(InputFormatPlugin):
|
||||
log.debug('No input encoding specified and could not auto detect using %s' % ienc)
|
||||
txt = txt.decode(ienc, 'replace')
|
||||
|
||||
# Determine the formatting of the document.
|
||||
if options.paragraph_format == 'auto':
|
||||
options.paragraph_format = detect_paragraph_formatting(txt)
|
||||
if options.paragraph_format == 'unknown':
|
||||
log.debug('Could not reliably determine paragraph format using block format')
|
||||
options.paragraph_format = 'block'
|
||||
else:
|
||||
log.debug('Auto detected paragraph format as %s' % options.paragraph_format)
|
||||
|
||||
# We don't check for block because the processor assumes block.
|
||||
# single and print at transformed to block for processing.
|
||||
if options.paragraph_format == 'single':
|
||||
txt = separate_paragraphs_single_line(txt)
|
||||
elif options.paragraph_format == 'print':
|
||||
txt = separate_paragraphs_print_formatted(txt)
|
||||
|
||||
txt = _ent_pat.sub(xml_entity_to_unicode, txt)
|
||||
# Preserve spaces will replace multiple spaces to a space
|
||||
# followed by the entity.
|
||||
if options.preserve_spaces:
|
||||
txt = preserve_spaces(txt)
|
||||
|
||||
if options.formatting_type == 'auto':
|
||||
options.formatting_type = detect_formatting_type(txt)
|
||||
|
||||
if options.paragraph_format == 'markdown':
|
||||
if options.formatting_type == 'markdown':
|
||||
log.debug('Running text though markdown conversion...')
|
||||
try:
|
||||
html = convert_markdown(txt, disable_toc=options.markdown_disable_toc)
|
||||
@ -87,6 +79,22 @@ class TXTInput(InputFormatPlugin):
|
||||
raise ValueError('This txt file has malformed markup, it cannot be'
|
||||
' converted by calibre. See http://daringfireball.net/projects/markdown/syntax')
|
||||
else:
|
||||
# Determine the paragraph type of the document.
|
||||
if options.paragraph_type == 'auto':
|
||||
options.paragraph_type = detect_paragraph_type(txt)
|
||||
if options.paragraph_type == 'unknown':
|
||||
log.debug('Could not reliably determine paragraph type using block')
|
||||
options.paragraph_type = 'block'
|
||||
else:
|
||||
log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
|
||||
|
||||
# We don't check for block because the processor assumes block.
|
||||
# single and print at transformed to block for processing.
|
||||
if options.paragraph_type == 'single':
|
||||
txt = separate_paragraphs_single_line(txt)
|
||||
elif options.paragraph_type == 'print':
|
||||
txt = separate_paragraphs_print_formatted(txt)
|
||||
|
||||
flow_size = getattr(options, 'flow_size', 0)
|
||||
html = convert_basic(txt, epub_split_size_kb=flow_size)
|
||||
|
||||
|
@ -93,7 +93,7 @@ def split_string_separator(txt, size) :
|
||||
xrange(0, len(txt), size)])
|
||||
return txt
|
||||
|
||||
def detect_paragraph_formatting(txt):
|
||||
def detect_paragraph_type(txt):
|
||||
'''
|
||||
Tries to determine the formatting of the document.
|
||||
|
||||
@ -109,6 +109,20 @@ def detect_paragraph_formatting(txt):
|
||||
txt = txt.replace('\r', '\n')
|
||||
txt_line_count = len(re.findall('(?mu)^\s*.+$', txt))
|
||||
|
||||
# Check for print
|
||||
tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt))
|
||||
if tab_line_count / float(txt_line_count) >= .25:
|
||||
return 'print'
|
||||
|
||||
# Check for block
|
||||
empty_line_count = len(re.findall('(?mu)^\s*$', txt))
|
||||
if empty_line_count / float(txt_line_count) >= .25:
|
||||
return 'block'
|
||||
|
||||
# Nothing else matched to assume single.
|
||||
return 'single'
|
||||
|
||||
def detect_formatting_type(txt):
|
||||
# Check for markdown
|
||||
# Headings
|
||||
if len(re.findall('(?mu)^#+', txt)) >= 5:
|
||||
@ -129,16 +143,4 @@ def detect_paragraph_formatting(txt):
|
||||
if txt.count('\\'+c) > 10:
|
||||
return 'markdown'
|
||||
|
||||
# Check for print
|
||||
tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt))
|
||||
if tab_line_count / float(txt_line_count) >= .25:
|
||||
return 'print'
|
||||
|
||||
# Check for block
|
||||
empty_line_count = len(re.findall('(?mu)^\s*$', txt))
|
||||
if empty_line_count / float(txt_line_count) >= .25:
|
||||
return 'block'
|
||||
|
||||
# Nothing else matched to assume single.
|
||||
return 'single'
|
||||
|
||||
return 'none'
|
||||
|
@ -16,8 +16,10 @@ class PluginWidget(Widget, Ui_Form):
|
||||
|
||||
def __init__(self, parent, get_option, get_help, db=None, book_id=None):
|
||||
Widget.__init__(self, parent,
|
||||
['paragraph_format', 'markdown_disable_toc', 'preserve_spaces'])
|
||||
['paragraph_type', 'formatting_type', 'markdown_disable_toc', 'preserve_spaces'])
|
||||
self.db, self.book_id = db, book_id
|
||||
for x in get_option('paragraph_format').option.choices:
|
||||
self.opt_paragraph_format.addItem(x)
|
||||
for x in get_option('paragraph_type').option.choices:
|
||||
self.opt_paragraph_type.addItem(x)
|
||||
for x in get_option('formatting_type').option.choices:
|
||||
self.opt_formatting_type.addItem(x)
|
||||
self.initialize_options(get_option, get_help, db, book_id)
|
||||
|
@ -16,8 +16,10 @@ class PluginWidget(Widget, Ui_Form):
|
||||
|
||||
def __init__(self, parent, get_option, get_help, db=None, book_id=None):
|
||||
Widget.__init__(self, parent,
|
||||
['paragraph_format', 'markdown_disable_toc', 'preserve_spaces'])
|
||||
['paragraph_type', 'formatting_type', 'markdown_disable_toc', 'preserve_spaces'])
|
||||
self.db, self.book_id = db, book_id
|
||||
for x in get_option('paragraph_format').option.choices:
|
||||
self.opt_paragraph_format.addItem(x)
|
||||
for x in get_option('paragraph_type').option.choices:
|
||||
self.opt_paragraph_type.addItem(x)
|
||||
for x in get_option('formatting_type').option.choices:
|
||||
self.opt_formatting_type.addItem(x)
|
||||
self.initialize_options(get_option, get_help, db, book_id)
|
||||
|
@ -16,8 +16,10 @@ class PluginWidget(Widget, Ui_Form):
|
||||
|
||||
def __init__(self, parent, get_option, get_help, db=None, book_id=None):
|
||||
Widget.__init__(self, parent,
|
||||
['paragraph_format', 'markdown_disable_toc', 'preserve_spaces'])
|
||||
['paragraph_type', 'formatting_type', 'markdown_disable_toc', 'preserve_spaces'])
|
||||
self.db, self.book_id = db, book_id
|
||||
for x in get_option('paragraph_format').option.choices:
|
||||
self.opt_paragraph_format.addItem(x)
|
||||
for x in get_option('paragraph_type').option.choices:
|
||||
self.opt_paragraph_type.addItem(x)
|
||||
for x in get_option('formatting_type').option.choices:
|
||||
self.opt_formatting_type.addItem(x)
|
||||
self.initialize_options(get_option, get_help, db, book_id)
|
||||
|
@ -6,7 +6,7 @@
|
||||
<rect>
|
||||
<x>0</x>
|
||||
<y>0</y>
|
||||
<width>488</width>
|
||||
<width>518</width>
|
||||
<height>300</height>
|
||||
</rect>
|
||||
</property>
|
||||
@ -17,41 +17,21 @@
|
||||
<item row="0" column="0">
|
||||
<widget class="QLabel" name="label_2">
|
||||
<property name="text">
|
||||
<string>Document structure detection</string>
|
||||
<string>Paragraph style:</string>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="0" column="1">
|
||||
<widget class="QComboBox" name="opt_paragraph_format"/>
|
||||
<widget class="QComboBox" name="opt_paragraph_type"/>
|
||||
</item>
|
||||
<item row="1" column="0" colspan="2">
|
||||
<widget class="QLabel" name="label">
|
||||
<property name="text">
|
||||
<string><p>Markdown is a simple markup language for text files, that allows for advanced formatting. To learn more visit <a href="http://daringfireball.net/projects/markdown">markdown</a>.</string>
|
||||
</property>
|
||||
<property name="wordWrap">
|
||||
<bool>true</bool>
|
||||
</property>
|
||||
<property name="openExternalLinks">
|
||||
<bool>true</bool>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="2" column="0" colspan="2">
|
||||
<widget class="QCheckBox" name="opt_markdown_disable_toc">
|
||||
<property name="text">
|
||||
<string>Do not insert Table of Contents into output text when using markdown</string>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="3" column="0" colspan="2">
|
||||
<item row="5" column="0" colspan="2">
|
||||
<widget class="QCheckBox" name="opt_preserve_spaces">
|
||||
<property name="text">
|
||||
<string>Preserve &spaces</string>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="4" column="0" colspan="2">
|
||||
<item row="6" column="0" colspan="2">
|
||||
<spacer name="verticalSpacer">
|
||||
<property name="orientation">
|
||||
<enum>Qt::Vertical</enum>
|
||||
@ -64,6 +44,45 @@
|
||||
</property>
|
||||
</spacer>
|
||||
</item>
|
||||
<item row="1" column="1">
|
||||
<widget class="QComboBox" name="opt_formatting_type"/>
|
||||
</item>
|
||||
<item row="1" column="0">
|
||||
<widget class="QLabel" name="label_3">
|
||||
<property name="text">
|
||||
<string>Formatting style:</string>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="2" column="0" rowspan="2" colspan="2">
|
||||
<widget class="QGroupBox" name="groupBox">
|
||||
<property name="title">
|
||||
<string>Markdown Options</string>
|
||||
</property>
|
||||
<layout class="QVBoxLayout" name="verticalLayout">
|
||||
<item>
|
||||
<widget class="QLabel" name="label">
|
||||
<property name="text">
|
||||
<string><p>Markdown is a simple markup language for text files, that allows for advanced formatting. To learn more visit <a href="http://daringfireball.net/projects/markdown">markdown</a>.</string>
|
||||
</property>
|
||||
<property name="wordWrap">
|
||||
<bool>true</bool>
|
||||
</property>
|
||||
<property name="openExternalLinks">
|
||||
<bool>true</bool>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item>
|
||||
<widget class="QCheckBox" name="opt_markdown_disable_toc">
|
||||
<property name="text">
|
||||
<string>Do not insert Table of Contents into output text when using markdown</string>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
</layout>
|
||||
</widget>
|
||||
</item>
|
||||
</layout>
|
||||
</widget>
|
||||
<resources/>
|
||||
|
Loading…
x
Reference in New Issue
Block a user