mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-08-11 09:13:57 -04:00
TXT Input: Split pargarph and formatting into two different options.
This commit is contained in:
parent
3bb40c9911
commit
dea9ae6832
@ -19,15 +19,20 @@ class PDBInput(InputFormatPlugin):
|
|||||||
file_types = set(['pdb'])
|
file_types = set(['pdb'])
|
||||||
|
|
||||||
options = set([
|
options = set([
|
||||||
OptionRecommendation(name='paragraph_format', recommended_value='auto',
|
OptionRecommendation(name='paragraph_type', recommended_value='auto',
|
||||||
choices=['auto', 'block', 'single', 'print', 'markdown'],
|
choices=['auto', 'block', 'single', 'print'],
|
||||||
help=_('How calibre splits text into paragraphs.\n'
|
help=_('Paragraph structure.\n'
|
||||||
'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n'
|
'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n'
|
||||||
'* auto: Try to auto detect paragraph format.\n'
|
'* auto: Try to auto detect paragraph type.\n'
|
||||||
'* block: Treat a blank line as a paragraph break.\n'
|
'* block: Treat a blank line as a paragraph break.\n'
|
||||||
'* single: Assume every line is a paragraph.\n'
|
'* single: Assume every line is a paragraph.\n'
|
||||||
'* print: Assume every line starting with 2+ spaces or a tab '
|
'* print: Assume every line starting with 2+ spaces or a tab '
|
||||||
'starts a paragraph.\n'
|
'starts a paragraph.')),
|
||||||
|
OptionRecommendation(name='formatting_type', recommended_value='auto',
|
||||||
|
choices=['auto', 'none', 'markdown'],
|
||||||
|
help=_('Formatting used within the document.'
|
||||||
|
'* auto: Try to auto detect the document formatting.\n'
|
||||||
|
'* none: Do not modify the paragraph formatting. Everything is a paragraph.\n'
|
||||||
'* markdown: Run the input though the markdown pre-processor. '
|
'* markdown: Run the input though the markdown pre-processor. '
|
||||||
'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
|
'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
|
||||||
OptionRecommendation(name='preserve_spaces', recommended_value=False,
|
OptionRecommendation(name='preserve_spaces', recommended_value=False,
|
||||||
|
@ -17,15 +17,20 @@ class TCRInput(InputFormatPlugin):
|
|||||||
file_types = set(['tcr'])
|
file_types = set(['tcr'])
|
||||||
|
|
||||||
options = set([
|
options = set([
|
||||||
OptionRecommendation(name='paragraph_format', recommended_value='auto',
|
OptionRecommendation(name='paragraph_type', recommended_value='auto',
|
||||||
choices=['auto', 'block', 'single', 'print', 'markdown'],
|
choices=['auto', 'block', 'single', 'print'],
|
||||||
help=_('How calibre splits text into paragraphs.\n'
|
help=_('Paragraph structure.\n'
|
||||||
'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n'
|
'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n'
|
||||||
'* auto: Try to auto detect paragraph format.\n'
|
'* auto: Try to auto detect paragraph type.\n'
|
||||||
'* block: Treat a blank line as a paragraph break.\n'
|
'* block: Treat a blank line as a paragraph break.\n'
|
||||||
'* single: Assume every line is a paragraph.\n'
|
'* single: Assume every line is a paragraph.\n'
|
||||||
'* print: Assume every line starting with 2+ spaces or a tab '
|
'* print: Assume every line starting with 2+ spaces or a tab '
|
||||||
'starts a paragraph.\n'
|
'starts a paragraph.')),
|
||||||
|
OptionRecommendation(name='formatting_type', recommended_value='auto',
|
||||||
|
choices=['auto', 'none', 'markdown'],
|
||||||
|
help=_('Formatting used within the document.'
|
||||||
|
'* auto: Try to auto detect the document formatting.\n'
|
||||||
|
'* none: Do not modify the paragraph formatting. Everything is a paragraph.\n'
|
||||||
'* markdown: Run the input though the markdown pre-processor. '
|
'* markdown: Run the input though the markdown pre-processor. '
|
||||||
'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
|
'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
|
||||||
OptionRecommendation(name='preserve_spaces', recommended_value=False,
|
OptionRecommendation(name='preserve_spaces', recommended_value=False,
|
||||||
|
@ -10,7 +10,7 @@ from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
|
|||||||
from calibre.ebooks.chardet import detect
|
from calibre.ebooks.chardet import detect
|
||||||
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
|
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
|
||||||
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
|
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
|
||||||
preserve_spaces, detect_paragraph_formatting
|
preserve_spaces, detect_paragraph_type, detect_formatting_type
|
||||||
from calibre import _ent_pat, xml_entity_to_unicode
|
from calibre import _ent_pat, xml_entity_to_unicode
|
||||||
|
|
||||||
class TXTInput(InputFormatPlugin):
|
class TXTInput(InputFormatPlugin):
|
||||||
@ -21,15 +21,20 @@ class TXTInput(InputFormatPlugin):
|
|||||||
file_types = set(['txt'])
|
file_types = set(['txt'])
|
||||||
|
|
||||||
options = set([
|
options = set([
|
||||||
OptionRecommendation(name='paragraph_format', recommended_value='auto',
|
OptionRecommendation(name='paragraph_type', recommended_value='auto',
|
||||||
choices=['auto', 'block', 'single', 'print', 'markdown'],
|
choices=['auto', 'block', 'single', 'print'],
|
||||||
help=_('How calibre splits text into paragraphs.\n'
|
help=_('Paragraph structure.\n'
|
||||||
'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n'
|
'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n'
|
||||||
'* auto: Try to auto detect paragraph format.\n'
|
'* auto: Try to auto detect paragraph type.\n'
|
||||||
'* block: Treat a blank line as a paragraph break.\n'
|
'* block: Treat a blank line as a paragraph break.\n'
|
||||||
'* single: Assume every line is a paragraph.\n'
|
'* single: Assume every line is a paragraph.\n'
|
||||||
'* print: Assume every line starting with 2+ spaces or a tab '
|
'* print: Assume every line starting with 2+ spaces or a tab '
|
||||||
'starts a paragraph.\n'
|
'starts a paragraph.')),
|
||||||
|
OptionRecommendation(name='formatting_type', recommended_value='auto',
|
||||||
|
choices=['auto', 'none', 'markdown'],
|
||||||
|
help=_('Formatting used within the document.'
|
||||||
|
'* auto: Try to auto detect the document formatting.\n'
|
||||||
|
'* none: Do not modify the paragraph formatting. Everything is a paragraph.\n'
|
||||||
'* markdown: Run the input though the markdown pre-processor. '
|
'* markdown: Run the input though the markdown pre-processor. '
|
||||||
'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
|
'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
|
||||||
OptionRecommendation(name='preserve_spaces', recommended_value=False,
|
OptionRecommendation(name='preserve_spaces', recommended_value=False,
|
||||||
@ -57,29 +62,16 @@ class TXTInput(InputFormatPlugin):
|
|||||||
log.debug('No input encoding specified and could not auto detect using %s' % ienc)
|
log.debug('No input encoding specified and could not auto detect using %s' % ienc)
|
||||||
txt = txt.decode(ienc, 'replace')
|
txt = txt.decode(ienc, 'replace')
|
||||||
|
|
||||||
# Determine the formatting of the document.
|
|
||||||
if options.paragraph_format == 'auto':
|
|
||||||
options.paragraph_format = detect_paragraph_formatting(txt)
|
|
||||||
if options.paragraph_format == 'unknown':
|
|
||||||
log.debug('Could not reliably determine paragraph format using block format')
|
|
||||||
options.paragraph_format = 'block'
|
|
||||||
else:
|
|
||||||
log.debug('Auto detected paragraph format as %s' % options.paragraph_format)
|
|
||||||
|
|
||||||
# We don't check for block because the processor assumes block.
|
|
||||||
# single and print at transformed to block for processing.
|
|
||||||
if options.paragraph_format == 'single':
|
|
||||||
txt = separate_paragraphs_single_line(txt)
|
|
||||||
elif options.paragraph_format == 'print':
|
|
||||||
txt = separate_paragraphs_print_formatted(txt)
|
|
||||||
|
|
||||||
txt = _ent_pat.sub(xml_entity_to_unicode, txt)
|
txt = _ent_pat.sub(xml_entity_to_unicode, txt)
|
||||||
# Preserve spaces will replace multiple spaces to a space
|
# Preserve spaces will replace multiple spaces to a space
|
||||||
# followed by the entity.
|
# followed by the entity.
|
||||||
if options.preserve_spaces:
|
if options.preserve_spaces:
|
||||||
txt = preserve_spaces(txt)
|
txt = preserve_spaces(txt)
|
||||||
|
|
||||||
if options.paragraph_format == 'markdown':
|
if options.formatting_type == 'auto':
|
||||||
|
options.formatting_type = detect_formatting_type(txt)
|
||||||
|
|
||||||
|
if options.formatting_type == 'markdown':
|
||||||
log.debug('Running text though markdown conversion...')
|
log.debug('Running text though markdown conversion...')
|
||||||
try:
|
try:
|
||||||
html = convert_markdown(txt, disable_toc=options.markdown_disable_toc)
|
html = convert_markdown(txt, disable_toc=options.markdown_disable_toc)
|
||||||
@ -87,6 +79,22 @@ class TXTInput(InputFormatPlugin):
|
|||||||
raise ValueError('This txt file has malformed markup, it cannot be'
|
raise ValueError('This txt file has malformed markup, it cannot be'
|
||||||
' converted by calibre. See http://daringfireball.net/projects/markdown/syntax')
|
' converted by calibre. See http://daringfireball.net/projects/markdown/syntax')
|
||||||
else:
|
else:
|
||||||
|
# Determine the paragraph type of the document.
|
||||||
|
if options.paragraph_type == 'auto':
|
||||||
|
options.paragraph_type = detect_paragraph_type(txt)
|
||||||
|
if options.paragraph_type == 'unknown':
|
||||||
|
log.debug('Could not reliably determine paragraph type using block')
|
||||||
|
options.paragraph_type = 'block'
|
||||||
|
else:
|
||||||
|
log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
|
||||||
|
|
||||||
|
# We don't check for block because the processor assumes block.
|
||||||
|
# single and print at transformed to block for processing.
|
||||||
|
if options.paragraph_type == 'single':
|
||||||
|
txt = separate_paragraphs_single_line(txt)
|
||||||
|
elif options.paragraph_type == 'print':
|
||||||
|
txt = separate_paragraphs_print_formatted(txt)
|
||||||
|
|
||||||
flow_size = getattr(options, 'flow_size', 0)
|
flow_size = getattr(options, 'flow_size', 0)
|
||||||
html = convert_basic(txt, epub_split_size_kb=flow_size)
|
html = convert_basic(txt, epub_split_size_kb=flow_size)
|
||||||
|
|
||||||
|
@ -93,7 +93,7 @@ def split_string_separator(txt, size) :
|
|||||||
xrange(0, len(txt), size)])
|
xrange(0, len(txt), size)])
|
||||||
return txt
|
return txt
|
||||||
|
|
||||||
def detect_paragraph_formatting(txt):
|
def detect_paragraph_type(txt):
|
||||||
'''
|
'''
|
||||||
Tries to determine the formatting of the document.
|
Tries to determine the formatting of the document.
|
||||||
|
|
||||||
@ -109,6 +109,20 @@ def detect_paragraph_formatting(txt):
|
|||||||
txt = txt.replace('\r', '\n')
|
txt = txt.replace('\r', '\n')
|
||||||
txt_line_count = len(re.findall('(?mu)^\s*.+$', txt))
|
txt_line_count = len(re.findall('(?mu)^\s*.+$', txt))
|
||||||
|
|
||||||
|
# Check for print
|
||||||
|
tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt))
|
||||||
|
if tab_line_count / float(txt_line_count) >= .25:
|
||||||
|
return 'print'
|
||||||
|
|
||||||
|
# Check for block
|
||||||
|
empty_line_count = len(re.findall('(?mu)^\s*$', txt))
|
||||||
|
if empty_line_count / float(txt_line_count) >= .25:
|
||||||
|
return 'block'
|
||||||
|
|
||||||
|
# Nothing else matched to assume single.
|
||||||
|
return 'single'
|
||||||
|
|
||||||
|
def detect_formatting_type(txt):
|
||||||
# Check for markdown
|
# Check for markdown
|
||||||
# Headings
|
# Headings
|
||||||
if len(re.findall('(?mu)^#+', txt)) >= 5:
|
if len(re.findall('(?mu)^#+', txt)) >= 5:
|
||||||
@ -129,16 +143,4 @@ def detect_paragraph_formatting(txt):
|
|||||||
if txt.count('\\'+c) > 10:
|
if txt.count('\\'+c) > 10:
|
||||||
return 'markdown'
|
return 'markdown'
|
||||||
|
|
||||||
# Check for print
|
return 'none'
|
||||||
tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt))
|
|
||||||
if tab_line_count / float(txt_line_count) >= .25:
|
|
||||||
return 'print'
|
|
||||||
|
|
||||||
# Check for block
|
|
||||||
empty_line_count = len(re.findall('(?mu)^\s*$', txt))
|
|
||||||
if empty_line_count / float(txt_line_count) >= .25:
|
|
||||||
return 'block'
|
|
||||||
|
|
||||||
# Nothing else matched to assume single.
|
|
||||||
return 'single'
|
|
||||||
|
|
||||||
|
@ -16,8 +16,10 @@ class PluginWidget(Widget, Ui_Form):
|
|||||||
|
|
||||||
def __init__(self, parent, get_option, get_help, db=None, book_id=None):
|
def __init__(self, parent, get_option, get_help, db=None, book_id=None):
|
||||||
Widget.__init__(self, parent,
|
Widget.__init__(self, parent,
|
||||||
['paragraph_format', 'markdown_disable_toc', 'preserve_spaces'])
|
['paragraph_type', 'formatting_type', 'markdown_disable_toc', 'preserve_spaces'])
|
||||||
self.db, self.book_id = db, book_id
|
self.db, self.book_id = db, book_id
|
||||||
for x in get_option('paragraph_format').option.choices:
|
for x in get_option('paragraph_type').option.choices:
|
||||||
self.opt_paragraph_format.addItem(x)
|
self.opt_paragraph_type.addItem(x)
|
||||||
|
for x in get_option('formatting_type').option.choices:
|
||||||
|
self.opt_formatting_type.addItem(x)
|
||||||
self.initialize_options(get_option, get_help, db, book_id)
|
self.initialize_options(get_option, get_help, db, book_id)
|
||||||
|
@ -16,8 +16,10 @@ class PluginWidget(Widget, Ui_Form):
|
|||||||
|
|
||||||
def __init__(self, parent, get_option, get_help, db=None, book_id=None):
|
def __init__(self, parent, get_option, get_help, db=None, book_id=None):
|
||||||
Widget.__init__(self, parent,
|
Widget.__init__(self, parent,
|
||||||
['paragraph_format', 'markdown_disable_toc', 'preserve_spaces'])
|
['paragraph_type', 'formatting_type', 'markdown_disable_toc', 'preserve_spaces'])
|
||||||
self.db, self.book_id = db, book_id
|
self.db, self.book_id = db, book_id
|
||||||
for x in get_option('paragraph_format').option.choices:
|
for x in get_option('paragraph_type').option.choices:
|
||||||
self.opt_paragraph_format.addItem(x)
|
self.opt_paragraph_type.addItem(x)
|
||||||
|
for x in get_option('formatting_type').option.choices:
|
||||||
|
self.opt_formatting_type.addItem(x)
|
||||||
self.initialize_options(get_option, get_help, db, book_id)
|
self.initialize_options(get_option, get_help, db, book_id)
|
||||||
|
@ -16,8 +16,10 @@ class PluginWidget(Widget, Ui_Form):
|
|||||||
|
|
||||||
def __init__(self, parent, get_option, get_help, db=None, book_id=None):
|
def __init__(self, parent, get_option, get_help, db=None, book_id=None):
|
||||||
Widget.__init__(self, parent,
|
Widget.__init__(self, parent,
|
||||||
['paragraph_format', 'markdown_disable_toc', 'preserve_spaces'])
|
['paragraph_type', 'formatting_type', 'markdown_disable_toc', 'preserve_spaces'])
|
||||||
self.db, self.book_id = db, book_id
|
self.db, self.book_id = db, book_id
|
||||||
for x in get_option('paragraph_format').option.choices:
|
for x in get_option('paragraph_type').option.choices:
|
||||||
self.opt_paragraph_format.addItem(x)
|
self.opt_paragraph_type.addItem(x)
|
||||||
|
for x in get_option('formatting_type').option.choices:
|
||||||
|
self.opt_formatting_type.addItem(x)
|
||||||
self.initialize_options(get_option, get_help, db, book_id)
|
self.initialize_options(get_option, get_help, db, book_id)
|
||||||
|
@ -6,7 +6,7 @@
|
|||||||
<rect>
|
<rect>
|
||||||
<x>0</x>
|
<x>0</x>
|
||||||
<y>0</y>
|
<y>0</y>
|
||||||
<width>488</width>
|
<width>518</width>
|
||||||
<height>300</height>
|
<height>300</height>
|
||||||
</rect>
|
</rect>
|
||||||
</property>
|
</property>
|
||||||
@ -17,41 +17,21 @@
|
|||||||
<item row="0" column="0">
|
<item row="0" column="0">
|
||||||
<widget class="QLabel" name="label_2">
|
<widget class="QLabel" name="label_2">
|
||||||
<property name="text">
|
<property name="text">
|
||||||
<string>Document structure detection</string>
|
<string>Paragraph style:</string>
|
||||||
</property>
|
</property>
|
||||||
</widget>
|
</widget>
|
||||||
</item>
|
</item>
|
||||||
<item row="0" column="1">
|
<item row="0" column="1">
|
||||||
<widget class="QComboBox" name="opt_paragraph_format"/>
|
<widget class="QComboBox" name="opt_paragraph_type"/>
|
||||||
</item>
|
</item>
|
||||||
<item row="1" column="0" colspan="2">
|
<item row="5" column="0" colspan="2">
|
||||||
<widget class="QLabel" name="label">
|
|
||||||
<property name="text">
|
|
||||||
<string><p>Markdown is a simple markup language for text files, that allows for advanced formatting. To learn more visit <a href="http://daringfireball.net/projects/markdown">markdown</a>.</string>
|
|
||||||
</property>
|
|
||||||
<property name="wordWrap">
|
|
||||||
<bool>true</bool>
|
|
||||||
</property>
|
|
||||||
<property name="openExternalLinks">
|
|
||||||
<bool>true</bool>
|
|
||||||
</property>
|
|
||||||
</widget>
|
|
||||||
</item>
|
|
||||||
<item row="2" column="0" colspan="2">
|
|
||||||
<widget class="QCheckBox" name="opt_markdown_disable_toc">
|
|
||||||
<property name="text">
|
|
||||||
<string>Do not insert Table of Contents into output text when using markdown</string>
|
|
||||||
</property>
|
|
||||||
</widget>
|
|
||||||
</item>
|
|
||||||
<item row="3" column="0" colspan="2">
|
|
||||||
<widget class="QCheckBox" name="opt_preserve_spaces">
|
<widget class="QCheckBox" name="opt_preserve_spaces">
|
||||||
<property name="text">
|
<property name="text">
|
||||||
<string>Preserve &spaces</string>
|
<string>Preserve &spaces</string>
|
||||||
</property>
|
</property>
|
||||||
</widget>
|
</widget>
|
||||||
</item>
|
</item>
|
||||||
<item row="4" column="0" colspan="2">
|
<item row="6" column="0" colspan="2">
|
||||||
<spacer name="verticalSpacer">
|
<spacer name="verticalSpacer">
|
||||||
<property name="orientation">
|
<property name="orientation">
|
||||||
<enum>Qt::Vertical</enum>
|
<enum>Qt::Vertical</enum>
|
||||||
@ -64,6 +44,45 @@
|
|||||||
</property>
|
</property>
|
||||||
</spacer>
|
</spacer>
|
||||||
</item>
|
</item>
|
||||||
|
<item row="1" column="1">
|
||||||
|
<widget class="QComboBox" name="opt_formatting_type"/>
|
||||||
|
</item>
|
||||||
|
<item row="1" column="0">
|
||||||
|
<widget class="QLabel" name="label_3">
|
||||||
|
<property name="text">
|
||||||
|
<string>Formatting style:</string>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item row="2" column="0" rowspan="2" colspan="2">
|
||||||
|
<widget class="QGroupBox" name="groupBox">
|
||||||
|
<property name="title">
|
||||||
|
<string>Markdown Options</string>
|
||||||
|
</property>
|
||||||
|
<layout class="QVBoxLayout" name="verticalLayout">
|
||||||
|
<item>
|
||||||
|
<widget class="QLabel" name="label">
|
||||||
|
<property name="text">
|
||||||
|
<string><p>Markdown is a simple markup language for text files, that allows for advanced formatting. To learn more visit <a href="http://daringfireball.net/projects/markdown">markdown</a>.</string>
|
||||||
|
</property>
|
||||||
|
<property name="wordWrap">
|
||||||
|
<bool>true</bool>
|
||||||
|
</property>
|
||||||
|
<property name="openExternalLinks">
|
||||||
|
<bool>true</bool>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item>
|
||||||
|
<widget class="QCheckBox" name="opt_markdown_disable_toc">
|
||||||
|
<property name="text">
|
||||||
|
<string>Do not insert Table of Contents into output text when using markdown</string>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
</layout>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
</layout>
|
</layout>
|
||||||
</widget>
|
</widget>
|
||||||
<resources/>
|
<resources/>
|
||||||
|
Loading…
x
Reference in New Issue
Block a user