TXT Input: Auto detect paragraph structure.

This commit is contained in:
John Schember 2011-01-02 19:05:35 -05:00
parent d9195c0632
commit 9ec9163919
4 changed files with 104 additions and 51 deletions

View File

@ -19,26 +19,22 @@ class PDBInput(InputFormatPlugin):
file_types = set(['pdb'])
options = set([
OptionRecommendation(name='single_line_paras', recommended_value=False,
help=_('Normally calibre treats blank lines as paragraph markers. '
'With this option it will assume that every line represents '
'a paragraph instead. This option is ignored by eReader format.')),
OptionRecommendation(name='print_formatted_paras', recommended_value=False,
help=_('Normally calibre treats blank lines as paragraph markers. '
'With this option it will assume that every line starting with '
'an indent (either a tab or 2+ spaces) represents a paragraph. '
'Paragraphs end when the next line that starts with an indent '
'is reached. This option is ignored by eReader format.')),
OptionRecommendation(name='paragraph_format', recommended_value='auto',
choices=['auto', 'block', 'single', 'print', 'markdown'],
help=_('How calibre splits text into paragraphs.\n'
'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n'
'* auto: Try to auto detect paragraph format.\n'
'* block: Treat a blank line as a paragraph break.\n'
'* single: Assume every line is a paragraph.\n'
'* print: Assume every line starting with 2+ spaces or a tab '
'starts a paragraph.\n'
'* markdown: Run the input though the markdown pre-processor. '
'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
OptionRecommendation(name='preserve_spaces', recommended_value=False,
help=_('Normally extra spaces are condensed into a single space. '
'With this option all spaces will be displayed. This option '
'is ignored by eReader format.')),
OptionRecommendation(name='markdown', recommended_value=False,
help=_('Run the text input through the markdown pre-processor. To '
'learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
'With this option all spaces will be displayed.')),
OptionRecommendation(name="markdown_disable_toc", recommended_value=False,
help=_('Do not insert a Table of Contents into the output text. '
'This option is ignored by eReader format.')),
help=_('Do not insert a Table of Contents into the output text.')),
])
def convert(self, stream, options, file_ext, log,

View File

@ -17,22 +17,20 @@ class TCRInput(InputFormatPlugin):
file_types = set(['tcr'])
options = set([
OptionRecommendation(name='single_line_paras', recommended_value=False,
help=_('Normally calibre treats blank lines as paragraph markers. '
'With this option it will assume that every line represents '
'a paragraph instead.')),
OptionRecommendation(name='print_formatted_paras', recommended_value=False,
help=_('Normally calibre treats blank lines as paragraph markers. '
'With this option it will assume that every line starting with '
'an indent (either a tab or 2+ spaces) represents a paragraph. '
'Paragraphs end when the next line that starts with an indent '
'is reached.')),
OptionRecommendation(name='paragraph_format', recommended_value='auto',
choices=['auto', 'block', 'single', 'print', 'markdown'],
help=_('How calibre splits text into paragraphs.\n'
'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n'
'* auto: Try to auto detect paragraph format.\n'
'* block: Treat a blank line as a paragraph break.\n'
'* single: Assume every line is a paragraph.\n'
'* print: Assume every line starting with 2+ spaces or a tab '
'starts a paragraph.\n'
'* markdown: Run the input though the markdown pre-processor. '
'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
OptionRecommendation(name='preserve_spaces', recommended_value=False,
help=_('Normally extra spaces are condensed into a single space. '
'With this option all spaces will be displayed.')),
OptionRecommendation(name='markdown', recommended_value=False,
help=_('Run the text input through the markdown pre-processor. To '
'learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
OptionRecommendation(name="markdown_disable_toc", recommended_value=False,
help=_('Do not insert a Table of Contents into the output text.')),
])

View File

@ -10,7 +10,7 @@ from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
from calibre.ebooks.chardet import detect
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
preserve_spaces
preserve_spaces, detect_paragraph_formatting
from calibre import _ent_pat, xml_entity_to_unicode
class TXTInput(InputFormatPlugin):
@ -21,22 +21,20 @@ class TXTInput(InputFormatPlugin):
file_types = set(['txt'])
options = set([
OptionRecommendation(name='single_line_paras', recommended_value=False,
help=_('Normally calibre treats blank lines as paragraph markers. '
'With this option it will assume that every line represents '
'a paragraph instead.')),
OptionRecommendation(name='print_formatted_paras', recommended_value=False,
help=_('Normally calibre treats blank lines as paragraph markers. '
'With this option it will assume that every line starting with '
'an indent (either a tab or 2+ spaces) represents a paragraph. '
'Paragraphs end when the next line that starts with an indent '
'is reached.')),
OptionRecommendation(name='paragraph_format', recommended_value='auto',
choices=['auto', 'block', 'single', 'print', 'markdown'],
help=_('How calibre splits text into paragraphs.\n'
'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n'
'* auto: Try to auto detect paragraph format.\n'
'* block: Treat a blank line as a paragraph break.\n'
'* single: Assume every line is a paragraph.\n'
'* print: Assume every line starting with 2+ spaces or a tab '
'starts a paragraph.\n'
'* markdown: Run the input though the markdown pre-processor. '
'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
OptionRecommendation(name='preserve_spaces', recommended_value=False,
help=_('Normally extra spaces are condensed into a single space. '
'With this option all spaces will be displayed.')),
OptionRecommendation(name='markdown', recommended_value=False,
help=_('Run the text input through the markdown pre-processor. To '
'learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
OptionRecommendation(name="markdown_disable_toc", recommended_value=False,
help=_('Do not insert a Table of Contents into the output text.')),
])
@ -46,6 +44,7 @@ class TXTInput(InputFormatPlugin):
log.debug('Reading text from file...')
txt = stream.read()
# Get the encoding of the document.
if options.input_encoding:
ienc = options.input_encoding
log.debug('Using user specified input encoding of %s' % ienc)
@ -58,17 +57,29 @@ class TXTInput(InputFormatPlugin):
log.debug('No input encoding specified and could not auto detect using %s' % ienc)
txt = txt.decode(ienc, 'replace')
# Adjust paragraph formatting as requested
if options.single_line_paras:
# Determine the formatting of the document.
if options.paragraph_format == 'auto':
options.paragraph_format = detect_paragraph_formatting(txt)
if options.paragraph_format == 'unknown':
log.debug('Could not reliably determine paragraph format using block format')
options.paragraph_format = 'block'
else:
log.debug('Auto detected paragraph format as %s' % options.paragraph_format)
# We don't check for block because the processor assumes block.
# single and print at transformed to block for processing.
if options.paragraph_format == 'single':
txt = separate_paragraphs_single_line(txt)
if options.print_formatted_paras:
elif options.paragraph_format == 'print':
txt = separate_paragraphs_print_formatted(txt)
txt = _ent_pat.sub(xml_entity_to_unicode, txt)
# Preserve spaces will replace multiple spaces to a space
# followed by the   entity.
if options.preserve_spaces:
txt = preserve_spaces(txt)
txt = _ent_pat.sub(xml_entity_to_unicode, txt)
if options.markdown:
if options.paragraph_format == 'markdown':
log.debug('Running text though markdown conversion...')
try:
html = convert_markdown(txt, disable_toc=options.markdown_disable_toc)

View File

@ -49,7 +49,6 @@ def convert_basic(txt, title='', epub_split_size_kb=0):
if isbytestring(txt):
txt = txt.decode('utf-8')
lines = []
# Split into paragraphs based on having a blank line between text.
for line in txt.split('\n\n'):
@ -94,3 +93,52 @@ def split_string_separator(txt, size) :
xrange(0, len(txt), size)])
return txt
def detect_paragraph_formatting(txt):
'''
Tries to determine the formatting of the document.
block: Paragraphs are separated by a blank line.
single: Each line is a paragraph.
print: Each paragraph starts with a 2+ spaces or a tab
and ends when a new paragraph is reached.
markdown: Markdown formatting is in the document.
returns block, single, print, markdown
'''
txt = txt.replace('\r\n', '\n')
txt = txt.replace('\r', '\n')
txt_line_count = len(re.findall('(?mu)^\s*.+$', txt))
# Check for markdown
# Headings
if len(re.findall('(?mu)^#+', txt)) >= 5:
return 'markdown'
if len(re.findall('(?mu)^=+$', txt)) >= 5:
return 'markdown'
if len(re.findall('(?mu)^-+$', txt)) >= 5:
return 'markdown'
# Images
if len(re.findall('(?u)!\[.*?\]\(.+?\)', txt)) >= 5:
return 'markdown'
# Links
if len(re.findall('(?u)(^|(?P<pre>[^!]))\[.*?\]\([^)]+\)', txt)) >= 5:
return 'markdown'
# Escaped characters
md_escapted_characters = ['\\', '`', '*', '_', '{', '}', '[', ']', '(', ')', '#', '+', '-', '.', '!']
for c in md_escapted_characters:
if txt.count('\\'+c) > 10:
return 'markdown'
# Check for print
tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt))
if tab_line_count / float(txt_line_count) >= .25:
return 'print'
# Check for block
empty_line_count = len(re.findall('(?mu)^\s*$', txt))
if empty_line_count / float(txt_line_count) >= .25:
return 'block'
# Nothing else matched to assume single.
return 'single'