mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
TXT Input: Auto detect paragraph structure.
This commit is contained in:
parent
d9195c0632
commit
9ec9163919
@ -19,26 +19,22 @@ class PDBInput(InputFormatPlugin):
|
|||||||
file_types = set(['pdb'])
|
file_types = set(['pdb'])
|
||||||
|
|
||||||
options = set([
|
options = set([
|
||||||
OptionRecommendation(name='single_line_paras', recommended_value=False,
|
OptionRecommendation(name='paragraph_format', recommended_value='auto',
|
||||||
help=_('Normally calibre treats blank lines as paragraph markers. '
|
choices=['auto', 'block', 'single', 'print', 'markdown'],
|
||||||
'With this option it will assume that every line represents '
|
help=_('How calibre splits text into paragraphs.\n'
|
||||||
'a paragraph instead. This option is ignored by eReader format.')),
|
'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n'
|
||||||
OptionRecommendation(name='print_formatted_paras', recommended_value=False,
|
'* auto: Try to auto detect paragraph format.\n'
|
||||||
help=_('Normally calibre treats blank lines as paragraph markers. '
|
'* block: Treat a blank line as a paragraph break.\n'
|
||||||
'With this option it will assume that every line starting with '
|
'* single: Assume every line is a paragraph.\n'
|
||||||
'an indent (either a tab or 2+ spaces) represents a paragraph. '
|
'* print: Assume every line starting with 2+ spaces or a tab '
|
||||||
'Paragraphs end when the next line that starts with an indent '
|
'starts a paragraph.\n'
|
||||||
'is reached. This option is ignored by eReader format.')),
|
'* markdown: Run the input though the markdown pre-processor. '
|
||||||
|
'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
|
||||||
OptionRecommendation(name='preserve_spaces', recommended_value=False,
|
OptionRecommendation(name='preserve_spaces', recommended_value=False,
|
||||||
help=_('Normally extra spaces are condensed into a single space. '
|
help=_('Normally extra spaces are condensed into a single space. '
|
||||||
'With this option all spaces will be displayed. This option '
|
'With this option all spaces will be displayed.')),
|
||||||
'is ignored by eReader format.')),
|
|
||||||
OptionRecommendation(name='markdown', recommended_value=False,
|
|
||||||
help=_('Run the text input through the markdown pre-processor. To '
|
|
||||||
'learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
|
|
||||||
OptionRecommendation(name="markdown_disable_toc", recommended_value=False,
|
OptionRecommendation(name="markdown_disable_toc", recommended_value=False,
|
||||||
help=_('Do not insert a Table of Contents into the output text. '
|
help=_('Do not insert a Table of Contents into the output text.')),
|
||||||
'This option is ignored by eReader format.')),
|
|
||||||
])
|
])
|
||||||
|
|
||||||
def convert(self, stream, options, file_ext, log,
|
def convert(self, stream, options, file_ext, log,
|
||||||
|
@ -17,22 +17,20 @@ class TCRInput(InputFormatPlugin):
|
|||||||
file_types = set(['tcr'])
|
file_types = set(['tcr'])
|
||||||
|
|
||||||
options = set([
|
options = set([
|
||||||
OptionRecommendation(name='single_line_paras', recommended_value=False,
|
OptionRecommendation(name='paragraph_format', recommended_value='auto',
|
||||||
help=_('Normally calibre treats blank lines as paragraph markers. '
|
choices=['auto', 'block', 'single', 'print', 'markdown'],
|
||||||
'With this option it will assume that every line represents '
|
help=_('How calibre splits text into paragraphs.\n'
|
||||||
'a paragraph instead.')),
|
'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n'
|
||||||
OptionRecommendation(name='print_formatted_paras', recommended_value=False,
|
'* auto: Try to auto detect paragraph format.\n'
|
||||||
help=_('Normally calibre treats blank lines as paragraph markers. '
|
'* block: Treat a blank line as a paragraph break.\n'
|
||||||
'With this option it will assume that every line starting with '
|
'* single: Assume every line is a paragraph.\n'
|
||||||
'an indent (either a tab or 2+ spaces) represents a paragraph. '
|
'* print: Assume every line starting with 2+ spaces or a tab '
|
||||||
'Paragraphs end when the next line that starts with an indent '
|
'starts a paragraph.\n'
|
||||||
'is reached.')),
|
'* markdown: Run the input though the markdown pre-processor. '
|
||||||
|
'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
|
||||||
OptionRecommendation(name='preserve_spaces', recommended_value=False,
|
OptionRecommendation(name='preserve_spaces', recommended_value=False,
|
||||||
help=_('Normally extra spaces are condensed into a single space. '
|
help=_('Normally extra spaces are condensed into a single space. '
|
||||||
'With this option all spaces will be displayed.')),
|
'With this option all spaces will be displayed.')),
|
||||||
OptionRecommendation(name='markdown', recommended_value=False,
|
|
||||||
help=_('Run the text input through the markdown pre-processor. To '
|
|
||||||
'learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
|
|
||||||
OptionRecommendation(name="markdown_disable_toc", recommended_value=False,
|
OptionRecommendation(name="markdown_disable_toc", recommended_value=False,
|
||||||
help=_('Do not insert a Table of Contents into the output text.')),
|
help=_('Do not insert a Table of Contents into the output text.')),
|
||||||
])
|
])
|
||||||
|
@ -10,7 +10,7 @@ from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
|
|||||||
from calibre.ebooks.chardet import detect
|
from calibre.ebooks.chardet import detect
|
||||||
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
|
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
|
||||||
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
|
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
|
||||||
preserve_spaces
|
preserve_spaces, detect_paragraph_formatting
|
||||||
from calibre import _ent_pat, xml_entity_to_unicode
|
from calibre import _ent_pat, xml_entity_to_unicode
|
||||||
|
|
||||||
class TXTInput(InputFormatPlugin):
|
class TXTInput(InputFormatPlugin):
|
||||||
@ -21,22 +21,20 @@ class TXTInput(InputFormatPlugin):
|
|||||||
file_types = set(['txt'])
|
file_types = set(['txt'])
|
||||||
|
|
||||||
options = set([
|
options = set([
|
||||||
OptionRecommendation(name='single_line_paras', recommended_value=False,
|
OptionRecommendation(name='paragraph_format', recommended_value='auto',
|
||||||
help=_('Normally calibre treats blank lines as paragraph markers. '
|
choices=['auto', 'block', 'single', 'print', 'markdown'],
|
||||||
'With this option it will assume that every line represents '
|
help=_('How calibre splits text into paragraphs.\n'
|
||||||
'a paragraph instead.')),
|
'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n'
|
||||||
OptionRecommendation(name='print_formatted_paras', recommended_value=False,
|
'* auto: Try to auto detect paragraph format.\n'
|
||||||
help=_('Normally calibre treats blank lines as paragraph markers. '
|
'* block: Treat a blank line as a paragraph break.\n'
|
||||||
'With this option it will assume that every line starting with '
|
'* single: Assume every line is a paragraph.\n'
|
||||||
'an indent (either a tab or 2+ spaces) represents a paragraph. '
|
'* print: Assume every line starting with 2+ spaces or a tab '
|
||||||
'Paragraphs end when the next line that starts with an indent '
|
'starts a paragraph.\n'
|
||||||
'is reached.')),
|
'* markdown: Run the input though the markdown pre-processor. '
|
||||||
|
'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
|
||||||
OptionRecommendation(name='preserve_spaces', recommended_value=False,
|
OptionRecommendation(name='preserve_spaces', recommended_value=False,
|
||||||
help=_('Normally extra spaces are condensed into a single space. '
|
help=_('Normally extra spaces are condensed into a single space. '
|
||||||
'With this option all spaces will be displayed.')),
|
'With this option all spaces will be displayed.')),
|
||||||
OptionRecommendation(name='markdown', recommended_value=False,
|
|
||||||
help=_('Run the text input through the markdown pre-processor. To '
|
|
||||||
'learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
|
|
||||||
OptionRecommendation(name="markdown_disable_toc", recommended_value=False,
|
OptionRecommendation(name="markdown_disable_toc", recommended_value=False,
|
||||||
help=_('Do not insert a Table of Contents into the output text.')),
|
help=_('Do not insert a Table of Contents into the output text.')),
|
||||||
])
|
])
|
||||||
@ -46,6 +44,7 @@ class TXTInput(InputFormatPlugin):
|
|||||||
log.debug('Reading text from file...')
|
log.debug('Reading text from file...')
|
||||||
|
|
||||||
txt = stream.read()
|
txt = stream.read()
|
||||||
|
# Get the encoding of the document.
|
||||||
if options.input_encoding:
|
if options.input_encoding:
|
||||||
ienc = options.input_encoding
|
ienc = options.input_encoding
|
||||||
log.debug('Using user specified input encoding of %s' % ienc)
|
log.debug('Using user specified input encoding of %s' % ienc)
|
||||||
@ -58,17 +57,29 @@ class TXTInput(InputFormatPlugin):
|
|||||||
log.debug('No input encoding specified and could not auto detect using %s' % ienc)
|
log.debug('No input encoding specified and could not auto detect using %s' % ienc)
|
||||||
txt = txt.decode(ienc, 'replace')
|
txt = txt.decode(ienc, 'replace')
|
||||||
|
|
||||||
# Adjust paragraph formatting as requested
|
# Determine the formatting of the document.
|
||||||
if options.single_line_paras:
|
if options.paragraph_format == 'auto':
|
||||||
|
options.paragraph_format = detect_paragraph_formatting(txt)
|
||||||
|
if options.paragraph_format == 'unknown':
|
||||||
|
log.debug('Could not reliably determine paragraph format using block format')
|
||||||
|
options.paragraph_format = 'block'
|
||||||
|
else:
|
||||||
|
log.debug('Auto detected paragraph format as %s' % options.paragraph_format)
|
||||||
|
|
||||||
|
# We don't check for block because the processor assumes block.
|
||||||
|
# single and print at transformed to block for processing.
|
||||||
|
if options.paragraph_format == 'single':
|
||||||
txt = separate_paragraphs_single_line(txt)
|
txt = separate_paragraphs_single_line(txt)
|
||||||
if options.print_formatted_paras:
|
elif options.paragraph_format == 'print':
|
||||||
txt = separate_paragraphs_print_formatted(txt)
|
txt = separate_paragraphs_print_formatted(txt)
|
||||||
|
|
||||||
|
txt = _ent_pat.sub(xml_entity_to_unicode, txt)
|
||||||
|
# Preserve spaces will replace multiple spaces to a space
|
||||||
|
# followed by the entity.
|
||||||
if options.preserve_spaces:
|
if options.preserve_spaces:
|
||||||
txt = preserve_spaces(txt)
|
txt = preserve_spaces(txt)
|
||||||
|
|
||||||
txt = _ent_pat.sub(xml_entity_to_unicode, txt)
|
if options.paragraph_format == 'markdown':
|
||||||
|
|
||||||
if options.markdown:
|
|
||||||
log.debug('Running text though markdown conversion...')
|
log.debug('Running text though markdown conversion...')
|
||||||
try:
|
try:
|
||||||
html = convert_markdown(txt, disable_toc=options.markdown_disable_toc)
|
html = convert_markdown(txt, disable_toc=options.markdown_disable_toc)
|
||||||
|
@ -49,7 +49,6 @@ def convert_basic(txt, title='', epub_split_size_kb=0):
|
|||||||
if isbytestring(txt):
|
if isbytestring(txt):
|
||||||
txt = txt.decode('utf-8')
|
txt = txt.decode('utf-8')
|
||||||
|
|
||||||
|
|
||||||
lines = []
|
lines = []
|
||||||
# Split into paragraphs based on having a blank line between text.
|
# Split into paragraphs based on having a blank line between text.
|
||||||
for line in txt.split('\n\n'):
|
for line in txt.split('\n\n'):
|
||||||
@ -94,3 +93,52 @@ def split_string_separator(txt, size) :
|
|||||||
xrange(0, len(txt), size)])
|
xrange(0, len(txt), size)])
|
||||||
return txt
|
return txt
|
||||||
|
|
||||||
|
def detect_paragraph_formatting(txt):
|
||||||
|
'''
|
||||||
|
Tries to determine the formatting of the document.
|
||||||
|
|
||||||
|
block: Paragraphs are separated by a blank line.
|
||||||
|
single: Each line is a paragraph.
|
||||||
|
print: Each paragraph starts with a 2+ spaces or a tab
|
||||||
|
and ends when a new paragraph is reached.
|
||||||
|
markdown: Markdown formatting is in the document.
|
||||||
|
|
||||||
|
returns block, single, print, markdown
|
||||||
|
'''
|
||||||
|
txt = txt.replace('\r\n', '\n')
|
||||||
|
txt = txt.replace('\r', '\n')
|
||||||
|
txt_line_count = len(re.findall('(?mu)^\s*.+$', txt))
|
||||||
|
|
||||||
|
# Check for markdown
|
||||||
|
# Headings
|
||||||
|
if len(re.findall('(?mu)^#+', txt)) >= 5:
|
||||||
|
return 'markdown'
|
||||||
|
if len(re.findall('(?mu)^=+$', txt)) >= 5:
|
||||||
|
return 'markdown'
|
||||||
|
if len(re.findall('(?mu)^-+$', txt)) >= 5:
|
||||||
|
return 'markdown'
|
||||||
|
# Images
|
||||||
|
if len(re.findall('(?u)!\[.*?\]\(.+?\)', txt)) >= 5:
|
||||||
|
return 'markdown'
|
||||||
|
# Links
|
||||||
|
if len(re.findall('(?u)(^|(?P<pre>[^!]))\[.*?\]\([^)]+\)', txt)) >= 5:
|
||||||
|
return 'markdown'
|
||||||
|
# Escaped characters
|
||||||
|
md_escapted_characters = ['\\', '`', '*', '_', '{', '}', '[', ']', '(', ')', '#', '+', '-', '.', '!']
|
||||||
|
for c in md_escapted_characters:
|
||||||
|
if txt.count('\\'+c) > 10:
|
||||||
|
return 'markdown'
|
||||||
|
|
||||||
|
# Check for print
|
||||||
|
tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt))
|
||||||
|
if tab_line_count / float(txt_line_count) >= .25:
|
||||||
|
return 'print'
|
||||||
|
|
||||||
|
# Check for block
|
||||||
|
empty_line_count = len(re.findall('(?mu)^\s*$', txt))
|
||||||
|
if empty_line_count / float(txt_line_count) >= .25:
|
||||||
|
return 'block'
|
||||||
|
|
||||||
|
# Nothing else matched to assume single.
|
||||||
|
return 'single'
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user