Sync to ldolse heuristics branch

This commit is contained in:
John Schember 2011-01-17 12:43:02 -05:00
commit 8e29e7046c
5 changed files with 143 additions and 54 deletions

View File

@ -492,7 +492,9 @@ OptionRecommendation(name='enable_heuristics',
OptionRecommendation(name='markup_chapter_headings', OptionRecommendation(name='markup_chapter_headings',
recommended_value=False, level=OptionRecommendation.LOW, recommended_value=False, level=OptionRecommendation.LOW,
help=_('Detect unformatted chapter headings and sub headings. Change ' help=_('Detect unformatted chapter headings and sub headings. Change '
'them to h2 and h3 tags.')), 'them to h2 and h3 tags. This setting will not create a TOC, '
'but can be used in conjunction with structure detection to create '
'one.')),
OptionRecommendation(name='italicize_common_cases', OptionRecommendation(name='italicize_common_cases',
recommended_value=False, level=OptionRecommendation.LOW, recommended_value=False, level=OptionRecommendation.LOW,
@ -501,7 +503,7 @@ OptionRecommendation(name='italicize_common_cases',
OptionRecommendation(name='fix_indents', OptionRecommendation(name='fix_indents',
recommended_value=False, level=OptionRecommendation.LOW, recommended_value=False, level=OptionRecommendation.LOW,
help=_('Turn indentation created from multiple   entities ' help=_('Turn indentation created from multiple non-breaking space entities '
'into CSS indents.')), 'into CSS indents.')),
OptionRecommendation(name='html_unwrap_factor', OptionRecommendation(name='html_unwrap_factor',

View File

@ -473,12 +473,6 @@ class HTMLPreProcessor(object):
# unwrap/delete soft hyphens with formatting # unwrap/delete soft hyphens with formatting
end_rules.append((re.compile(u'[­]\s*(</(i|u|b)>)+(</p>\s*<p>\s*)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: '')) end_rules.append((re.compile(u'[­]\s*(</(i|u|b)>)+(</p>\s*<p>\s*)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: ''))
# Make the more aggressive chapter marking regex optional with the preprocess option to
# reduce false positives and move after header/footer removal
if getattr(self.extra_opts, 'preprocess_html', None):
if is_pdftohtml:
end_rules.append((re.compile(r'<p>\s*(?P<chap>(<[ibu]>){0,2}\s*([A-Z \'"!]{3,})\s*([\dA-Z:]+\s){0,4}\s*(</[ibu]>){0,2})\s*<p>\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<p>)?'), chap_head),)
length = -1 length = -1
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01: if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
docanalysis = DocAnalysis('pdf', html) docanalysis = DocAnalysis('pdf', html)
@ -525,11 +519,10 @@ class HTMLPreProcessor(object):
html = dehyphenator(html,'html', length) html = dehyphenator(html,'html', length)
if is_pdftohtml: if is_pdftohtml:
from calibre.ebooks.conversion.utils import PreProcessor from calibre.ebooks.conversion.utils import HeuristicProcessor
pdf_markup = PreProcessor(self.extra_opts, None) pdf_markup = HeuristicProcessor(self.extra_opts, None)
totalwords = 0 totalwords = 0
totalwords = pdf_markup.get_word_count(html) if pdf_markup.get_word_count(html) > 7000:
if totalwords > 7000:
html = pdf_markup.markup_chapters(html, totalwords, True) html = pdf_markup.markup_chapters(html, totalwords, True)
#dump(html, 'post-preprocess') #dump(html, 'post-preprocess')

View File

@ -21,8 +21,10 @@ class HeuristicProcessor(object):
self.deleted_nbsps = False self.deleted_nbsps = False
self.totalwords = 0 self.totalwords = 0
self.min_chapters = 1 self.min_chapters = 1
self.max_chapters = 150
self.chapters_no_title = 0 self.chapters_no_title = 0
self.chapters_with_title = 0 self.chapters_with_title = 0
self.blanks_deleted = False
self.linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL) self.linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
self.blankreg = re.compile(r'\s*(?P<openline><p(?!\sid=\"softbreak\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE) self.blankreg = re.compile(r'\s*(?P<openline><p(?!\sid=\"softbreak\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
self.multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>){2,}', re.IGNORECASE) self.multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>){2,}', re.IGNORECASE)
@ -131,7 +133,7 @@ class HeuristicProcessor(object):
def markup_italicis(self, html): def markup_italicis(self, html):
ITALICIZE_WORDS = [ ITALICIZE_WORDS = [
'Etc.', 'etc.', 'viz.', 'ie.', 'i.e.', 'Ie.', 'I.e.', 'eg.', 'Etc.', 'etc.', 'viz.', 'ie.', 'i.e.', 'Ie.', 'I.e.', 'eg.',
'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetra', 'n.b.', 'N.b.', 'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetera', 'n.b.', 'N.b.',
'nota bene', 'Nota bene', 'Ste.', 'Mme.', 'Mdme.', 'nota bene', 'Nota bene', 'Ste.', 'Mme.', 'Mdme.',
'Mlle.', 'Mons.', 'PS.', 'PPS.', 'Mlle.', 'Mons.', 'PS.', 'PPS.',
] ]
@ -165,10 +167,12 @@ class HeuristicProcessor(object):
with minimum false positives. Exits after finding a successful pattern with minimum false positives. Exits after finding a successful pattern
''' '''
# Typical chapters are between 2000 and 7000 words, use the larger number to decide the # Typical chapters are between 2000 and 7000 words, use the larger number to decide the
# minimum of chapters to search for # minimum of chapters to search for. A max limit is calculated to prevent things like OCR
# or pdf page numbers from being treated as TOC markers
if wordcount > 7000: if wordcount > 7000:
self.min_chapters = int(ceil(wordcount / 7000.)) self.min_chapters = int(ceil(wordcount / 15000.))
#print "minimum chapters required are: "+str(self.min_chapters) self.max_chapters = int(ceil(wordcount / 1200.))
print "minimum chapters required are: "+str(self.min_chapters)
heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE) heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
self.html_preprocess_sections = len(heading.findall(html)) self.html_preprocess_sections = len(heading.findall(html))
self.log.debug("found " + unicode(self.html_preprocess_sections) + " pre-existing headings") self.log.debug("found " + unicode(self.html_preprocess_sections) + " pre-existing headings")
@ -201,44 +205,85 @@ class HeuristicProcessor(object):
n_lookahead_open = "\s+(?!" n_lookahead_open = "\s+(?!"
n_lookahead_close = ")" n_lookahead_close = ")"
default_title = r"(<[ibu][^>]*>)?\s{0,3}([\w\'\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)" default_title = r"(<[ibu][^>]*>)?\s{0,3}(?!Chapter)([\w\'\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)"
simple_title = r"(<[ibu][^>]*>)?\s{0,3}(?!Chapter).{0,50}?(</[ibu][^>]*>)?(?=<)"
analysis_result = []
chapter_types = [ chapter_types = [
[r"[^'\"]?(CHAPTER|Kapitel)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, "Searching for most common chapter headings", 'chapter'], # Highest frequency headings which include titles [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Epilogue|CHAPTER|Kapitel|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, True, True, False, "Searching for common section headings", 'common'],
[r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, "Searching for common section headings", 'common'], [r"[^'\"]?(CHAPTER|Kapitel)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, True, True, False, "Searching for most common chapter headings", 'chapter'], # Highest frequency headings which include titles
[r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•=]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>", True, "Searching for emphasized lines", 'emphasized'], # Emphasized lines [r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•=]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>", True, True, True, False, "Searching for emphasized lines", 'emphasized'], # Emphasized lines
[r"[^'\"]?(\d+(\.|:))\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings", 'numeric'], # Numeric Chapters [r"[^'\"]?(\d+(\.|:))\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, True, True, False, "Searching for numeric chapter headings", 'numeric'], # Numeric Chapters
[r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, "Searching for letter spaced headings", 'letter_spaced'], # Spaced Lettering [r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, True, True, False, "Searching for letter spaced headings", 'letter_spaced'], # Spaced Lettering
[r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles", 'numeric_title'], # Numeric Titles [r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, True, True, False, "Searching for numeric chapters with titles", 'numeric_title'], # Numeric Titles
[r"[^'\"]?(\d+)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, "Searching for simple numeric headings", 'plain_number'], # Numeric Chapters, no dot or colon [r"[^'\"]?(\d+)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, True, True, False, "Searching for simple numeric headings", 'plain_number'], # Numeric Chapters, no dot or colon
[r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters", 'uppercase' ] # Uppercase Chapters [r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, True, False, False, "Searching for chapters with Uppercase Characters", 'uppercase' ] # Uppercase Chapters
] ]
def recurse_patterns(html, analyze): def recurse_patterns(html, analyze):
# Start with most typical chapter headings, get more aggressive until one works # Start with most typical chapter headings, get more aggressive until one works
for [chapter_type, lookahead_ignorecase, log_message, type_name] in chapter_types: for [chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name] in chapter_types:
n_lookahead = ''
hits = 0
self.chapters_no_title = 0
self.chapters_with_title = 0
if n_lookahead_req:
lp_n_lookahead_open = n_lookahead_open
lp_n_lookahead_close = n_lookahead_close
else:
lp_n_lookahead_open = ''
lp_n_lookahead_close = ''
if strict_title:
lp_title = default_title
else:
lp_title = simple_title
if ignorecase:
arg_ignorecase = r'(?i)'
else:
arg_ignorecase = ''
if title_req:
lp_opt_title_open = ''
lp_opt_title_close = ''
else:
lp_opt_title_open = opt_title_open
lp_opt_title_close = opt_title_close
if self.html_preprocess_sections >= self.min_chapters: if self.html_preprocess_sections >= self.min_chapters:
break break
full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close
n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line) if n_lookahead_req:
self.log.debug("Marked " + unicode(self.html_preprocess_sections) + " headings, " + log_message) n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
if lookahead_ignorecase: if not analyze:
chapter_marker = init_lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close self.log.debug("Marked " + unicode(self.html_preprocess_sections) + " headings, " + log_message)
chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
else: chapter_marker = arg_ignorecase+init_lookahead+full_chapter_line+blank_lines+lp_n_lookahead_open+n_lookahead+lp_n_lookahead_close+lp_opt_title_open+title_line_open+title_header_open+lp_title+title_header_close+title_line_close+lp_opt_title_close
chapter_marker = init_lookahead+full_chapter_line+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close+n_lookahead_open+n_lookahead+n_lookahead_close chapdetect = re.compile(r'%s' % chapter_marker)
chapdetect = re.compile(r'%s' % chapter_marker, re.UNICODE)
if analyze: if analyze:
hits = len(chapdetect.findall(html)) hits = len(chapdetect.findall(html))
print unicode(type_name)+" had "+unicode(hits)+" hits" if hits:
chapdetect.sub(self.analyze_title_matches, html) chapdetect.sub(self.analyze_title_matches, html)
print unicode(self.chapters_no_title)+" chapters with no title" if float(self.chapters_with_title) / float(hits) > .5:
print unicode(self.chapters_with_title)+" chapters with titles" title_req = True
strict_title = False
self.log.debug(unicode(type_name)+" had "+unicode(hits)+" hits - "+unicode(self.chapters_no_title)+" chapters with no title, "+unicode(self.chapters_with_title)+" chapters with titles, "+unicode(float(self.chapters_with_title) / float(hits))+" percent. ")
print "max chapters is "+str(self.max_chapters)
if type_name == 'common':
analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name])
elif self.min_chapters <= hits < self.max_chapters:
analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name])
break
else: else:
html = chapdetect.sub(self.chapter_head, html) html = chapdetect.sub(self.chapter_head, html)
return html return html
recurse_patterns(html, True) recurse_patterns(html, True)
chapter_types = analysis_result
html = recurse_patterns(html, False) html = recurse_patterns(html, False)
words_per_chptr = wordcount words_per_chptr = wordcount
@ -292,7 +337,7 @@ class HeuristicProcessor(object):
pre = re.compile(r'<pre>', re.IGNORECASE) pre = re.compile(r'<pre>', re.IGNORECASE)
if len(pre.findall(html)) >= 1: if len(pre.findall(html)) >= 1:
self.log.debug("Running Text Processing") self.log.debug("Running Text Processing")
outerhtml = re.compile(r'.*?(?<=<pre>)(?P<text>.*)(?=</pre>).*', re.IGNORECASE|re.DOTALL) outerhtml = re.compile(r'.*?(?<=<pre>)(?P<text>.*?)</pre>', re.IGNORECASE|re.DOTALL)
html = outerhtml.sub(self.txt_process, html) html = outerhtml.sub(self.txt_process, html)
else: else:
# Add markup naively # Add markup naively
@ -422,6 +467,7 @@ class HeuristicProcessor(object):
# blank paragraphs then delete blank lines to clean up spacing # blank paragraphs then delete blank lines to clean up spacing
if blanks_between_paragraphs and getattr(self.extra_opts, 'delete_blank_paragraphs', False): if blanks_between_paragraphs and getattr(self.extra_opts, 'delete_blank_paragraphs', False):
self.log.debug("deleting blank lines") self.log.debug("deleting blank lines")
self.blanks_deleted = True
html = self.multi_blank.sub('\n<p id="softbreak" style="margin-top:1.5em; margin-bottom:1.5em"> </p>', html) html = self.multi_blank.sub('\n<p id="softbreak" style="margin-top:1.5em; margin-bottom:1.5em"> </p>', html)
html = self.blankreg.sub('', html) html = self.blankreg.sub('', html)
@ -479,6 +525,9 @@ class HeuristicProcessor(object):
if getattr(self.extra_opts, 'format_scene_breaks', False): if getattr(self.extra_opts, 'format_scene_breaks', False):
# Center separator lines # Center separator lines
html = re.sub(u'<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*(?P<break>([*#•]+\s*)+)\s*(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>', '<p style="text-align:center; margin-top:1.25em; margin-bottom:1.25em">' + '\g<break>' + '</p>', html) html = re.sub(u'<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*(?P<break>([*#•]+\s*)+)\s*(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>', '<p style="text-align:center; margin-top:1.25em; margin-bottom:1.25em">' + '\g<break>' + '</p>', html)
if not self.blanks_deleted:
html = self.multi_blank.sub('\n<p id="softbreak" style="margin-top:1.5em; margin-bottom:1.5em"> </p>', html)
html = re.sub('<p\s+id="softbreak"[^>]*>\s*</p>', '<div id="softbreak" style="margin-left: 45%; margin-right: 45%; margin-top:1.5em; margin-bottom:1.5em"><hr style="height: 3px; background:#505050" /></div>', html)
if self.deleted_nbsps: if self.deleted_nbsps:
# put back non-breaking spaces in empty paragraphs to preserve original formatting # put back non-breaking spaces in empty paragraphs to preserve original formatting

View File

@ -22,7 +22,7 @@ class LITInput(InputFormatPlugin):
from calibre.ebooks.lit.reader import LitReader from calibre.ebooks.lit.reader import LitReader
from calibre.ebooks.conversion.plumber import create_oebbook from calibre.ebooks.conversion.plumber import create_oebbook
self.log = log self.log = log
return create_oebbook(log, stream, options, self, reader=LitReader) return create_oebbook(log, stream, options, reader=LitReader)
def postprocess_book(self, oeb, opts, log): def postprocess_book(self, oeb, opts, log):
from calibre.ebooks.oeb.base import XHTML_NS, XPath, XHTML from calibre.ebooks.oeb.base import XHTML_NS, XPath, XHTML
@ -39,10 +39,13 @@ class LITInput(InputFormatPlugin):
body = body[0] body = body[0]
if len(body) == 1 and body[0].tag == XHTML('pre'): if len(body) == 1 and body[0].tag == XHTML('pre'):
pre = body[0] pre = body[0]
from calibre.ebooks.txt.processor import convert_basic from calibre.ebooks.txt.processor import convert_basic, preserve_spaces, \
separate_paragraphs_single_line
from lxml import etree from lxml import etree
import copy import copy
html = convert_basic(pre.text).replace('<html>', html = separate_paragraphs_single_line(pre.text)
html = preserve_spaces(html)
html = convert_basic(html).replace('<html>',
'<html xmlns="%s">'%XHTML_NS) '<html xmlns="%s">'%XHTML_NS)
root = etree.fromstring(html) root = etree.fromstring(html)
body = XPath('//h:body')(root) body = XPath('//h:body')(root)

View File

@ -255,6 +255,46 @@ you are producing are meant for a particular device type, choose the correspondi
The Output profile also controls the screen size. This will cause, for example, images to be auto-resized to be fit to the screen in some output formats. So choose a profile of a device that has a screen size similar to your device. The Output profile also controls the screen size. This will cause, for example, images to be auto-resized to be fit to the screen in some output formats. So choose a profile of a device that has a screen size similar to your device.
.. _heuristic-processing:
Heuristic Processing
---------------------
:guilabel:`Preprocess input`
This option activates various algorithms that try to detect and correct common cases of
badly formatted input documents. Things like hard line breaks, large blocks of text with no formatting, etc.
Turn this option on if your input document suffers from bad formatting. But be aware that in
some cases, this option can lead to worse results, so use with care.
:guilabel:`Line-unwrap factor`
This option control the algorithm |app| uses to remove hard line breaks. For example, if the value of this
option is 0.4, that means calibre will remove hard line breaks from the end of lines whose lengths are less
than the length of 40% of all lines in the document.
:guilabel:`Unwrap lines`
Lorem ipsum
:guilabel:`Detect and markup unformatted chapter headings and sub headings`
Lorem ipsum
:guilabel:`Renumber sequences of &lt;h1&gt; or &lt;h2&gt; tags to prevent splitting`
Lorem ipsum
:guilabel:`Delete blank lines between paragraphs`
Lorem ipsum
:guilabel:`Ensure scene breaks are consistently formatted`
Lorem ipsum
:guilabel:`Remove unnecessary hyphens`
Lorem ipsum
:guilabel:`Italicize common words and patterns`
Lorem ipsum
:guilabel:`Replace entity indents with CSS indents`
Lorem ipsum
.. _structure-detection: .. _structure-detection:
Structure Detection Structure Detection
@ -330,16 +370,6 @@ There are a few more options in this section.
two covers. This option will simply remove the first image from the source document, thereby two covers. This option will simply remove the first image from the source document, thereby
ensuring that the converted book has only one cover, the one specified in |app|. ensuring that the converted book has only one cover, the one specified in |app|.
:guilabel:`Preprocess input`
This option activates various algorithms that try to detect and correct common cases of
badly formatted input documents. Things like hard line breaks, large blocks of text with no formatting, etc.
Turn this option on if your input document suffers from bad formatting. But be aware that in
some cases, this option can lead to worse results, so use with care.
:guilabel:`Line-unwrap factor`
This option control the algorithm |app| uses to remove hard line breaks. For example, if the value of this
option is 0.4, that means calibre will remove hard line breaks from the end of lines whose lengths are less
than the length of 40% of all lines in the document.
Table of Contents Table of Contents
------------------ ------------------
@ -500,14 +530,18 @@ more blank lines are a paragraph boundary::
TXT input supports a number of options to differentiate how paragraphs are detected. TXT input supports a number of options to differentiate how paragraphs are detected.
:guilabel:`Treat each line as a paragraph` :guilabel:`Paragraph Style: Auto`
Analyzes the text file and attempts to automatically determine how paragraphs are defined. This
option will generally work fine, if you achieve undesirable results try one of the manual options.
:guilabel:`Paragraph Style: Single`
Assumes that every line is a paragraph:: Assumes that every line is a paragraph::
This is the first. This is the first.
This is the second. This is the second.
This is the third. This is the third.
:guilabel:`Assume print formatting` :guilabel:`Paragraph Style: Print`
Assumes that every paragraph starts with an indent (either a tab or 2+ spaces). Paragraphs end when Assumes that every paragraph starts with an indent (either a tab or 2+ spaces). Paragraphs end when
the next line that starts with an indent is reached:: the next line that starts with an indent is reached::
@ -518,6 +552,14 @@ TXT input supports a number of options to differentiate how paragraphs are detec
This is the This is the
third. third.
:guilabel:`Paragraph Style: Unformatted`
Assumes that the document has no formatting, but does use hard line breaks. Punctuation
and median line length are used to attempt to re-create paragraphs.
:guilabel:`Formatting Style: Auto`
:guilabel:`Formatting Style: Heuristic`
:guilabel:`Process using markdown` :guilabel:`Process using markdown`
|app| also supports running TXT input though a transformation preprocessor known as markdown. Markdown |app| also supports running TXT input though a transformation preprocessor known as markdown. Markdown
allows for basic formatting to be added to TXT documents, such as bold, italics, section headings, tables, allows for basic formatting to be added to TXT documents, such as bold, italics, section headings, tables,