This commit is contained in:
Kovid Goyal 2011-01-19 20:24:26 -07:00
commit 1bd86990fb
3 changed files with 44 additions and 39 deletions

View File

@ -201,7 +201,7 @@ class Dehyphenator(object):
lookupword = self.removesuffixes.sub('', dehyphenated)
else:
lookupword = dehyphenated
if len(firsthalf) > 3 and self.prefixes.match(firsthalf) is None:
if len(firsthalf) > 4 and self.prefixes.match(firsthalf) is None:
lookupword = self.removeprefix.sub('', lookupword)
if self.verbose > 2:
self.log("lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated))

View File

@ -71,21 +71,41 @@ class TXTInput(InputFormatPlugin):
txt = txt.decode(ienc, 'replace')
txt = _ent_pat.sub(xml_entity_to_unicode, txt)
# Normalize line endings
txt = normalize_line_endings(txt)
if options.formatting_type == 'auto':
options.formatting_type = detect_formatting_type(txt)
if options.formatting_type == 'heuristic':
setattr(options, 'enable_heuristics', True)
setattr(options, 'markup_chapter_headings', True)
setattr(options, 'italicize_common_cases', True)
setattr(options, 'fix_indents', True)
setattr(options, 'preserve_spaces', True)
setattr(options, 'delete_blank_paragraphs', True)
setattr(options, 'format_scene_breaks', True)
setattr(options, 'dehyphenate', True)
# Determine the paragraph type of the document.
if options.paragraph_type == 'auto':
options.paragraph_type = detect_paragraph_type(txt)
if options.paragraph_type == 'unknown':
log.debug('Could not reliably determine paragraph type using block')
options.paragraph_type = 'block'
else:
log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
# Preserve spaces will replace multiple spaces to a space
# followed by the   entity.
if options.preserve_spaces:
txt = preserve_spaces(txt)
# Normalize line endings
txt = normalize_line_endings(txt)
# Get length for hyphen removal and punctuation unwrap
docanalysis = DocAnalysis('txt', txt)
length = docanalysis.line_length(.5)
if options.formatting_type == 'auto':
options.formatting_type = detect_formatting_type(txt)
if options.formatting_type == 'markdown':
log.debug('Running text though markdown conversion...')
try:
@ -96,16 +116,8 @@ class TXTInput(InputFormatPlugin):
elif options.formatting_type == 'textile':
log.debug('Running text though textile conversion...')
html = convert_textile(txt)
else:
# Determine the paragraph type of the document.
if options.paragraph_type == 'auto':
options.paragraph_type = detect_paragraph_type(txt)
if options.paragraph_type == 'unknown':
log.debug('Could not reliably determine paragraph type using block')
options.paragraph_type = 'block'
else:
log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
else:
# Dehyphenate
dehyphenator = Dehyphenator(options.verbose, log=self.log)
txt = dehyphenator(txt,'txt', length)
@ -129,15 +141,6 @@ class TXTInput(InputFormatPlugin):
flow_size = getattr(options, 'flow_size', 0)
html = convert_basic(txt, epub_split_size_kb=flow_size)
if options.formatting_type == 'heuristic':
setattr(options, 'enable_heuristics', True)
setattr(options, 'markup_chapter_headings', True)
setattr(options, 'italicize_common_cases', True)
setattr(options, 'fix_indents', True)
setattr(options, 'delete_blank_paragraphs', True)
setattr(options, 'format_scene_breaks', True)
setattr(options, 'dehyphenate', True)
from calibre.customize.ui import plugin_for_input_format
html_input = plugin_for_input_format('html')
for opt in html_input.options:

View File

@ -260,11 +260,11 @@ The Output profile also controls the screen size. This will cause, for example,
Heuristic Processing
---------------------
Heuristic Processing provides a variety of functions which can be used that try to detect and correct
Heuristic Processing provides a variety of functions which can be used to try and detect and correct
common problems in poorly formatted input documents. Use these functions if your input document suffers
from bad formatting. Because these functions rely on common patterns, be aware that in some cases an
from poor formatting. Because these functions rely on common patterns, be aware that in some cases an
option may lead to worse results, so use with care. As an example, several of these options will
remove all non-breaking-space entities.
remove all non-breaking-space entities, or may include false positive matches relating to the function.
:guilabel:`Enable heuristic processing`
This option activates |app|'s Heuristic Processing stage of the conversion pipeline.
@ -283,7 +283,7 @@ remove all non-breaking-space entities.
correction, then this value should be reduced to somewhere between 0.1 and 0.2.
:guilabel:`Detect and markup unformatted chapter headings and sub headings`
If your document does not have Chapter Markers and titles formatted differently from the rest of the text,
If your document does not have chapter headings and titles formatted differently from the rest of the text,
|app| can use this option to attempt detection them and surround them with heading tags. <h2> tags are used
for chapter headings; <h3> tags are used for any titles that are detected.
@ -331,21 +331,23 @@ remove all non-breaking-space entities.
Some documents use a convention of defining text indents using non-breaking space entities. When this option is enabled |app| will
attempt to detect this sort of formatting and convert them to a 3% text indent using css.
.. search-replace:
.. _search-replace:
Search & Replace
---------------------
These options are useful primarily for conversion of PDF documents. Often, the conversion leaves
behind page headers and footers in the text. These options use regular expressions to try and detect
the headers and footers and remove them. Remember that they operate on the intermediate XHTML produced
by the conversion pipeline. There is also a wizard to help you customize the regular expressions for
your document. These options can also be used for generic search and replace of any content by additionally
specifying a replacement expression.
These options are useful primarily for conversion of PDF documents or OCR conversions, though they can
also be used to fix many document specific problems. As an example, some conversions can leaves behind page
headers and footers in the text. These options use regular expressions to try and detect headers, footers,
or other arbitrary text and remove or replace them. Remember that they operate on the intermediate XHTML produced
by the conversion pipeline. There is a wizard to help you customize the regular expressions for
your document. Click the magic wand beside the expression box, and click the 'Test' button after composing
your search expression. Successful matches will be highlighted in Yellow.
The search works by using a python regular expression. All matched text is simply removed from
the document or replaced using the replacement pattern. You can learn more about regular expressions and
their syntax at :ref:`regexptutorial`.
The search works by using a python regular expression. All matched text is simply removed from
the document or replaced using the replacement pattern. The replacement pattern is optional, if left blank
then text matching the search pattern will be deleted from the document. You can learn more about regular expressions
and their syntax at :ref:`regexptutorial`.
.. _structure-detection: