mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
...
This commit is contained in:
commit
1bd86990fb
@ -201,7 +201,7 @@ class Dehyphenator(object):
|
||||
lookupword = self.removesuffixes.sub('', dehyphenated)
|
||||
else:
|
||||
lookupword = dehyphenated
|
||||
if len(firsthalf) > 3 and self.prefixes.match(firsthalf) is None:
|
||||
if len(firsthalf) > 4 and self.prefixes.match(firsthalf) is None:
|
||||
lookupword = self.removeprefix.sub('', lookupword)
|
||||
if self.verbose > 2:
|
||||
self.log("lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated))
|
||||
|
@ -71,21 +71,41 @@ class TXTInput(InputFormatPlugin):
|
||||
txt = txt.decode(ienc, 'replace')
|
||||
|
||||
txt = _ent_pat.sub(xml_entity_to_unicode, txt)
|
||||
|
||||
# Normalize line endings
|
||||
txt = normalize_line_endings(txt)
|
||||
|
||||
if options.formatting_type == 'auto':
|
||||
options.formatting_type = detect_formatting_type(txt)
|
||||
|
||||
if options.formatting_type == 'heuristic':
|
||||
setattr(options, 'enable_heuristics', True)
|
||||
setattr(options, 'markup_chapter_headings', True)
|
||||
setattr(options, 'italicize_common_cases', True)
|
||||
setattr(options, 'fix_indents', True)
|
||||
setattr(options, 'preserve_spaces', True)
|
||||
setattr(options, 'delete_blank_paragraphs', True)
|
||||
setattr(options, 'format_scene_breaks', True)
|
||||
setattr(options, 'dehyphenate', True)
|
||||
|
||||
# Determine the paragraph type of the document.
|
||||
if options.paragraph_type == 'auto':
|
||||
options.paragraph_type = detect_paragraph_type(txt)
|
||||
if options.paragraph_type == 'unknown':
|
||||
log.debug('Could not reliably determine paragraph type using block')
|
||||
options.paragraph_type = 'block'
|
||||
else:
|
||||
log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
|
||||
|
||||
# Preserve spaces will replace multiple spaces to a space
|
||||
# followed by the entity.
|
||||
if options.preserve_spaces:
|
||||
txt = preserve_spaces(txt)
|
||||
|
||||
# Normalize line endings
|
||||
txt = normalize_line_endings(txt)
|
||||
|
||||
# Get length for hyphen removal and punctuation unwrap
|
||||
docanalysis = DocAnalysis('txt', txt)
|
||||
length = docanalysis.line_length(.5)
|
||||
|
||||
if options.formatting_type == 'auto':
|
||||
options.formatting_type = detect_formatting_type(txt)
|
||||
|
||||
if options.formatting_type == 'markdown':
|
||||
log.debug('Running text though markdown conversion...')
|
||||
try:
|
||||
@ -96,16 +116,8 @@ class TXTInput(InputFormatPlugin):
|
||||
elif options.formatting_type == 'textile':
|
||||
log.debug('Running text though textile conversion...')
|
||||
html = convert_textile(txt)
|
||||
else:
|
||||
# Determine the paragraph type of the document.
|
||||
if options.paragraph_type == 'auto':
|
||||
options.paragraph_type = detect_paragraph_type(txt)
|
||||
if options.paragraph_type == 'unknown':
|
||||
log.debug('Could not reliably determine paragraph type using block')
|
||||
options.paragraph_type = 'block'
|
||||
else:
|
||||
log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
|
||||
|
||||
else:
|
||||
# Dehyphenate
|
||||
dehyphenator = Dehyphenator(options.verbose, log=self.log)
|
||||
txt = dehyphenator(txt,'txt', length)
|
||||
@ -129,15 +141,6 @@ class TXTInput(InputFormatPlugin):
|
||||
flow_size = getattr(options, 'flow_size', 0)
|
||||
html = convert_basic(txt, epub_split_size_kb=flow_size)
|
||||
|
||||
if options.formatting_type == 'heuristic':
|
||||
setattr(options, 'enable_heuristics', True)
|
||||
setattr(options, 'markup_chapter_headings', True)
|
||||
setattr(options, 'italicize_common_cases', True)
|
||||
setattr(options, 'fix_indents', True)
|
||||
setattr(options, 'delete_blank_paragraphs', True)
|
||||
setattr(options, 'format_scene_breaks', True)
|
||||
setattr(options, 'dehyphenate', True)
|
||||
|
||||
from calibre.customize.ui import plugin_for_input_format
|
||||
html_input = plugin_for_input_format('html')
|
||||
for opt in html_input.options:
|
||||
|
@ -260,11 +260,11 @@ The Output profile also controls the screen size. This will cause, for example,
|
||||
Heuristic Processing
|
||||
---------------------
|
||||
|
||||
Heuristic Processing provides a variety of functions which can be used that try to detect and correct
|
||||
Heuristic Processing provides a variety of functions which can be used to try and detect and correct
|
||||
common problems in poorly formatted input documents. Use these functions if your input document suffers
|
||||
from bad formatting. Because these functions rely on common patterns, be aware that in some cases an
|
||||
from poor formatting. Because these functions rely on common patterns, be aware that in some cases an
|
||||
option may lead to worse results, so use with care. As an example, several of these options will
|
||||
remove all non-breaking-space entities.
|
||||
remove all non-breaking-space entities, or may include false positive matches relating to the function.
|
||||
|
||||
:guilabel:`Enable heuristic processing`
|
||||
This option activates |app|'s Heuristic Processing stage of the conversion pipeline.
|
||||
@ -283,7 +283,7 @@ remove all non-breaking-space entities.
|
||||
correction, then this value should be reduced to somewhere between 0.1 and 0.2.
|
||||
|
||||
:guilabel:`Detect and markup unformatted chapter headings and sub headings`
|
||||
If your document does not have Chapter Markers and titles formatted differently from the rest of the text,
|
||||
If your document does not have chapter headings and titles formatted differently from the rest of the text,
|
||||
|app| can use this option to attempt detection them and surround them with heading tags. <h2> tags are used
|
||||
for chapter headings; <h3> tags are used for any titles that are detected.
|
||||
|
||||
@ -331,21 +331,23 @@ remove all non-breaking-space entities.
|
||||
Some documents use a convention of defining text indents using non-breaking space entities. When this option is enabled |app| will
|
||||
attempt to detect this sort of formatting and convert them to a 3% text indent using css.
|
||||
|
||||
.. search-replace:
|
||||
.. _search-replace:
|
||||
|
||||
Search & Replace
|
||||
---------------------
|
||||
|
||||
These options are useful primarily for conversion of PDF documents. Often, the conversion leaves
|
||||
behind page headers and footers in the text. These options use regular expressions to try and detect
|
||||
the headers and footers and remove them. Remember that they operate on the intermediate XHTML produced
|
||||
by the conversion pipeline. There is also a wizard to help you customize the regular expressions for
|
||||
your document. These options can also be used for generic search and replace of any content by additionally
|
||||
specifying a replacement expression.
|
||||
These options are useful primarily for conversion of PDF documents or OCR conversions, though they can
|
||||
also be used to fix many document specific problems. As an example, some conversions can leaves behind page
|
||||
headers and footers in the text. These options use regular expressions to try and detect headers, footers,
|
||||
or other arbitrary text and remove or replace them. Remember that they operate on the intermediate XHTML produced
|
||||
by the conversion pipeline. There is a wizard to help you customize the regular expressions for
|
||||
your document. Click the magic wand beside the expression box, and click the 'Test' button after composing
|
||||
your search expression. Successful matches will be highlighted in Yellow.
|
||||
|
||||
The search works by using a python regular expression. All matched text is simply removed from
|
||||
the document or replaced using the replacement pattern. You can learn more about regular expressions and
|
||||
their syntax at :ref:`regexptutorial`.
|
||||
The search works by using a python regular expression. All matched text is simply removed from
|
||||
the document or replaced using the replacement pattern. The replacement pattern is optional, if left blank
|
||||
then text matching the search pattern will be deleted from the document. You can learn more about regular expressions
|
||||
and their syntax at :ref:`regexptutorial`.
|
||||
|
||||
.. _structure-detection:
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user