mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 02:34:06 -04:00
TXT Output: Fix inline toc not showing all items. TXT Input: Restructure to run dehyphenator when auto and heuristic formatting options are used. This causes textile and markdown to be dehyphenated. Heuristics: Fix issue with invalid markup from italicize patterns. TXT Input: Add option to remove indents. TXT Input: Fix bug where spaces were not retained properly.
This commit is contained in:
commit
347c276aa9
@ -149,17 +149,17 @@ class HeuristicProcessor(object):
|
|||||||
]
|
]
|
||||||
|
|
||||||
ITALICIZE_STYLE_PATS = [
|
ITALICIZE_STYLE_PATS = [
|
||||||
r'(?msu)(?<=\s)_(?P<words>\S[^_]{0,40}?\S)?_(?=[\s\.,\!\?])',
|
r'(?msu)(?<=[\s>])_(?P<words>[^_]+)?_',
|
||||||
r'(?msu)(?<=\s)/(?P<words>\S[^/]{0,40}?\S)?/(?=[\s\.,\!\?])',
|
r'(?msu)(?<=[\s>])/(?P<words>[^/]+)?/',
|
||||||
r'(?msu)(?<=\s)~~(?P<words>\S[^~]{0,40}?\S)?~~(?=[\s\.,\!\?])',
|
r'(?msu)(?<=[\s>])~~(?P<words>[^~]+)?~~',
|
||||||
r'(?msu)(?<=\s)\*(?P<words>\S[^\*]{0,40}?\S)?\*(?=[\s\.,\!\?])',
|
r'(?msu)(?<=[\s>])\*(?P<words>[^\*]+)?\*',
|
||||||
r'(?msu)(?<=\s)~(?P<words>\S[^~]{0,40}?\S)?~(?=[\s\.,\!\?])',
|
r'(?msu)(?<=[\s>])~(?P<words>[^~]+)?~',
|
||||||
r'(?msu)(?<=\s)_/(?P<words>\S[^/_]{0,40}?\S)?/_(?=[\s\.,\!\?])',
|
r'(?msu)(?<=[\s>])_/(?P<words>[^/_]+)?/_',
|
||||||
r'(?msu)(?<=\s)_\*(?P<words>\S[^\*_]{0,40}?\S)?\*_(?=[\s\.,\!\?])',
|
r'(?msu)(?<=[\s>])_\*(?P<words>[^\*_]+)?\*_',
|
||||||
r'(?msu)(?<=\s)\*/(?P<words>\S[^/\*]{0,40}?\S)?/\*(?=[\s\.,\!\?])',
|
r'(?msu)(?<=[\s>])\*/(?P<words>[^/\*]+)?/\*',
|
||||||
r'(?msu)(?<=\s)_\*/(?P<words>\S[^\*_]{0,40}?\S)?/\*_(?=[\s\.,\!\?])',
|
r'(?msu)(?<=[\s>])_\*/(?P<words>[^\*_]+)?/\*_',
|
||||||
r'(?msu)(?<=\s)/:(?P<words>\S[^:/]{0,40}?\S)?:/(?=[\s\.,\!\?])',
|
r'(?msu)(?<=[\s>])/:(?P<words>[^:/]+)?:/',
|
||||||
r'(?msu)(?<=\s)\|:(?P<words>\S[^:\|]{0,40}?\S)?:\|(?=[\s\.,\!\?])',
|
r'(?msu)(?<=[\s>])\|:(?P<words>[^:\|]+)?:\|',
|
||||||
]
|
]
|
||||||
|
|
||||||
for word in ITALICIZE_WORDS:
|
for word in ITALICIZE_WORDS:
|
||||||
|
@ -12,7 +12,7 @@ from calibre.ebooks.chardet import detect
|
|||||||
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
|
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
|
||||||
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
|
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
|
||||||
preserve_spaces, detect_paragraph_type, detect_formatting_type, \
|
preserve_spaces, detect_paragraph_type, detect_formatting_type, \
|
||||||
normalize_line_endings, convert_textile
|
normalize_line_endings, convert_textile, remove_indents
|
||||||
from calibre import _ent_pat, xml_entity_to_unicode
|
from calibre import _ent_pat, xml_entity_to_unicode
|
||||||
|
|
||||||
class TXTInput(InputFormatPlugin):
|
class TXTInput(InputFormatPlugin):
|
||||||
@ -47,6 +47,9 @@ class TXTInput(InputFormatPlugin):
|
|||||||
OptionRecommendation(name='preserve_spaces', recommended_value=False,
|
OptionRecommendation(name='preserve_spaces', recommended_value=False,
|
||||||
help=_('Normally extra spaces are condensed into a single space. '
|
help=_('Normally extra spaces are condensed into a single space. '
|
||||||
'With this option all spaces will be displayed.')),
|
'With this option all spaces will be displayed.')),
|
||||||
|
OptionRecommendation(name='txt_in_remove_indents', recommended_value=False,
|
||||||
|
help=_('Normally extra space at the beginning of lines is retained. '
|
||||||
|
'With this option they will be removed.')),
|
||||||
OptionRecommendation(name="markdown_disable_toc", recommended_value=False,
|
OptionRecommendation(name="markdown_disable_toc", recommended_value=False,
|
||||||
help=_('Do not insert a Table of Contents into the output text.')),
|
help=_('Do not insert a Table of Contents into the output text.')),
|
||||||
])
|
])
|
||||||
@ -77,20 +80,6 @@ class TXTInput(InputFormatPlugin):
|
|||||||
# Normalize line endings
|
# Normalize line endings
|
||||||
txt = normalize_line_endings(txt)
|
txt = normalize_line_endings(txt)
|
||||||
|
|
||||||
# Detect formatting
|
|
||||||
if options.formatting_type == 'auto':
|
|
||||||
options.formatting_type = detect_formatting_type(txt)
|
|
||||||
log.debug('Auto detected formatting as %s' % options.formatting_type)
|
|
||||||
|
|
||||||
if options.formatting_type == 'heuristic':
|
|
||||||
setattr(options, 'enable_heuristics', True)
|
|
||||||
setattr(options, 'markup_chapter_headings', True)
|
|
||||||
setattr(options, 'italicize_common_cases', True)
|
|
||||||
setattr(options, 'fix_indents', True)
|
|
||||||
setattr(options, 'delete_blank_paragraphs', True)
|
|
||||||
setattr(options, 'format_scene_breaks', True)
|
|
||||||
setattr(options, 'dehyphenate', True)
|
|
||||||
|
|
||||||
# Determine the paragraph type of the document.
|
# Determine the paragraph type of the document.
|
||||||
if options.paragraph_type == 'auto':
|
if options.paragraph_type == 'auto':
|
||||||
options.paragraph_type = detect_paragraph_type(txt)
|
options.paragraph_type = detect_paragraph_type(txt)
|
||||||
@ -99,16 +88,30 @@ class TXTInput(InputFormatPlugin):
|
|||||||
options.paragraph_type = 'block'
|
options.paragraph_type = 'block'
|
||||||
else:
|
else:
|
||||||
log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
|
log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
|
||||||
|
|
||||||
|
dehyphenate = False
|
||||||
|
if options.formatting_type in ('auto', 'heuristic'):
|
||||||
|
# Set this here because we want it to run over all
|
||||||
|
# formatting types if auto is used.
|
||||||
|
dehyphenate = True
|
||||||
|
|
||||||
|
# Detect formatting
|
||||||
|
if options.formatting_type == 'auto':
|
||||||
|
options.formatting_type = detect_formatting_type(txt)
|
||||||
|
log.debug('Auto detected formatting as %s' % options.formatting_type)
|
||||||
|
|
||||||
|
if options.formatting_type == 'heuristic':
|
||||||
|
setattr(options, 'enable_heuristics', True)
|
||||||
|
setattr(options, 'unwrap_lines', False)
|
||||||
|
|
||||||
|
if options.txt_in_remove_indents:
|
||||||
|
txt = remove_indents(txt)
|
||||||
|
|
||||||
# Preserve spaces will replace multiple spaces to a space
|
# Preserve spaces will replace multiple spaces to a space
|
||||||
# followed by the entity.
|
# followed by the entity.
|
||||||
if options.preserve_spaces:
|
if options.preserve_spaces:
|
||||||
txt = preserve_spaces(txt)
|
txt = preserve_spaces(txt)
|
||||||
|
|
||||||
# Get length for hyphen removal and punctuation unwrap
|
|
||||||
docanalysis = DocAnalysis('txt', txt)
|
|
||||||
length = docanalysis.line_length(.5)
|
|
||||||
|
|
||||||
# Reformat paragraphs to block formatting based on the detected type.
|
# Reformat paragraphs to block formatting based on the detected type.
|
||||||
# We don't check for block because the processor assumes block.
|
# We don't check for block because the processor assumes block.
|
||||||
# single and print at transformed to block for processing.
|
# single and print at transformed to block for processing.
|
||||||
@ -119,9 +122,17 @@ class TXTInput(InputFormatPlugin):
|
|||||||
elif options.paragraph_type == 'unformatted':
|
elif options.paragraph_type == 'unformatted':
|
||||||
from calibre.ebooks.conversion.utils import HeuristicProcessor
|
from calibre.ebooks.conversion.utils import HeuristicProcessor
|
||||||
# unwrap lines based on punctuation
|
# unwrap lines based on punctuation
|
||||||
|
docanalysis = DocAnalysis('txt', txt)
|
||||||
|
length = docanalysis.line_length(.5)
|
||||||
preprocessor = HeuristicProcessor(options, log=getattr(self, 'log', None))
|
preprocessor = HeuristicProcessor(options, log=getattr(self, 'log', None))
|
||||||
txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
|
txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
|
||||||
|
|
||||||
|
if dehyphenate:
|
||||||
|
docanalysis = DocAnalysis('txt', txt)
|
||||||
|
length = docanalysis.line_length(.5)
|
||||||
|
dehyphenator = Dehyphenator(options.verbose, log=self.log)
|
||||||
|
txt = dehyphenator(txt,'txt', length)
|
||||||
|
|
||||||
# Process the text using the appropriate text processor.
|
# Process the text using the appropriate text processor.
|
||||||
html = ''
|
html = ''
|
||||||
if options.formatting_type == 'markdown':
|
if options.formatting_type == 'markdown':
|
||||||
@ -134,14 +145,8 @@ class TXTInput(InputFormatPlugin):
|
|||||||
elif options.formatting_type == 'textile':
|
elif options.formatting_type == 'textile':
|
||||||
log.debug('Running text through textile conversion...')
|
log.debug('Running text through textile conversion...')
|
||||||
html = convert_textile(txt)
|
html = convert_textile(txt)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
log.debug('Running text through basic conversion...')
|
log.debug('Running text through basic conversion...')
|
||||||
if options.formatting_type == 'heuristic':
|
|
||||||
# Dehyphenate
|
|
||||||
dehyphenator = Dehyphenator(options.verbose, log=self.log)
|
|
||||||
txt = dehyphenator(txt,'txt', length)
|
|
||||||
|
|
||||||
flow_size = getattr(options, 'flow_size', 0)
|
flow_size = getattr(options, 'flow_size', 0)
|
||||||
html = convert_basic(txt, epub_split_size_kb=flow_size)
|
html = convert_basic(txt, epub_split_size_kb=flow_size)
|
||||||
|
|
||||||
|
@ -24,14 +24,14 @@ def clean_txt(txt):
|
|||||||
# all line breaks with \n.
|
# all line breaks with \n.
|
||||||
txt = '\n'.join([line.rstrip() for line in txt.splitlines()])
|
txt = '\n'.join([line.rstrip() for line in txt.splitlines()])
|
||||||
|
|
||||||
# Replace whitespace at the beginning of the list with
|
# Replace whitespace at the beginning of the line with
|
||||||
txt = re.sub('(?m)(?P<space>[ ]+)', lambda mo: ' ' * mo.groups('space').count(' '), txt)
|
txt = re.sub('(?m)(?P<space>^[ ]+)(?=.)', lambda mo: ' ' * mo.groups('space').count(' '), txt)
|
||||||
txt = re.sub('(?m)(?P<space>[\t]+)', lambda mo: ' ' * 4 * mo.groups('space').count('\t'), txt)
|
txt = re.sub('(?m)(?P<space>^[\t]+)(?=.)', lambda mo: ' ' * 4 * mo.groups('space').count('\t'), txt)
|
||||||
|
|
||||||
# Condense redundant spaces
|
# Condense redundant spaces
|
||||||
txt = re.sub('[ ]{2,}', ' ', txt)
|
txt = re.sub('[ ]{2,}', ' ', txt)
|
||||||
|
|
||||||
# Remove blank lines from the beginning and end of the document.
|
# Remove blank space from the beginning and end of the document.
|
||||||
txt = re.sub('^\s+(?=.)', '', txt)
|
txt = re.sub('^\s+(?=.)', '', txt)
|
||||||
txt = re.sub('(?<=.)\s+$', '', txt)
|
txt = re.sub('(?<=.)\s+$', '', txt)
|
||||||
# Remove excessive line breaks.
|
# Remove excessive line breaks.
|
||||||
@ -107,6 +107,10 @@ def preserve_spaces(txt):
|
|||||||
txt = txt.replace('\t', ' ')
|
txt = txt.replace('\t', ' ')
|
||||||
return txt
|
return txt
|
||||||
|
|
||||||
|
def remove_indents(txt):
|
||||||
|
txt = re.sub('(?miu)^\s+', '', txt)
|
||||||
|
return txt
|
||||||
|
|
||||||
def opf_writer(path, opf_name, manifest, spine, mi):
|
def opf_writer(path, opf_name, manifest, spine, mi):
|
||||||
opf = OPFCreator(path, mi)
|
opf = OPFCreator(path, mi)
|
||||||
opf.create_manifest(manifest)
|
opf.create_manifest(manifest)
|
||||||
|
@ -55,6 +55,7 @@ class TXTMLizer(object):
|
|||||||
self.log.info('Converting XHTML to TXT...')
|
self.log.info('Converting XHTML to TXT...')
|
||||||
self.oeb_book = oeb_book
|
self.oeb_book = oeb_book
|
||||||
self.opts = opts
|
self.opts = opts
|
||||||
|
self.toc_titles = []
|
||||||
self.toc_ids = []
|
self.toc_ids = []
|
||||||
self.last_was_heading = False
|
self.last_was_heading = False
|
||||||
|
|
||||||
@ -94,8 +95,8 @@ class TXTMLizer(object):
|
|||||||
if getattr(self.opts, 'inline_toc', None):
|
if getattr(self.opts, 'inline_toc', None):
|
||||||
self.log.debug('Generating table of contents...')
|
self.log.debug('Generating table of contents...')
|
||||||
toc.append(u'%s\n\n' % _(u'Table of Contents:'))
|
toc.append(u'%s\n\n' % _(u'Table of Contents:'))
|
||||||
for item in self.oeb_book.toc:
|
for item in self.toc_titles:
|
||||||
toc.append(u'* %s\n\n' % item.title)
|
toc.append(u'* %s\n\n' % item)
|
||||||
return ''.join(toc)
|
return ''.join(toc)
|
||||||
|
|
||||||
def create_flat_toc(self, nodes):
|
def create_flat_toc(self, nodes):
|
||||||
@ -103,6 +104,7 @@ class TXTMLizer(object):
|
|||||||
Turns a hierarchical list of TOC href's into a flat list.
|
Turns a hierarchical list of TOC href's into a flat list.
|
||||||
'''
|
'''
|
||||||
for item in nodes:
|
for item in nodes:
|
||||||
|
self.toc_titles.append(item.title)
|
||||||
self.toc_ids.append(item.href)
|
self.toc_ids.append(item.href)
|
||||||
self.create_flat_toc(item.nodes)
|
self.create_flat_toc(item.nodes)
|
||||||
|
|
||||||
|
@ -16,7 +16,8 @@ class PluginWidget(Widget, Ui_Form):
|
|||||||
|
|
||||||
def __init__(self, parent, get_option, get_help, db=None, book_id=None):
|
def __init__(self, parent, get_option, get_help, db=None, book_id=None):
|
||||||
Widget.__init__(self, parent,
|
Widget.__init__(self, parent,
|
||||||
['paragraph_type', 'formatting_type', 'markdown_disable_toc', 'preserve_spaces'])
|
['paragraph_type', 'formatting_type', 'markdown_disable_toc',
|
||||||
|
'preserve_spaces', 'txt_in_remove_indents'])
|
||||||
self.db, self.book_id = db, book_id
|
self.db, self.book_id = db, book_id
|
||||||
for x in get_option('paragraph_type').option.choices:
|
for x in get_option('paragraph_type').option.choices:
|
||||||
self.opt_paragraph_type.addItem(x)
|
self.opt_paragraph_type.addItem(x)
|
||||||
|
@ -7,57 +7,95 @@
|
|||||||
<x>0</x>
|
<x>0</x>
|
||||||
<y>0</y>
|
<y>0</y>
|
||||||
<width>518</width>
|
<width>518</width>
|
||||||
<height>300</height>
|
<height>353</height>
|
||||||
</rect>
|
</rect>
|
||||||
</property>
|
</property>
|
||||||
<property name="windowTitle">
|
<property name="windowTitle">
|
||||||
<string>Form</string>
|
<string>Form</string>
|
||||||
</property>
|
</property>
|
||||||
<layout class="QGridLayout" name="gridLayout">
|
<layout class="QVBoxLayout" name="verticalLayout_3">
|
||||||
<item row="0" column="0">
|
<item>
|
||||||
<widget class="QLabel" name="label_2">
|
<widget class="QGroupBox" name="groupBox_3">
|
||||||
<property name="text">
|
<property name="title">
|
||||||
<string>Paragraph style:</string>
|
<string>Structure</string>
|
||||||
</property>
|
</property>
|
||||||
|
<layout class="QGridLayout" name="gridLayout">
|
||||||
|
<item row="0" column="0">
|
||||||
|
<widget class="QLabel" name="label_2">
|
||||||
|
<property name="sizePolicy">
|
||||||
|
<sizepolicy hsizetype="Minimum" vsizetype="Preferred">
|
||||||
|
<horstretch>0</horstretch>
|
||||||
|
<verstretch>0</verstretch>
|
||||||
|
</sizepolicy>
|
||||||
|
</property>
|
||||||
|
<property name="text">
|
||||||
|
<string>Paragraph style:</string>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item row="0" column="1">
|
||||||
|
<widget class="QComboBox" name="opt_paragraph_type">
|
||||||
|
<property name="sizePolicy">
|
||||||
|
<sizepolicy hsizetype="Expanding" vsizetype="Fixed">
|
||||||
|
<horstretch>0</horstretch>
|
||||||
|
<verstretch>0</verstretch>
|
||||||
|
</sizepolicy>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item row="1" column="0">
|
||||||
|
<widget class="QLabel" name="label_3">
|
||||||
|
<property name="sizePolicy">
|
||||||
|
<sizepolicy hsizetype="Minimum" vsizetype="Preferred">
|
||||||
|
<horstretch>0</horstretch>
|
||||||
|
<verstretch>0</verstretch>
|
||||||
|
</sizepolicy>
|
||||||
|
</property>
|
||||||
|
<property name="text">
|
||||||
|
<string>Formatting style:</string>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item row="1" column="1">
|
||||||
|
<widget class="QComboBox" name="opt_formatting_type">
|
||||||
|
<property name="sizePolicy">
|
||||||
|
<sizepolicy hsizetype="Expanding" vsizetype="Fixed">
|
||||||
|
<horstretch>0</horstretch>
|
||||||
|
<verstretch>0</verstretch>
|
||||||
|
</sizepolicy>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
</layout>
|
||||||
</widget>
|
</widget>
|
||||||
</item>
|
</item>
|
||||||
<item row="0" column="1">
|
<item>
|
||||||
<widget class="QComboBox" name="opt_paragraph_type"/>
|
<widget class="QGroupBox" name="groupBox_2">
|
||||||
</item>
|
<property name="title">
|
||||||
<item row="5" column="0" colspan="2">
|
<string>Common</string>
|
||||||
<widget class="QCheckBox" name="opt_preserve_spaces">
|
|
||||||
<property name="text">
|
|
||||||
<string>Preserve &spaces</string>
|
|
||||||
</property>
|
</property>
|
||||||
|
<layout class="QVBoxLayout" name="verticalLayout_2">
|
||||||
|
<item>
|
||||||
|
<widget class="QCheckBox" name="opt_preserve_spaces">
|
||||||
|
<property name="text">
|
||||||
|
<string>Preserve &spaces</string>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item>
|
||||||
|
<widget class="QCheckBox" name="opt_txt_in_remove_indents">
|
||||||
|
<property name="text">
|
||||||
|
<string>Remove indents at the beginning of lines</string>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
</layout>
|
||||||
</widget>
|
</widget>
|
||||||
</item>
|
</item>
|
||||||
<item row="6" column="0" colspan="2">
|
<item>
|
||||||
<spacer name="verticalSpacer">
|
|
||||||
<property name="orientation">
|
|
||||||
<enum>Qt::Vertical</enum>
|
|
||||||
</property>
|
|
||||||
<property name="sizeHint" stdset="0">
|
|
||||||
<size>
|
|
||||||
<width>20</width>
|
|
||||||
<height>213</height>
|
|
||||||
</size>
|
|
||||||
</property>
|
|
||||||
</spacer>
|
|
||||||
</item>
|
|
||||||
<item row="1" column="1">
|
|
||||||
<widget class="QComboBox" name="opt_formatting_type"/>
|
|
||||||
</item>
|
|
||||||
<item row="1" column="0">
|
|
||||||
<widget class="QLabel" name="label_3">
|
|
||||||
<property name="text">
|
|
||||||
<string>Formatting style:</string>
|
|
||||||
</property>
|
|
||||||
</widget>
|
|
||||||
</item>
|
|
||||||
<item row="2" column="0" rowspan="2" colspan="2">
|
|
||||||
<widget class="QGroupBox" name="groupBox">
|
<widget class="QGroupBox" name="groupBox">
|
||||||
<property name="title">
|
<property name="title">
|
||||||
<string>Markdown Options</string>
|
<string>Markdown</string>
|
||||||
</property>
|
</property>
|
||||||
<layout class="QVBoxLayout" name="verticalLayout">
|
<layout class="QVBoxLayout" name="verticalLayout">
|
||||||
<item>
|
<item>
|
||||||
@ -83,6 +121,19 @@
|
|||||||
</layout>
|
</layout>
|
||||||
</widget>
|
</widget>
|
||||||
</item>
|
</item>
|
||||||
|
<item>
|
||||||
|
<spacer name="verticalSpacer">
|
||||||
|
<property name="orientation">
|
||||||
|
<enum>Qt::Vertical</enum>
|
||||||
|
</property>
|
||||||
|
<property name="sizeHint" stdset="0">
|
||||||
|
<size>
|
||||||
|
<width>20</width>
|
||||||
|
<height>213</height>
|
||||||
|
</size>
|
||||||
|
</property>
|
||||||
|
</spacer>
|
||||||
|
</item>
|
||||||
</layout>
|
</layout>
|
||||||
</widget>
|
</widget>
|
||||||
<resources/>
|
<resources/>
|
||||||
|
Loading…
x
Reference in New Issue
Block a user