mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 18:24:30 -04:00
TXT Output: Fix inline toc not showing all items. TXT Input: Restructure to run dehyphenator when auto and heuristic formatting options are used. This causes textile and markdown to be dehyphenated. Heuristics: Fix issue with invalid markup from italicize patterns. TXT Input: Add option to remove indents. TXT Input: Fix bug where spaces were not retained properly.
This commit is contained in:
commit
347c276aa9
@ -149,17 +149,17 @@ class HeuristicProcessor(object):
|
||||
]
|
||||
|
||||
ITALICIZE_STYLE_PATS = [
|
||||
r'(?msu)(?<=\s)_(?P<words>\S[^_]{0,40}?\S)?_(?=[\s\.,\!\?])',
|
||||
r'(?msu)(?<=\s)/(?P<words>\S[^/]{0,40}?\S)?/(?=[\s\.,\!\?])',
|
||||
r'(?msu)(?<=\s)~~(?P<words>\S[^~]{0,40}?\S)?~~(?=[\s\.,\!\?])',
|
||||
r'(?msu)(?<=\s)\*(?P<words>\S[^\*]{0,40}?\S)?\*(?=[\s\.,\!\?])',
|
||||
r'(?msu)(?<=\s)~(?P<words>\S[^~]{0,40}?\S)?~(?=[\s\.,\!\?])',
|
||||
r'(?msu)(?<=\s)_/(?P<words>\S[^/_]{0,40}?\S)?/_(?=[\s\.,\!\?])',
|
||||
r'(?msu)(?<=\s)_\*(?P<words>\S[^\*_]{0,40}?\S)?\*_(?=[\s\.,\!\?])',
|
||||
r'(?msu)(?<=\s)\*/(?P<words>\S[^/\*]{0,40}?\S)?/\*(?=[\s\.,\!\?])',
|
||||
r'(?msu)(?<=\s)_\*/(?P<words>\S[^\*_]{0,40}?\S)?/\*_(?=[\s\.,\!\?])',
|
||||
r'(?msu)(?<=\s)/:(?P<words>\S[^:/]{0,40}?\S)?:/(?=[\s\.,\!\?])',
|
||||
r'(?msu)(?<=\s)\|:(?P<words>\S[^:\|]{0,40}?\S)?:\|(?=[\s\.,\!\?])',
|
||||
r'(?msu)(?<=[\s>])_(?P<words>[^_]+)?_',
|
||||
r'(?msu)(?<=[\s>])/(?P<words>[^/]+)?/',
|
||||
r'(?msu)(?<=[\s>])~~(?P<words>[^~]+)?~~',
|
||||
r'(?msu)(?<=[\s>])\*(?P<words>[^\*]+)?\*',
|
||||
r'(?msu)(?<=[\s>])~(?P<words>[^~]+)?~',
|
||||
r'(?msu)(?<=[\s>])_/(?P<words>[^/_]+)?/_',
|
||||
r'(?msu)(?<=[\s>])_\*(?P<words>[^\*_]+)?\*_',
|
||||
r'(?msu)(?<=[\s>])\*/(?P<words>[^/\*]+)?/\*',
|
||||
r'(?msu)(?<=[\s>])_\*/(?P<words>[^\*_]+)?/\*_',
|
||||
r'(?msu)(?<=[\s>])/:(?P<words>[^:/]+)?:/',
|
||||
r'(?msu)(?<=[\s>])\|:(?P<words>[^:\|]+)?:\|',
|
||||
]
|
||||
|
||||
for word in ITALICIZE_WORDS:
|
||||
|
@ -12,7 +12,7 @@ from calibre.ebooks.chardet import detect
|
||||
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
|
||||
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
|
||||
preserve_spaces, detect_paragraph_type, detect_formatting_type, \
|
||||
normalize_line_endings, convert_textile
|
||||
normalize_line_endings, convert_textile, remove_indents
|
||||
from calibre import _ent_pat, xml_entity_to_unicode
|
||||
|
||||
class TXTInput(InputFormatPlugin):
|
||||
@ -47,6 +47,9 @@ class TXTInput(InputFormatPlugin):
|
||||
OptionRecommendation(name='preserve_spaces', recommended_value=False,
|
||||
help=_('Normally extra spaces are condensed into a single space. '
|
||||
'With this option all spaces will be displayed.')),
|
||||
OptionRecommendation(name='txt_in_remove_indents', recommended_value=False,
|
||||
help=_('Normally extra space at the beginning of lines is retained. '
|
||||
'With this option they will be removed.')),
|
||||
OptionRecommendation(name="markdown_disable_toc", recommended_value=False,
|
||||
help=_('Do not insert a Table of Contents into the output text.')),
|
||||
])
|
||||
@ -77,20 +80,6 @@ class TXTInput(InputFormatPlugin):
|
||||
# Normalize line endings
|
||||
txt = normalize_line_endings(txt)
|
||||
|
||||
# Detect formatting
|
||||
if options.formatting_type == 'auto':
|
||||
options.formatting_type = detect_formatting_type(txt)
|
||||
log.debug('Auto detected formatting as %s' % options.formatting_type)
|
||||
|
||||
if options.formatting_type == 'heuristic':
|
||||
setattr(options, 'enable_heuristics', True)
|
||||
setattr(options, 'markup_chapter_headings', True)
|
||||
setattr(options, 'italicize_common_cases', True)
|
||||
setattr(options, 'fix_indents', True)
|
||||
setattr(options, 'delete_blank_paragraphs', True)
|
||||
setattr(options, 'format_scene_breaks', True)
|
||||
setattr(options, 'dehyphenate', True)
|
||||
|
||||
# Determine the paragraph type of the document.
|
||||
if options.paragraph_type == 'auto':
|
||||
options.paragraph_type = detect_paragraph_type(txt)
|
||||
@ -99,16 +88,30 @@ class TXTInput(InputFormatPlugin):
|
||||
options.paragraph_type = 'block'
|
||||
else:
|
||||
log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
|
||||
|
||||
dehyphenate = False
|
||||
if options.formatting_type in ('auto', 'heuristic'):
|
||||
# Set this here because we want it to run over all
|
||||
# formatting types if auto is used.
|
||||
dehyphenate = True
|
||||
|
||||
# Detect formatting
|
||||
if options.formatting_type == 'auto':
|
||||
options.formatting_type = detect_formatting_type(txt)
|
||||
log.debug('Auto detected formatting as %s' % options.formatting_type)
|
||||
|
||||
if options.formatting_type == 'heuristic':
|
||||
setattr(options, 'enable_heuristics', True)
|
||||
setattr(options, 'unwrap_lines', False)
|
||||
|
||||
if options.txt_in_remove_indents:
|
||||
txt = remove_indents(txt)
|
||||
|
||||
# Preserve spaces will replace multiple spaces to a space
|
||||
# followed by the entity.
|
||||
if options.preserve_spaces:
|
||||
txt = preserve_spaces(txt)
|
||||
|
||||
# Get length for hyphen removal and punctuation unwrap
|
||||
docanalysis = DocAnalysis('txt', txt)
|
||||
length = docanalysis.line_length(.5)
|
||||
|
||||
# Reformat paragraphs to block formatting based on the detected type.
|
||||
# We don't check for block because the processor assumes block.
|
||||
# single and print at transformed to block for processing.
|
||||
@ -119,9 +122,17 @@ class TXTInput(InputFormatPlugin):
|
||||
elif options.paragraph_type == 'unformatted':
|
||||
from calibre.ebooks.conversion.utils import HeuristicProcessor
|
||||
# unwrap lines based on punctuation
|
||||
docanalysis = DocAnalysis('txt', txt)
|
||||
length = docanalysis.line_length(.5)
|
||||
preprocessor = HeuristicProcessor(options, log=getattr(self, 'log', None))
|
||||
txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
|
||||
|
||||
if dehyphenate:
|
||||
docanalysis = DocAnalysis('txt', txt)
|
||||
length = docanalysis.line_length(.5)
|
||||
dehyphenator = Dehyphenator(options.verbose, log=self.log)
|
||||
txt = dehyphenator(txt,'txt', length)
|
||||
|
||||
# Process the text using the appropriate text processor.
|
||||
html = ''
|
||||
if options.formatting_type == 'markdown':
|
||||
@ -134,14 +145,8 @@ class TXTInput(InputFormatPlugin):
|
||||
elif options.formatting_type == 'textile':
|
||||
log.debug('Running text through textile conversion...')
|
||||
html = convert_textile(txt)
|
||||
|
||||
else:
|
||||
log.debug('Running text through basic conversion...')
|
||||
if options.formatting_type == 'heuristic':
|
||||
# Dehyphenate
|
||||
dehyphenator = Dehyphenator(options.verbose, log=self.log)
|
||||
txt = dehyphenator(txt,'txt', length)
|
||||
|
||||
flow_size = getattr(options, 'flow_size', 0)
|
||||
html = convert_basic(txt, epub_split_size_kb=flow_size)
|
||||
|
||||
|
@ -24,14 +24,14 @@ def clean_txt(txt):
|
||||
# all line breaks with \n.
|
||||
txt = '\n'.join([line.rstrip() for line in txt.splitlines()])
|
||||
|
||||
# Replace whitespace at the beginning of the list with
|
||||
txt = re.sub('(?m)(?P<space>[ ]+)', lambda mo: ' ' * mo.groups('space').count(' '), txt)
|
||||
txt = re.sub('(?m)(?P<space>[\t]+)', lambda mo: ' ' * 4 * mo.groups('space').count('\t'), txt)
|
||||
# Replace whitespace at the beginning of the line with
|
||||
txt = re.sub('(?m)(?P<space>^[ ]+)(?=.)', lambda mo: ' ' * mo.groups('space').count(' '), txt)
|
||||
txt = re.sub('(?m)(?P<space>^[\t]+)(?=.)', lambda mo: ' ' * 4 * mo.groups('space').count('\t'), txt)
|
||||
|
||||
# Condense redundant spaces
|
||||
txt = re.sub('[ ]{2,}', ' ', txt)
|
||||
|
||||
# Remove blank lines from the beginning and end of the document.
|
||||
# Remove blank space from the beginning and end of the document.
|
||||
txt = re.sub('^\s+(?=.)', '', txt)
|
||||
txt = re.sub('(?<=.)\s+$', '', txt)
|
||||
# Remove excessive line breaks.
|
||||
@ -107,6 +107,10 @@ def preserve_spaces(txt):
|
||||
txt = txt.replace('\t', ' ')
|
||||
return txt
|
||||
|
||||
def remove_indents(txt):
|
||||
txt = re.sub('(?miu)^\s+', '', txt)
|
||||
return txt
|
||||
|
||||
def opf_writer(path, opf_name, manifest, spine, mi):
|
||||
opf = OPFCreator(path, mi)
|
||||
opf.create_manifest(manifest)
|
||||
|
@ -55,6 +55,7 @@ class TXTMLizer(object):
|
||||
self.log.info('Converting XHTML to TXT...')
|
||||
self.oeb_book = oeb_book
|
||||
self.opts = opts
|
||||
self.toc_titles = []
|
||||
self.toc_ids = []
|
||||
self.last_was_heading = False
|
||||
|
||||
@ -94,8 +95,8 @@ class TXTMLizer(object):
|
||||
if getattr(self.opts, 'inline_toc', None):
|
||||
self.log.debug('Generating table of contents...')
|
||||
toc.append(u'%s\n\n' % _(u'Table of Contents:'))
|
||||
for item in self.oeb_book.toc:
|
||||
toc.append(u'* %s\n\n' % item.title)
|
||||
for item in self.toc_titles:
|
||||
toc.append(u'* %s\n\n' % item)
|
||||
return ''.join(toc)
|
||||
|
||||
def create_flat_toc(self, nodes):
|
||||
@ -103,6 +104,7 @@ class TXTMLizer(object):
|
||||
Turns a hierarchical list of TOC href's into a flat list.
|
||||
'''
|
||||
for item in nodes:
|
||||
self.toc_titles.append(item.title)
|
||||
self.toc_ids.append(item.href)
|
||||
self.create_flat_toc(item.nodes)
|
||||
|
||||
|
@ -16,7 +16,8 @@ class PluginWidget(Widget, Ui_Form):
|
||||
|
||||
def __init__(self, parent, get_option, get_help, db=None, book_id=None):
|
||||
Widget.__init__(self, parent,
|
||||
['paragraph_type', 'formatting_type', 'markdown_disable_toc', 'preserve_spaces'])
|
||||
['paragraph_type', 'formatting_type', 'markdown_disable_toc',
|
||||
'preserve_spaces', 'txt_in_remove_indents'])
|
||||
self.db, self.book_id = db, book_id
|
||||
for x in get_option('paragraph_type').option.choices:
|
||||
self.opt_paragraph_type.addItem(x)
|
||||
|
@ -7,57 +7,95 @@
|
||||
<x>0</x>
|
||||
<y>0</y>
|
||||
<width>518</width>
|
||||
<height>300</height>
|
||||
<height>353</height>
|
||||
</rect>
|
||||
</property>
|
||||
<property name="windowTitle">
|
||||
<string>Form</string>
|
||||
</property>
|
||||
<layout class="QGridLayout" name="gridLayout">
|
||||
<item row="0" column="0">
|
||||
<widget class="QLabel" name="label_2">
|
||||
<property name="text">
|
||||
<string>Paragraph style:</string>
|
||||
<layout class="QVBoxLayout" name="verticalLayout_3">
|
||||
<item>
|
||||
<widget class="QGroupBox" name="groupBox_3">
|
||||
<property name="title">
|
||||
<string>Structure</string>
|
||||
</property>
|
||||
<layout class="QGridLayout" name="gridLayout">
|
||||
<item row="0" column="0">
|
||||
<widget class="QLabel" name="label_2">
|
||||
<property name="sizePolicy">
|
||||
<sizepolicy hsizetype="Minimum" vsizetype="Preferred">
|
||||
<horstretch>0</horstretch>
|
||||
<verstretch>0</verstretch>
|
||||
</sizepolicy>
|
||||
</property>
|
||||
<property name="text">
|
||||
<string>Paragraph style:</string>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="0" column="1">
|
||||
<widget class="QComboBox" name="opt_paragraph_type">
|
||||
<property name="sizePolicy">
|
||||
<sizepolicy hsizetype="Expanding" vsizetype="Fixed">
|
||||
<horstretch>0</horstretch>
|
||||
<verstretch>0</verstretch>
|
||||
</sizepolicy>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="1" column="0">
|
||||
<widget class="QLabel" name="label_3">
|
||||
<property name="sizePolicy">
|
||||
<sizepolicy hsizetype="Minimum" vsizetype="Preferred">
|
||||
<horstretch>0</horstretch>
|
||||
<verstretch>0</verstretch>
|
||||
</sizepolicy>
|
||||
</property>
|
||||
<property name="text">
|
||||
<string>Formatting style:</string>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="1" column="1">
|
||||
<widget class="QComboBox" name="opt_formatting_type">
|
||||
<property name="sizePolicy">
|
||||
<sizepolicy hsizetype="Expanding" vsizetype="Fixed">
|
||||
<horstretch>0</horstretch>
|
||||
<verstretch>0</verstretch>
|
||||
</sizepolicy>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
</layout>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="0" column="1">
|
||||
<widget class="QComboBox" name="opt_paragraph_type"/>
|
||||
</item>
|
||||
<item row="5" column="0" colspan="2">
|
||||
<widget class="QCheckBox" name="opt_preserve_spaces">
|
||||
<property name="text">
|
||||
<string>Preserve &spaces</string>
|
||||
<item>
|
||||
<widget class="QGroupBox" name="groupBox_2">
|
||||
<property name="title">
|
||||
<string>Common</string>
|
||||
</property>
|
||||
<layout class="QVBoxLayout" name="verticalLayout_2">
|
||||
<item>
|
||||
<widget class="QCheckBox" name="opt_preserve_spaces">
|
||||
<property name="text">
|
||||
<string>Preserve &spaces</string>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item>
|
||||
<widget class="QCheckBox" name="opt_txt_in_remove_indents">
|
||||
<property name="text">
|
||||
<string>Remove indents at the beginning of lines</string>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
</layout>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="6" column="0" colspan="2">
|
||||
<spacer name="verticalSpacer">
|
||||
<property name="orientation">
|
||||
<enum>Qt::Vertical</enum>
|
||||
</property>
|
||||
<property name="sizeHint" stdset="0">
|
||||
<size>
|
||||
<width>20</width>
|
||||
<height>213</height>
|
||||
</size>
|
||||
</property>
|
||||
</spacer>
|
||||
</item>
|
||||
<item row="1" column="1">
|
||||
<widget class="QComboBox" name="opt_formatting_type"/>
|
||||
</item>
|
||||
<item row="1" column="0">
|
||||
<widget class="QLabel" name="label_3">
|
||||
<property name="text">
|
||||
<string>Formatting style:</string>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="2" column="0" rowspan="2" colspan="2">
|
||||
<item>
|
||||
<widget class="QGroupBox" name="groupBox">
|
||||
<property name="title">
|
||||
<string>Markdown Options</string>
|
||||
<string>Markdown</string>
|
||||
</property>
|
||||
<layout class="QVBoxLayout" name="verticalLayout">
|
||||
<item>
|
||||
@ -83,6 +121,19 @@
|
||||
</layout>
|
||||
</widget>
|
||||
</item>
|
||||
<item>
|
||||
<spacer name="verticalSpacer">
|
||||
<property name="orientation">
|
||||
<enum>Qt::Vertical</enum>
|
||||
</property>
|
||||
<property name="sizeHint" stdset="0">
|
||||
<size>
|
||||
<width>20</width>
|
||||
<height>213</height>
|
||||
</size>
|
||||
</property>
|
||||
</spacer>
|
||||
</item>
|
||||
</layout>
|
||||
</widget>
|
||||
<resources/>
|
||||
|
Loading…
x
Reference in New Issue
Block a user