TXT Output: Fix inline toc not showing all items. TXT Input: Restructure to run dehyphenator when auto and heuristic formatting options are used. This causes textile and markdown to be dehyphenated. Heuristics: Fix issue with invalid markup from italicize patterns. TXT Input: Add option to remove indents. TXT Input: Fix bug where spaces were not retained properly.

This commit is contained in:
Kovid Goyal 2011-02-05 12:03:56 -07:00
commit 347c276aa9
6 changed files with 144 additions and 81 deletions

View File

@ -149,17 +149,17 @@ class HeuristicProcessor(object):
] ]
ITALICIZE_STYLE_PATS = [ ITALICIZE_STYLE_PATS = [
r'(?msu)(?<=\s)_(?P<words>\S[^_]{0,40}?\S)?_(?=[\s\.,\!\?])', r'(?msu)(?<=[\s>])_(?P<words>[^_]+)?_',
r'(?msu)(?<=\s)/(?P<words>\S[^/]{0,40}?\S)?/(?=[\s\.,\!\?])', r'(?msu)(?<=[\s>])/(?P<words>[^/]+)?/',
r'(?msu)(?<=\s)~~(?P<words>\S[^~]{0,40}?\S)?~~(?=[\s\.,\!\?])', r'(?msu)(?<=[\s>])~~(?P<words>[^~]+)?~~',
r'(?msu)(?<=\s)\*(?P<words>\S[^\*]{0,40}?\S)?\*(?=[\s\.,\!\?])', r'(?msu)(?<=[\s>])\*(?P<words>[^\*]+)?\*',
r'(?msu)(?<=\s)~(?P<words>\S[^~]{0,40}?\S)?~(?=[\s\.,\!\?])', r'(?msu)(?<=[\s>])~(?P<words>[^~]+)?~',
r'(?msu)(?<=\s)_/(?P<words>\S[^/_]{0,40}?\S)?/_(?=[\s\.,\!\?])', r'(?msu)(?<=[\s>])_/(?P<words>[^/_]+)?/_',
r'(?msu)(?<=\s)_\*(?P<words>\S[^\*_]{0,40}?\S)?\*_(?=[\s\.,\!\?])', r'(?msu)(?<=[\s>])_\*(?P<words>[^\*_]+)?\*_',
r'(?msu)(?<=\s)\*/(?P<words>\S[^/\*]{0,40}?\S)?/\*(?=[\s\.,\!\?])', r'(?msu)(?<=[\s>])\*/(?P<words>[^/\*]+)?/\*',
r'(?msu)(?<=\s)_\*/(?P<words>\S[^\*_]{0,40}?\S)?/\*_(?=[\s\.,\!\?])', r'(?msu)(?<=[\s>])_\*/(?P<words>[^\*_]+)?/\*_',
r'(?msu)(?<=\s)/:(?P<words>\S[^:/]{0,40}?\S)?:/(?=[\s\.,\!\?])', r'(?msu)(?<=[\s>])/:(?P<words>[^:/]+)?:/',
r'(?msu)(?<=\s)\|:(?P<words>\S[^:\|]{0,40}?\S)?:\|(?=[\s\.,\!\?])', r'(?msu)(?<=[\s>])\|:(?P<words>[^:\|]+)?:\|',
] ]
for word in ITALICIZE_WORDS: for word in ITALICIZE_WORDS:

View File

@ -12,7 +12,7 @@ from calibre.ebooks.chardet import detect
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \ from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \ separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
preserve_spaces, detect_paragraph_type, detect_formatting_type, \ preserve_spaces, detect_paragraph_type, detect_formatting_type, \
normalize_line_endings, convert_textile normalize_line_endings, convert_textile, remove_indents
from calibre import _ent_pat, xml_entity_to_unicode from calibre import _ent_pat, xml_entity_to_unicode
class TXTInput(InputFormatPlugin): class TXTInput(InputFormatPlugin):
@ -47,6 +47,9 @@ class TXTInput(InputFormatPlugin):
OptionRecommendation(name='preserve_spaces', recommended_value=False, OptionRecommendation(name='preserve_spaces', recommended_value=False,
help=_('Normally extra spaces are condensed into a single space. ' help=_('Normally extra spaces are condensed into a single space. '
'With this option all spaces will be displayed.')), 'With this option all spaces will be displayed.')),
OptionRecommendation(name='txt_in_remove_indents', recommended_value=False,
help=_('Normally extra space at the beginning of lines is retained. '
'With this option they will be removed.')),
OptionRecommendation(name="markdown_disable_toc", recommended_value=False, OptionRecommendation(name="markdown_disable_toc", recommended_value=False,
help=_('Do not insert a Table of Contents into the output text.')), help=_('Do not insert a Table of Contents into the output text.')),
]) ])
@ -77,20 +80,6 @@ class TXTInput(InputFormatPlugin):
# Normalize line endings # Normalize line endings
txt = normalize_line_endings(txt) txt = normalize_line_endings(txt)
# Detect formatting
if options.formatting_type == 'auto':
options.formatting_type = detect_formatting_type(txt)
log.debug('Auto detected formatting as %s' % options.formatting_type)
if options.formatting_type == 'heuristic':
setattr(options, 'enable_heuristics', True)
setattr(options, 'markup_chapter_headings', True)
setattr(options, 'italicize_common_cases', True)
setattr(options, 'fix_indents', True)
setattr(options, 'delete_blank_paragraphs', True)
setattr(options, 'format_scene_breaks', True)
setattr(options, 'dehyphenate', True)
# Determine the paragraph type of the document. # Determine the paragraph type of the document.
if options.paragraph_type == 'auto': if options.paragraph_type == 'auto':
options.paragraph_type = detect_paragraph_type(txt) options.paragraph_type = detect_paragraph_type(txt)
@ -99,16 +88,30 @@ class TXTInput(InputFormatPlugin):
options.paragraph_type = 'block' options.paragraph_type = 'block'
else: else:
log.debug('Auto detected paragraph type as %s' % options.paragraph_type) log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
dehyphenate = False
if options.formatting_type in ('auto', 'heuristic'):
# Set this here because we want it to run over all
# formatting types if auto is used.
dehyphenate = True
# Detect formatting
if options.formatting_type == 'auto':
options.formatting_type = detect_formatting_type(txt)
log.debug('Auto detected formatting as %s' % options.formatting_type)
if options.formatting_type == 'heuristic':
setattr(options, 'enable_heuristics', True)
setattr(options, 'unwrap_lines', False)
if options.txt_in_remove_indents:
txt = remove_indents(txt)
# Preserve spaces will replace multiple spaces to a space # Preserve spaces will replace multiple spaces to a space
# followed by the &nbsp; entity. # followed by the &nbsp; entity.
if options.preserve_spaces: if options.preserve_spaces:
txt = preserve_spaces(txt) txt = preserve_spaces(txt)
# Get length for hyphen removal and punctuation unwrap
docanalysis = DocAnalysis('txt', txt)
length = docanalysis.line_length(.5)
# Reformat paragraphs to block formatting based on the detected type. # Reformat paragraphs to block formatting based on the detected type.
# We don't check for block because the processor assumes block. # We don't check for block because the processor assumes block.
# single and print at transformed to block for processing. # single and print at transformed to block for processing.
@ -119,9 +122,17 @@ class TXTInput(InputFormatPlugin):
elif options.paragraph_type == 'unformatted': elif options.paragraph_type == 'unformatted':
from calibre.ebooks.conversion.utils import HeuristicProcessor from calibre.ebooks.conversion.utils import HeuristicProcessor
# unwrap lines based on punctuation # unwrap lines based on punctuation
docanalysis = DocAnalysis('txt', txt)
length = docanalysis.line_length(.5)
preprocessor = HeuristicProcessor(options, log=getattr(self, 'log', None)) preprocessor = HeuristicProcessor(options, log=getattr(self, 'log', None))
txt = preprocessor.punctuation_unwrap(length, txt, 'txt') txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
if dehyphenate:
docanalysis = DocAnalysis('txt', txt)
length = docanalysis.line_length(.5)
dehyphenator = Dehyphenator(options.verbose, log=self.log)
txt = dehyphenator(txt,'txt', length)
# Process the text using the appropriate text processor. # Process the text using the appropriate text processor.
html = '' html = ''
if options.formatting_type == 'markdown': if options.formatting_type == 'markdown':
@ -134,14 +145,8 @@ class TXTInput(InputFormatPlugin):
elif options.formatting_type == 'textile': elif options.formatting_type == 'textile':
log.debug('Running text through textile conversion...') log.debug('Running text through textile conversion...')
html = convert_textile(txt) html = convert_textile(txt)
else: else:
log.debug('Running text through basic conversion...') log.debug('Running text through basic conversion...')
if options.formatting_type == 'heuristic':
# Dehyphenate
dehyphenator = Dehyphenator(options.verbose, log=self.log)
txt = dehyphenator(txt,'txt', length)
flow_size = getattr(options, 'flow_size', 0) flow_size = getattr(options, 'flow_size', 0)
html = convert_basic(txt, epub_split_size_kb=flow_size) html = convert_basic(txt, epub_split_size_kb=flow_size)

View File

@ -24,14 +24,14 @@ def clean_txt(txt):
# all line breaks with \n. # all line breaks with \n.
txt = '\n'.join([line.rstrip() for line in txt.splitlines()]) txt = '\n'.join([line.rstrip() for line in txt.splitlines()])
# Replace whitespace at the beginning of the list with &nbsp; # Replace whitespace at the beginning of the line with &nbsp;
txt = re.sub('(?m)(?P<space>[ ]+)', lambda mo: '&nbsp;' * mo.groups('space').count(' '), txt) txt = re.sub('(?m)(?P<space>^[ ]+)(?=.)', lambda mo: '&nbsp;' * mo.groups('space').count(' '), txt)
txt = re.sub('(?m)(?P<space>[\t]+)', lambda mo: '&nbsp;' * 4 * mo.groups('space').count('\t'), txt) txt = re.sub('(?m)(?P<space>^[\t]+)(?=.)', lambda mo: '&nbsp;' * 4 * mo.groups('space').count('\t'), txt)
# Condense redundant spaces # Condense redundant spaces
txt = re.sub('[ ]{2,}', ' ', txt) txt = re.sub('[ ]{2,}', ' ', txt)
# Remove blank lines from the beginning and end of the document. # Remove blank space from the beginning and end of the document.
txt = re.sub('^\s+(?=.)', '', txt) txt = re.sub('^\s+(?=.)', '', txt)
txt = re.sub('(?<=.)\s+$', '', txt) txt = re.sub('(?<=.)\s+$', '', txt)
# Remove excessive line breaks. # Remove excessive line breaks.
@ -107,6 +107,10 @@ def preserve_spaces(txt):
txt = txt.replace('\t', '&nbsp;&nbsp;&nbsp;&nbsp;') txt = txt.replace('\t', '&nbsp;&nbsp;&nbsp;&nbsp;')
return txt return txt
def remove_indents(txt):
txt = re.sub('(?miu)^\s+', '', txt)
return txt
def opf_writer(path, opf_name, manifest, spine, mi): def opf_writer(path, opf_name, manifest, spine, mi):
opf = OPFCreator(path, mi) opf = OPFCreator(path, mi)
opf.create_manifest(manifest) opf.create_manifest(manifest)

View File

@ -55,6 +55,7 @@ class TXTMLizer(object):
self.log.info('Converting XHTML to TXT...') self.log.info('Converting XHTML to TXT...')
self.oeb_book = oeb_book self.oeb_book = oeb_book
self.opts = opts self.opts = opts
self.toc_titles = []
self.toc_ids = [] self.toc_ids = []
self.last_was_heading = False self.last_was_heading = False
@ -94,8 +95,8 @@ class TXTMLizer(object):
if getattr(self.opts, 'inline_toc', None): if getattr(self.opts, 'inline_toc', None):
self.log.debug('Generating table of contents...') self.log.debug('Generating table of contents...')
toc.append(u'%s\n\n' % _(u'Table of Contents:')) toc.append(u'%s\n\n' % _(u'Table of Contents:'))
for item in self.oeb_book.toc: for item in self.toc_titles:
toc.append(u'* %s\n\n' % item.title) toc.append(u'* %s\n\n' % item)
return ''.join(toc) return ''.join(toc)
def create_flat_toc(self, nodes): def create_flat_toc(self, nodes):
@ -103,6 +104,7 @@ class TXTMLizer(object):
Turns a hierarchical list of TOC href's into a flat list. Turns a hierarchical list of TOC href's into a flat list.
''' '''
for item in nodes: for item in nodes:
self.toc_titles.append(item.title)
self.toc_ids.append(item.href) self.toc_ids.append(item.href)
self.create_flat_toc(item.nodes) self.create_flat_toc(item.nodes)

View File

@ -16,7 +16,8 @@ class PluginWidget(Widget, Ui_Form):
def __init__(self, parent, get_option, get_help, db=None, book_id=None): def __init__(self, parent, get_option, get_help, db=None, book_id=None):
Widget.__init__(self, parent, Widget.__init__(self, parent,
['paragraph_type', 'formatting_type', 'markdown_disable_toc', 'preserve_spaces']) ['paragraph_type', 'formatting_type', 'markdown_disable_toc',
'preserve_spaces', 'txt_in_remove_indents'])
self.db, self.book_id = db, book_id self.db, self.book_id = db, book_id
for x in get_option('paragraph_type').option.choices: for x in get_option('paragraph_type').option.choices:
self.opt_paragraph_type.addItem(x) self.opt_paragraph_type.addItem(x)

View File

@ -7,57 +7,95 @@
<x>0</x> <x>0</x>
<y>0</y> <y>0</y>
<width>518</width> <width>518</width>
<height>300</height> <height>353</height>
</rect> </rect>
</property> </property>
<property name="windowTitle"> <property name="windowTitle">
<string>Form</string> <string>Form</string>
</property> </property>
<layout class="QGridLayout" name="gridLayout"> <layout class="QVBoxLayout" name="verticalLayout_3">
<item row="0" column="0"> <item>
<widget class="QLabel" name="label_2"> <widget class="QGroupBox" name="groupBox_3">
<property name="text"> <property name="title">
<string>Paragraph style:</string> <string>Structure</string>
</property> </property>
<layout class="QGridLayout" name="gridLayout">
<item row="0" column="0">
<widget class="QLabel" name="label_2">
<property name="sizePolicy">
<sizepolicy hsizetype="Minimum" vsizetype="Preferred">
<horstretch>0</horstretch>
<verstretch>0</verstretch>
</sizepolicy>
</property>
<property name="text">
<string>Paragraph style:</string>
</property>
</widget>
</item>
<item row="0" column="1">
<widget class="QComboBox" name="opt_paragraph_type">
<property name="sizePolicy">
<sizepolicy hsizetype="Expanding" vsizetype="Fixed">
<horstretch>0</horstretch>
<verstretch>0</verstretch>
</sizepolicy>
</property>
</widget>
</item>
<item row="1" column="0">
<widget class="QLabel" name="label_3">
<property name="sizePolicy">
<sizepolicy hsizetype="Minimum" vsizetype="Preferred">
<horstretch>0</horstretch>
<verstretch>0</verstretch>
</sizepolicy>
</property>
<property name="text">
<string>Formatting style:</string>
</property>
</widget>
</item>
<item row="1" column="1">
<widget class="QComboBox" name="opt_formatting_type">
<property name="sizePolicy">
<sizepolicy hsizetype="Expanding" vsizetype="Fixed">
<horstretch>0</horstretch>
<verstretch>0</verstretch>
</sizepolicy>
</property>
</widget>
</item>
</layout>
</widget> </widget>
</item> </item>
<item row="0" column="1"> <item>
<widget class="QComboBox" name="opt_paragraph_type"/> <widget class="QGroupBox" name="groupBox_2">
</item> <property name="title">
<item row="5" column="0" colspan="2"> <string>Common</string>
<widget class="QCheckBox" name="opt_preserve_spaces">
<property name="text">
<string>Preserve &amp;spaces</string>
</property> </property>
<layout class="QVBoxLayout" name="verticalLayout_2">
<item>
<widget class="QCheckBox" name="opt_preserve_spaces">
<property name="text">
<string>Preserve &amp;spaces</string>
</property>
</widget>
</item>
<item>
<widget class="QCheckBox" name="opt_txt_in_remove_indents">
<property name="text">
<string>Remove indents at the beginning of lines</string>
</property>
</widget>
</item>
</layout>
</widget> </widget>
</item> </item>
<item row="6" column="0" colspan="2"> <item>
<spacer name="verticalSpacer">
<property name="orientation">
<enum>Qt::Vertical</enum>
</property>
<property name="sizeHint" stdset="0">
<size>
<width>20</width>
<height>213</height>
</size>
</property>
</spacer>
</item>
<item row="1" column="1">
<widget class="QComboBox" name="opt_formatting_type"/>
</item>
<item row="1" column="0">
<widget class="QLabel" name="label_3">
<property name="text">
<string>Formatting style:</string>
</property>
</widget>
</item>
<item row="2" column="0" rowspan="2" colspan="2">
<widget class="QGroupBox" name="groupBox"> <widget class="QGroupBox" name="groupBox">
<property name="title"> <property name="title">
<string>Markdown Options</string> <string>Markdown</string>
</property> </property>
<layout class="QVBoxLayout" name="verticalLayout"> <layout class="QVBoxLayout" name="verticalLayout">
<item> <item>
@ -83,6 +121,19 @@
</layout> </layout>
</widget> </widget>
</item> </item>
<item>
<spacer name="verticalSpacer">
<property name="orientation">
<enum>Qt::Vertical</enum>
</property>
<property name="sizeHint" stdset="0">
<size>
<width>20</width>
<height>213</height>
</size>
</property>
</spacer>
</item>
</layout> </layout>
</widget> </widget>
<resources/> <resources/>