This commit is contained in:
Kovid Goyal 2011-02-06 08:57:54 -07:00
commit d426c0f045
2 changed files with 60 additions and 12 deletions

View File

@ -12,7 +12,7 @@ from calibre.ebooks.chardet import detect
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
preserve_spaces, detect_paragraph_type, detect_formatting_type, \
normalize_line_endings, convert_textile, remove_indents
normalize_line_endings, convert_textile, remove_indents, block_to_single_line
from calibre import _ent_pat, xml_entity_to_unicode
class TXTInput(InputFormatPlugin):
@ -99,14 +99,6 @@ class TXTInput(InputFormatPlugin):
setattr(options, 'enable_heuristics', True)
setattr(options, 'unwrap_lines', False)
if options.txt_in_remove_indents:
txt = remove_indents(txt)
# Preserve spaces will replace multiple spaces to a space
# followed by the   entity.
if options.preserve_spaces:
txt = preserve_spaces(txt)
# Reformat paragraphs to block formatting based on the detected type.
# We don't check for block because the processor assumes block.
# single and print at transformed to block for processing.
@ -114,6 +106,7 @@ class TXTInput(InputFormatPlugin):
txt = separate_paragraphs_single_line(txt)
elif options.paragraph_type == 'print':
txt = separate_paragraphs_print_formatted(txt)
txt = block_to_single_line(txt)
elif options.paragraph_type == 'unformatted':
from calibre.ebooks.conversion.utils import HeuristicProcessor
# unwrap lines based on punctuation
@ -122,6 +115,8 @@ class TXTInput(InputFormatPlugin):
preprocessor = HeuristicProcessor(options, log=getattr(self, 'log', None))
txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
txt = separate_paragraphs_single_line(txt)
else:
txt = block_to_single_line(txt)
if getattr(options, 'enable_heuristics', False) and getattr(options, 'dehyphenate', False):
docanalysis = DocAnalysis('txt', txt)
@ -130,6 +125,15 @@ class TXTInput(InputFormatPlugin):
dehyphenator = Dehyphenator(options.verbose, log=self.log)
txt = dehyphenator(txt,'txt', length)
# User requested transformation on the text.
if options.txt_in_remove_indents:
txt = remove_indents(txt)
# Preserve spaces will replace multiple spaces to a space
# followed by the   entity.
if options.preserve_spaces:
txt = preserve_spaces(txt)
# Process the text using the appropriate text processor.
html = ''
if options.formatting_type == 'markdown':

View File

@ -18,6 +18,10 @@ from calibre.utils.cleantext import clean_ascii_chars
HTML_TEMPLATE = u'<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/><title>%s</title></head><body>\n%s\n</body></html>'
def clean_txt(txt):
'''
Run transformations on the text to put it into
consistent state.
'''
if isbytestring(txt):
txt = txt.decode('utf-8', 'replace')
# Strip whitespace from the end of the line. Also replace
@ -42,6 +46,15 @@ def clean_txt(txt):
return txt
def split_txt(txt, epub_split_size_kb=0):
'''
Ensure there are split points for converting
to EPUB. A misdetected paragraph type can
result in the entire document being one giant
paragraph. In this case the EPUB parser will not
be able to determine where to split the file
to accomidate the EPUB file size limitation
and will fail.
'''
#Takes care if there is no point to split
if epub_split_size_kb > 0:
if isinstance(txt, unicode):
@ -59,6 +72,12 @@ def split_txt(txt, epub_split_size_kb=0):
return txt
def convert_basic(txt, title='', epub_split_size_kb=0):
'''
Converts plain text to html by putting all paragraphs in
<p> tags. It condense and retains blank lines when necessary.
Requires paragraphs to be in single line format.
'''
txt = clean_txt(txt)
txt = split_txt(txt, epub_split_size_kb)
@ -99,15 +118,25 @@ def separate_paragraphs_single_line(txt):
return txt
def separate_paragraphs_print_formatted(txt):
txt = re.sub(u'(?miu)^(\t+|[ ]{2,})(?=.)', '\n\t', txt)
txt = re.sub(u'(?miu)^(?P<indent>\t+|[ ]{2,})(?=.)', lambda mo: '\n%s' % mo.group('indent'), txt)
return txt
def block_to_single_line(txt):
txt = re.sub(r'(?<=.)\n(?=.)', ' ', txt)
return txt
def preserve_spaces(txt):
'''
Replaces spaces multiple spaces with &nbsp; entities.
'''
txt = re.sub('(?P<space>[ ]{2,})', lambda mo: ' ' + ('&nbsp;' * (len(mo.group('space')) - 1)), txt)
txt = txt.replace('\t', '&nbsp;&nbsp;&nbsp;&nbsp;')
return txt
def remove_indents(txt):
'''
Remove whitespace at the beginning of each line.
'''
txt = re.sub('(?miu)^\s+', '', txt)
return txt
@ -118,7 +147,10 @@ def opf_writer(path, opf_name, manifest, spine, mi):
with open(os.path.join(path, opf_name), 'wb') as opffile:
opf.render(opffile)
def split_string_separator(txt, size) :
def split_string_separator(txt, size):
'''
Splits the text by putting \n\n at the point size.
'''
if len(txt) > size:
txt = ''.join([re.sub(u'\.(?P<ends>[^.]*)$', '.\n\n\g<ends>',
txt[i:i+size], 1) for i in
@ -127,7 +159,7 @@ def split_string_separator(txt, size) :
def detect_paragraph_type(txt):
'''
Tries to determine the formatting of the document.
Tries to determine the paragraph type of the document.
block: Paragraphs are separated by a blank line.
single: Each line is a paragraph.
@ -170,6 +202,16 @@ def detect_paragraph_type(txt):
def detect_formatting_type(txt):
'''
Tries to determine the formatting of the document.
markdown: Markdown formatting is used.
textile: Textile formatting is used.
heuristic: When none of the above formatting types are
detected heuristic is returned.
'''
# Keep a count of the number of format specific object
# that are found in the text.
markdown_count = 0
textile_count = 0
@ -193,6 +235,8 @@ def detect_formatting_type(txt):
# Links
textile_count += len(re.findall(r'"(?=".*?\()(\(.+?\))*[^\(]+?(\(.+?\))*":[^\s]+', txt))
# Decide if either markdown or textile is used in the text
# based on the number of unique formatting elements found.
if markdown_count > 5 or textile_count > 5:
if markdown_count > textile_count:
return 'markdown'