normalized line endings to simplify line length and dehyphenation, fixes print formatted output for certain line endings

This commit is contained in:
ldolse 2011-01-09 18:14:49 +08:00
parent f3a9f3f83f
commit 696d925232
3 changed files with 15 additions and 8 deletions

View File

@ -72,8 +72,8 @@ class DocAnalysis(object):
def __init__(self, format='html', raw=''):
raw = raw.replace(' ', ' ')
raw = raw.replace('\r\n', '\n')
raw = raw.replace('\r', '\n')
#raw = raw.replace('\r\n', '\n')
#raw = raw.replace('\r', '\n')
if format == 'html':
linere = re.compile('(?<=<p)(?![^>]*>\s*</p>).*?(?=</p>)', re.DOTALL)
elif format == 'pdf':
@ -214,10 +214,10 @@ class Dehyphenator(object):
else:
if self.html.find(lookupword) != -1 or searchresult != -1:
#print "returned dehyphenated word: " + str(dehyphenated)
print "returned dehyphenated word: " + str(dehyphenated)
return dehyphenated
else:
#print " returned hyphenated word: " + str(hyphenated)
print " returned hyphenated word: " + str(hyphenated)
return hyphenated
def __call__(self, html, format, length=1):
@ -228,7 +228,7 @@ class Dehyphenator(object):
elif format == 'pdf':
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|)\s*(?P<wraptags><p>|</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)'% length)
elif format == 'txt':
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|)(\u0020|\u0009)*(?P<wraptags>((\n|\r|\r\n)(\u0020|\u0009)*)+)(?P<secondpart>[\w\d]+)'% length)
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|)(\u0020|\u0009)*(?P<wraptags>(\n(\u0020|\u0009)*)+)(?P<secondpart>[\w\d]+)'% length)
elif format == 'individual_words':
intextmatch = re.compile(u'>[^<]*\b(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|)\u0020?(?P<secondpart>\w+)\b[^<]*<') # for later, not called anywhere yet
elif format == 'individual_words_txt':

View File

@ -12,7 +12,7 @@ from calibre.ebooks.chardet import detect
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
preserve_spaces, detect_paragraph_type, detect_formatting_type, \
convert_heuristic
convert_heuristic, normalize_line_endings
from calibre import _ent_pat, xml_entity_to_unicode
class TXTInput(InputFormatPlugin):
@ -94,13 +94,17 @@ class TXTInput(InputFormatPlugin):
else:
log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
# Normalize line endings
txt = normalize_line_endings(txt)
# Get length for hyphen removal and punctuation unwrap
docanalysis = DocAnalysis('txt', txt)
length = docanalysis.line_length(.5)
print "length is "+str(length)
# Dehyphenate
dehyphenator = Dehyphenator()
html = dehyphenator(txt,'txt', length)
txt = dehyphenator(txt,'txt', length)
# We don't check for block because the processor assumes block.
# single and print at transformed to block for processing.

View File

@ -80,9 +80,12 @@ def convert_markdown(txt, title='', disable_toc=False):
safe_mode=False)
return HTML_TEMPLATE % (title, md.convert(txt))
def separate_paragraphs_single_line(txt):
def normalize_line_endings(txt):
txt = txt.replace('\r\n', '\n')
txt = txt.replace('\r', '\n')
return txt
def separate_paragraphs_single_line(txt):
txt = re.sub(u'(?<=.)\n(?=.)', '\n\n', txt)
return txt