normalized line endings to simplify line length and dehyphenation, fixes print formatted output for certain line endings

This commit is contained in:
ldolse 2011-01-09 18:14:49 +08:00
parent f3a9f3f83f
commit 696d925232
3 changed files with 15 additions and 8 deletions

View File

@ -72,8 +72,8 @@ class DocAnalysis(object):
def __init__(self, format='html', raw=''): def __init__(self, format='html', raw=''):
raw = raw.replace(' ', ' ') raw = raw.replace(' ', ' ')
raw = raw.replace('\r\n', '\n') #raw = raw.replace('\r\n', '\n')
raw = raw.replace('\r', '\n') #raw = raw.replace('\r', '\n')
if format == 'html': if format == 'html':
linere = re.compile('(?<=<p)(?![^>]*>\s*</p>).*?(?=</p>)', re.DOTALL) linere = re.compile('(?<=<p)(?![^>]*>\s*</p>).*?(?=</p>)', re.DOTALL)
elif format == 'pdf': elif format == 'pdf':
@ -214,10 +214,10 @@ class Dehyphenator(object):
else: else:
if self.html.find(lookupword) != -1 or searchresult != -1: if self.html.find(lookupword) != -1 or searchresult != -1:
#print "returned dehyphenated word: " + str(dehyphenated) print "returned dehyphenated word: " + str(dehyphenated)
return dehyphenated return dehyphenated
else: else:
#print " returned hyphenated word: " + str(hyphenated) print " returned hyphenated word: " + str(hyphenated)
return hyphenated return hyphenated
def __call__(self, html, format, length=1): def __call__(self, html, format, length=1):
@ -228,7 +228,7 @@ class Dehyphenator(object):
elif format == 'pdf': elif format == 'pdf':
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|)\s*(?P<wraptags><p>|</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)'% length) intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|)\s*(?P<wraptags><p>|</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)'% length)
elif format == 'txt': elif format == 'txt':
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|)(\u0020|\u0009)*(?P<wraptags>((\n|\r|\r\n)(\u0020|\u0009)*)+)(?P<secondpart>[\w\d]+)'% length) intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|)(\u0020|\u0009)*(?P<wraptags>(\n(\u0020|\u0009)*)+)(?P<secondpart>[\w\d]+)'% length)
elif format == 'individual_words': elif format == 'individual_words':
intextmatch = re.compile(u'>[^<]*\b(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|)\u0020?(?P<secondpart>\w+)\b[^<]*<') # for later, not called anywhere yet intextmatch = re.compile(u'>[^<]*\b(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|)\u0020?(?P<secondpart>\w+)\b[^<]*<') # for later, not called anywhere yet
elif format == 'individual_words_txt': elif format == 'individual_words_txt':

View File

@ -12,7 +12,7 @@ from calibre.ebooks.chardet import detect
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \ from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \ separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
preserve_spaces, detect_paragraph_type, detect_formatting_type, \ preserve_spaces, detect_paragraph_type, detect_formatting_type, \
convert_heuristic convert_heuristic, normalize_line_endings
from calibre import _ent_pat, xml_entity_to_unicode from calibre import _ent_pat, xml_entity_to_unicode
class TXTInput(InputFormatPlugin): class TXTInput(InputFormatPlugin):
@ -94,13 +94,17 @@ class TXTInput(InputFormatPlugin):
else: else:
log.debug('Auto detected paragraph type as %s' % options.paragraph_type) log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
# Normalize line endings
txt = normalize_line_endings(txt)
# Get length for hyphen removal and punctuation unwrap # Get length for hyphen removal and punctuation unwrap
docanalysis = DocAnalysis('txt', txt) docanalysis = DocAnalysis('txt', txt)
length = docanalysis.line_length(.5) length = docanalysis.line_length(.5)
print "length is "+str(length)
# Dehyphenate # Dehyphenate
dehyphenator = Dehyphenator() dehyphenator = Dehyphenator()
html = dehyphenator(txt,'txt', length) txt = dehyphenator(txt,'txt', length)
# We don't check for block because the processor assumes block. # We don't check for block because the processor assumes block.
# single and print at transformed to block for processing. # single and print at transformed to block for processing.

View File

@ -80,9 +80,12 @@ def convert_markdown(txt, title='', disable_toc=False):
safe_mode=False) safe_mode=False)
return HTML_TEMPLATE % (title, md.convert(txt)) return HTML_TEMPLATE % (title, md.convert(txt))
def separate_paragraphs_single_line(txt): def normalize_line_endings(txt):
txt = txt.replace('\r\n', '\n') txt = txt.replace('\r\n', '\n')
txt = txt.replace('\r', '\n') txt = txt.replace('\r', '\n')
return txt
def separate_paragraphs_single_line(txt):
txt = re.sub(u'(?<=.)\n(?=.)', '\n\n', txt) txt = re.sub(u'(?<=.)\n(?=.)', '\n\n', txt)
return txt return txt