mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
normalized line endings to simplify line length and dehyphenation, fixes print formatted output for certain line endings
This commit is contained in:
parent
f3a9f3f83f
commit
696d925232
@ -72,8 +72,8 @@ class DocAnalysis(object):
|
||||
|
||||
def __init__(self, format='html', raw=''):
|
||||
raw = raw.replace(' ', ' ')
|
||||
raw = raw.replace('\r\n', '\n')
|
||||
raw = raw.replace('\r', '\n')
|
||||
#raw = raw.replace('\r\n', '\n')
|
||||
#raw = raw.replace('\r', '\n')
|
||||
if format == 'html':
|
||||
linere = re.compile('(?<=<p)(?![^>]*>\s*</p>).*?(?=</p>)', re.DOTALL)
|
||||
elif format == 'pdf':
|
||||
@ -214,10 +214,10 @@ class Dehyphenator(object):
|
||||
|
||||
else:
|
||||
if self.html.find(lookupword) != -1 or searchresult != -1:
|
||||
#print "returned dehyphenated word: " + str(dehyphenated)
|
||||
print "returned dehyphenated word: " + str(dehyphenated)
|
||||
return dehyphenated
|
||||
else:
|
||||
#print " returned hyphenated word: " + str(hyphenated)
|
||||
print " returned hyphenated word: " + str(hyphenated)
|
||||
return hyphenated
|
||||
|
||||
def __call__(self, html, format, length=1):
|
||||
@ -228,7 +228,7 @@ class Dehyphenator(object):
|
||||
elif format == 'pdf':
|
||||
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?P<wraptags><p>|</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)'% length)
|
||||
elif format == 'txt':
|
||||
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)(\u0020|\u0009)*(?P<wraptags>((\n|\r|\r\n)(\u0020|\u0009)*)+)(?P<secondpart>[\w\d]+)'% length)
|
||||
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)(\u0020|\u0009)*(?P<wraptags>(\n(\u0020|\u0009)*)+)(?P<secondpart>[\w\d]+)'% length)
|
||||
elif format == 'individual_words':
|
||||
intextmatch = re.compile(u'>[^<]*\b(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020?(?P<secondpart>\w+)\b[^<]*<') # for later, not called anywhere yet
|
||||
elif format == 'individual_words_txt':
|
||||
|
@ -12,7 +12,7 @@ from calibre.ebooks.chardet import detect
|
||||
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
|
||||
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
|
||||
preserve_spaces, detect_paragraph_type, detect_formatting_type, \
|
||||
convert_heuristic
|
||||
convert_heuristic, normalize_line_endings
|
||||
from calibre import _ent_pat, xml_entity_to_unicode
|
||||
|
||||
class TXTInput(InputFormatPlugin):
|
||||
@ -94,13 +94,17 @@ class TXTInput(InputFormatPlugin):
|
||||
else:
|
||||
log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
|
||||
|
||||
# Normalize line endings
|
||||
txt = normalize_line_endings(txt)
|
||||
|
||||
# Get length for hyphen removal and punctuation unwrap
|
||||
docanalysis = DocAnalysis('txt', txt)
|
||||
length = docanalysis.line_length(.5)
|
||||
print "length is "+str(length)
|
||||
|
||||
# Dehyphenate
|
||||
dehyphenator = Dehyphenator()
|
||||
html = dehyphenator(txt,'txt', length)
|
||||
txt = dehyphenator(txt,'txt', length)
|
||||
|
||||
# We don't check for block because the processor assumes block.
|
||||
# single and print at transformed to block for processing.
|
||||
|
@ -80,9 +80,12 @@ def convert_markdown(txt, title='', disable_toc=False):
|
||||
safe_mode=False)
|
||||
return HTML_TEMPLATE % (title, md.convert(txt))
|
||||
|
||||
def separate_paragraphs_single_line(txt):
|
||||
def normalize_line_endings(txt):
|
||||
txt = txt.replace('\r\n', '\n')
|
||||
txt = txt.replace('\r', '\n')
|
||||
return txt
|
||||
|
||||
def separate_paragraphs_single_line(txt):
|
||||
txt = re.sub(u'(?<=.)\n(?=.)', '\n\n', txt)
|
||||
return txt
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user