mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
normalized line endings to simplify line length and dehyphenation, fixes print formatted output for certain line endings
This commit is contained in:
parent
f3a9f3f83f
commit
696d925232
@ -72,8 +72,8 @@ class DocAnalysis(object):
|
|||||||
|
|
||||||
def __init__(self, format='html', raw=''):
|
def __init__(self, format='html', raw=''):
|
||||||
raw = raw.replace(' ', ' ')
|
raw = raw.replace(' ', ' ')
|
||||||
raw = raw.replace('\r\n', '\n')
|
#raw = raw.replace('\r\n', '\n')
|
||||||
raw = raw.replace('\r', '\n')
|
#raw = raw.replace('\r', '\n')
|
||||||
if format == 'html':
|
if format == 'html':
|
||||||
linere = re.compile('(?<=<p)(?![^>]*>\s*</p>).*?(?=</p>)', re.DOTALL)
|
linere = re.compile('(?<=<p)(?![^>]*>\s*</p>).*?(?=</p>)', re.DOTALL)
|
||||||
elif format == 'pdf':
|
elif format == 'pdf':
|
||||||
@ -214,10 +214,10 @@ class Dehyphenator(object):
|
|||||||
|
|
||||||
else:
|
else:
|
||||||
if self.html.find(lookupword) != -1 or searchresult != -1:
|
if self.html.find(lookupword) != -1 or searchresult != -1:
|
||||||
#print "returned dehyphenated word: " + str(dehyphenated)
|
print "returned dehyphenated word: " + str(dehyphenated)
|
||||||
return dehyphenated
|
return dehyphenated
|
||||||
else:
|
else:
|
||||||
#print " returned hyphenated word: " + str(hyphenated)
|
print " returned hyphenated word: " + str(hyphenated)
|
||||||
return hyphenated
|
return hyphenated
|
||||||
|
|
||||||
def __call__(self, html, format, length=1):
|
def __call__(self, html, format, length=1):
|
||||||
@ -228,7 +228,7 @@ class Dehyphenator(object):
|
|||||||
elif format == 'pdf':
|
elif format == 'pdf':
|
||||||
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?P<wraptags><p>|</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)'% length)
|
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?P<wraptags><p>|</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)'% length)
|
||||||
elif format == 'txt':
|
elif format == 'txt':
|
||||||
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)(\u0020|\u0009)*(?P<wraptags>((\n|\r|\r\n)(\u0020|\u0009)*)+)(?P<secondpart>[\w\d]+)'% length)
|
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)(\u0020|\u0009)*(?P<wraptags>(\n(\u0020|\u0009)*)+)(?P<secondpart>[\w\d]+)'% length)
|
||||||
elif format == 'individual_words':
|
elif format == 'individual_words':
|
||||||
intextmatch = re.compile(u'>[^<]*\b(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020?(?P<secondpart>\w+)\b[^<]*<') # for later, not called anywhere yet
|
intextmatch = re.compile(u'>[^<]*\b(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020?(?P<secondpart>\w+)\b[^<]*<') # for later, not called anywhere yet
|
||||||
elif format == 'individual_words_txt':
|
elif format == 'individual_words_txt':
|
||||||
|
@ -12,7 +12,7 @@ from calibre.ebooks.chardet import detect
|
|||||||
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
|
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
|
||||||
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
|
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
|
||||||
preserve_spaces, detect_paragraph_type, detect_formatting_type, \
|
preserve_spaces, detect_paragraph_type, detect_formatting_type, \
|
||||||
convert_heuristic
|
convert_heuristic, normalize_line_endings
|
||||||
from calibre import _ent_pat, xml_entity_to_unicode
|
from calibre import _ent_pat, xml_entity_to_unicode
|
||||||
|
|
||||||
class TXTInput(InputFormatPlugin):
|
class TXTInput(InputFormatPlugin):
|
||||||
@ -94,13 +94,17 @@ class TXTInput(InputFormatPlugin):
|
|||||||
else:
|
else:
|
||||||
log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
|
log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
|
||||||
|
|
||||||
|
# Normalize line endings
|
||||||
|
txt = normalize_line_endings(txt)
|
||||||
|
|
||||||
# Get length for hyphen removal and punctuation unwrap
|
# Get length for hyphen removal and punctuation unwrap
|
||||||
docanalysis = DocAnalysis('txt', txt)
|
docanalysis = DocAnalysis('txt', txt)
|
||||||
length = docanalysis.line_length(.5)
|
length = docanalysis.line_length(.5)
|
||||||
|
print "length is "+str(length)
|
||||||
|
|
||||||
# Dehyphenate
|
# Dehyphenate
|
||||||
dehyphenator = Dehyphenator()
|
dehyphenator = Dehyphenator()
|
||||||
html = dehyphenator(txt,'txt', length)
|
txt = dehyphenator(txt,'txt', length)
|
||||||
|
|
||||||
# We don't check for block because the processor assumes block.
|
# We don't check for block because the processor assumes block.
|
||||||
# single and print at transformed to block for processing.
|
# single and print at transformed to block for processing.
|
||||||
|
@ -80,9 +80,12 @@ def convert_markdown(txt, title='', disable_toc=False):
|
|||||||
safe_mode=False)
|
safe_mode=False)
|
||||||
return HTML_TEMPLATE % (title, md.convert(txt))
|
return HTML_TEMPLATE % (title, md.convert(txt))
|
||||||
|
|
||||||
def separate_paragraphs_single_line(txt):
|
def normalize_line_endings(txt):
|
||||||
txt = txt.replace('\r\n', '\n')
|
txt = txt.replace('\r\n', '\n')
|
||||||
txt = txt.replace('\r', '\n')
|
txt = txt.replace('\r', '\n')
|
||||||
|
return txt
|
||||||
|
|
||||||
|
def separate_paragraphs_single_line(txt):
|
||||||
txt = re.sub(u'(?<=.)\n(?=.)', '\n\n', txt)
|
txt = re.sub(u'(?<=.)\n(?=.)', '\n\n', txt)
|
||||||
return txt
|
return txt
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user