From 9bbff15c27c2be0b6101f17ddaa7f53a504824ea Mon Sep 17 00:00:00 2001 From: ldolse Date: Sun, 9 Jan 2011 02:12:09 +0800 Subject: [PATCH 1/9] text processing tweaks --- src/calibre/ebooks/conversion/utils.py | 4 ++-- src/calibre/ebooks/txt/input.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 27dacdf5fb..52d1bcc619 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -190,7 +190,7 @@ class PreProcessor(object): line_ending = "\s*\s*()?" blanklines = "\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*\s*)\s*){0,3}\s*" line_opening = "<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" - txt_line_wrap = u"(\u0020|\u0009)*\n" + txt_line_wrap = u"((\u0020|\u0009)*\n){1,4}" unwrap_regex = lookahead+line_ending+blanklines+line_opening if format == 'txt': @@ -357,6 +357,6 @@ class PreProcessor(object): html = blankreg.sub('\n'+r'\g'+u'\u00a0'+r'\g', html) # Center separator lines - html = re.sub(u'

\s*(?P([*#•]+\s*)+)\s*

', '

' + '\g' + '

', html) + html = re.sub(u'<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(?P([*#•]+\s*)+)\s*()?\s*()?\s*()?\s*', '

' + '\g' + '

', html) return html diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index 98756c5fa1..eac46385a7 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -90,7 +90,7 @@ class TXTInput(InputFormatPlugin): # We don't check for block because the processor assumes block. # single and print at transformed to block for processing. - if options.paragraph_type == 'single' or 'unformatted': + if options.paragraph_type in ('single', 'unformatted'): txt = separate_paragraphs_single_line(txt) elif options.paragraph_type == 'print': txt = separate_paragraphs_print_formatted(txt) From e9130241603a99f7e8dddfb8ff7df6edf4faacb5 Mon Sep 17 00:00:00 2001 From: ldolse Date: Sun, 9 Jan 2011 10:40:30 +0800 Subject: [PATCH 2/9] ... --- src/calibre/ebooks/txt/input.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index e2405de617..34a702cc55 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -23,7 +23,7 @@ class TXTInput(InputFormatPlugin): options = set([ OptionRecommendation(name='paragraph_type', recommended_value='auto', - choices=['auto', 'block', 'single', 'print'], + choices=['auto', 'block', 'single', 'print', 'unformatted'], help=_('Paragraph structure.\n' 'choices are [\'auto\', \'block\', \'single\', \'print\', \'unformatted\']\n' '* auto: Try to auto detect paragraph type.\n' From 289cdf33925dc4f80c08889e941becc9c3862471 Mon Sep 17 00:00:00 2001 From: ldolse Date: Sun, 9 Jan 2011 10:43:24 +0800 Subject: [PATCH 3/9] changed unformatted description --- src/calibre/ebooks/txt/input.py | 2 +- src/calibre/ebooks/txt/processor.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index 34a702cc55..9bc9323a4c 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -31,7 +31,7 @@ class TXTInput(InputFormatPlugin): '* single: Assume every line is a paragraph.\n' '* print: Assume every line starting with 2+ spaces or a tab ' 'starts a paragraph.' - '* unformatted: Most lines have hard line breaks, few/no spaces or indents.')), + '* unformatted: Most lines have hard line breaks, few/no blank lines or indents.')), OptionRecommendation(name='formatting_type', recommended_value='auto', choices=['auto', 'none', 'heuristic', 'markdown'], help=_('Formatting used within the document.' diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py index 9dc29e45dd..e26f0a9d07 100644 --- a/src/calibre/ebooks/txt/processor.py +++ b/src/calibre/ebooks/txt/processor.py @@ -117,7 +117,7 @@ def detect_paragraph_type(txt): single: Each line is a paragraph. print: Each paragraph starts with a 2+ spaces or a tab and ends when a new paragraph is reached. - unformatted: most lines have hard line breaks, few/no spaces or indents + unformatted: most lines have hard line breaks, few/no blank lines or indents returns block, single, print, unformatted ''' From f3a9f3f83f7da4821bdc1fca2ba0df66aca714e1 Mon Sep 17 00:00:00 2001 From: ldolse Date: Sun, 9 Jan 2011 17:27:24 +0800 Subject: [PATCH 4/9] added dehyphenation to txt input --- src/calibre/ebooks/conversion/preprocess.py | 15 +++++++++++---- src/calibre/ebooks/txt/input.py | 18 ++++++++++++------ 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index ae111355e4..df9fd66407 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -72,6 +72,8 @@ class DocAnalysis(object): def __init__(self, format='html', raw=''): raw = raw.replace(' ', ' ') + raw = raw.replace('\r\n', '\n') + raw = raw.replace('\r', '\n') if format == 'html': linere = re.compile('(?<=]*>\s*

).*?(?=

)', re.DOTALL) elif format == 'pdf': @@ -79,7 +81,7 @@ class DocAnalysis(object): elif format == 'spanned_html': linere = re.compile('(?<=)', re.DOTALL) elif format == 'txt': - linere = re.compile('.*?\n', re.DOTALL) + linere = re.compile('.*?\n') self.lines = linere.findall(raw) def line_length(self, percent): @@ -177,7 +179,7 @@ class Dehyphenator(object): def __init__(self): # Add common suffixes to the regex below to increase the likelihood of a match - # don't add suffixes which are also complete words, such as 'able' or 'sex' - self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE) + self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|m?ents?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex|ian)$", re.IGNORECASE) # remove prefixes if the prefix was not already the point of hyphenation self.prefixes = re.compile(r'^(dis|re|un|in|ex)$', re.IGNORECASE) self.removeprefix = re.compile(r'^(dis|re|un|in|ex)', re.IGNORECASE) @@ -194,7 +196,7 @@ class Dehyphenator(object): lookupword = self.removesuffixes.sub('', dehyphenated) if self.prefixes.match(firsthalf) is None: lookupword = self.removeprefix.sub('', lookupword) - #print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated) + print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated) try: searchresult = self.html.find(lookupword.lower()) except: @@ -225,8 +227,13 @@ class Dehyphenator(object): intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P()?\s*(\s*){1,2}(?P<(p|div)[^>]*>\s*(]*>\s*

\s*)?\s+){0,3}\s*(<[iubp][^>]*>\s*){1,2}(]*>)?)\s*(?P[\w\d]+)' % length) elif format == 'pdf': intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?P

|\s*

\s*<[iub]>)\s*(?P[\w\d]+)'% length) + elif format == 'txt': + intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)(\u0020|\u0009)*(?P((\n|\r|\r\n)(\u0020|\u0009)*)+)(?P[\w\d]+)'% length) elif format == 'individual_words': - intextmatch = re.compile(u'>[^<]*\b(?P[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)(?P[^<]*\b(?P[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020?(?P\w+)\b[^<]*<') # for later, not called anywhere yet + elif format == 'individual_words_txt': + intextmatch = re.compile(u'\b(?P[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020*(?P\w+)\b') + elif format == 'html_cleanup': intextmatch = re.compile(u'(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P\s*(\s*<[iubp][^>]*>\s*)?]*>|\s*<[iubp][^>]*>)?\s*(?P[\w\d]+)') diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index 9bc9323a4c..f6adb617c3 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -7,6 +7,7 @@ __docformat__ = 'restructuredtext en' import os from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation +from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator from calibre.ebooks.chardet import detect from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \ separate_paragraphs_single_line, separate_paragraphs_print_formatted, \ @@ -91,8 +92,16 @@ class TXTInput(InputFormatPlugin): log.debug('Could not reliably determine paragraph type using block') options.paragraph_type = 'block' else: - log.debug('Auto detected paragraph type as %s' % options.paragraph_type) - + log.debug('Auto detected paragraph type as %s' % options.paragraph_type) + + # Get length for hyphen removal and punctuation unwrap + docanalysis = DocAnalysis('txt', txt) + length = docanalysis.line_length(.5) + + # Dehyphenate + dehyphenator = Dehyphenator() + html = dehyphenator(txt,'txt', length) + # We don't check for block because the processor assumes block. # single and print at transformed to block for processing. @@ -103,10 +112,8 @@ class TXTInput(InputFormatPlugin): if options.paragraph_type == 'unformatted': from calibre.ebooks.conversion.utils import PreProcessor - from calibre.ebooks.conversion.preprocess import DocAnalysis # get length - docanalysis = DocAnalysis('txt', txt) - length = docanalysis.line_length(.5) + # unwrap lines based on punctuation preprocessor = PreProcessor(options, log=getattr(self, 'log', None)) txt = preprocessor.punctuation_unwrap(length, txt, 'txt') @@ -117,7 +124,6 @@ class TXTInput(InputFormatPlugin): html = convert_heuristic(txt, epub_split_size_kb=flow_size) else: html = convert_basic(txt, epub_split_size_kb=flow_size) - from calibre.customize.ui import plugin_for_input_format html_input = plugin_for_input_format('html') From 696d9252324a5fa31ae91f8a3c5d472b5d5d953c Mon Sep 17 00:00:00 2001 From: ldolse Date: Sun, 9 Jan 2011 18:14:49 +0800 Subject: [PATCH 5/9] normalized line endings to simplify line length and dehyphenation, fixes print formatted output for certain line endings --- src/calibre/ebooks/conversion/preprocess.py | 10 +++++----- src/calibre/ebooks/txt/input.py | 8 ++++++-- src/calibre/ebooks/txt/processor.py | 5 ++++- 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index df9fd66407..d9d735e391 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -72,8 +72,8 @@ class DocAnalysis(object): def __init__(self, format='html', raw=''): raw = raw.replace(' ', ' ') - raw = raw.replace('\r\n', '\n') - raw = raw.replace('\r', '\n') + #raw = raw.replace('\r\n', '\n') + #raw = raw.replace('\r', '\n') if format == 'html': linere = re.compile('(?<=]*>\s*

).*?(?=

)', re.DOTALL) elif format == 'pdf': @@ -214,10 +214,10 @@ class Dehyphenator(object): else: if self.html.find(lookupword) != -1 or searchresult != -1: - #print "returned dehyphenated word: " + str(dehyphenated) + print "returned dehyphenated word: " + str(dehyphenated) return dehyphenated else: - #print " returned hyphenated word: " + str(hyphenated) + print " returned hyphenated word: " + str(hyphenated) return hyphenated def __call__(self, html, format, length=1): @@ -228,7 +228,7 @@ class Dehyphenator(object): elif format == 'pdf': intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?P

|\s*

\s*<[iub]>)\s*(?P[\w\d]+)'% length) elif format == 'txt': - intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)(\u0020|\u0009)*(?P((\n|\r|\r\n)(\u0020|\u0009)*)+)(?P[\w\d]+)'% length) + intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)(\u0020|\u0009)*(?P(\n(\u0020|\u0009)*)+)(?P[\w\d]+)'% length) elif format == 'individual_words': intextmatch = re.compile(u'>[^<]*\b(?P[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020?(?P\w+)\b[^<]*<') # for later, not called anywhere yet elif format == 'individual_words_txt': diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index f6adb617c3..2e35e8e345 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -12,7 +12,7 @@ from calibre.ebooks.chardet import detect from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \ separate_paragraphs_single_line, separate_paragraphs_print_formatted, \ preserve_spaces, detect_paragraph_type, detect_formatting_type, \ - convert_heuristic + convert_heuristic, normalize_line_endings from calibre import _ent_pat, xml_entity_to_unicode class TXTInput(InputFormatPlugin): @@ -94,13 +94,17 @@ class TXTInput(InputFormatPlugin): else: log.debug('Auto detected paragraph type as %s' % options.paragraph_type) + # Normalize line endings + txt = normalize_line_endings(txt) + # Get length for hyphen removal and punctuation unwrap docanalysis = DocAnalysis('txt', txt) length = docanalysis.line_length(.5) + print "length is "+str(length) # Dehyphenate dehyphenator = Dehyphenator() - html = dehyphenator(txt,'txt', length) + txt = dehyphenator(txt,'txt', length) # We don't check for block because the processor assumes block. # single and print at transformed to block for processing. diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py index e26f0a9d07..ebdadebda2 100644 --- a/src/calibre/ebooks/txt/processor.py +++ b/src/calibre/ebooks/txt/processor.py @@ -80,9 +80,12 @@ def convert_markdown(txt, title='', disable_toc=False): safe_mode=False) return HTML_TEMPLATE % (title, md.convert(txt)) -def separate_paragraphs_single_line(txt): +def normalize_line_endings(txt): txt = txt.replace('\r\n', '\n') txt = txt.replace('\r', '\n') + return txt + +def separate_paragraphs_single_line(txt): txt = re.sub(u'(?<=.)\n(?=.)', '\n\n', txt) return txt From 0f109d699f06967394370150a0a35bf671a283c6 Mon Sep 17 00:00:00 2001 From: ldolse Date: Sun, 9 Jan 2011 18:38:52 +0800 Subject: [PATCH 6/9] tweaked the auto-detection to handle cases where the vast majority of the lines are formatted as block or print --- src/calibre/ebooks/txt/processor.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py index ebdadebda2..6a1a106681 100644 --- a/src/calibre/ebooks/txt/processor.py +++ b/src/calibre/ebooks/txt/processor.py @@ -133,15 +133,21 @@ def detect_paragraph_type(txt): hardbreaks = docanalysis.line_histogram(.55) if hardbreaks: - # Check for print + # Determine print percentage tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt)) - if tab_line_count / float(txt_line_count) >= .15: - return 'print' - - # Check for block + print_percent = tab_line_count / float(txt_line_count) + + # Determine block percentage empty_line_count = len(re.findall('(?mu)^\s*$', txt)) - if empty_line_count / float(txt_line_count) >= .15: - return 'block' + block_percent = empty_line_count / float(txt_line_count) + + # Compare the two types - the type with the larger number of instances wins + # in cases where only one or the other represents the vast majority of the document neither wins + if print_percent >= block_percent: + if .15 <= print_percent <= .75: + return 'print' + elif .15 <= block_percent <= .75: + return 'block' # Assume unformatted text with hardbreaks if nothing else matches return 'unformatted' From c2cef786ce19b25cbdfc79c345d4cffa38885248 Mon Sep 17 00:00:00 2001 From: ldolse Date: Sun, 9 Jan 2011 19:34:02 +0800 Subject: [PATCH 7/9] added partial dehyphenation for markdown --- src/calibre/ebooks/conversion/preprocess.py | 16 +++++++-------- src/calibre/ebooks/txt/input.py | 22 +++++++++++++-------- 2 files changed, 22 insertions(+), 16 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index d9d735e391..e2c51846a4 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -201,15 +201,15 @@ class Dehyphenator(object): searchresult = self.html.find(lookupword.lower()) except: return hyphenated - if self.format == 'html_cleanup': + if self.format == 'html_cleanup' or self.format == 'txt_cleanup': if self.html.find(lookupword) != -1 or searchresult != -1: - #print "Cleanup:returned dehyphenated word: " + str(dehyphenated) + print "Cleanup:returned dehyphenated word: " + str(dehyphenated) return dehyphenated elif self.html.find(hyphenated) != -1: - #print "Cleanup:returned hyphenated word: " + str(hyphenated) + print "Cleanup:returned hyphenated word: " + str(hyphenated) return hyphenated else: - #print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf) + print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf) return firsthalf+u'\u2014'+wraptags+secondhalf else: @@ -230,12 +230,12 @@ class Dehyphenator(object): elif format == 'txt': intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)(\u0020|\u0009)*(?P(\n(\u0020|\u0009)*)+)(?P[\w\d]+)'% length) elif format == 'individual_words': - intextmatch = re.compile(u'>[^<]*\b(?P[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020?(?P\w+)\b[^<]*<') # for later, not called anywhere yet - elif format == 'individual_words_txt': - intextmatch = re.compile(u'\b(?P[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020*(?P\w+)\b') - + intextmatch = re.compile(u'>[^<]*\b(?P[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020*(?P\w+)\b[^<]*<') # for later, not called anywhere yet elif format == 'html_cleanup': intextmatch = re.compile(u'(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P\s*(\s*<[iubp][^>]*>\s*)?]*>|\s*<[iubp][^>]*>)?\s*(?P[\w\d]+)') + elif format == 'txt_cleanup': + intextmatch = re.compile(u'(?P\w+)(-|‐)(?P\s+)(?P[\w\d]+)') + html = intextmatch.sub(self.dehyphenate, html) return html diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index 2e35e8e345..5fbdc7131a 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -73,6 +73,14 @@ class TXTInput(InputFormatPlugin): # followed by the   entity. if options.preserve_spaces: txt = preserve_spaces(txt) + + # Normalize line endings + txt = normalize_line_endings(txt) + + # Get length for hyphen removal and punctuation unwrap + docanalysis = DocAnalysis('txt', txt) + length = docanalysis.line_length(.5) + print "length is "+str(length) if options.formatting_type == 'auto': options.formatting_type = detect_formatting_type(txt) @@ -94,14 +102,6 @@ class TXTInput(InputFormatPlugin): else: log.debug('Auto detected paragraph type as %s' % options.paragraph_type) - # Normalize line endings - txt = normalize_line_endings(txt) - - # Get length for hyphen removal and punctuation unwrap - docanalysis = DocAnalysis('txt', txt) - length = docanalysis.line_length(.5) - print "length is "+str(length) - # Dehyphenate dehyphenator = Dehyphenator() txt = dehyphenator(txt,'txt', length) @@ -129,6 +129,12 @@ class TXTInput(InputFormatPlugin): else: html = convert_basic(txt, epub_split_size_kb=flow_size) + # Dehyphenate in cleanup mode for missed txt and markdown conversion + print "going through final dehyphenation" + dehyphenator = Dehyphenator() + html = dehyphenator(html,'txt_cleanup', length) + html = dehyphenator(html,'html_cleanup', length) + from calibre.customize.ui import plugin_for_input_format html_input = plugin_for_input_format('html') for opt in html_input.options: From 9751f99db95185a9a6cdf66029f1d46e4a9d90d8 Mon Sep 17 00:00:00 2001 From: ldolse Date: Sun, 9 Jan 2011 19:57:15 +0800 Subject: [PATCH 8/9] cleaned up print statements --- src/calibre/ebooks/conversion/preprocess.py | 12 ++++++------ src/calibre/ebooks/txt/input.py | 2 -- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index e2c51846a4..32eee713fe 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -196,28 +196,28 @@ class Dehyphenator(object): lookupword = self.removesuffixes.sub('', dehyphenated) if self.prefixes.match(firsthalf) is None: lookupword = self.removeprefix.sub('', lookupword) - print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated) + #print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated) try: searchresult = self.html.find(lookupword.lower()) except: return hyphenated if self.format == 'html_cleanup' or self.format == 'txt_cleanup': if self.html.find(lookupword) != -1 or searchresult != -1: - print "Cleanup:returned dehyphenated word: " + str(dehyphenated) + #print "Cleanup:returned dehyphenated word: " + str(dehyphenated) return dehyphenated elif self.html.find(hyphenated) != -1: - print "Cleanup:returned hyphenated word: " + str(hyphenated) + #print "Cleanup:returned hyphenated word: " + str(hyphenated) return hyphenated else: - print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf) + #print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf) return firsthalf+u'\u2014'+wraptags+secondhalf else: if self.html.find(lookupword) != -1 or searchresult != -1: - print "returned dehyphenated word: " + str(dehyphenated) + #print "returned dehyphenated word: " + str(dehyphenated) return dehyphenated else: - print " returned hyphenated word: " + str(hyphenated) + #print " returned hyphenated word: " + str(hyphenated) return hyphenated def __call__(self, html, format, length=1): diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index 5fbdc7131a..3957391494 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -80,7 +80,6 @@ class TXTInput(InputFormatPlugin): # Get length for hyphen removal and punctuation unwrap docanalysis = DocAnalysis('txt', txt) length = docanalysis.line_length(.5) - print "length is "+str(length) if options.formatting_type == 'auto': options.formatting_type = detect_formatting_type(txt) @@ -130,7 +129,6 @@ class TXTInput(InputFormatPlugin): html = convert_basic(txt, epub_split_size_kb=flow_size) # Dehyphenate in cleanup mode for missed txt and markdown conversion - print "going through final dehyphenation" dehyphenator = Dehyphenator() html = dehyphenator(html,'txt_cleanup', length) html = dehyphenator(html,'html_cleanup', length) From 7008e9b64cbe98ca43e77965a84a3f5af4e88f6d Mon Sep 17 00:00:00 2001 From: ldolse Date: Sun, 9 Jan 2011 21:56:12 +0800 Subject: [PATCH 9/9] ... --- src/calibre/ebooks/conversion/preprocess.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 32eee713fe..08a46cb8d9 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -72,8 +72,6 @@ class DocAnalysis(object): def __init__(self, format='html', raw=''): raw = raw.replace(' ', ' ') - #raw = raw.replace('\r\n', '\n') - #raw = raw.replace('\r', '\n') if format == 'html': linere = re.compile('(?<=]*>\s*

).*?(?=

)', re.DOTALL) elif format == 'pdf':