From b73e1b3da50810e151d10a5d62251754a077e605 Mon Sep 17 00:00:00 2001 From: ldolse Date: Tue, 14 Sep 2010 02:56:56 +1000 Subject: [PATCH] tweaked preprocess for $, added rtf to new preprocess logic, changed last pdf default --- src/calibre/ebooks/conversion/preprocess.py | 2 +- src/calibre/ebooks/rtf/input.py | 13 +++---------- src/calibre/gui2/convert/pdf_input.ui | 2 +- 3 files changed, 5 insertions(+), 12 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index f6277956c8..9464be1210 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -340,7 +340,7 @@ class HTMLPreProcessor(object): # print "The pdf line length returned is " + str(length) end_rules.append( # Un wrap using punctuation - (re.compile(r'(?<=.{%i}[a-z,;:)\-IA])\s*(?P)?\s*(\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines), + (re.compile(r'(?<=.{%i}[a-z,;:)\-IA])\s*(?P)?\s*(\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines), ) for rule in self.PREPROCESS + start_rules: diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py index 216ccf591d..d229b80c16 100644 --- a/src/calibre/ebooks/rtf/input.py +++ b/src/calibre/ebooks/rtf/input.py @@ -8,6 +8,7 @@ from lxml import etree from calibre.customize.conversion import InputFormatPlugin from calibre.ebooks.conversion.preprocess import line_length +from calibre.ebooks.conversion.utils import PreProcessor class InlineClass(etree.XSLTExtension): @@ -229,16 +230,8 @@ class RTFInput(InputFormatPlugin): res = transform.tostring(result) res = res[:100].replace('xmlns:html', 'xmlns') + res[100:] if self.options.preprocess_html: - self.log("********* Preprocessing HTML *********") - # Detect Chapters to match the xpath in the GUI - chapdetect = re.compile(r']*>\s*]*>\s*(?P(<(i|b)>(<(i|b)>)?)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?((<(/i|b)>)?)?)\s*\s*

', re.IGNORECASE) - res = chapdetect.sub('

'+'\g'+'

\n', res) - # Unwrap lines using punctation if the median length of all lines is less than 150 - length = line_length('html', res, 0.4) - self.log("*** Median length is " + str(length) + " ***") - unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*\s*

\s*(?P]*>\s*(]*>\s*\s*)

\s*){0,3}\s*]*>\s*]*>\s*" % length, re.UNICODE) - if length < 150: - res = unwrap.sub(' ', res) + preprocessor = PreProcessor(res) + res = preprocessor(res) f.write(res) self.write_inline_css(inline_class) stream.seek(0) diff --git a/src/calibre/gui2/convert/pdf_input.ui b/src/calibre/gui2/convert/pdf_input.ui index 626c68ea63..b2ee421922 100644 --- a/src/calibre/gui2/convert/pdf_input.ui +++ b/src/calibre/gui2/convert/pdf_input.ui @@ -46,7 +46,7 @@ 0.010000000000000 - 0.500000000000000 + 0.450000000000000