From b73e1b3da50810e151d10a5d62251754a077e605 Mon Sep 17 00:00:00 2001
From: ldolse
Date: Tue, 14 Sep 2010 02:56:56 +1000
Subject: [PATCH] tweaked preprocess for $, added rtf to new preprocess logic,
changed last pdf default
---
src/calibre/ebooks/conversion/preprocess.py | 2 +-
src/calibre/ebooks/rtf/input.py | 13 +++----------
src/calibre/gui2/convert/pdf_input.ui | 2 +-
3 files changed, 5 insertions(+), 12 deletions(-)
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index f6277956c8..9464be1210 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -340,7 +340,7 @@ class HTMLPreProcessor(object):
# print "The pdf line length returned is " + str(length)
end_rules.append(
# Un wrap using punctuation
- (re.compile(r'(?<=.{%i}[a-z,;:)\-IA])\s*(?P(i|b|u)>)?\s*(\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines),
+ (re.compile(r'(?<=.{%i}[a-z,;:)\-IA])\s*(?P(i|b|u)>)?\s*(\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
)
for rule in self.PREPROCESS + start_rules:
diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py
index 216ccf591d..d229b80c16 100644
--- a/src/calibre/ebooks/rtf/input.py
+++ b/src/calibre/ebooks/rtf/input.py
@@ -8,6 +8,7 @@ from lxml import etree
from calibre.customize.conversion import InputFormatPlugin
from calibre.ebooks.conversion.preprocess import line_length
+from calibre.ebooks.conversion.utils import PreProcessor
class InlineClass(etree.XSLTExtension):
@@ -229,16 +230,8 @@ class RTFInput(InputFormatPlugin):
res = transform.tostring(result)
res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
if self.options.preprocess_html:
- self.log("********* Preprocessing HTML *********")
- # Detect Chapters to match the xpath in the GUI
- chapdetect = re.compile(r']*>\s*]*>\s*(?P(<(i|b)>(<(i|b)>)?)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?((i|b)>(<(/i|b)>)?)?)\s*\s*
', re.IGNORECASE)
- res = chapdetect.sub(''+'\g'+'
\n', res)
- # Unwrap lines using punctation if the median length of all lines is less than 150
- length = line_length('html', res, 0.4)
- self.log("*** Median length is " + str(length) + " ***")
- unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*\s*
\s*(?P]*>\s*(]*>\s*\s*)
\s*){0,3}\s*]*>\s*]*>\s*" % length, re.UNICODE)
- if length < 150:
- res = unwrap.sub(' ', res)
+ preprocessor = PreProcessor(res)
+ res = preprocessor(res)
f.write(res)
self.write_inline_css(inline_class)
stream.seek(0)
diff --git a/src/calibre/gui2/convert/pdf_input.ui b/src/calibre/gui2/convert/pdf_input.ui
index 626c68ea63..b2ee421922 100644
--- a/src/calibre/gui2/convert/pdf_input.ui
+++ b/src/calibre/gui2/convert/pdf_input.ui
@@ -46,7 +46,7 @@
0.010000000000000
- 0.500000000000000
+ 0.450000000000000