From 0bd3cf40090934e1f6e41f2b0e4b1f5e9b0d3173 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Tue, 24 Aug 2010 00:56:06 +1000
Subject: [PATCH] Enabled preprocess_html option for rtf

---
 src/calibre/ebooks/rtf/input.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)
diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py
index 5447e69403..dcffbe68ca 100644
--- a/src/calibre/ebooks/rtf/input.py
+++ b/src/calibre/ebooks/rtf/input.py
@@ -7,6 +7,7 @@ import os, glob, re, textwrap
 from lxml import etree
 
 from calibre.customize.conversion import InputFormatPlugin
+from calibre.ebooks.conversion.preprocess import line_length
 
 class InlineClass(etree.XSLTExtension):
 
@@ -184,6 +185,7 @@ class RTFInput(InputFormatPlugin):
         from calibre.ebooks.metadata.meta import get_metadata
         from calibre.ebooks.metadata.opf2 import OPFCreator
         from calibre.ebooks.rtf2xml.ParseRtf import RtfInvalidCodeException
+        self.options = options
         self.log = log
         self.log('Converting RTF to XML...')
         #Name of the preprocesssed RTF file
@@ -226,6 +228,17 @@ class RTFInput(InputFormatPlugin):
         with open(html, 'wb') as f:
             res = transform.tostring(result)
             res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
+            if self.options.preprocess_html:
+                print "*********  Preprocessing HTML  *********\n"
+                # Detect Chapters to match the xpath in the GUI
+                chapdetect = re.compile(r'<p[^>]*>\s*<span[^>]*>\s*(?P<chap>(<(i|b)><(i|b)>|<(i|b)>)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?(</(i|b)></(i|b)>|</(i|b)>)?)\s*</span>\s*</p>', re.IGNORECASE)
+                res = chapdetect.sub('<h2>'+'\g<chap>'+'</h2>\n', res)
+                # Unwrap lines using punctation if the median length of all lines is less than 150
+                length = line_length('html', res, 0.4)
+                print "*** Median length is " + str(length) + " ***\n"
+                unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</span>\s*(</p>)?\s*(?P<up2threeblanks><p[^>]*>\s*(<span[^>]*>\s*</span>\s*)</p>\s*){0,3}\s*<p[^>]*>\s*(<span[^>]*>)?\s*" % length, re.UNICODE)
+                if length < 150:
+                    res = unwrap.sub(' ', res)
             f.write(res)
         self.write_inline_css(inline_class)
         stream.seek(0)