From 0bd3cf40090934e1f6e41f2b0e4b1f5e9b0d3173 Mon Sep 17 00:00:00 2001 From: ldolse Date: Tue, 24 Aug 2010 00:56:06 +1000 Subject: [PATCH] Enabled preprocess_html option for rtf --- src/calibre/ebooks/rtf/input.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py index 5447e69403..dcffbe68ca 100644 --- a/src/calibre/ebooks/rtf/input.py +++ b/src/calibre/ebooks/rtf/input.py @@ -7,6 +7,7 @@ import os, glob, re, textwrap from lxml import etree from calibre.customize.conversion import InputFormatPlugin +from calibre.ebooks.conversion.preprocess import line_length class InlineClass(etree.XSLTExtension): @@ -184,6 +185,7 @@ class RTFInput(InputFormatPlugin): from calibre.ebooks.metadata.meta import get_metadata from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.ebooks.rtf2xml.ParseRtf import RtfInvalidCodeException + self.options = options self.log = log self.log('Converting RTF to XML...') #Name of the preprocesssed RTF file @@ -226,6 +228,17 @@ class RTFInput(InputFormatPlugin): with open(html, 'wb') as f: res = transform.tostring(result) res = res[:100].replace('xmlns:html', 'xmlns') + res[100:] + if self.options.preprocess_html: + print "********* Preprocessing HTML *********\n" + # Detect Chapters to match the xpath in the GUI + chapdetect = re.compile(r']*>\s*]*>\s*(?P(<(i|b)><(i|b)>|<(i|b)>)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?(|)?)\s*\s*

', re.IGNORECASE) + res = chapdetect.sub('

'+'\g'+'

\n', res) + # Unwrap lines using punctation if the median length of all lines is less than 150 + length = line_length('html', res, 0.4) + print "*** Median length is " + str(length) + " ***\n" + unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*\s*(

)?\s*(?P]*>\s*(]*>\s*\s*)

\s*){0,3}\s*]*>\s*(]*>)?\s*" % length, re.UNICODE) + if length < 150: + res = unwrap.sub(' ', res) f.write(res) self.write_inline_css(inline_class) stream.seek(0)