mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Enabled preprocess_html option for rtf
This commit is contained in:
parent
aca34337d1
commit
0bd3cf4009
@ -7,6 +7,7 @@ import os, glob, re, textwrap
|
|||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
|
||||||
from calibre.customize.conversion import InputFormatPlugin
|
from calibre.customize.conversion import InputFormatPlugin
|
||||||
|
from calibre.ebooks.conversion.preprocess import line_length
|
||||||
|
|
||||||
class InlineClass(etree.XSLTExtension):
|
class InlineClass(etree.XSLTExtension):
|
||||||
|
|
||||||
@ -184,6 +185,7 @@ class RTFInput(InputFormatPlugin):
|
|||||||
from calibre.ebooks.metadata.meta import get_metadata
|
from calibre.ebooks.metadata.meta import get_metadata
|
||||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||||
from calibre.ebooks.rtf2xml.ParseRtf import RtfInvalidCodeException
|
from calibre.ebooks.rtf2xml.ParseRtf import RtfInvalidCodeException
|
||||||
|
self.options = options
|
||||||
self.log = log
|
self.log = log
|
||||||
self.log('Converting RTF to XML...')
|
self.log('Converting RTF to XML...')
|
||||||
#Name of the preprocesssed RTF file
|
#Name of the preprocesssed RTF file
|
||||||
@ -226,6 +228,17 @@ class RTFInput(InputFormatPlugin):
|
|||||||
with open(html, 'wb') as f:
|
with open(html, 'wb') as f:
|
||||||
res = transform.tostring(result)
|
res = transform.tostring(result)
|
||||||
res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
|
res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
|
||||||
|
if self.options.preprocess_html:
|
||||||
|
print "********* Preprocessing HTML *********\n"
|
||||||
|
# Detect Chapters to match the xpath in the GUI
|
||||||
|
chapdetect = re.compile(r'<p[^>]*>\s*<span[^>]*>\s*(?P<chap>(<(i|b)><(i|b)>|<(i|b)>)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?(</(i|b)></(i|b)>|</(i|b)>)?)\s*</span>\s*</p>', re.IGNORECASE)
|
||||||
|
res = chapdetect.sub('<h2>'+'\g<chap>'+'</h2>\n', res)
|
||||||
|
# Unwrap lines using punctation if the median length of all lines is less than 150
|
||||||
|
length = line_length('html', res, 0.4)
|
||||||
|
print "*** Median length is " + str(length) + " ***\n"
|
||||||
|
unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</span>\s*(</p>)?\s*(?P<up2threeblanks><p[^>]*>\s*(<span[^>]*>\s*</span>\s*)</p>\s*){0,3}\s*<p[^>]*>\s*(<span[^>]*>)?\s*" % length, re.UNICODE)
|
||||||
|
if length < 150:
|
||||||
|
res = unwrap.sub(' ', res)
|
||||||
f.write(res)
|
f.write(res)
|
||||||
self.write_inline_css(inline_class)
|
self.write_inline_css(inline_class)
|
||||||
stream.seek(0)
|
stream.seek(0)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user