Fix #6587 (Enhancements to preprocessing for txt, html, lit, and pdf)

This commit is contained in:
Kovid Goyal 2010-08-24 16:43:06 -06:00
commit a1ec250de6
4 changed files with 51 additions and 9 deletions

View File

@ -54,7 +54,7 @@ def chap_head(match):
if not title: if not title:
return '<h1>'+chap+'</h1><br/>\n' return '<h1>'+chap+'</h1><br/>\n'
else: else:
return '<h1>'+chap+'<br/>\n'+title+'</h1><br/>\n' return '<h1>'+chap+'</h1>\n<h3>'+title+'</h3>\n'
def wrap_lines(match): def wrap_lines(match):
ital = match.group('ital') ital = match.group('ital')
@ -63,7 +63,7 @@ def wrap_lines(match):
else: else:
return ital+' ' return ital+' '
def line_length(raw, percent): def line_length(format, raw, percent):
''' '''
raw is the raw text to find the line length to use for wrapping. raw is the raw text to find the line length to use for wrapping.
percentage is a decimal number, 0 - 1 which is used to determine percentage is a decimal number, 0 - 1 which is used to determine
@ -72,6 +72,9 @@ def line_length(raw, percent):
median value. median value.
''' '''
raw = raw.replace('&nbsp;', ' ') raw = raw.replace('&nbsp;', ' ')
if format == 'html':
linere = re.compile('(?<=<p).*?(?=</p>)', re.DOTALL)
elif format == 'pdf':
linere = re.compile('(?<=<br>).*?(?=<br>)', re.DOTALL) linere = re.compile('(?<=<br>).*?(?=<br>)', re.DOTALL)
lines = linere.findall(raw) lines = linere.findall(raw)
@ -206,7 +209,7 @@ class HTMLPreProcessor(object):
(re.compile(ur'\u00a0'), lambda match : ' '), (re.compile(ur'\u00a0'), lambda match : ' '),
# Detect Chapters to match default XPATH in GUI # Detect Chapters to match default XPATH in GUI
(re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>(<i><b>|<i>|<b>)?(Chapter|Epilogue|Prologue|Book|Part)\s*([\d\w-]+(\s\w+)?)?(</i></b>|</i>|</b>)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>(<i>)?\s*\w+(\s+\w+)?(</i>)?)(<br[^>]*>|</?p[^>]*>)))?', re.IGNORECASE), chap_head), (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>(<(i|b)>(<(i|b)>)?)?(.?Chapter|Epilogue|Prologue|Book|Part)\s*([\d\w-]+(\s\w+)?)?(</(i|b)>(</(i|b)>)?)?)</?(br|p)[^>]*>\s*(?P<title>(<(i|b)>)?\s*\w+(\s*\w+)?\s*(</(i|b)>)?\s*(</?(br|p)[^>]*>))?', re.IGNORECASE), chap_head),
(re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>([A-Z \'"!]{5,})\s*(\d+|\w+)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head), (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>([A-Z \'"!]{5,})\s*(\d+|\w+)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head),
# Have paragraphs show better # Have paragraphs show better
@ -289,7 +292,7 @@ class HTMLPreProcessor(object):
traceback.print_exc() traceback.print_exc()
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01: if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
length = line_length(html, getattr(self.extra_opts, 'unwrap_factor')) length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'))
if length: if length:
end_rules.append( end_rules.append(
# Un wrap using punctuation # Un wrap using punctuation

View File

@ -24,6 +24,7 @@ from calibre.constants import islinux, isfreebsd, iswindows
from calibre import unicode_path from calibre import unicode_path
from calibre.utils.localization import get_lang from calibre.utils.localization import get_lang
from calibre.utils.filenames import ascii_filename from calibre.utils.filenames import ascii_filename
from calibre.ebooks.conversion.preprocess import line_length
class Link(object): class Link(object):
''' '''
@ -489,5 +490,18 @@ class HTMLInput(InputFormatPlugin):
return (None, None) return (None, None)
return (None, raw) return (None, raw)
def preprocess_html(self, html):
self.log("********* Preprocessing HTML *********")
# Detect Chapters to match the xpath in the GUI
chapdetect = re.compile(r'(?=</?(br|p|span))(</?(br|p|span)[^>]*>)?\s*(?P<chap>(<(i|b)><(i|b)>|<(i|b)>)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?(</(i|b)></(i|b)>|</(i|b)>)?)(</?(p|br|span)[^>]*>)', re.IGNORECASE)
html = chapdetect.sub('<h2>'+'\g<chap>'+'</h2>\n', html)
# Unwrap lines using punctation if the median length of all lines is less than 150
#
# Insert extra line feeds so the line length regex functions properly
html = re.sub(r"</p>", "</p>\n", html)
length = line_length('html', html, 0.4)
self.log.debug("*** Median length is " + str(length) + " ***")
unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
if length < 150:
html = unwrap.sub(' ', html)
return html

View File

@ -9,6 +9,7 @@ __docformat__ = 'restructuredtext en'
import re import re
from calibre.customize.conversion import InputFormatPlugin from calibre.customize.conversion import InputFormatPlugin
from calibre.ebooks.conversion.preprocess import line_length
class LITInput(InputFormatPlugin): class LITInput(InputFormatPlugin):
@ -21,6 +22,7 @@ class LITInput(InputFormatPlugin):
accelerators): accelerators):
from calibre.ebooks.lit.reader import LitReader from calibre.ebooks.lit.reader import LitReader
from calibre.ebooks.conversion.plumber import create_oebbook from calibre.ebooks.conversion.plumber import create_oebbook
self.log = log
return create_oebbook(log, stream, options, self, reader=LitReader) return create_oebbook(log, stream, options, self, reader=LitReader)
def postprocess_book(self, oeb, opts, log): def postprocess_book(self, oeb, opts, log):
@ -53,8 +55,18 @@ class LITInput(InputFormatPlugin):
def preprocess_html(self, html): def preprocess_html(self, html):
self.log("********* Preprocessing HTML *********")
# Detect Chapters to match the xpath in the GUI
chapdetect = re.compile(r'(?=</?(br|p|span))(</?(br|p|span)[^>]*>)?\s*(?P<chap>(<(i|b)><(i|b)>|<(i|b)>)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?(</(i|b)></(i|b)>|</(i|b)>)?)(</?(p|br|span)[^>]*>)', re.IGNORECASE) chapdetect = re.compile(r'(?=</?(br|p|span))(</?(br|p|span)[^>]*>)?\s*(?P<chap>(<(i|b)><(i|b)>|<(i|b)>)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?(</(i|b)></(i|b)>|</(i|b)>)?)(</?(p|br|span)[^>]*>)', re.IGNORECASE)
html = chapdetect.sub('<h2>'+'\g<chap>'+'</h2>\n', html) html = chapdetect.sub('<h2>'+'\g<chap>'+'</h2>\n', html)
html = re.sub(r"(?<=.{65}[a-z,\IA])\s*</(span|p|div)>\s*(</(p|span|div)>\s*<p[^>]*>(\s*<(p|span|div)>\s*</(p|span|div)[^>]*>)?\s*(</(p|span|div)>\s*<p[^>]*>)?)?\s*<(span|div|p)[^>]*>", " ", html) # Unwrap lines using punctation if the median length of all lines is less than 150
#
# Insert extra line feeds so the line length regex functions properly
html = re.sub(r"</p>", "</p>\n", html)
length = line_length('html', html, 0.4)
self.log("*** Median length is " + str(length) + " ***")
unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
if length < 150:
html = unwrap.sub(' ', html)
return html return html

View File

@ -7,6 +7,7 @@ import os, glob, re, textwrap
from lxml import etree from lxml import etree
from calibre.customize.conversion import InputFormatPlugin from calibre.customize.conversion import InputFormatPlugin
from calibre.ebooks.conversion.preprocess import line_length
class InlineClass(etree.XSLTExtension): class InlineClass(etree.XSLTExtension):
@ -184,6 +185,7 @@ class RTFInput(InputFormatPlugin):
from calibre.ebooks.metadata.meta import get_metadata from calibre.ebooks.metadata.meta import get_metadata
from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.rtf2xml.ParseRtf import RtfInvalidCodeException from calibre.ebooks.rtf2xml.ParseRtf import RtfInvalidCodeException
self.options = options
self.log = log self.log = log
self.log('Converting RTF to XML...') self.log('Converting RTF to XML...')
#Name of the preprocesssed RTF file #Name of the preprocesssed RTF file
@ -226,6 +228,17 @@ class RTFInput(InputFormatPlugin):
with open(html, 'wb') as f: with open(html, 'wb') as f:
res = transform.tostring(result) res = transform.tostring(result)
res = res[:100].replace('xmlns:html', 'xmlns') + res[100:] res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
if self.options.preprocess_html:
self.log("********* Preprocessing HTML *********")
# Detect Chapters to match the xpath in the GUI
chapdetect = re.compile(r'<p[^>]*>\s*<span[^>]*>\s*(?P<chap>(<(i|b)><(i|b)>|<(i|b)>)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?(</(i|b)></(i|b)>|</(i|b)>)?)\s*</span>\s*</p>', re.IGNORECASE)
res = chapdetect.sub('<h2>'+'\g<chap>'+'</h2>\n', res)
# Unwrap lines using punctation if the median length of all lines is less than 150
length = line_length('html', res, 0.4)
self.log("*** Median length is " + str(length) + " ***")
unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</span>\s*(</p>)?\s*(?P<up2threeblanks><p[^>]*>\s*(<span[^>]*>\s*</span>\s*)</p>\s*){0,3}\s*<p[^>]*>\s*(<span[^>]*>)?\s*" % length, re.UNICODE)
if length < 150:
res = unwrap.sub(' ', res)
f.write(res) f.write(res)
self.write_inline_css(inline_class) self.write_inline_css(inline_class)
stream.seek(0) stream.seek(0)