mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
enhancements to preprocessing for Lit, html, & pdf
This commit is contained in:
parent
e322371900
commit
aca34337d1
@ -54,7 +54,7 @@ def chap_head(match):
|
||||
if not title:
|
||||
return '<h1>'+chap+'</h1><br/>\n'
|
||||
else:
|
||||
return '<h1>'+chap+'<br/>\n'+title+'</h1><br/>\n'
|
||||
return '<h1>'+chap+'</h1>\n<h3>'+title+'</h3>\n'
|
||||
|
||||
def wrap_lines(match):
|
||||
ital = match.group('ital')
|
||||
@ -63,7 +63,7 @@ def wrap_lines(match):
|
||||
else:
|
||||
return ital+' '
|
||||
|
||||
def line_length(raw, percent):
|
||||
def line_length(format, raw, percent):
|
||||
'''
|
||||
raw is the raw text to find the line length to use for wrapping.
|
||||
percentage is a decimal number, 0 - 1 which is used to determine
|
||||
@ -72,7 +72,10 @@ def line_length(raw, percent):
|
||||
median value.
|
||||
'''
|
||||
raw = raw.replace(' ', ' ')
|
||||
linere = re.compile('(?<=<br>).*?(?=<br>)', re.DOTALL)
|
||||
if format == 'html':
|
||||
linere = re.compile('(?<=<p).*?(?=</p>)', re.DOTALL)
|
||||
elif format == 'pdf':
|
||||
linere = re.compile('(?<=<br>).*?(?=<br>)', re.DOTALL)
|
||||
lines = linere.findall(raw)
|
||||
|
||||
lengths = []
|
||||
@ -206,7 +209,7 @@ class HTMLPreProcessor(object):
|
||||
(re.compile(ur'\u00a0'), lambda match : ' '),
|
||||
|
||||
# Detect Chapters to match default XPATH in GUI
|
||||
(re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>(<i><b>|<i>|<b>)?(Chapter|Epilogue|Prologue|Book|Part)\s*([\d\w-]+(\s\w+)?)?(</i></b>|</i>|</b>)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>(<i>)?\s*\w+(\s+\w+)?(</i>)?)(<br[^>]*>|</?p[^>]*>)))?', re.IGNORECASE), chap_head),
|
||||
(re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>(<(i|b)>(<(i|b)>)?)?(.?Chapter|Epilogue|Prologue|Book|Part)\s*([\d\w-]+(\s\w+)?)?(</(i|b)>(</(i|b)>)?)?)</?(br|p)[^>]*>\s*(?P<title>(<(i|b)>)?\s*\w+(\s*\w+)?\s*(</(i|b)>)?\s*(</?(br|p)[^>]*>))?', re.IGNORECASE), chap_head),
|
||||
(re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>([A-Z \'"!]{5,})\s*(\d+|\w+)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head),
|
||||
|
||||
# Have paragraphs show better
|
||||
@ -289,7 +292,7 @@ class HTMLPreProcessor(object):
|
||||
traceback.print_exc()
|
||||
|
||||
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
|
||||
length = line_length(html, getattr(self.extra_opts, 'unwrap_factor'))
|
||||
length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'))
|
||||
if length:
|
||||
end_rules.append(
|
||||
# Un wrap using punctuation
|
||||
|
@ -24,6 +24,7 @@ from calibre.constants import islinux, isfreebsd, iswindows
|
||||
from calibre import unicode_path
|
||||
from calibre.utils.localization import get_lang
|
||||
from calibre.utils.filenames import ascii_filename
|
||||
from calibre.ebooks.conversion.preprocess import line_length
|
||||
|
||||
class Link(object):
|
||||
'''
|
||||
@ -489,5 +490,18 @@ class HTMLInput(InputFormatPlugin):
|
||||
return (None, None)
|
||||
return (None, raw)
|
||||
|
||||
|
||||
|
||||
def preprocess_html(self, html):
|
||||
print "********* Preprocessing HTML *********\n"
|
||||
# Detect Chapters to match the xpath in the GUI
|
||||
chapdetect = re.compile(r'(?=</?(br|p|span))(</?(br|p|span)[^>]*>)?\s*(?P<chap>(<(i|b)><(i|b)>|<(i|b)>)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?(</(i|b)></(i|b)>|</(i|b)>)?)(</?(p|br|span)[^>]*>)', re.IGNORECASE)
|
||||
html = chapdetect.sub('<h2>'+'\g<chap>'+'</h2>\n', html)
|
||||
# Unwrap lines using punctation if the median length of all lines is less than 150
|
||||
#
|
||||
# Insert extra line feeds so the line length regex functions properly
|
||||
html = re.sub(r"</p>", "</p>\n", html)
|
||||
length = line_length('html', html, 0.4)
|
||||
print "*** Median length is " + str(length) + " ***\n"
|
||||
unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
|
||||
if length < 150:
|
||||
html = unwrap.sub(' ', html)
|
||||
return html
|
||||
|
@ -9,6 +9,7 @@ __docformat__ = 'restructuredtext en'
|
||||
import re
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
from calibre.ebooks.conversion.preprocess import line_length
|
||||
|
||||
class LITInput(InputFormatPlugin):
|
||||
|
||||
@ -53,8 +54,18 @@ class LITInput(InputFormatPlugin):
|
||||
|
||||
|
||||
def preprocess_html(self, html):
|
||||
print "********* Preprocessing HTML *********\n"
|
||||
# Detect Chapters to match the xpath in the GUI
|
||||
chapdetect = re.compile(r'(?=</?(br|p|span))(</?(br|p|span)[^>]*>)?\s*(?P<chap>(<(i|b)><(i|b)>|<(i|b)>)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?(</(i|b)></(i|b)>|</(i|b)>)?)(</?(p|br|span)[^>]*>)', re.IGNORECASE)
|
||||
html = chapdetect.sub('<h2>'+'\g<chap>'+'</h2>\n', html)
|
||||
html = re.sub(r"(?<=.{65}[a-z,\IA])\s*</(span|p|div)>\s*(</(p|span|div)>\s*<p[^>]*>(\s*<(p|span|div)>\s*</(p|span|div)[^>]*>)?\s*(</(p|span|div)>\s*<p[^>]*>)?)?\s*<(span|div|p)[^>]*>", " ", html)
|
||||
return html
|
||||
# Unwrap lines using punctation if the median length of all lines is less than 150
|
||||
#
|
||||
# Insert extra line feeds so the line length regex functions properly
|
||||
html = re.sub(r"</p>", "</p>\n", html)
|
||||
length = line_length('html', html, 0.4)
|
||||
print "*** Median length is " + str(length) + " ***\n"
|
||||
unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
|
||||
if length < 150:
|
||||
html = unwrap.sub(' ', html)
|
||||
return html
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user