mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Fixed unwrapping for various hyphen and dash types, other minor tweaks to pdf
This commit is contained in:
parent
2a906184ad
commit
480eccb0b0
@ -77,7 +77,6 @@ def line_length(format, raw, percent):
|
|||||||
elif format == 'pdf':
|
elif format == 'pdf':
|
||||||
linere = re.compile('(?<=<br>).*?(?=<br>)', re.DOTALL)
|
linere = re.compile('(?<=<br>).*?(?=<br>)', re.DOTALL)
|
||||||
lines = linere.findall(raw)
|
lines = linere.findall(raw)
|
||||||
print "percent is " + str(percent)
|
|
||||||
|
|
||||||
lengths = []
|
lengths = []
|
||||||
for line in lines:
|
for line in lines:
|
||||||
@ -230,14 +229,17 @@ class HTMLPreProcessor(object):
|
|||||||
# (re.compile(r'<br>\s*<br>', re.IGNORECASE), lambda match: '\n<p>'),
|
# (re.compile(r'<br>\s*<br>', re.IGNORECASE), lambda match: '\n<p>'),
|
||||||
|
|
||||||
# unwrap hyphenation - don't delete the hyphen (often doesn't split words)
|
# unwrap hyphenation - don't delete the hyphen (often doesn't split words)
|
||||||
(re.compile(u'(?<=[-–—])\s*<br>\s*(?=[[a-z\d])'), lambda match: ''),
|
#(re.compile(u'(?<=[-–—])\s*<br>\s*(?=[[a-z\d])'), lambda match: ''),
|
||||||
|
# unwrap/delete soft hyphens
|
||||||
|
#(re.compile(u'[]\s*<br>\s*(?=[[a-z\d])'), lambda match: ''),
|
||||||
|
|
||||||
# Remove gray background
|
# Remove gray background
|
||||||
(re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'),
|
(re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'),
|
||||||
|
|
||||||
# Detect Chapters to match default XPATH in GUI
|
# Detect Chapters to match default XPATH in GUI
|
||||||
(re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>(<(i|b)>(<(i|b)>)?)?.?(Introduction|Chapter|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*(</(i|b)>(</(i|b)>)?)?)\s*(</?(br|p)[^>]*>\s*){1,3}\s*(?P<title>(<(i|b)>)?(\s*\w+){1,4}\s*(</(i|b)>)?\s*(</?(br|p)[^>]*>))?', re.IGNORECASE), chap_head),
|
(re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>(<(i|b)>(<(i|b)>)?)?.?(Introduction|Chapter|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*(</(i|b)>(</(i|b)>)?)?)\s*(</?(br|p)[^>]*>\s*){1,3}\s*(?P<title>(<(i|b)>)?(\s*\w+){1,4}\s*(</(i|b)>)?\s*(</?(br|p)[^>]*>))?', re.IGNORECASE), chap_head),
|
||||||
|
(re.compile(r'<br\s*/?>\s*(?P<chap>([A-Z]\s+){4,}\s*([\d\w-]+\s*){0,3}\s*)\s*(<br>\s*){1,3}\s*(?P<title>(<(i|b)>)?(\s*\w+){1,4}\s*(</(i|b)>)?\s*(</?(br|p)[^>]*>))?'), chap_head),
|
||||||
|
|
||||||
# Have paragraphs show better
|
# Have paragraphs show better
|
||||||
(re.compile(r'<br.*?>'), lambda match : '<p>'),
|
(re.compile(r'<br.*?>'), lambda match : '<p>'),
|
||||||
# Clean up spaces
|
# Clean up spaces
|
||||||
@ -322,21 +324,29 @@ class HTMLPreProcessor(object):
|
|||||||
import traceback
|
import traceback
|
||||||
print 'Failed to parse remove_footer regexp'
|
print 'Failed to parse remove_footer regexp'
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
|
|
||||||
|
# unwrap hyphenation - moved here so it's executed after header/footer removal
|
||||||
|
if is_pdftohtml:
|
||||||
|
# unwrap visible dashes and hyphens - don't delete as 50% or more of the time these
|
||||||
|
# hyphens are for compound words, formatting, etc
|
||||||
|
end_rules.append((re.compile(u'(?<=[-–—])\s*<p>\s*(?=[[a-z\d])'), lambda match: ''))
|
||||||
|
# unwrap/delete soft hyphens
|
||||||
|
end_rules.append((re.compile(u'[](\s*<p>)+\s*(?=[[a-z\d])'), lambda match: ''))
|
||||||
|
# unwrap/delete soft hyphens with formatting
|
||||||
|
end_rules.append((re.compile(u'[]\s*(</(i|u|b)>)+(\s*<p>)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: ''))
|
||||||
|
|
||||||
# Make the more aggressive chapter marking regex optional with the preprocess option to reduce false positives
|
# Make the more aggressive chapter marking regex optional with the preprocess option to reduce false positives
|
||||||
if getattr(self.extra_opts, 'preprocess_html', None):
|
if getattr(self.extra_opts, 'preprocess_html', None):
|
||||||
if is_pdftohtml:
|
if is_pdftohtml:
|
||||||
end_rules.append(
|
end_rules.append((re.compile(r'(?=<(/?br|p|hr))(<(/?br|p|hr)[^>]*)?>\s*(<(i|b)>(<(i|b)>)?)?\s*(?P<chap>([A-Z-\'"!]{3,})\s*(\d+|[A-Z]+(\s*[A-Z]+)?)?|\d+\.?\s*([\d\w-]+\s*){0,4}\s*)\s*(</(i|b)>(</(i|b)>)?)?\s*(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head))
|
||||||
(re.compile(r'(?=<(/?br|p|hr))(<(/?br|p|hr)[^>]*)?>\s*(<(i|b)>(<(i|b)>)?)?\s*(?P<chap>([A-Z-\'"!]{3,})\s*(\d+|[A-Z]+(\s*[A-Z]+)?)?|\d+\.?\s*([\d\w-]+\s*){0,4}\s*)\s*(</(i|b)>(</(i|b)>)?)?\s*(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head),
|
|
||||||
)
|
|
||||||
|
|
||||||
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
|
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
|
||||||
length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'))
|
length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'))
|
||||||
if length:
|
if length:
|
||||||
print "The pdf line length returned is " + str(length)
|
# print "The pdf line length returned is " + str(length)
|
||||||
end_rules.append(
|
end_rules.append(
|
||||||
# Un wrap using punctuation
|
# Un wrap using punctuation
|
||||||
(re.compile(r'(?<=.{%i}[a-z,;:)\-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines),
|
(re.compile(r'(?<=.{%i}[a-z,;:)\-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines),
|
||||||
)
|
)
|
||||||
|
|
||||||
for rule in self.PREPROCESS + start_rules:
|
for rule in self.PREPROCESS + start_rules:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user