Date: Mon, 20 Sep 2010 00:31:22 +0800
Subject: [PATCH 01/26] ...
---
src/calibre/ebooks/conversion/utils.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 6a5eaa4a34..f38d02309a 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -83,7 +83,7 @@ class PreProcessor(object):
# tags), check and mark up line endings if required before proceeding
if self.no_markup(html, 0.1):
self.log("not enough paragraph markers, adding now")
- # check if content is in pre tags, use txt procesor to mark up if so
+ # check if content is in pre tags, use txt processor to mark up if so
pre = re.compile(r'', re.IGNORECASE)
if len(pre.findall(html)) == 1:
self.log("Running Text Processing")
From 301af532c6940ec8082dbe6ece4dca351417ac63 Mon Sep 17 00:00:00 2001
From: ldolse
Date: Mon, 20 Sep 2010 09:57:46 +0800
Subject: [PATCH 02/26] made em-dash unwrapping line length dependent, as
sometimes it's used as an ellipsis alternative
---
src/calibre/ebooks/conversion/preprocess.py | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 3b1239814a..d6b5460552 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -117,7 +117,7 @@ class Dehyphenator(object):
def __init__(self):
# Add common suffixes to the regex below to increase the likelihood of a match -
# don't add suffixes which are also complete words, such as 'able' or 'sex'
- self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)ion(s|al(ly)?)?|ings?|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE)
+ self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE)
# remove prefixes if the prefix was not already the point of hyphenation
self.prefixes = re.compile(r'^(un|in|ex)$', re.IGNORECASE)
self.removeprefix = re.compile(r'^(un|in|ex)', re.IGNORECASE)
@@ -374,10 +374,8 @@ class HTMLPreProcessor(object):
print 'Failed to parse remove_footer regexp'
traceback.print_exc()
- # unwrap em/en dashes, delete soft hyphens - moved here so it's executed after header/footer removal
+ # delete soft hyphens - moved here so it's executed after header/footer removal
if is_pdftohtml:
- # unwrap em/en dashes
- end_rules.append((re.compile(u'(?<=[–—])\s*\s*(?=[[a-z\d])'), lambda match: ''))
# unwrap/delete soft hyphens
end_rules.append((re.compile(u'[](\s*
)+\s*(?=[[a-z\d])'), lambda match: ''))
# unwrap/delete soft hyphens with formatting
@@ -397,6 +395,8 @@ class HTMLPreProcessor(object):
# Un wrap using punctuation
(re.compile(r'(?<=.{%i}([a-z,:)\IA]|(?(i|b|u)>)?\s*(
\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
)
+ # unwrap em/en dashes
+ end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*\s*(?=[[a-z\d])' % length), lambda match: ''))
for rule in self.PREPROCESS + start_rules:
html = rule[0].sub(rule[1], html)
From 936451853caa1190eff41bf07a28f39005da5fb3 Mon Sep 17 00:00:00 2001
From: ldolse
Date: Tue, 21 Sep 2010 18:18:50 -1000
Subject: [PATCH 03/26] tuned dehyphen code to better handle unwrapped docs,
added line histogram function to determine whether a document has hard breaks
or not
---
src/calibre/ebooks/conversion/preprocess.py | 138 +++++++++++++++-----
src/calibre/ebooks/conversion/utils.py | 48 ++++---
2 files changed, 134 insertions(+), 52 deletions(-)
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index d6b5460552..c42b29e0e4 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -62,49 +62,97 @@ def wrap_lines(match):
else:
return ital+' '
-def line_length(format, raw, percent):
+def line_length(format, raw, percent, test_type):
'''
- raw is the raw text to find the line length to use for wrapping.
+ Analyses the document to see if hard line breaks exist or to find the
+ median line length.
+ format is the type of document analysis will be done against.
+ raw is the raw text to determine the line length to use for wrapping.
percentage is a decimal number, 0 - 1 which is used to determine
how far in the list of line lengths to use. The list of line lengths is
ordered smallest to larged and does not include duplicates. 0.5 is the
median value.
+ test_type sets whether to use the line length to return the median or a
+ do a histogram analysis to see if unwrapping is required.
'''
raw = raw.replace(' ', ' ')
if format == 'html':
- linere = re.compile('(?<=)', re.DOTALL)
+ linere = re.compile('(?<=
]*>\s*
).*?(?=
)', re.DOTALL)
elif format == 'pdf':
linere = re.compile('(?<=
).*?(?=
)', re.DOTALL)
elif format == 'spanned_html':
linere = re.compile('(?<=)', re.DOTALL)
lines = linere.findall(raw)
- lengths = []
- for line in lines:
- if len(line) > 0:
- lengths.append(len(line))
+ if test_type == 'median':
+ lengths = []
+ for line in lines:
+ if len(line) > 0:
+ lengths.append(len(line))
- if not lengths:
- return 0
+ if not lengths:
+ return 0
- lengths = list(set(lengths))
- total = sum(lengths)
- avg = total / len(lengths)
- max_line = avg * 2
+ lengths = list(set(lengths))
+ total = sum(lengths)
+ avg = total / len(lengths)
+ max_line = avg * 2
- lengths = sorted(lengths)
- for i in range(len(lengths) - 1, -1, -1):
- if lengths[i] > max_line:
- del lengths[i]
+ lengths = sorted(lengths)
+ for i in range(len(lengths) - 1, -1, -1):
+ if lengths[i] > max_line:
+ del lengths[i]
- if percent > 1:
- percent = 1
- if percent < 0:
- percent = 0
+ if percent > 1:
+ percent = 1
+ if percent < 0:
+ percent = 0
- index = int(len(lengths) * percent) - 1
+ index = int(len(lengths) * percent) - 1
- return lengths[index]
+ return lengths[index]
+
+ if test_type == 'histogram':
+ minLineLength=20 # Ignore lines under 20 chars (typical of spaces)
+ maxLineLength=1900 # Discard larger than this to stay in range
+ buckets=20 # Each line is divided into a bucket based on length
+
+ #print "there are "+str(len(lines))+" lines"
+ max = 0
+ for line in lines:
+ l = len(line)
+ if l > max:
+ max = l
+ print "max line found is "+str(max)
+ # Build the line length histogram
+ hRaw = [ 0 for i in range(0,buckets) ]
+ for line in lines:
+ l = len(line)
+ if l > minLineLength and l < maxLineLength:
+ l = int(l/100)
+ #print "adding "+str(l)
+ hRaw[l]+=1
+
+ # Normalize the histogram into percents
+ totalLines = len(lines)
+ h = [ float(count)/totalLines for count in hRaw ]
+ print "\nhRaw histogram lengths are: "+str(hRaw)
+ print " percents are: "+str(h)+"\n"
+
+ # Find the biggest bucket
+ maxValue = 0
+ peakPosition = 0
+ for i in range(0,len(h)):
+ if h[i] > maxValue:
+ maxValue = h[i]
+ peakPosition = i
+
+ if maxValue < percent:
+ #print "Line lengths are too variable. Not unwrapping."
+ return False
+ else:
+ #print str(maxValue)+" of the lines were in one bucket"
+ return True
class Dehyphenator(object):
'''
@@ -117,7 +165,7 @@ class Dehyphenator(object):
def __init__(self):
# Add common suffixes to the regex below to increase the likelihood of a match -
# don't add suffixes which are also complete words, such as 'able' or 'sex'
- self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE)
+ self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE)
# remove prefixes if the prefix was not already the point of hyphenation
self.prefixes = re.compile(r'^(un|in|ex)$', re.IGNORECASE)
self.removeprefix = re.compile(r'^(un|in|ex)', re.IGNORECASE)
@@ -125,34 +173,54 @@ class Dehyphenator(object):
def dehyphenate(self, match):
firsthalf = match.group('firstpart')
secondhalf = match.group('secondpart')
+ try:
+ wraptags = match.group('wraptags')
+ except:
+ wraptags = ''
hyphenated = str(firsthalf) + "-" + str(secondhalf)
dehyphenated = str(firsthalf) + str(secondhalf)
lookupword = self.removesuffixes.sub('', dehyphenated)
if self.prefixes.match(firsthalf) is None:
lookupword = self.removeprefix.sub('', lookupword)
booklookup = re.compile(u'%s' % lookupword, re.IGNORECASE)
- #print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
- match = booklookup.search(self.html)
- if match:
- #print "returned dehyphenated word: " + str(dehyphenated)
- return dehyphenated
+ print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
+ if self.format == 'html_cleanup':
+ match = booklookup.search(self.html)
+ hyphenmatch = re.search(u'%s' % hyphenated, self.html)
+ if match:
+ print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
+ return dehyphenated
+ elif hyphenmatch:
+ print "Cleanup:returned hyphenated word: " + str(hyphenated)
+ return hyphenated
+ else:
+ print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf)
+ return firsthalf+u'\u2014'+wraptags+secondhalf
+
else:
- #print "returned hyphenated word: " + str(hyphenated)
- return hyphenated
+ match = booklookup.search(self.html)
+ if match:
+ print "returned dehyphenated word: " + str(dehyphenated)
+ return dehyphenated
+ else:
+ print "returned hyphenated word: " + str(hyphenated)
+ return hyphenated
def __call__(self, html, format, length=1):
self.html = html
+ self.format = format
if format == 'html':
- intextmatch = re.compile(u'(?<=.{%i})(?P[^“"\s>]+)-\s*(?=<)(\s*([iubp]>\s*<[iubp][^>]*>\s*)?]*>|[iubp]>\s*<[iubp][^>]*>)?\s*(?P[\w\d]+)' % length)
+ intextmatch = re.compile(u'(?<=.{%i})(?P[^“"\s>]+)-\s*(?=<)(?P\s*([iubp]>\s*<[iubp][^>]*>\s*)?]*>|[iubp]>\s*<[iubp][^>]*>)?\s*(?P[\w\d]+)' % length)
elif format == 'pdf':
- intextmatch = re.compile(u'(?<=.{%i})(?P[^“"\s>]+)-\s*(|[iub]>\s*
\s*<[iub]>)\s*(?P[\w\d]+)'% length)
+ intextmatch = re.compile(u'(?<=.{%i})(?P[^“"\s>]+)-\s*(?P|[iub]>\s*
\s*<[iub]>)\s*(?P[\w\d]+)'% length)
elif format == 'individual_words':
intextmatch = re.compile('>[^<]*\b(?P[^"\s>]+)-(?P[^“"\s>]+)-\s*(?=<)(?P
\s*([iubp]>\s*<[iubp][^>]*>\s*)?]*>|[iubp]>\s*<[iubp][^>]*>)?\s*(?P[\w\d]+)')
html = intextmatch.sub(self.dehyphenate, html)
return html
-
class CSSPreProcessor(object):
PAGE_PAT = re.compile(r'@page[^{]*?{[^}]*?}')
@@ -388,7 +456,7 @@ class HTMLPreProcessor(object):
end_rules.append((re.compile(r'\s*(?P(<[ibu]>){0,2}\s*([A-Z \'"!]{3,})\s*([\dA-Z:]+\s){0,4}\s*([ibu]>){0,2})\s*\s*(?P
(<[ibu]>){0,2}(\s*\w+){1,4}\s*([ibu]>){0,2}\s*)?'), chap_head),)
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
- length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'))
+ length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'), 'median')
if length:
# print "The pdf line length returned is " + str(length)
end_rules.append(
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index f38d02309a..7e85e24a83 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -153,7 +153,6 @@ class PreProcessor(object):
###### Unwrap lines ######
#
- self.log("Unwrapping Lines")
# Some OCR sourced files have line breaks in the html using a combination of span & p tags
# span are used for hard line breaks, p for new paragraphs. Determine which is used so
# that lines can be un-wrapped across page boundaries
@@ -168,25 +167,40 @@ class PreProcessor(object):
format = 'html'
else:
format = 'html'
-
+ # Check Line histogram to determine if the document uses hard line breaks, If 50% or
+ # more of the lines break in the same region of the document then unwrapping is required
+ hardbreaks = line_length(format, html, .50, 'histogram')
+ print "Hard line breaks check returned "+str(hardbreaks)
# Calculate Length
- length = line_length(format, html, getattr(self.extra_opts,
- 'html_unwrap_factor', 0.4))
+ unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
+ length = line_length(format, html, unwrap_factor, 'median')
self.log("*** Median line length is " + str(length) + ", calculated with " + format + " format ***")
- max_length = length * 1.4
- min_max = str("(?<=.{"+str(length)+"})(?\s*([iubp]>\s*<[iubp][^>]*>\s*)?]*>|[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])' % length, '', html)
+ # Dehyphenate
+ self.log("Unwrapping/Removing hyphens")
+ dehyphenator = Dehyphenator()
+ html = dehyphenator(html,'html', length)
+ self.log("Done dehyphenating")
+ # Unwrap lines using punctation and line length
+ unwrap = re.compile(r"(?<=.{%i}([a-z,;):\IA]|(?\s*((p|span|div)>)?\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*(span|p|div)>\s*)(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
+ html = unwrap.sub(' ', html)
+ #check any remaining hyphens, but only unwrap if there is a match
+ dehyphenator = Dehyphenator()
+ html = dehyphenator(html,'html_cleanup', length)
+ else:
+ # dehyphenate in cleanup mode to fix anything previous conversions/editing missed
+ self.log("Cleaning up hyphenation")
+ dehyphenator = Dehyphenator()
+ html = dehyphenator(html,'html_cleanup', length)
+ self.log("Done dehyphenating")
+
+ # delete soft hyphens
html = re.sub(u'\xad\s*(\s*([iubp]>\s*<[iubp][^>]*>\s*)?]*>|[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
- html = re.sub(u'%s(?<=[\u2013\u2014])\s*(?=<)(\s*([iubp]>\s*<[iubp][^>]*>\s*)?]*>|[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])' % min_max, '', html)
- # Dehyphenate
- dehyphenator = Dehyphenator()
- html = dehyphenator(html,'html', length)
-
- # Unwrap lines using punctation and line length
- unwrap = re.compile(r"(?<=.{%i}([a-z,;):\IA]|(?\s*((p|span|div)>)?\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*(span|p|div)>\s*)(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
- html = unwrap.sub(' ', html)
# If still no sections after unwrapping mark split points on lines with no punctuation
if self.html_preprocess_sections < 10:
From 5aa36581c57e80a791071aaf9fcddb7fd4e4eaff Mon Sep 17 00:00:00 2001
From: ldolse
Date: Sat, 25 Sep 2010 12:34:01 -1000
Subject: [PATCH 04/26] Building chapter marking regexes using variables to
increase manageability, switched to using backreferences to increase
reliability
---
src/calibre/ebooks/conversion/utils.py | 51 ++++++++++++++++++++++----
1 file changed, 44 insertions(+), 7 deletions(-)
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 7e85e24a83..5e3cac7714 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -113,11 +113,12 @@ class PreProcessor(object):
# Get rid of empty tags to simplify other processing
html = re.sub(ur'\s*\s*', ' ', html)
# Get rid of empty span, bold, & italics tags
- html = re.sub(r"\s*]*>\s*(]>\s*){0,2}\s*\s*", " ", html)
+ html = re.sub(r"\s*]*>\s*(]*>\s*){0,2}\s*\s*", " ", html)
html = re.sub(r"\s*<[ibu][^>]*>\s*(<[ibu][^>]*>\s*[ibu]>\s*){0,2}\s*[ibu]>", " ", html)
html = re.sub(r"\s*]*>\s*(]>\s*){0,2}\s*\s*", " ", html)
- # If more than 40% of the lines are empty paragraphs then delete them to clean up spacing
+ # If more than 40% of the lines are empty paragraphs and the user has enabled remove
+ # paragraph spacing then delete blank lines to clean up spacing
linereg = re.compile('(?<=)', re.IGNORECASE|re.DOTALL)
blankreg = re.compile(r'\s*(?P]*>)\s*(?P
)', re.IGNORECASE)
#multi_blank = re.compile(r'(\s*]*>\s*(<(b|i|u)>)?\s*((b|i|u)>)?\s*
){2,}', re.IGNORECASE)
@@ -129,26 +130,63 @@ class PreProcessor(object):
'remove_paragraph_spacing', False):
self.log("deleting blank lines")
html = blankreg.sub('', html)
+ elif float(len(blanklines)) / float(len(lines)) > 0.40:
+ blanks_between_paragraphs = True
+ print "blanks between paragraphs is marked True"
+ else:
+ blanks_between_paragraphs = False
# Arrange line feeds and
tags so the line_length and no_markup functions work correctly
html = re.sub(r"\s*
", "
\n", html)
html = re.sub(r"\s*\s*", "\n
", html)
# detect chapters/sections to match xpath or splitting logic
+ #
+ # Build the Regular Expressions in pieces
+ lookahead = "(?=<(p|div))"
+ chapter_line_open = "<(?Pp|div)[^>]*>\s*(<(?Pspan|[ibu])[^>]*>)?\s*(<(?Pspan|[ibu])[^>]*>)?\s*(<(?Pspan|[ibu])[^>]*>)?\s*"
+ chapter_header_open = r"(?P"
+ chapter_header_close = ")\s*"
+ chapter_line_close = "((?P=inner3)>)?\s*((?P=inner2)>)?\s*((?P=inner1)\s[^>]*>)?\s*(?P=outer)>\s*"
+ if blanks_between_paragraphs:
+ blank_lines = "(\s*]*>\s*
){0,2}\s*"
+ else:
+ blank_lines = ""
+ opt_title_open = "("
+ title_line_open = "<(?Pp|div)[^>]*>\s*(<(?Pspan|[ibu])[^>]*>)?\s*(<(?Pspan|[ibu])[^>]*>)?\s*(<(?Pspan|[ibu])[^>]*>)?\s*"
+ title_header_open = "(?P"
+ title_header_close = ")\s*"
+ title_line_close = "((?P=inner6)>)?\s*((?P=inner5)>)?\s*((?P=inner4)\s[^>]*>)?\s*(?P=outer2)>"
+ opt_title_close = ")?"
+
+ default_title = r"(\s*[\w\'\"-]+){1,5}(?!<)"
+ typical_chapters = r".?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}"
+ numeric_chapters = r".?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*"
+ uppercase_chapters = r"\s*.?([A-Z#\-\s]+)\s*"
+
+ chapter_marker = lookahead+chapter_line_open+chapter_header_open+typical_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
+ print chapter_marker
+ #self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n")
heading = re.compile(']*>', re.IGNORECASE)
self.html_preprocess_sections = len(heading.findall(html))
self.log("found " + str(self.html_preprocess_sections) + " pre-existing headings")
#
# Start with most typical chapter headings, get more aggressive until one works
if self.html_preprocess_sections < 10:
- chapdetect = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*(?P(<[ibu][^>]*>){0,2}\s*.?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*([ibu]>){0,2})\s*()?s*([ibu]>){0,2}\s*()?\s*((p|/?br)>)\s*\s*(\s*]*>\s*
){0,2}\s*(<(/?br|p)[^>]*>\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*(?P(<[ibu][^>]*>){0,2}(\s*[\w\'\"-]+){1,5}\s*([ibu]>){0,2})\s*()?\s*([ibu]>){0,2}\s*((br|p)>))?', re.IGNORECASE|re.VERBOSE)
+ chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
+ #chapdetect = re.compile(r"(?=<(p|div))<(?Pp|div)[^>]*>\s*(<(?Pspan|[ibu])\s[^>]*>)?\s*(<(?Pspan|[ibu])\s[^>]*>)?\s*(<(?Pspan|[ibu])\s[^>]*>)?\s*(?P.?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8})\s*((?P=)>)?\s*((?P=)>)?\s*((?P=)\s[^>]*>)?\s(?P=)>(<(?Pp|div)[^>]*>\s*(<(?Pspan|[ibu])\s[^>]*>)?\s*(<(?Pspan|[ibu])\s[^>]*>)?\s*(<(?Pspan|[ibu])\s[^>]*>)?\s*(?P(\s*[\w\'\"-]+){1,5})\s*((?P=)>)?\s*((?P=)>)?\s*((?P=)\s[^>]*>)?\s(?P=)>)?", re.IGNORECASE)
+ #chapdetect = re.compile(r'(?=?(br|p))(<(?P(/?br|p))[^>]*>)\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*(?P(<[ibu][^>]*>){0,2}\s*.?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*([ibu]>){0,2})\s*()?s*([ibu]>){0,2}\s*()?\s*((?P=outer)>)\s*\s*(\s*]*>\s*
){0,2}\s*(<(/?br|p)[^>]*>\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*(?P(<[ibu][^>]*>){0,2}(\s*[\w\'\"-]+){1,5}\s*([ibu]>){0,2})\s*()?\s*([ibu]>){0,2}\s*((br|p)>))?', re.IGNORECASE|re.VERBOSE)
html = chapdetect.sub(self.chapter_head, html)
if self.html_preprocess_sections < 10:
self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying numeric chapters")
- chapdetect2 = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*(?P(<[ibu][^>]*>){0,2}\s*.?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*([ibu]>){0,2})\s*()?\s*([ibu]>){0,2}\s*((p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*(?P(<[ibu][^>]*>){0,2}(\s*[\w\'\"-]+){1,5}\s*([ibu]>){0,2})\s*()?\s*([ibu]>){0,2}\s*((br|p)>))?', re.UNICODE)
+ chapter_marker = lookahead+chapter_line_open+chapter_header_open+numeric_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
+ chapdetect2 = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
+ #chapdetect2 = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*(?P(<[ibu][^>]*>){0,2}\s*.?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*([ibu]>){0,2})\s*()?\s*([ibu]>){0,2}\s*((p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*(?P(<[ibu][^>]*>){0,2}(\s*[\w\'\"-]+){1,5}\s*([ibu]>){0,2})\s*()?\s*([ibu]>){0,2}\s*((br|p)>))?', re.UNICODE)
html = chapdetect2.sub(self.chapter_head, html)
if self.html_preprocess_sections < 10:
self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying with uppercase words")
- chapdetect2 = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*(?P(<[ibu][^>]*>){0,2}\s*.?([A-Z#\-\s]+)\s*([ibu]>){0,2})\s*()?\s*([ibu]>){0,2}\s*((p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*(?P(<[ibu][^>]*>){0,2}(\s*[\w\'\"-]+){1,5}\s*([ibu]>){0,2})\s*()?\s*([ibu]>){0,2}\s*((br|p)>))?', re.UNICODE)
+ chapter_marker = lookahead+chapter_line_open+chapter_header_open+uppercase_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
+ chapdetect2 = re.compile(r'%s' % chapter_marker, re.UNICODE)
+ #chapdetect2 = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*(?P(<[ibu][^>]*>){0,2}\s*.?([A-Z#\-\s]+)\s*([ibu]>){0,2})\s*()?\s*([ibu]>){0,2}\s*((p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*(?P(<[ibu][^>]*>){0,2}(\s*[\w\'\"-]+){1,5}\s*([ibu]>){0,2})\s*()?\s*([ibu]>){0,2}\s*((br|p)>))?', re.UNICODE)
html = chapdetect2.sub(self.chapter_head, html)
###### Unwrap lines ######
@@ -179,7 +217,6 @@ class PreProcessor(object):
if hardbreaks or unwrap_factor < 0.4:
self.log("Unwrapping required, unwrapping Lines")
# Unwrap em/en dashes
- #self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n")
html = re.sub(u'(?<=.{%i}[\u2013\u2014])\s*(?=<)(
\s*([iubp]>\s*<[iubp][^>]*>\s*)?]*>|[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])' % length, '', html)
# Dehyphenate
self.log("Unwrapping/Removing hyphens")
@@ -206,7 +243,7 @@ class PreProcessor(object):
if self.html_preprocess_sections < 10:
self.log("Looking for more split points based on punctuation, currently have " + str(self.html_preprocess_sections))
chapdetect3 = re.compile(r'<(?P(p|div)[^>]*)>\s*(?P(]*>)?\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*()?([ibu]>){0,2}\s*()?\s*([ibu]>){0,2}\s*()?\s*(p|div)>)', re.IGNORECASE)
- html = chapdetect3.sub(self.chapter_break, html)
+ #html = chapdetect3.sub(self.chapter_break, html)
# search for places where a first or second level heading is immediately followed by another
# top level heading. demote the second heading to h3 to prevent splitting between chapter
# headings and titles, images, etc
From 8b7ef0984f4bed6acc64b6e6124352c65b22eb65 Mon Sep 17 00:00:00 2001
From: ldolse
Date: Sat, 25 Sep 2010 12:53:40 -1000
Subject: [PATCH 05/26] ...
---
src/calibre/ebooks/conversion/preprocess.py | 18 +++++++++---------
src/calibre/ebooks/conversion/utils.py | 11 ++++-------
2 files changed, 13 insertions(+), 16 deletions(-)
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index c42b29e0e4..a18ff07d44 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -123,7 +123,7 @@ def line_length(format, raw, percent, test_type):
l = len(line)
if l > max:
max = l
- print "max line found is "+str(max)
+ #print "max line found is "+str(max)
# Build the line length histogram
hRaw = [ 0 for i in range(0,buckets) ]
for line in lines:
@@ -136,8 +136,8 @@ def line_length(format, raw, percent, test_type):
# Normalize the histogram into percents
totalLines = len(lines)
h = [ float(count)/totalLines for count in hRaw ]
- print "\nhRaw histogram lengths are: "+str(hRaw)
- print " percents are: "+str(h)+"\n"
+ #print "\nhRaw histogram lengths are: "+str(hRaw)
+ #print " percents are: "+str(h)+"\n"
# Find the biggest bucket
maxValue = 0
@@ -183,27 +183,27 @@ class Dehyphenator(object):
if self.prefixes.match(firsthalf) is None:
lookupword = self.removeprefix.sub('', lookupword)
booklookup = re.compile(u'%s' % lookupword, re.IGNORECASE)
- print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
+ #print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
if self.format == 'html_cleanup':
match = booklookup.search(self.html)
hyphenmatch = re.search(u'%s' % hyphenated, self.html)
if match:
- print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
+ #print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
return dehyphenated
elif hyphenmatch:
- print "Cleanup:returned hyphenated word: " + str(hyphenated)
+ #print "Cleanup:returned hyphenated word: " + str(hyphenated)
return hyphenated
else:
- print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf)
+ #print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf)
return firsthalf+u'\u2014'+wraptags+secondhalf
else:
match = booklookup.search(self.html)
if match:
- print "returned dehyphenated word: " + str(dehyphenated)
+ #print "returned dehyphenated word: " + str(dehyphenated)
return dehyphenated
else:
- print "returned hyphenated word: " + str(hyphenated)
+ #print "returned hyphenated word: " + str(hyphenated)
return hyphenated
def __call__(self, html, format, length=1):
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 5e3cac7714..555f42702b 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -132,7 +132,7 @@ class PreProcessor(object):
html = blankreg.sub('', html)
elif float(len(blanklines)) / float(len(lines)) > 0.40:
blanks_between_paragraphs = True
- print "blanks between paragraphs is marked True"
+ #print "blanks between paragraphs is marked True"
else:
blanks_between_paragraphs = False
# Arrange line feeds and