'+span
+ pstyle = pstyle+' style="text-indent:3%"'
+ if not span:
+ return '
'
+ else:
+ return '
'+span
else:
if not span:
return '
'
From 26ba75f76cc1db12439fb6f3a7c6bc9fbd049507 Mon Sep 17 00:00:00 2001
From: ldolse
Date: Wed, 17 Nov 2010 10:25:51 +0800
Subject: [PATCH 10/32] added a search for emphasized lines during chapter
markup
---
src/calibre/ebooks/conversion/utils.py | 20 ++++++++++++++++----
1 file changed, 16 insertions(+), 4 deletions(-)
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 51139d3a18..bec15924d6 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -177,9 +177,10 @@ class PreProcessor(object):
numeric_chapters = r".?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*"
uppercase_chapters = r"\s*.?([A-Z#]+(\s|-){0,3}){1,5}\s*"
numeric_titles = r".?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*"
+ emphasized_lines = r"]*>\s*(]*>)?\s*(\s*(?=[\w#\-*\s]+<)([\w#-*]+\s*){1,5}\s*)()?\s*"
full_chapter_line = chapter_line_open+chapter_header_open+typical_chapters+chapter_header_close+chapter_line_close
- n_lookahead = re.sub("(out|inn|cha)", "l", full_chapter_line)
+ n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
print "n_lookahead is " + n_lookahead
print "Chapter line is " + full_chapter_line + "\n\n"
chapter_marker = lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
@@ -195,7 +196,7 @@ class PreProcessor(object):
if self.html_preprocess_sections < 10:
self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying numeric chapters")
full_chapter_line = chapter_line_open+chapter_header_open+numeric_chapters+chapter_header_close+chapter_line_close
- n_lookahead = re.sub("(out|inn|cha)", "l", full_chapter_line)
+ n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
print "n_lookahead is " + n_lookahead
print "Chapter line is " + full_chapter_line + "\n\n"
chapter_marker = lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
@@ -203,10 +204,21 @@ class PreProcessor(object):
chapdetect2 = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
html = chapdetect2.sub(self.chapter_head, html)
+ if self.html_preprocess_sections < 10:
+ self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying emphazised lines")
+ full_chapter_line = chapter_line_open+chapter_header_open+emphasized_lines+chapter_header_close+chapter_line_close
+ n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
+ print "n_lookahead is " + n_lookahead
+ print "Chapter line is " + full_chapter_line + "\n\n"
+ chapter_marker = lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
+ print chapter_marker
+ chapdetect2 = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
+ html = chapdetect2.sub(self.chapter_head, html)
+
if self.html_preprocess_sections < 10:
self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying with uppercase words")
full_chapter_line = chapter_line_open+chapter_header_open+uppercase_chapters+chapter_header_close+chapter_line_close
- n_lookahead = re.sub("(out|inn|cha)", "l", full_chapter_line)
+ n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
print "n_lookahead is " + n_lookahead
print "Chapter line is " + full_chapter_line + "\n\n"
chapter_marker = lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
@@ -217,7 +229,7 @@ class PreProcessor(object):
if self.html_preprocess_sections < 10:
self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying numeric chapters with titles")
full_chapter_line = chapter_line_open+chapter_header_open+numeric_titles+chapter_header_close+chapter_line_close
- n_lookahead = re.sub("(out|inn|cha)", "l", full_chapter_line)
+ n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
print "n_lookahead is " + n_lookahead
print "Chapter line is " + full_chapter_line + "\n\n"
chapter_marker = lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
From a55b4dbbac65ef083d3af4943243f9b6e092d227 Mon Sep 17 00:00:00 2001
From: ldolse
Date: Wed, 17 Nov 2010 13:49:12 +0800
Subject: [PATCH 11/32] remove extra line feeds from html comments when
sanitizing
---
src/calibre/library/comments.py | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/src/calibre/library/comments.py b/src/calibre/library/comments.py
index 83eec89abe..00a6ef55ae 100644
--- a/src/calibre/library/comments.py
+++ b/src/calibre/library/comments.py
@@ -131,7 +131,8 @@ def comments_to_html(comments):
def sanitize_comments_html(html):
text = html2text(html)
md = markdown.Markdown(safe_mode=True)
- return md.convert(text)
+ cleansed = re.sub('\n+', '', md.convert(text))
+ return cleansed
def test():
for pat, val in [
From b03b8023943417dc544f70bd470ba5f61c59d848 Mon Sep 17 00:00:00 2001
From: ldolse
Date: Wed, 17 Nov 2010 14:12:14 +0800
Subject: [PATCH 12/32] adjusted css to compact the comments display
---
src/calibre/gui2/book_details.py | 2 ++
1 file changed, 2 insertions(+)
diff --git a/src/calibre/gui2/book_details.py b/src/calibre/gui2/book_details.py
index e193fe10b2..8cc2965171 100644
--- a/src/calibre/gui2/book_details.py
+++ b/src/calibre/gui2/book_details.py
@@ -221,6 +221,8 @@ class BookInfo(QWebView):
From fb124c50a767956abcadec577fe10ad1e0e4ae80 Mon Sep 17 00:00:00 2001
From: ldolse
Date: Wed, 17 Nov 2010 17:55:50 +0800
Subject: [PATCH 13/32] added negative lookahead to reduce false positive
matches during chapter marking
---
src/calibre/ebooks/conversion/utils.py | 54 +++++++++++++++++---------
1 file changed, 36 insertions(+), 18 deletions(-)
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index bec15924d6..ac38a0097d 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -83,6 +83,24 @@ class PreProcessor(object):
if min_lns > tot_htm_ends:
return True
+ def dump(self, raw, where):
+ import os
+ dp = getattr(self.extra_opts, 'debug_pipeline', None)
+ if dp and os.path.exists(dp):
+ odir = os.path.join(dp, 'preprocess')
+ if not os.path.exists(odir):
+ os.makedirs(odir)
+ if os.path.exists(odir):
+ odir = os.path.join(odir, where)
+ if not os.path.exists(odir):
+ os.makedirs(odir)
+ name, i = None, 0
+ while not name or os.path.exists(os.path.join(odir, name)):
+ i += 1
+ name = '%04d.html'%i
+ with open(os.path.join(odir, name), 'wb') as f:
+ f.write(raw.encode('utf-8'))
+
def __call__(self, html):
self.log("********* Preprocessing HTML *********")
@@ -150,7 +168,7 @@ class PreProcessor(object):
#print "blanks between paragraphs is marked True"
else:
blanks_between_paragraphs = False
- #self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n")
+ #self.dump(html, 'before_chapter_markup')
# detect chapters/sections to match xpath or splitting logic
#
# Build the Regular Expressions in pieces
@@ -158,7 +176,7 @@ class PreProcessor(object):
chapter_line_open = "<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*"
chapter_header_open = r"(?P"
chapter_header_close = ")\s*"
- chapter_line_close = "((?P=inner3)>)?\s*((?P=inner2)>)?\s*((?P=inner1)>)?\s*(?P=outer)>\s*"
+ chapter_line_close = "((?P=inner3)>)?\s*((?P=inner2)>)?\s*((?P=inner1)>)?\s*(?P=outer)>"
if blanks_between_paragraphs:
blank_lines = "(\s*
]*>\s*
){0,2}\s*"
else:
@@ -169,7 +187,7 @@ class PreProcessor(object):
title_header_close = ")\s*"
title_line_close = "((?P=inner6)>)?\s*((?P=inner5)>)?\s*((?P=inner4)\s[^>]*>)?\s*(?P=outer2)>"
opt_title_close = ")?"
- n_lookahead_open = "(?!="
+ n_lookahead_open = "\s+(?!"
n_lookahead_close = ")"
default_title = r"(\s*[\w\'\"-]+){1,5}?(?=<)"
@@ -181,10 +199,10 @@ class PreProcessor(object):
full_chapter_line = chapter_line_open+chapter_header_open+typical_chapters+chapter_header_close+chapter_line_close
n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
- print "n_lookahead is " + n_lookahead
- print "Chapter line is " + full_chapter_line + "\n\n"
+ #print "n_lookahead is:\n" + n_lookahead + "\n\n"
+ #print "'normal' Chapter line - no title - is:\n" + full_chapter_line + "\n\n"
chapter_marker = lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
- print chapter_marker
+ #print "full chapter regex with lookahead is:\n" + chapter_marker + "\n\n"
heading = re.compile(']*>', re.IGNORECASE)
self.html_preprocess_sections = len(heading.findall(html))
self.log("found " + unicode(self.html_preprocess_sections) + " pre-existing headings")
@@ -197,10 +215,10 @@ class PreProcessor(object):
self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying numeric chapters")
full_chapter_line = chapter_line_open+chapter_header_open+numeric_chapters+chapter_header_close+chapter_line_close
n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
- print "n_lookahead is " + n_lookahead
- print "Chapter line is " + full_chapter_line + "\n\n"
+ #print "n_lookahead is " + n_lookahead
+ #print "Chapter line is " + full_chapter_line + "\n\n"
chapter_marker = lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
- print chapter_marker
+ #print chapter_marker
chapdetect2 = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
html = chapdetect2.sub(self.chapter_head, html)
@@ -208,10 +226,10 @@ class PreProcessor(object):
self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying emphazised lines")
full_chapter_line = chapter_line_open+chapter_header_open+emphasized_lines+chapter_header_close+chapter_line_close
n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
- print "n_lookahead is " + n_lookahead
- print "Chapter line is " + full_chapter_line + "\n\n"
+ #print "n_lookahead is " + n_lookahead
+ #print "Chapter line is " + full_chapter_line + "\n\n"
chapter_marker = lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
- print chapter_marker
+ #print chapter_marker
chapdetect2 = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
html = chapdetect2.sub(self.chapter_head, html)
@@ -219,10 +237,10 @@ class PreProcessor(object):
self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying with uppercase words")
full_chapter_line = chapter_line_open+chapter_header_open+uppercase_chapters+chapter_header_close+chapter_line_close
n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
- print "n_lookahead is " + n_lookahead
- print "Chapter line is " + full_chapter_line + "\n\n"
+ #print "n_lookahead is " + n_lookahead
+ #print "Chapter line is " + full_chapter_line + "\n\n"
chapter_marker = lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
- print chapter_marker
+ #print chapter_marker
chapdetect2 = re.compile(r'%s' % chapter_marker, re.UNICODE)
html = chapdetect2.sub(self.chapter_head, html)
@@ -230,10 +248,10 @@ class PreProcessor(object):
self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying numeric chapters with titles")
full_chapter_line = chapter_line_open+chapter_header_open+numeric_titles+chapter_header_close+chapter_line_close
n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
- print "n_lookahead is " + n_lookahead
- print "Chapter line is " + full_chapter_line + "\n\n"
+ #print "n_lookahead is " + n_lookahead
+ #print "Chapter line is " + full_chapter_line + "\n\n"
chapter_marker = lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
- print chapter_marker
+ #print chapter_marker
chapdetect2 = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
html = chapdetect2.sub(self.chapter_head, html)
From b238903ba81af83b3a7246cdf5d4d839a48f0d9b Mon Sep 17 00:00:00 2001
From: ldolse
Date: Wed, 17 Nov 2010 19:27:51 +0800
Subject: [PATCH 14/32] minor tweaks to chapter marking
---
src/calibre/ebooks/conversion/utils.py | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index ac38a0097d..fffb0d75d4 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -195,7 +195,7 @@ class PreProcessor(object):
numeric_chapters = r".?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*"
uppercase_chapters = r"\s*.?([A-Z#]+(\s|-){0,3}){1,5}\s*"
numeric_titles = r".?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*"
- emphasized_lines = r"]*>\s*(]*>)?\s*(\s*(?=[\w#\-*\s]+<)([\w#-*]+\s*){1,5}\s*)()?\s*"
+ emphasized_lines = r"]*>\s*(]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\w#\-*\s]+<)([\w#-*]+\s*){1,5}\s*)()?\s*"
full_chapter_line = chapter_line_open+chapter_header_open+typical_chapters+chapter_header_close+chapter_line_close
n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
@@ -308,10 +308,10 @@ class PreProcessor(object):
html = re.sub(u'\xad\s*(
\s*([iubp]>\s*<[iubp][^>]*>\s*)?]*>|[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
# If still no sections after unwrapping mark split points on lines with no punctuation
- if self.html_preprocess_sections < 10:
+ if self.html_preprocess_sections < 5:
self.log("Looking for more split points based on punctuation,"
" currently have " + unicode(self.html_preprocess_sections))
- chapdetect3 = re.compile(r'<(?P(p|div)[^>]*)>\s*(?P(]*>)?\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*()?([ibu]>){0,2}\s*()?\s*([ibu]>){0,2}\s*()?\s*(p|div)>)', re.IGNORECASE)
+ chapdetect3 = re.compile(r'<(?P(p|div)[^>]*)>\s*(?P(]*>)?\s*(?!([*#•]+\s*)+)(<[ibu][^>]*>){0,2}\s*(]*>)?\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*()?([ibu]>){0,2}\s*()?\s*([ibu]>){0,2}\s*()?\s*(p|div)>)', re.IGNORECASE)
html = chapdetect3.sub(self.chapter_break, html)
# search for places where a first or second level heading is immediately followed by another
# top level heading. demote the second heading to h3 to prevent splitting between chapter
From 2b888a4add647821774fbf92ea7807bbdf435af9 Mon Sep 17 00:00:00 2001
From: ldolse
Date: Fri, 19 Nov 2010 10:03:56 +0800
Subject: [PATCH 15/32] fix a problem with pdf unwrap_factor getting set to 0.0
---
src/calibre/gui2/convert/pdf_input.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/calibre/gui2/convert/pdf_input.py b/src/calibre/gui2/convert/pdf_input.py
index 967a0fe234..f1ef7d24ee 100644
--- a/src/calibre/gui2/convert/pdf_input.py
+++ b/src/calibre/gui2/convert/pdf_input.py
@@ -22,5 +22,5 @@ class PluginWidget(Widget, Ui_Form):
def set_value_handler(self, g, val):
if val is None and isinstance(g, QDoubleSpinBox):
- g.setValue(0.0)
+ g.setValue(0.45)
return True
From 2a40afbd8e819e8fee0261e1f35ba54af235be8d Mon Sep 17 00:00:00 2001
From: ldolse
Date: Fri, 19 Nov 2010 12:54:25 +0800
Subject: [PATCH 16/32] blanklines are preserved in rtf2xml, then converted to
empty html paragraphs to preserver softbreaks
---
src/calibre/ebooks/rtf/input.py | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py
index 078b30627f..d7619d471a 100644
--- a/src/calibre/ebooks/rtf/input.py
+++ b/src/calibre/ebooks/rtf/input.py
@@ -84,7 +84,7 @@ class RTFInput(InputFormatPlugin):
group_borders = 1,
# Write or do not write paragraphs. Default is 0.
- empty_paragraphs = 0,
+ empty_paragraphs = 1,
)
parser.parse_rtf()
ans = open('out.xml').read()
@@ -228,6 +228,10 @@ class RTFInput(InputFormatPlugin):
with open(html, 'wb') as f:
res = transform.tostring(result)
res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
+ # Replace newlines inserted by the 'empty_paragraphs' option in rtf2xml with html blank lines
+ if not getattr(self.options, 'remove_paragraph_spacing', False):
+ res = re.sub('\s*', '', res)
+ res = re.sub('\n{4}', u'\n
\u00a0
\n', res)
if self.options.preprocess_html:
preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
res = preprocessor(res)
From f8f908ecd670f63ac07573d9ea330abfbca4ff3a Mon Sep 17 00:00:00 2001
From: ldolse
Date: Fri, 19 Nov 2010 13:23:32 +0800
Subject: [PATCH 17/32] ...
---
src/calibre/ebooks/rtf/input.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py
index d7619d471a..d0ef19ecd9 100644
--- a/src/calibre/ebooks/rtf/input.py
+++ b/src/calibre/ebooks/rtf/input.py
@@ -231,7 +231,7 @@ class RTFInput(InputFormatPlugin):
# Replace newlines inserted by the 'empty_paragraphs' option in rtf2xml with html blank lines
if not getattr(self.options, 'remove_paragraph_spacing', False):
res = re.sub('\s*', '', res)
- res = re.sub('\n{4}', u'\n
\u00a0
\n', res)
+ res = re.sub('(?<=\n)\n{2}', u'
\u00a0
\n', res)
if self.options.preprocess_html:
preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
res = preprocessor(res)
From 25c93421fb38455a4b57eb4e84bb9c55eb507299 Mon Sep 17 00:00:00 2001
From: ldolse
Date: Sat, 20 Nov 2010 12:25:56 +0800
Subject: [PATCH 18/32] merge from trunk
---
src/calibre/ebooks/conversion/plumber.py | 2 ++
1 file changed, 2 insertions(+)
diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py
index 9a863d7e66..d0e9aa2e99 100644
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@@ -957,6 +957,8 @@ def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None,
'''
Create an OEBBook.
'''
+ if input_plugin == 'LITInput':
+ print "***\n\n*** Input plugin is: "+str(input_plugin)+"\n\n****"
from calibre.ebooks.oeb.base import OEBBook
html_preprocessor = HTMLPreProcessor(input_plugin.preprocess_html,
opts.preprocess_html, opts)
From 9c2dcfd5aff2b6e521677bf8afeac68fb81c7816 Mon Sep 17 00:00:00 2001
From: ldolse
Date: Sat, 20 Nov 2010 12:26:57 +0800
Subject: [PATCH 19/32] ...
---
src/calibre/ebooks/conversion/plumber.py | 2 --
1 file changed, 2 deletions(-)
diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py
index d0e9aa2e99..9a863d7e66 100644
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@@ -957,8 +957,6 @@ def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None,
'''
Create an OEBBook.
'''
- if input_plugin == 'LITInput':
- print "***\n\n*** Input plugin is: "+str(input_plugin)+"\n\n****"
from calibre.ebooks.oeb.base import OEBBook
html_preprocessor = HTMLPreProcessor(input_plugin.preprocess_html,
opts.preprocess_html, opts)
From 267eebb9aa489cc443e57e90a9353730345af0c3 Mon Sep 17 00:00:00 2001
From: ldolse
Date: Sun, 21 Nov 2010 15:38:18 +0800
Subject: [PATCH 20/32] adjusted preprocessing regexes for hyphen removal and
chapter marking
---
src/calibre/ebooks/conversion/preprocess.py | 2 +-
src/calibre/ebooks/conversion/utils.py | 10 +++++-----
2 files changed, 6 insertions(+), 6 deletions(-)
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index ef092f7954..3ff816b3bf 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -220,7 +220,7 @@ class Dehyphenator(object):
self.html = html
self.format = format
if format == 'html':
- intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P\s*([iubp]>\s*<[iubp][^>]*>\s*)?]*>|[iubp]>\s*<[iubp][^>]*>)?\s*(?P[\w\d]+)' % length)
+ intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P()?\s*([iubp]>\s*){1,2}(?P<(p|div)[^>]*>\s*(
]*>\s*
\s*)?(p|div)>\s+){0,3}\s*(<[iubp][^>]*>\s*){1,2}(]*>)?)\s*(?P[\w\d]+)' % length)
elif format == 'pdf':
intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?P
|[iub]>\s*
\s*<[iub]>)\s*(?P[\w\d]+)'% length)
elif format == 'individual_words':
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 20689c6950..feb74324e8 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -107,7 +107,7 @@ class PreProcessor(object):
# Arrange line feeds and
tags so the line_length and no_markup functions work correctly
html = re.sub(r"\s*
", "\n", html)
html = re.sub(r"\s*
\s*", "\n
", html)
-
+
###### Check Markup ######
#
# some lit files don't have any
tags or equivalent (generally just plain text between
@@ -191,10 +191,10 @@ class PreProcessor(object):
n_lookahead_close = ")"
default_title = r"(\s*[\w\'\"-]+){1,5}?(?=<)"
- typical_chapters = r".?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}"
- numeric_chapters = r".?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*"
- uppercase_chapters = r"\s*.?([A-Z#]+(\s|-){0,3}){1,5}\s*"
- numeric_titles = r".?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*"
+ typical_chapters = r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}"
+ numeric_chapters = r"[^'\"]?(\d+\.?|CHAPTER)\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}\s*"
+ uppercase_chapters = r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*"
+ numeric_titles = r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*"
emphasized_lines = r"]*>\s*(]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\w#\-*\s]+<)([\w#-*]+\s*){1,5}\s*)()?\s*"
full_chapter_line = chapter_line_open+chapter_header_open+typical_chapters+chapter_header_close+chapter_line_close
From 73278a8cd65dc780155154712ecdb77048fbacb0 Mon Sep 17 00:00:00 2001
From: ldolse
Date: Sun, 21 Nov 2010 15:40:02 +0800
Subject: [PATCH 21/32] ...
---
src/calibre/gui2/convert/pdf_input.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/calibre/gui2/convert/pdf_input.py b/src/calibre/gui2/convert/pdf_input.py
index f1ef7d24ee..967a0fe234 100644
--- a/src/calibre/gui2/convert/pdf_input.py
+++ b/src/calibre/gui2/convert/pdf_input.py
@@ -22,5 +22,5 @@ class PluginWidget(Widget, Ui_Form):
def set_value_handler(self, g, val):
if val is None and isinstance(g, QDoubleSpinBox):
- g.setValue(0.45)
+ g.setValue(0.0)
return True
From c378a90a927bd9e1d075699226353ac05ccd9422 Mon Sep 17 00:00:00 2001
From: ldolse
Date: Sun, 21 Nov 2010 16:43:06 +0800
Subject: [PATCH 22/32] reworked chapter marking code
---
src/calibre/ebooks/conversion/utils.py | 86 ++++++++------------------
1 file changed, 26 insertions(+), 60 deletions(-)
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index feb74324e8..acd8d3f02a 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -172,7 +172,7 @@ class PreProcessor(object):
# detect chapters/sections to match xpath or splitting logic
#
# Build the Regular Expressions in pieces
- lookahead = "(?=<(p|div))"
+ init_lookahead = "(?=<(p|div))"
chapter_line_open = "<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*"
chapter_header_open = r"(?P"
chapter_header_close = ")\s*"
@@ -191,69 +191,35 @@ class PreProcessor(object):
n_lookahead_close = ")"
default_title = r"(\s*[\w\'\"-]+){1,5}?(?=<)"
- typical_chapters = r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}"
- numeric_chapters = r"[^'\"]?(\d+\.?|CHAPTER)\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}\s*"
- uppercase_chapters = r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*"
- numeric_titles = r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*"
- emphasized_lines = r"]*>\s*(]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\w#\-*\s]+<)([\w#-*]+\s*){1,5}\s*)()?\s*"
-
- full_chapter_line = chapter_line_open+chapter_header_open+typical_chapters+chapter_header_close+chapter_line_close
- n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
- #print "n_lookahead is:\n" + n_lookahead + "\n\n"
- #print "'normal' Chapter line - no title - is:\n" + full_chapter_line + "\n\n"
- chapter_marker = lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
- #print "full chapter regex with lookahead is:\n" + chapter_marker + "\n\n"
+
+ min_chapters = 10
heading = re.compile(']*>', re.IGNORECASE)
self.html_preprocess_sections = len(heading.findall(html))
self.log("found " + unicode(self.html_preprocess_sections) + " pre-existing headings")
- #
- # Start with most typical chapter headings, get more aggressive until one works
- if self.html_preprocess_sections < 10:
- chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
+
+ chapter_types = [
+ [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}", True, "Searching for common Chapter Headings"],
+ [r"[^'\"]?(\d+\.?|CHAPTER)\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}\s*", True, "Searching for numeric chapter headings"], # Numeric Chapters
+ [r"]*>\s*(]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\w#\-*\s]+<)([\w#-*]+\s*){1,5}\s*)()?\s*", True, "Searching for emphasized lines"], # Emphasized lines
+ [r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles"], # Numeric Titles
+ [r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters" ] # Uppercase Chapters
+ ]
+
+ for [chapter_type, lookahead_ignorecase, log_message] in chapter_types:
+ if self.html_preprocess_sections >= min_chapters:
+ break
+ full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close
+ n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
+ self.log("Marked " + unicode(self.html_preprocess_sections) + " headings, " + log_message)
+ if lookahead_ignorecase:
+ chapter_marker = init_lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
+ chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
+ else:
+ chapter_marker = init_lookahead+full_chapter_line+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close+n_lookahead_open+n_lookahead+n_lookahead_close
+ chapdetect = re.compile(r'%s' % chapter_marker, re.UNICODE)
+
html = chapdetect.sub(self.chapter_head, html)
- if self.html_preprocess_sections < 10:
- self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying numeric chapters")
- full_chapter_line = chapter_line_open+chapter_header_open+numeric_chapters+chapter_header_close+chapter_line_close
- n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
- #print "n_lookahead is " + n_lookahead
- #print "Chapter line is " + full_chapter_line + "\n\n"
- chapter_marker = lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
- #print chapter_marker
- chapdetect2 = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
- html = chapdetect2.sub(self.chapter_head, html)
-
- if self.html_preprocess_sections < 10:
- self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying emphazised lines")
- full_chapter_line = chapter_line_open+chapter_header_open+emphasized_lines+chapter_header_close+chapter_line_close
- n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
- #print "n_lookahead is " + n_lookahead
- #print "Chapter line is " + full_chapter_line + "\n\n"
- chapter_marker = lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
- #print chapter_marker
- chapdetect2 = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
- html = chapdetect2.sub(self.chapter_head, html)
-
- if self.html_preprocess_sections < 10:
- self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying with uppercase words")
- full_chapter_line = chapter_line_open+chapter_header_open+uppercase_chapters+chapter_header_close+chapter_line_close
- n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
- #print "n_lookahead is " + n_lookahead
- #print "Chapter line is " + full_chapter_line + "\n\n"
- chapter_marker = lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
- #print chapter_marker
- chapdetect2 = re.compile(r'%s' % chapter_marker, re.UNICODE)
- html = chapdetect2.sub(self.chapter_head, html)
-
- if self.html_preprocess_sections < 10:
- self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying numeric chapters with titles")
- full_chapter_line = chapter_line_open+chapter_header_open+numeric_titles+chapter_header_close+chapter_line_close
- n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
- #print "n_lookahead is " + n_lookahead
- #print "Chapter line is " + full_chapter_line + "\n\n"
- chapter_marker = lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
- #print chapter_marker
- chapdetect2 = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
- html = chapdetect2.sub(self.chapter_head, html)
+
###### Unwrap lines ######
#
From fae3252d50f3316458dad2606a3362e2345f5326 Mon Sep 17 00:00:00 2001
From: ldolse
Date: Sun, 21 Nov 2010 17:37:56 +0800
Subject: [PATCH 23/32] further cleanup to chapter markup
---
src/calibre/ebooks/conversion/utils.py | 18 +++++++++++-------
1 file changed, 11 insertions(+), 7 deletions(-)
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index acd8d3f02a..af3d83da4a 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -168,29 +168,30 @@ class PreProcessor(object):
#print "blanks between paragraphs is marked True"
else:
blanks_between_paragraphs = False
- #self.dump(html, 'before_chapter_markup')
+ self.dump(html, 'before_chapter_markup')
# detect chapters/sections to match xpath or splitting logic
#
# Build the Regular Expressions in pieces
init_lookahead = "(?=<(p|div))"
chapter_line_open = "<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*"
+ title_line_open = "<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*"
chapter_header_open = r"(?P"
+ title_header_open = r"(?P"
chapter_header_close = ")\s*"
+ title_header_close = ")"
chapter_line_close = "((?P=inner3)>)?\s*((?P=inner2)>)?\s*((?P=inner1)>)?\s*(?P=outer)>"
+ title_line_close = "((?P=inner6)>)?\s*((?P=inner5)>)?\s*((?P=inner4)>)?\s*(?P=outer2)>"
+
if blanks_between_paragraphs:
blank_lines = "(\s*
]*>\s*
){0,2}\s*"
else:
blank_lines = ""
opt_title_open = "("
- title_line_open = "<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*"
- title_header_open = "(?P"
- title_header_close = ")\s*"
- title_line_close = "((?P=inner6)>)?\s*((?P=inner5)>)?\s*((?P=inner4)\s[^>]*>)?\s*(?P=outer2)>"
opt_title_close = ")?"
n_lookahead_open = "\s+(?!"
n_lookahead_close = ")"
- default_title = r"(\s*[\w\'\"-]+){1,5}?(?=<)"
+ default_title = r"\s{0,3}([\w\'\"-]+\s{0,3}){1,5}?(?=<)"
min_chapters = 10
heading = re.compile(']*>', re.IGNORECASE)
@@ -204,7 +205,8 @@ class PreProcessor(object):
[r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles"], # Numeric Titles
[r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters" ] # Uppercase Chapters
]
-
+
+ # Start with most typical chapter headings, get more aggressive until one works
for [chapter_type, lookahead_ignorecase, log_message] in chapter_types:
if self.html_preprocess_sections >= min_chapters:
break
@@ -215,7 +217,9 @@ class PreProcessor(object):
chapter_marker = init_lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
else:
+ print "Chapter line is:\n"+full_chapter_line
chapter_marker = init_lookahead+full_chapter_line+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close+n_lookahead_open+n_lookahead+n_lookahead_close
+ print "\nFull regex is:\n"+chapter_marker
chapdetect = re.compile(r'%s' % chapter_marker, re.UNICODE)
html = chapdetect.sub(self.chapter_head, html)
From cf4f9e41c273fe5f63db22a33120cd3a380bc730 Mon Sep 17 00:00:00 2001
From: ldolse
Date: Sun, 21 Nov 2010 17:49:46 +0800
Subject: [PATCH 24/32] ...
---
src/calibre/ebooks/conversion/utils.py | 2 --
1 file changed, 2 deletions(-)
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index af3d83da4a..26c8d23e0c 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -217,9 +217,7 @@ class PreProcessor(object):
chapter_marker = init_lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
else:
- print "Chapter line is:\n"+full_chapter_line
chapter_marker = init_lookahead+full_chapter_line+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close+n_lookahead_open+n_lookahead+n_lookahead_close
- print "\nFull regex is:\n"+chapter_marker
chapdetect = re.compile(r'%s' % chapter_marker, re.UNICODE)
html = chapdetect.sub(self.chapter_head, html)
From e1602dc31a2ac0b3f8f4367fd5d881369906c7e1 Mon Sep 17 00:00:00 2001
From: ldolse
Date: Sun, 21 Nov 2010 18:03:25 +0800
Subject: [PATCH 25/32] ...
---
src/calibre/ebooks/conversion/utils.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 26c8d23e0c..ea78808d08 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -168,7 +168,7 @@ class PreProcessor(object):
#print "blanks between paragraphs is marked True"
else:
blanks_between_paragraphs = False
- self.dump(html, 'before_chapter_markup')
+ #self.dump(html, 'before_chapter_markup')
# detect chapters/sections to match xpath or splitting logic
#
# Build the Regular Expressions in pieces
From caabf1d814a419c79aafb1a78a372afc894420de Mon Sep 17 00:00:00 2001
From: ldolse
Date: Sun, 21 Nov 2010 18:29:59 +0800
Subject: [PATCH 26/32] ...
---
src/calibre/ebooks/conversion/utils.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index ea78808d08..4f3e2ed90a 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -200,7 +200,7 @@ class PreProcessor(object):
chapter_types = [
[r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}", True, "Searching for common Chapter Headings"],
- [r"[^'\"]?(\d+\.?|CHAPTER)\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}\s*", True, "Searching for numeric chapter headings"], # Numeric Chapters
+ [r"[^'\"]?(\d+\.?|CHAPTER)\s*([\dA-Z\-\'\"\?\.!#,]+\s*){0,10}\s*", True, "Searching for numeric chapter headings"], # Numeric Chapters
[r"]*>\s*(]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\w#\-*\s]+<)([\w#-*]+\s*){1,5}\s*)()?\s*", True, "Searching for emphasized lines"], # Emphasized lines
[r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles"], # Numeric Titles
[r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters" ] # Uppercase Chapters
From 7b35480ce2acf9a947193f504fe26ac78fb8ca94 Mon Sep 17 00:00:00 2001
From: ldolse
Date: Sun, 21 Nov 2010 19:40:17 +0800
Subject: [PATCH 27/32] fixed a problem with some formats and line unwrapping
---
src/calibre/ebooks/conversion/utils.py | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 4f3e2ed90a..2039a545ca 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -22,12 +22,12 @@ class PreProcessor(object):
title = match.group('title')
if not title:
self.html_preprocess_sections = self.html_preprocess_sections + 1
- self.log("found " + unicode(self.html_preprocess_sections) +
+ self.log("marked " + unicode(self.html_preprocess_sections) +
" chapters. - " + unicode(chap))
return '