diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 68afc464a0..99685e90d1 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -19,6 +19,7 @@ class PreProcessor(object): self.found_indents = 0 self.extra_opts = extra_opts self.deleted_nbsps = False + self.totalwords = 0 self.min_chapters = 1 self.linereg = re.compile('(?<=
)', re.IGNORECASE|re.DOTALL)
self.blankreg = re.compile(r'\s*(?P ]*>)\s*(?P', re.IGNORECASE)
- if len(pre.findall(html)) == 1:
+ if len(pre.findall(html)) >= 1:
self.log("Running Text Processing")
- from calibre.ebooks.txt.processor import convert_basic, preserve_spaces, \
- separate_paragraphs_single_line
outerhtml = re.compile(r'.*?(?<=
)(?P
).*', re.IGNORECASE|re.DOTALL)
- html = outerhtml.sub('\g tag or
@@ -302,25 +309,26 @@ class PreProcessor(object):
# Count the words in the document to estimate how many chapters to look for and whether
# other types of processing are attempted
- totalwords = 0
- totalwords = self.get_word_count(html)
+ try:
+ self.totalwords = self.get_word_count(html)
+ except:
+ self.log("Can't get wordcount")
- if totalwords < 50:
+ if 0 < self.totalwords < 50:
self.log("flow is too short, not running heuristics")
return html
# Arrange line feeds and
tags or equivalent (generally just plain text between #
tags), check and mark up line endings if required before proceeding
if self.no_markup(html, 0.1):
self.log("not enough paragraph markers, adding now")
- # check if content is in pre tags, use txt processor to mark up if so
- html = self.text_process_pre(html)
+ # markup using text processing
+ html = self.markup_pre(html)
# Replace series of non-breaking spaces with text-indent
if getattr(self.extra_opts, 'fix_indents', True):
@@ -338,7 +346,7 @@ class PreProcessor(object):
# detect chapters/sections to match xpath or splitting logic
if getattr(self.extra_opts, 'markup_chapter_headings', True):
- html = self.markup_chapters(html, totalwords, blanks_between_paragraphs)
+ html = self.markup_chapters(html, self.totalwords, blanks_between_paragraphs)
# If more than 40% of the lines are empty paragraphs and the user has enabled delete
# blank paragraphs then delete blank lines to clean up spacing
@@ -383,8 +391,6 @@ class PreProcessor(object):
self.log("Fixing hyphenated content")
dehyphenator = Dehyphenator()
html = dehyphenator(html,'html_cleanup', length)
- # delete soft hyphens
- html = re.sub(u'\xad\s*(\s*([iubp]>\s*<[iubp][^>]*>\s*)?]*>|[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
# If still no sections after unwrapping mark split points on lines with no punctuation
if self.html_preprocess_sections < self.min_chapters:
@@ -392,13 +398,14 @@ class PreProcessor(object):
" currently have " + unicode(self.html_preprocess_sections))
chapdetect3 = re.compile(r'<(?P(p|div)[^>]*)>\s*(?P(]*>)?\s*(?!([*#•]+\s*)+)(<[ibu][^>]*>){0,2}\s*(]*>)?\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*()?([ibu]>){0,2}\s*()?\s*([ibu]>){0,2}\s*()?\s*(p|div)>)', re.IGNORECASE)
html = chapdetect3.sub(self.chapter_break, html)
+
# search for places where a first or second level heading is immediately followed by another
# top level heading. demote the second heading to h3 to prevent splitting between chapter
# headings and titles, images, etc
doubleheading = re.compile(r'(?P]*>.+? \s*(<(?!h\d)[^>]*>\s*)*)[^>]*>.+?) ', re.IGNORECASE)
html = doubleheading.sub('\g'+'\n'+'
', html)
- if getattr(self.extra_opts, 'dehyphenate', True):
+ if getattr(self.extra_opts, 'format_scene_breaks', True):
# Center separator lines
html = re.sub(u'<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(?P([*#•]+\s*)+)\s*((?P=inner3)>)?\s*((?P=inner2)>)?\s*((?P=inner1)>)?\s*(?P=outer)>', '' + '\g' + '
', html)