'), lambda match : ''),
@@ -303,15 +309,16 @@ class HTMLPreProcessor(object):
if getattr(self.extra_opts, 'preprocess_html', None):
if is_pdftohtml:
end_rules.append(
- (re.compile(r'(?=<(/?br|p|hr))(<(/?br|p|hr)[^>]*)?>\s*(<(i|b)>(<(i|b)>)?)?\s*(?P([A-Z-\'"!]{3,})\s*(\d+|[A-Z]+(\s*[A-Z]+)?)?)\s*((i|b)>((i|b)>)?)?\s*(?p[^>]*>|
]*>)\n?((?=()?\s*\w+(\s+\w+)?()?(
]*>|?p[^>]*>))((?P.*)(
]*>|?p[^>]*>)))?'), chap_head),
+ (re.compile(r'(?=<(/?br|p|hr))(<(/?br|p|hr)[^>]*)?>\s*(<(i|b)>(<(i|b)>)?)?\s*(?P([A-Z-\'"!]{3,})\s*(\d+|[A-Z]+(\s*[A-Z]+)?)?|\d+\.?\s*([\d\w-]+\s*){0,4}\s*)\s*((i|b)>((i|b)>)?)?\s*(?p[^>]*>|
]*>)\n?((?=()?\s*\w+(\s+\w+)?()?(
]*>|?p[^>]*>))((?P.*)(
]*>|?p[^>]*>)))?'), chap_head),
)
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'))
if length:
+ print "The pdf line length returned is " + str(length)
end_rules.append(
# Un wrap using punctuation
- (re.compile(r'(?<=.{%i}[a-z\.,;:)\-IA])\s*(?P(i|b|u)>)?\s*()\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines),
+ (re.compile(r'(?<=.{%i}[a-z,;:)\-IA])\s*(?P(i|b|u)>)?\s*()\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines),
)
for rule in self.PREPROCESS + start_rules:
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
new file mode 100644
index 0000000000..52be473372
--- /dev/null
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -0,0 +1,6 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+
+__license__ = 'GPL v3'
+__copyright__ = '2010, Kovid Goyal '
+__docformat__ = 'restructuredtext en'
\ No newline at end of file
diff --git a/src/calibre/ebooks/lit/input.py b/src/calibre/ebooks/lit/input.py
index f7bb0fbfd9..35dad501be 100644
--- a/src/calibre/ebooks/lit/input.py
+++ b/src/calibre/ebooks/lit/input.py
@@ -102,7 +102,7 @@ class LITInput(InputFormatPlugin):
percent = 0
min_lns = tot_ln_fds * percent
- self.log("There must be more than " + str(min_lns) + " unmarked lines to be true")
+ self.log("There must be more than " + str(min_lns) + " unmarked lines to return true")
if min_lns > tot_htm_ends:
return True
@@ -141,24 +141,31 @@ class LITInput(InputFormatPlugin):
html = chaplink.sub(chapter_link, html)
# Continue with alternate patterns, start with most typical chapter headings
if self.html_preprocess_sections < 10:
- chapdetect = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}.?(\d+\.?|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\s*){0,4}\s*((i|b|u)>){0,2})\s*()?\s*((i|b|u)>){0,2}\s*((p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*((i|b|u)>){0,2})\s*()?\s*((i|b|u)>){0,2}\s*((br|p)>))?', re.IGNORECASE)
+ chapdetect = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}s*(]*>)?\s*.?(\d+\.?|Introduction|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*((i|b|u)>){0,2})\s*()?s*()?\s*((i|b|u)>){0,2}\s*((p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*((i|b|u)>){0,2})\s*()?\s*((i|b|u)>){0,2}\s*((br|p)>))?', re.IGNORECASE)
html = chapdetect.sub(chapter_head, html)
if self.html_preprocess_sections < 10:
self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying a more aggressive pattern")
chapdetect2 = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}\s*.?(([A-Z#]+\s*){1,9}|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*((i|b|u)>){0,2})\s*()?\s*((i|b|u)>){0,2}\s*((p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*((i|b|u)>){0,2})\s*()?\s*((i|b|u)>){0,2}\s*((br|p)>))?', re.UNICODE)
- html = chapdetect2.sub(chapter_head, html)
-
+ html = chapdetect2.sub(chapter_head, html)
+ #
+ # Unwrap lines using punctation if the median length of all lines is less than 150
+ length = line_length('html', html, 0.4)
+ self.log("*** Median line length is " + str(length) + " ***")
+ unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*(span|p|div)>\s*((p|span|div)>)?\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*(span|p|div)>\s*)(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
+ if length < 150:
+ self.log("Unwrapping Lines")
+ html = unwrap.sub(' ', html)
+ # If still no sections after unwrapping lines break on lines with no punctuation
+ if self.html_preprocess_sections < 10:
+ self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", splitting based on punctuation")
+ #self.log(html)
+ chapdetect3 = re.compile(r'(]*>)\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}\s*.?([a-z]+\s*){1,5}\s*((i|b|u)>){0,2})\s*()?\s*((i|b|u)>){0,2}\s*(
)(?P)?', re.IGNORECASE)
+ html = chapdetect3.sub(chapter_head, html)
# search for places where a first or second level heading is immediately followed by another
# top level heading. demote the second heading to h3 to prevent splitting between chapter
# headings and titles, images, etc
doubleheading = re.compile(r'(?P]*>.+?\s*(<(?!h\d)[^>]*>\s*)*)[^>]*>.+?)', re.IGNORECASE)
html = doubleheading.sub('\g'+''+'
', html)
- #
- # Unwrap lines using punctation if the median length of all lines is less than 150
- length = line_length('html', html, 0.4)
- self.log("*** Median length is " + str(length) + " ***")
- unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*(span|p|div)>\s*((p|span|div)>)?\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*(span|p|div)>\s*)(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
- if length < 150:
- html = unwrap.sub(' ', html)
+
return html
diff --git a/src/calibre/ebooks/pdf/input.py b/src/calibre/ebooks/pdf/input.py
index 64a089281e..113c3d99d8 100644
--- a/src/calibre/ebooks/pdf/input.py
+++ b/src/calibre/ebooks/pdf/input.py
@@ -22,10 +22,10 @@ class PDFInput(InputFormatPlugin):
options = set([
OptionRecommendation(name='no_images', recommended_value=False,
help=_('Do not extract images from the document')),
- OptionRecommendation(name='unwrap_factor', recommended_value=0.5,
+ OptionRecommendation(name='unwrap_factor', recommended_value=0.45,
help=_('Scale used to determine the length at which a line should '
'be unwrapped. Valid values are a decimal between 0 and 1. The '
- 'default is 0.5, this is the median line length.')),
+ 'default is 0.45, this is the median line length.')),
OptionRecommendation(name='new_pdf_engine', recommended_value=False,
help=_('Use the new PDF conversion engine.'))
])