From 7b35480ce2acf9a947193f504fe26ac78fb8ca94 Mon Sep 17 00:00:00 2001
From: ldolse '+chap+'
\n'
else:
self.html_preprocess_sections = self.html_preprocess_sections + 1
- self.log("found " + unicode(self.html_preprocess_sections) +
+ self.log("marked " + unicode(self.html_preprocess_sections) +
" chapters & titles. - " + unicode(chap) + ", " + unicode(title))
return ''+chap+'
\n'+title+'
\n'
@@ -106,7 +106,7 @@ class PreProcessor(object):
# Arrange line feeds and
\s*", "\n
", html) + html = re.sub(r"\s*
[^>]*)>\s*", "\n
"+">", html) ###### Check Markup ###### # @@ -200,7 +200,7 @@ class PreProcessor(object): chapter_types = [ [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}", True, "Searching for common Chapter Headings"], - [r"[^'\"]?(\d+\.?|CHAPTER)\s*([\dA-Z\-\'\"\?\.!#,]+\s*){0,10}\s*", True, "Searching for numeric chapter headings"], # Numeric Chapters + [r"[^'\"]?(\d+\.?|CHAPTER)\s*([\dA-Z\-\'\"\?\.!#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"], # Numeric Chapters [r"]*>\s*(]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\w#\-*\s]+<)([\w#-*]+\s*){1,5}\s*)()?\s*", True, "Searching for emphasized lines"], # Emphasized lines [r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles"], # Numeric Titles [r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters" ] # Uppercase Chapters @@ -241,6 +241,7 @@ class PreProcessor(object): format = 'html' # Check Line histogram to determine if the document uses hard line breaks, If 50% or # more of the lines break in the same region of the document then unwrapping is required + self.dump(html, 'before_doc_analysis_zipped_http') docanalysis = DocAnalysis(format, html) hardbreaks = docanalysis.line_histogram(.50) self.log("Hard line breaks check returned "+unicode(hardbreaks))