From a1dcbb33c1c9ff12c8dbc2092ef4172c014dc827 Mon Sep 17 00:00:00 2001
From: ldolse
]*>\s*
){0,2}\s*" + else: + blank_lines = "" + opt_title_open = "(" + opt_title_close = ")?" + n_lookahead_open = "\s+(?!" + n_lookahead_close = ")" + + default_title = r"\s{0,3}([\w\'\"-]+\s{0,3}){1,5}?(?=<)" + + chapter_types = [ + [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}", True, "Searching for common Chapter Headings"], + [r"[^'\"]?(\d+\.?|CHAPTER)\s*([\dA-Z\-\'\"\?\.!#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"], # Numeric Chapters + [r"]*>\s*(]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\w#\-*\s]+<)([\w#-*]+\s*){1,5}\s*)()?\s*", True, "Searching for emphasized lines"], # Emphasized lines + [r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles"], # Numeric Titles + [r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters" ] # Uppercase Chapters + ] + + # Start with most typical chapter headings, get more aggressive until one works + for [chapter_type, lookahead_ignorecase, log_message] in chapter_types: + if self.html_preprocess_sections >= self.min_chapters: + break + full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close + n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line) + self.log("Marked " + unicode(self.html_preprocess_sections) + " headings, " + log_message) + if lookahead_ignorecase: + chapter_marker = init_lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close + chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE) + else: + chapter_marker = init_lookahead+full_chapter_line+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close+n_lookahead_open+n_lookahead+n_lookahead_close + chapdetect = re.compile(r'%s' % chapter_marker, re.UNICODE) + + html = chapdetect.sub(self.chapter_head, html) + + words_per_chptr = wordcount + if words_per_chptr > 0 and self.html_preprocess_sections > 0: + words_per_chptr = wordcount / self.html_preprocess_sections + print "Total wordcount is: "+ str(wordcount)+", Average words per section is: "+str(words_per_chptr)+", Marked up "+str(self.html_preprocess_sections)+" chapters" + + return html + + + def __call__(self, html): self.log("********* Preprocessing HTML *********") + # Count the words in the document to estimate how many chapters to look for and whether + # other types of processing are attempted + totalwords = self.get_word_count(html) + + if totalwords < 10: + print "not enough text, not preprocessing" + return html + # Arrange line feeds and tags so the line_length and no_markup functions work correctly - html = re.sub(r"\s*", "\n", html) - html = re.sub(r"\s*[^>]*)>\s*", "\n
"+">", html)
+ html = re.sub(r"\s*(?P