From 0c2ab9e32838933e0b3731f8cca72a0e98c36730 Mon Sep 17 00:00:00 2001 From: ldolse Date: Fri, 17 Dec 2010 02:09:25 -0500 Subject: [PATCH] merged pdf chapter markup with preprocess markup --- src/calibre/ebooks/conversion/preprocess.py | 27 +++++++++++++++------ src/calibre/ebooks/conversion/utils.py | 18 ++++++++++++-- 2 files changed, 35 insertions(+), 10 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 3385771228..310a636022 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -364,12 +364,15 @@ class HTMLPreProcessor(object): (re.compile(r']+>'), lambda match : ''), # Detect Chapters to match default XPATH in GUI - (re.compile(r'
\s*(?P(<[ibu]>){0,2}\s*.?(Introduction|Chapter|Kapitel|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*(){0,2})\s*(
\s*){1,3}\s*(?P(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<br>)?', re.IGNORECASE), chap_head), + #(re.compile(r'<br>\s*(?P<chap>(<[ibu]>){0,2}\s*.?(Introduction|Chapter|Kapitel|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*(</[ibu]>){0,2})\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<br>)?', re.IGNORECASE), chap_head), # Cover the case where every letter in a chapter title is separated by a space - (re.compile(r'<br>\s*(?P<chap>([A-Z]\s+){4,}\s*([\d\w-]+\s*){0,3}\s*)\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*(<br>))?'), chap_head), + #(re.compile(r'<br>\s*(?P<chap>([A-Z]\s+){4,}\s*([\d\w-]+\s*){0,3}\s*)\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*(<br>))?'), chap_head), - # Have paragraphs show better - (re.compile(r'<br.*?>'), lambda match : '<p>'), + # Convert line breaks to paragraphs + (re.compile(r'<br[^>]*>\s*'), lambda match : '</p>\n<p>'), + (re.compile(r'<body[^>]*>\s*'), lambda match : '<body>\n<p>'), + (re.compile(r'\s*</body>'), lambda match : '</p>\n</body>'), + # Clean up spaces (re.compile(u'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '), # Add space before and after italics @@ -455,9 +458,9 @@ class HTMLPreProcessor(object): # delete soft hyphens - moved here so it's executed after header/footer removal if is_pdftohtml: # unwrap/delete soft hyphens - end_rules.append((re.compile(u'[­](\s*<p>)+\s*(?=[[a-z\d])'), lambda match: '')) + end_rules.append((re.compile(u'[­](</p>\s*<p>\s*)+\s*(?=[[a-z\d])'), lambda match: '')) # unwrap/delete soft hyphens with formatting - end_rules.append((re.compile(u'[­]\s*(</(i|u|b)>)+(\s*<p>)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: '')) + end_rules.append((re.compile(u'[­]\s*(</(i|u|b)>)+(</p>\s*<p>\s*)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: '')) # Make the more aggressive chapter marking regex optional with the preprocess option to # reduce false positives and move after header/footer removal @@ -475,7 +478,7 @@ class HTMLPreProcessor(object): end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: '')) end_rules.append( # Un wrap using punctuation - (re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężı,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines), + (re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężı,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines), ) for rule in self.PREPROCESS + start_rules: @@ -508,7 +511,15 @@ class HTMLPreProcessor(object): if is_pdftohtml and length > -1: # Dehyphenate dehyphenator = Dehyphenator() - html = dehyphenator(html,'pdf', length) + html = dehyphenator(html,'html', length) + + if is_pdftohtml: + from calibre.ebooks.conversion.utils import PreProcessor + pdf_markup = PreProcessor(self.extra_opts, None) + totalwords = 0 + totalwords = pdf_markup.get_word_count(html) + if totalwords > 7000: + html = pdf_markup.markup_chapters(html, totalwords, True) #dump(html, 'post-preprocess') diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index cda9d9cbba..3fd7f88434 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -18,6 +18,9 @@ class PreProcessor(object): self.found_indents = 0 self.extra_opts = extra_opts + def is_pdftohtml(self, src): + return '<!-- created by calibre\'s pdftohtml -->' in src[:1000] + def chapter_head(self, match): chap = match.group('chap') title = match.group('title') @@ -130,6 +133,15 @@ class PreProcessor(object): chapter_line_close = "(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>" title_line_close = "(</(?P=inner6)>)?\s*(</(?P=inner5)>)?\s*(</(?P=inner4)>)?\s*</(?P=outer2)>" + is_pdftohtml = self.is_pdftohtml(html) + if is_pdftohtml: + print "this is a pdf" + chapter_line_open = "<(?P<outer>p)[^>]*>(\s*<[ibu][^>]*>)?\s*" + chapter_line_close = "\s*(</[ibu][^>]*>\s*)?</(?P=outer)>" + title_line_open = "<(?P<outer2>p)[^>]*>\s*" + title_line_close = "\s*</(?P=outer2)>" + + if blanks_between_paragraphs: blank_lines = "(\s*<p[^>]*>\s*</p>){0,2}\s*" else: @@ -139,11 +151,13 @@ class PreProcessor(object): n_lookahead_open = "\s+(?!" n_lookahead_close = ")" - default_title = r"\s{0,3}([\w\'\"-]+\s{0,3}){1,5}?(?=<)" + default_title = r"(<[ibu][^>]*>)?\s{0,3}([\w\'\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)" chapter_types = [ [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}", True, "Searching for common Chapter Headings"], - [r"[^'\"]?(\d+\.?|CHAPTER)\s*([\dA-Z\-\'\"\?\.!#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"], # Numeric Chapters + [r"[^'\"]?(\d+(\.|:)|CHAPTER)\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"], # Numeric Chapters + [r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, "Searching for letter spaced headings"], # Spaced Lettering + [r"[^'\"]?(\d+|CHAPTER)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, "Searching for simple numeric chapter headings"], # Numeric Chapters, no dot or colon [r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\w#\-*\s]+<)([\w#-*]+\s*){1,5}\s*)(</span>)?\s*</b>", True, "Searching for emphasized lines"], # Emphasized lines [r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles"], # Numeric Titles [r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters" ] # Uppercase Chapters