mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
merged pdf chapter markup with preprocess markup
This commit is contained in:
parent
272d076bd5
commit
0c2ab9e328
@ -364,12 +364,15 @@ class HTMLPreProcessor(object):
|
|||||||
(re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'),
|
(re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'),
|
||||||
|
|
||||||
# Detect Chapters to match default XPATH in GUI
|
# Detect Chapters to match default XPATH in GUI
|
||||||
(re.compile(r'<br>\s*(?P<chap>(<[ibu]>){0,2}\s*.?(Introduction|Chapter|Kapitel|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*(</[ibu]>){0,2})\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<br>)?', re.IGNORECASE), chap_head),
|
#(re.compile(r'<br>\s*(?P<chap>(<[ibu]>){0,2}\s*.?(Introduction|Chapter|Kapitel|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*(</[ibu]>){0,2})\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<br>)?', re.IGNORECASE), chap_head),
|
||||||
# Cover the case where every letter in a chapter title is separated by a space
|
# Cover the case where every letter in a chapter title is separated by a space
|
||||||
(re.compile(r'<br>\s*(?P<chap>([A-Z]\s+){4,}\s*([\d\w-]+\s*){0,3}\s*)\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*(<br>))?'), chap_head),
|
#(re.compile(r'<br>\s*(?P<chap>([A-Z]\s+){4,}\s*([\d\w-]+\s*){0,3}\s*)\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*(<br>))?'), chap_head),
|
||||||
|
|
||||||
|
# Convert line breaks to paragraphs
|
||||||
|
(re.compile(r'<br[^>]*>\s*'), lambda match : '</p>\n<p>'),
|
||||||
|
(re.compile(r'<body[^>]*>\s*'), lambda match : '<body>\n<p>'),
|
||||||
|
(re.compile(r'\s*</body>'), lambda match : '</p>\n</body>'),
|
||||||
|
|
||||||
# Have paragraphs show better
|
|
||||||
(re.compile(r'<br.*?>'), lambda match : '<p>'),
|
|
||||||
# Clean up spaces
|
# Clean up spaces
|
||||||
(re.compile(u'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
|
(re.compile(u'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
|
||||||
# Add space before and after italics
|
# Add space before and after italics
|
||||||
@ -455,9 +458,9 @@ class HTMLPreProcessor(object):
|
|||||||
# delete soft hyphens - moved here so it's executed after header/footer removal
|
# delete soft hyphens - moved here so it's executed after header/footer removal
|
||||||
if is_pdftohtml:
|
if is_pdftohtml:
|
||||||
# unwrap/delete soft hyphens
|
# unwrap/delete soft hyphens
|
||||||
end_rules.append((re.compile(u'[](\s*<p>)+\s*(?=[[a-z\d])'), lambda match: ''))
|
end_rules.append((re.compile(u'[](</p>\s*<p>\s*)+\s*(?=[[a-z\d])'), lambda match: ''))
|
||||||
# unwrap/delete soft hyphens with formatting
|
# unwrap/delete soft hyphens with formatting
|
||||||
end_rules.append((re.compile(u'[]\s*(</(i|u|b)>)+(\s*<p>)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: ''))
|
end_rules.append((re.compile(u'[]\s*(</(i|u|b)>)+(</p>\s*<p>\s*)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: ''))
|
||||||
|
|
||||||
# Make the more aggressive chapter marking regex optional with the preprocess option to
|
# Make the more aggressive chapter marking regex optional with the preprocess option to
|
||||||
# reduce false positives and move after header/footer removal
|
# reduce false positives and move after header/footer removal
|
||||||
@ -475,7 +478,7 @@ class HTMLPreProcessor(object):
|
|||||||
end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: ''))
|
end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: ''))
|
||||||
end_rules.append(
|
end_rules.append(
|
||||||
# Un wrap using punctuation
|
# Un wrap using punctuation
|
||||||
(re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężı,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
|
(re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężı,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
|
||||||
)
|
)
|
||||||
|
|
||||||
for rule in self.PREPROCESS + start_rules:
|
for rule in self.PREPROCESS + start_rules:
|
||||||
@ -508,7 +511,15 @@ class HTMLPreProcessor(object):
|
|||||||
if is_pdftohtml and length > -1:
|
if is_pdftohtml and length > -1:
|
||||||
# Dehyphenate
|
# Dehyphenate
|
||||||
dehyphenator = Dehyphenator()
|
dehyphenator = Dehyphenator()
|
||||||
html = dehyphenator(html,'pdf', length)
|
html = dehyphenator(html,'html', length)
|
||||||
|
|
||||||
|
if is_pdftohtml:
|
||||||
|
from calibre.ebooks.conversion.utils import PreProcessor
|
||||||
|
pdf_markup = PreProcessor(self.extra_opts, None)
|
||||||
|
totalwords = 0
|
||||||
|
totalwords = pdf_markup.get_word_count(html)
|
||||||
|
if totalwords > 7000:
|
||||||
|
html = pdf_markup.markup_chapters(html, totalwords, True)
|
||||||
|
|
||||||
#dump(html, 'post-preprocess')
|
#dump(html, 'post-preprocess')
|
||||||
|
|
||||||
|
@ -18,6 +18,9 @@ class PreProcessor(object):
|
|||||||
self.found_indents = 0
|
self.found_indents = 0
|
||||||
self.extra_opts = extra_opts
|
self.extra_opts = extra_opts
|
||||||
|
|
||||||
|
def is_pdftohtml(self, src):
|
||||||
|
return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
|
||||||
|
|
||||||
def chapter_head(self, match):
|
def chapter_head(self, match):
|
||||||
chap = match.group('chap')
|
chap = match.group('chap')
|
||||||
title = match.group('title')
|
title = match.group('title')
|
||||||
@ -130,6 +133,15 @@ class PreProcessor(object):
|
|||||||
chapter_line_close = "(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>"
|
chapter_line_close = "(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>"
|
||||||
title_line_close = "(</(?P=inner6)>)?\s*(</(?P=inner5)>)?\s*(</(?P=inner4)>)?\s*</(?P=outer2)>"
|
title_line_close = "(</(?P=inner6)>)?\s*(</(?P=inner5)>)?\s*(</(?P=inner4)>)?\s*</(?P=outer2)>"
|
||||||
|
|
||||||
|
is_pdftohtml = self.is_pdftohtml(html)
|
||||||
|
if is_pdftohtml:
|
||||||
|
print "this is a pdf"
|
||||||
|
chapter_line_open = "<(?P<outer>p)[^>]*>(\s*<[ibu][^>]*>)?\s*"
|
||||||
|
chapter_line_close = "\s*(</[ibu][^>]*>\s*)?</(?P=outer)>"
|
||||||
|
title_line_open = "<(?P<outer2>p)[^>]*>\s*"
|
||||||
|
title_line_close = "\s*</(?P=outer2)>"
|
||||||
|
|
||||||
|
|
||||||
if blanks_between_paragraphs:
|
if blanks_between_paragraphs:
|
||||||
blank_lines = "(\s*<p[^>]*>\s*</p>){0,2}\s*"
|
blank_lines = "(\s*<p[^>]*>\s*</p>){0,2}\s*"
|
||||||
else:
|
else:
|
||||||
@ -139,11 +151,13 @@ class PreProcessor(object):
|
|||||||
n_lookahead_open = "\s+(?!"
|
n_lookahead_open = "\s+(?!"
|
||||||
n_lookahead_close = ")"
|
n_lookahead_close = ")"
|
||||||
|
|
||||||
default_title = r"\s{0,3}([\w\'\"-]+\s{0,3}){1,5}?(?=<)"
|
default_title = r"(<[ibu][^>]*>)?\s{0,3}([\w\'\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)"
|
||||||
|
|
||||||
chapter_types = [
|
chapter_types = [
|
||||||
[r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}", True, "Searching for common Chapter Headings"],
|
[r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}", True, "Searching for common Chapter Headings"],
|
||||||
[r"[^'\"]?(\d+\.?|CHAPTER)\s*([\dA-Z\-\'\"\?\.!#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"], # Numeric Chapters
|
[r"[^'\"]?(\d+(\.|:)|CHAPTER)\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"], # Numeric Chapters
|
||||||
|
[r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, "Searching for letter spaced headings"], # Spaced Lettering
|
||||||
|
[r"[^'\"]?(\d+|CHAPTER)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, "Searching for simple numeric chapter headings"], # Numeric Chapters, no dot or colon
|
||||||
[r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\w#\-*\s]+<)([\w#-*]+\s*){1,5}\s*)(</span>)?\s*</b>", True, "Searching for emphasized lines"], # Emphasized lines
|
[r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\w#\-*\s]+<)([\w#-*]+\s*){1,5}\s*)(</span>)?\s*</b>", True, "Searching for emphasized lines"], # Emphasized lines
|
||||||
[r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles"], # Numeric Titles
|
[r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles"], # Numeric Titles
|
||||||
[r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters" ] # Uppercase Chapters
|
[r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters" ] # Uppercase Chapters
|
||||||
|
Loading…
x
Reference in New Issue
Block a user