mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Sync to ldolse heuristics branch
This commit is contained in:
commit
8e29e7046c
@ -492,7 +492,9 @@ OptionRecommendation(name='enable_heuristics',
|
|||||||
OptionRecommendation(name='markup_chapter_headings',
|
OptionRecommendation(name='markup_chapter_headings',
|
||||||
recommended_value=False, level=OptionRecommendation.LOW,
|
recommended_value=False, level=OptionRecommendation.LOW,
|
||||||
help=_('Detect unformatted chapter headings and sub headings. Change '
|
help=_('Detect unformatted chapter headings and sub headings. Change '
|
||||||
'them to h2 and h3 tags.')),
|
'them to h2 and h3 tags. This setting will not create a TOC, '
|
||||||
|
'but can be used in conjunction with structure detection to create '
|
||||||
|
'one.')),
|
||||||
|
|
||||||
OptionRecommendation(name='italicize_common_cases',
|
OptionRecommendation(name='italicize_common_cases',
|
||||||
recommended_value=False, level=OptionRecommendation.LOW,
|
recommended_value=False, level=OptionRecommendation.LOW,
|
||||||
@ -501,7 +503,7 @@ OptionRecommendation(name='italicize_common_cases',
|
|||||||
|
|
||||||
OptionRecommendation(name='fix_indents',
|
OptionRecommendation(name='fix_indents',
|
||||||
recommended_value=False, level=OptionRecommendation.LOW,
|
recommended_value=False, level=OptionRecommendation.LOW,
|
||||||
help=_('Turn indentation created from multiple entities '
|
help=_('Turn indentation created from multiple non-breaking space entities '
|
||||||
'into CSS indents.')),
|
'into CSS indents.')),
|
||||||
|
|
||||||
OptionRecommendation(name='html_unwrap_factor',
|
OptionRecommendation(name='html_unwrap_factor',
|
||||||
|
@ -473,12 +473,6 @@ class HTMLPreProcessor(object):
|
|||||||
# unwrap/delete soft hyphens with formatting
|
# unwrap/delete soft hyphens with formatting
|
||||||
end_rules.append((re.compile(u'[]\s*(</(i|u|b)>)+(</p>\s*<p>\s*)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: ''))
|
end_rules.append((re.compile(u'[]\s*(</(i|u|b)>)+(</p>\s*<p>\s*)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: ''))
|
||||||
|
|
||||||
# Make the more aggressive chapter marking regex optional with the preprocess option to
|
|
||||||
# reduce false positives and move after header/footer removal
|
|
||||||
if getattr(self.extra_opts, 'preprocess_html', None):
|
|
||||||
if is_pdftohtml:
|
|
||||||
end_rules.append((re.compile(r'<p>\s*(?P<chap>(<[ibu]>){0,2}\s*([A-Z \'"!]{3,})\s*([\dA-Z:]+\s){0,4}\s*(</[ibu]>){0,2})\s*<p>\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<p>)?'), chap_head),)
|
|
||||||
|
|
||||||
length = -1
|
length = -1
|
||||||
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
|
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
|
||||||
docanalysis = DocAnalysis('pdf', html)
|
docanalysis = DocAnalysis('pdf', html)
|
||||||
@ -525,11 +519,10 @@ class HTMLPreProcessor(object):
|
|||||||
html = dehyphenator(html,'html', length)
|
html = dehyphenator(html,'html', length)
|
||||||
|
|
||||||
if is_pdftohtml:
|
if is_pdftohtml:
|
||||||
from calibre.ebooks.conversion.utils import PreProcessor
|
from calibre.ebooks.conversion.utils import HeuristicProcessor
|
||||||
pdf_markup = PreProcessor(self.extra_opts, None)
|
pdf_markup = HeuristicProcessor(self.extra_opts, None)
|
||||||
totalwords = 0
|
totalwords = 0
|
||||||
totalwords = pdf_markup.get_word_count(html)
|
if pdf_markup.get_word_count(html) > 7000:
|
||||||
if totalwords > 7000:
|
|
||||||
html = pdf_markup.markup_chapters(html, totalwords, True)
|
html = pdf_markup.markup_chapters(html, totalwords, True)
|
||||||
|
|
||||||
#dump(html, 'post-preprocess')
|
#dump(html, 'post-preprocess')
|
||||||
|
@ -21,8 +21,10 @@ class HeuristicProcessor(object):
|
|||||||
self.deleted_nbsps = False
|
self.deleted_nbsps = False
|
||||||
self.totalwords = 0
|
self.totalwords = 0
|
||||||
self.min_chapters = 1
|
self.min_chapters = 1
|
||||||
|
self.max_chapters = 150
|
||||||
self.chapters_no_title = 0
|
self.chapters_no_title = 0
|
||||||
self.chapters_with_title = 0
|
self.chapters_with_title = 0
|
||||||
|
self.blanks_deleted = False
|
||||||
self.linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
|
self.linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
|
||||||
self.blankreg = re.compile(r'\s*(?P<openline><p(?!\sid=\"softbreak\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
|
self.blankreg = re.compile(r'\s*(?P<openline><p(?!\sid=\"softbreak\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
|
||||||
self.multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>){2,}', re.IGNORECASE)
|
self.multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>){2,}', re.IGNORECASE)
|
||||||
@ -131,7 +133,7 @@ class HeuristicProcessor(object):
|
|||||||
def markup_italicis(self, html):
|
def markup_italicis(self, html):
|
||||||
ITALICIZE_WORDS = [
|
ITALICIZE_WORDS = [
|
||||||
'Etc.', 'etc.', 'viz.', 'ie.', 'i.e.', 'Ie.', 'I.e.', 'eg.',
|
'Etc.', 'etc.', 'viz.', 'ie.', 'i.e.', 'Ie.', 'I.e.', 'eg.',
|
||||||
'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetra', 'n.b.', 'N.b.',
|
'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetera', 'n.b.', 'N.b.',
|
||||||
'nota bene', 'Nota bene', 'Ste.', 'Mme.', 'Mdme.',
|
'nota bene', 'Nota bene', 'Ste.', 'Mme.', 'Mdme.',
|
||||||
'Mlle.', 'Mons.', 'PS.', 'PPS.',
|
'Mlle.', 'Mons.', 'PS.', 'PPS.',
|
||||||
]
|
]
|
||||||
@ -165,10 +167,12 @@ class HeuristicProcessor(object):
|
|||||||
with minimum false positives. Exits after finding a successful pattern
|
with minimum false positives. Exits after finding a successful pattern
|
||||||
'''
|
'''
|
||||||
# Typical chapters are between 2000 and 7000 words, use the larger number to decide the
|
# Typical chapters are between 2000 and 7000 words, use the larger number to decide the
|
||||||
# minimum of chapters to search for
|
# minimum of chapters to search for. A max limit is calculated to prevent things like OCR
|
||||||
|
# or pdf page numbers from being treated as TOC markers
|
||||||
if wordcount > 7000:
|
if wordcount > 7000:
|
||||||
self.min_chapters = int(ceil(wordcount / 7000.))
|
self.min_chapters = int(ceil(wordcount / 15000.))
|
||||||
#print "minimum chapters required are: "+str(self.min_chapters)
|
self.max_chapters = int(ceil(wordcount / 1200.))
|
||||||
|
print "minimum chapters required are: "+str(self.min_chapters)
|
||||||
heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
|
heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
|
||||||
self.html_preprocess_sections = len(heading.findall(html))
|
self.html_preprocess_sections = len(heading.findall(html))
|
||||||
self.log.debug("found " + unicode(self.html_preprocess_sections) + " pre-existing headings")
|
self.log.debug("found " + unicode(self.html_preprocess_sections) + " pre-existing headings")
|
||||||
@ -201,44 +205,85 @@ class HeuristicProcessor(object):
|
|||||||
n_lookahead_open = "\s+(?!"
|
n_lookahead_open = "\s+(?!"
|
||||||
n_lookahead_close = ")"
|
n_lookahead_close = ")"
|
||||||
|
|
||||||
default_title = r"(<[ibu][^>]*>)?\s{0,3}([\w\'’\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)"
|
default_title = r"(<[ibu][^>]*>)?\s{0,3}(?!Chapter)([\w\'’\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)"
|
||||||
|
simple_title = r"(<[ibu][^>]*>)?\s{0,3}(?!Chapter).{0,50}?(</[ibu][^>]*>)?(?=<)"
|
||||||
|
|
||||||
|
analysis_result = []
|
||||||
|
|
||||||
chapter_types = [
|
chapter_types = [
|
||||||
[r"[^'\"]?(CHAPTER|Kapitel)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, "Searching for most common chapter headings", 'chapter'], # Highest frequency headings which include titles
|
[r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Epilogue|CHAPTER|Kapitel|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, True, True, False, "Searching for common section headings", 'common'],
|
||||||
[r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, "Searching for common section headings", 'common'],
|
[r"[^'\"]?(CHAPTER|Kapitel)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, True, True, False, "Searching for most common chapter headings", 'chapter'], # Highest frequency headings which include titles
|
||||||
[r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•=]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>", True, "Searching for emphasized lines", 'emphasized'], # Emphasized lines
|
[r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•=]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>", True, True, True, False, "Searching for emphasized lines", 'emphasized'], # Emphasized lines
|
||||||
[r"[^'\"]?(\d+(\.|:))\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings", 'numeric'], # Numeric Chapters
|
[r"[^'\"]?(\d+(\.|:))\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, True, True, False, "Searching for numeric chapter headings", 'numeric'], # Numeric Chapters
|
||||||
[r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, "Searching for letter spaced headings", 'letter_spaced'], # Spaced Lettering
|
[r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, True, True, False, "Searching for letter spaced headings", 'letter_spaced'], # Spaced Lettering
|
||||||
[r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles", 'numeric_title'], # Numeric Titles
|
[r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, True, True, False, "Searching for numeric chapters with titles", 'numeric_title'], # Numeric Titles
|
||||||
[r"[^'\"]?(\d+)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, "Searching for simple numeric headings", 'plain_number'], # Numeric Chapters, no dot or colon
|
[r"[^'\"]?(\d+)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, True, True, False, "Searching for simple numeric headings", 'plain_number'], # Numeric Chapters, no dot or colon
|
||||||
[r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters", 'uppercase' ] # Uppercase Chapters
|
[r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, True, False, False, "Searching for chapters with Uppercase Characters", 'uppercase' ] # Uppercase Chapters
|
||||||
]
|
]
|
||||||
|
|
||||||
def recurse_patterns(html, analyze):
|
def recurse_patterns(html, analyze):
|
||||||
# Start with most typical chapter headings, get more aggressive until one works
|
# Start with most typical chapter headings, get more aggressive until one works
|
||||||
for [chapter_type, lookahead_ignorecase, log_message, type_name] in chapter_types:
|
for [chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name] in chapter_types:
|
||||||
|
n_lookahead = ''
|
||||||
|
hits = 0
|
||||||
|
self.chapters_no_title = 0
|
||||||
|
self.chapters_with_title = 0
|
||||||
|
|
||||||
|
if n_lookahead_req:
|
||||||
|
lp_n_lookahead_open = n_lookahead_open
|
||||||
|
lp_n_lookahead_close = n_lookahead_close
|
||||||
|
else:
|
||||||
|
lp_n_lookahead_open = ''
|
||||||
|
lp_n_lookahead_close = ''
|
||||||
|
|
||||||
|
if strict_title:
|
||||||
|
lp_title = default_title
|
||||||
|
else:
|
||||||
|
lp_title = simple_title
|
||||||
|
|
||||||
|
if ignorecase:
|
||||||
|
arg_ignorecase = r'(?i)'
|
||||||
|
else:
|
||||||
|
arg_ignorecase = ''
|
||||||
|
|
||||||
|
if title_req:
|
||||||
|
lp_opt_title_open = ''
|
||||||
|
lp_opt_title_close = ''
|
||||||
|
else:
|
||||||
|
lp_opt_title_open = opt_title_open
|
||||||
|
lp_opt_title_close = opt_title_close
|
||||||
|
|
||||||
if self.html_preprocess_sections >= self.min_chapters:
|
if self.html_preprocess_sections >= self.min_chapters:
|
||||||
break
|
break
|
||||||
full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close
|
full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close
|
||||||
n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
|
if n_lookahead_req:
|
||||||
self.log.debug("Marked " + unicode(self.html_preprocess_sections) + " headings, " + log_message)
|
n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
|
||||||
if lookahead_ignorecase:
|
if not analyze:
|
||||||
chapter_marker = init_lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
|
self.log.debug("Marked " + unicode(self.html_preprocess_sections) + " headings, " + log_message)
|
||||||
chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
|
|
||||||
else:
|
chapter_marker = arg_ignorecase+init_lookahead+full_chapter_line+blank_lines+lp_n_lookahead_open+n_lookahead+lp_n_lookahead_close+lp_opt_title_open+title_line_open+title_header_open+lp_title+title_header_close+title_line_close+lp_opt_title_close
|
||||||
chapter_marker = init_lookahead+full_chapter_line+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close+n_lookahead_open+n_lookahead+n_lookahead_close
|
chapdetect = re.compile(r'%s' % chapter_marker)
|
||||||
chapdetect = re.compile(r'%s' % chapter_marker, re.UNICODE)
|
|
||||||
if analyze:
|
if analyze:
|
||||||
hits = len(chapdetect.findall(html))
|
hits = len(chapdetect.findall(html))
|
||||||
print unicode(type_name)+" had "+unicode(hits)+" hits"
|
if hits:
|
||||||
chapdetect.sub(self.analyze_title_matches, html)
|
chapdetect.sub(self.analyze_title_matches, html)
|
||||||
print unicode(self.chapters_no_title)+" chapters with no title"
|
if float(self.chapters_with_title) / float(hits) > .5:
|
||||||
print unicode(self.chapters_with_title)+" chapters with titles"
|
title_req = True
|
||||||
|
strict_title = False
|
||||||
|
self.log.debug(unicode(type_name)+" had "+unicode(hits)+" hits - "+unicode(self.chapters_no_title)+" chapters with no title, "+unicode(self.chapters_with_title)+" chapters with titles, "+unicode(float(self.chapters_with_title) / float(hits))+" percent. ")
|
||||||
|
print "max chapters is "+str(self.max_chapters)
|
||||||
|
if type_name == 'common':
|
||||||
|
analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name])
|
||||||
|
elif self.min_chapters <= hits < self.max_chapters:
|
||||||
|
analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name])
|
||||||
|
break
|
||||||
else:
|
else:
|
||||||
html = chapdetect.sub(self.chapter_head, html)
|
html = chapdetect.sub(self.chapter_head, html)
|
||||||
return html
|
return html
|
||||||
|
|
||||||
recurse_patterns(html, True)
|
recurse_patterns(html, True)
|
||||||
|
chapter_types = analysis_result
|
||||||
html = recurse_patterns(html, False)
|
html = recurse_patterns(html, False)
|
||||||
|
|
||||||
words_per_chptr = wordcount
|
words_per_chptr = wordcount
|
||||||
@ -292,7 +337,7 @@ class HeuristicProcessor(object):
|
|||||||
pre = re.compile(r'<pre>', re.IGNORECASE)
|
pre = re.compile(r'<pre>', re.IGNORECASE)
|
||||||
if len(pre.findall(html)) >= 1:
|
if len(pre.findall(html)) >= 1:
|
||||||
self.log.debug("Running Text Processing")
|
self.log.debug("Running Text Processing")
|
||||||
outerhtml = re.compile(r'.*?(?<=<pre>)(?P<text>.*)(?=</pre>).*', re.IGNORECASE|re.DOTALL)
|
outerhtml = re.compile(r'.*?(?<=<pre>)(?P<text>.*?)</pre>', re.IGNORECASE|re.DOTALL)
|
||||||
html = outerhtml.sub(self.txt_process, html)
|
html = outerhtml.sub(self.txt_process, html)
|
||||||
else:
|
else:
|
||||||
# Add markup naively
|
# Add markup naively
|
||||||
@ -422,6 +467,7 @@ class HeuristicProcessor(object):
|
|||||||
# blank paragraphs then delete blank lines to clean up spacing
|
# blank paragraphs then delete blank lines to clean up spacing
|
||||||
if blanks_between_paragraphs and getattr(self.extra_opts, 'delete_blank_paragraphs', False):
|
if blanks_between_paragraphs and getattr(self.extra_opts, 'delete_blank_paragraphs', False):
|
||||||
self.log.debug("deleting blank lines")
|
self.log.debug("deleting blank lines")
|
||||||
|
self.blanks_deleted = True
|
||||||
html = self.multi_blank.sub('\n<p id="softbreak" style="margin-top:1.5em; margin-bottom:1.5em"> </p>', html)
|
html = self.multi_blank.sub('\n<p id="softbreak" style="margin-top:1.5em; margin-bottom:1.5em"> </p>', html)
|
||||||
html = self.blankreg.sub('', html)
|
html = self.blankreg.sub('', html)
|
||||||
|
|
||||||
@ -479,6 +525,9 @@ class HeuristicProcessor(object):
|
|||||||
if getattr(self.extra_opts, 'format_scene_breaks', False):
|
if getattr(self.extra_opts, 'format_scene_breaks', False):
|
||||||
# Center separator lines
|
# Center separator lines
|
||||||
html = re.sub(u'<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*(?P<break>([*#•]+\s*)+)\s*(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>', '<p style="text-align:center; margin-top:1.25em; margin-bottom:1.25em">' + '\g<break>' + '</p>', html)
|
html = re.sub(u'<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*(?P<break>([*#•]+\s*)+)\s*(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>', '<p style="text-align:center; margin-top:1.25em; margin-bottom:1.25em">' + '\g<break>' + '</p>', html)
|
||||||
|
if not self.blanks_deleted:
|
||||||
|
html = self.multi_blank.sub('\n<p id="softbreak" style="margin-top:1.5em; margin-bottom:1.5em"> </p>', html)
|
||||||
|
html = re.sub('<p\s+id="softbreak"[^>]*>\s*</p>', '<div id="softbreak" style="margin-left: 45%; margin-right: 45%; margin-top:1.5em; margin-bottom:1.5em"><hr style="height: 3px; background:#505050" /></div>', html)
|
||||||
|
|
||||||
if self.deleted_nbsps:
|
if self.deleted_nbsps:
|
||||||
# put back non-breaking spaces in empty paragraphs to preserve original formatting
|
# put back non-breaking spaces in empty paragraphs to preserve original formatting
|
||||||
|
@ -22,7 +22,7 @@ class LITInput(InputFormatPlugin):
|
|||||||
from calibre.ebooks.lit.reader import LitReader
|
from calibre.ebooks.lit.reader import LitReader
|
||||||
from calibre.ebooks.conversion.plumber import create_oebbook
|
from calibre.ebooks.conversion.plumber import create_oebbook
|
||||||
self.log = log
|
self.log = log
|
||||||
return create_oebbook(log, stream, options, self, reader=LitReader)
|
return create_oebbook(log, stream, options, reader=LitReader)
|
||||||
|
|
||||||
def postprocess_book(self, oeb, opts, log):
|
def postprocess_book(self, oeb, opts, log):
|
||||||
from calibre.ebooks.oeb.base import XHTML_NS, XPath, XHTML
|
from calibre.ebooks.oeb.base import XHTML_NS, XPath, XHTML
|
||||||
@ -39,10 +39,13 @@ class LITInput(InputFormatPlugin):
|
|||||||
body = body[0]
|
body = body[0]
|
||||||
if len(body) == 1 and body[0].tag == XHTML('pre'):
|
if len(body) == 1 and body[0].tag == XHTML('pre'):
|
||||||
pre = body[0]
|
pre = body[0]
|
||||||
from calibre.ebooks.txt.processor import convert_basic
|
from calibre.ebooks.txt.processor import convert_basic, preserve_spaces, \
|
||||||
|
separate_paragraphs_single_line
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
import copy
|
import copy
|
||||||
html = convert_basic(pre.text).replace('<html>',
|
html = separate_paragraphs_single_line(pre.text)
|
||||||
|
html = preserve_spaces(html)
|
||||||
|
html = convert_basic(html).replace('<html>',
|
||||||
'<html xmlns="%s">'%XHTML_NS)
|
'<html xmlns="%s">'%XHTML_NS)
|
||||||
root = etree.fromstring(html)
|
root = etree.fromstring(html)
|
||||||
body = XPath('//h:body')(root)
|
body = XPath('//h:body')(root)
|
||||||
|
@ -255,6 +255,46 @@ you are producing are meant for a particular device type, choose the correspondi
|
|||||||
|
|
||||||
The Output profile also controls the screen size. This will cause, for example, images to be auto-resized to be fit to the screen in some output formats. So choose a profile of a device that has a screen size similar to your device.
|
The Output profile also controls the screen size. This will cause, for example, images to be auto-resized to be fit to the screen in some output formats. So choose a profile of a device that has a screen size similar to your device.
|
||||||
|
|
||||||
|
.. _heuristic-processing:
|
||||||
|
|
||||||
|
Heuristic Processing
|
||||||
|
---------------------
|
||||||
|
|
||||||
|
:guilabel:`Preprocess input`
|
||||||
|
This option activates various algorithms that try to detect and correct common cases of
|
||||||
|
badly formatted input documents. Things like hard line breaks, large blocks of text with no formatting, etc.
|
||||||
|
Turn this option on if your input document suffers from bad formatting. But be aware that in
|
||||||
|
some cases, this option can lead to worse results, so use with care.
|
||||||
|
|
||||||
|
:guilabel:`Line-unwrap factor`
|
||||||
|
This option control the algorithm |app| uses to remove hard line breaks. For example, if the value of this
|
||||||
|
option is 0.4, that means calibre will remove hard line breaks from the end of lines whose lengths are less
|
||||||
|
than the length of 40% of all lines in the document.
|
||||||
|
|
||||||
|
:guilabel:`Unwrap lines`
|
||||||
|
Lorem ipsum
|
||||||
|
|
||||||
|
:guilabel:`Detect and markup unformatted chapter headings and sub headings`
|
||||||
|
Lorem ipsum
|
||||||
|
|
||||||
|
:guilabel:`Renumber sequences of <h1> or <h2> tags to prevent splitting`
|
||||||
|
Lorem ipsum
|
||||||
|
|
||||||
|
:guilabel:`Delete blank lines between paragraphs`
|
||||||
|
Lorem ipsum
|
||||||
|
|
||||||
|
:guilabel:`Ensure scene breaks are consistently formatted`
|
||||||
|
Lorem ipsum
|
||||||
|
|
||||||
|
:guilabel:`Remove unnecessary hyphens`
|
||||||
|
Lorem ipsum
|
||||||
|
|
||||||
|
:guilabel:`Italicize common words and patterns`
|
||||||
|
Lorem ipsum
|
||||||
|
|
||||||
|
:guilabel:`Replace entity indents with CSS indents`
|
||||||
|
Lorem ipsum
|
||||||
|
|
||||||
.. _structure-detection:
|
.. _structure-detection:
|
||||||
|
|
||||||
Structure Detection
|
Structure Detection
|
||||||
@ -330,16 +370,6 @@ There are a few more options in this section.
|
|||||||
two covers. This option will simply remove the first image from the source document, thereby
|
two covers. This option will simply remove the first image from the source document, thereby
|
||||||
ensuring that the converted book has only one cover, the one specified in |app|.
|
ensuring that the converted book has only one cover, the one specified in |app|.
|
||||||
|
|
||||||
:guilabel:`Preprocess input`
|
|
||||||
This option activates various algorithms that try to detect and correct common cases of
|
|
||||||
badly formatted input documents. Things like hard line breaks, large blocks of text with no formatting, etc.
|
|
||||||
Turn this option on if your input document suffers from bad formatting. But be aware that in
|
|
||||||
some cases, this option can lead to worse results, so use with care.
|
|
||||||
|
|
||||||
:guilabel:`Line-unwrap factor`
|
|
||||||
This option control the algorithm |app| uses to remove hard line breaks. For example, if the value of this
|
|
||||||
option is 0.4, that means calibre will remove hard line breaks from the end of lines whose lengths are less
|
|
||||||
than the length of 40% of all lines in the document.
|
|
||||||
|
|
||||||
Table of Contents
|
Table of Contents
|
||||||
------------------
|
------------------
|
||||||
@ -500,14 +530,18 @@ more blank lines are a paragraph boundary::
|
|||||||
|
|
||||||
TXT input supports a number of options to differentiate how paragraphs are detected.
|
TXT input supports a number of options to differentiate how paragraphs are detected.
|
||||||
|
|
||||||
:guilabel:`Treat each line as a paragraph`
|
:guilabel:`Paragraph Style: Auto`
|
||||||
|
Analyzes the text file and attempts to automatically determine how paragraphs are defined. This
|
||||||
|
option will generally work fine, if you achieve undesirable results try one of the manual options.
|
||||||
|
|
||||||
|
:guilabel:`Paragraph Style: Single`
|
||||||
Assumes that every line is a paragraph::
|
Assumes that every line is a paragraph::
|
||||||
|
|
||||||
This is the first.
|
This is the first.
|
||||||
This is the second.
|
This is the second.
|
||||||
This is the third.
|
This is the third.
|
||||||
|
|
||||||
:guilabel:`Assume print formatting`
|
:guilabel:`Paragraph Style: Print`
|
||||||
Assumes that every paragraph starts with an indent (either a tab or 2+ spaces). Paragraphs end when
|
Assumes that every paragraph starts with an indent (either a tab or 2+ spaces). Paragraphs end when
|
||||||
the next line that starts with an indent is reached::
|
the next line that starts with an indent is reached::
|
||||||
|
|
||||||
@ -518,6 +552,14 @@ TXT input supports a number of options to differentiate how paragraphs are detec
|
|||||||
This is the
|
This is the
|
||||||
third.
|
third.
|
||||||
|
|
||||||
|
:guilabel:`Paragraph Style: Unformatted`
|
||||||
|
Assumes that the document has no formatting, but does use hard line breaks. Punctuation
|
||||||
|
and median line length are used to attempt to re-create paragraphs.
|
||||||
|
|
||||||
|
:guilabel:`Formatting Style: Auto`
|
||||||
|
|
||||||
|
:guilabel:`Formatting Style: Heuristic`
|
||||||
|
|
||||||
:guilabel:`Process using markdown`
|
:guilabel:`Process using markdown`
|
||||||
|app| also supports running TXT input though a transformation preprocessor known as markdown. Markdown
|
|app| also supports running TXT input though a transformation preprocessor known as markdown. Markdown
|
||||||
allows for basic formatting to be added to TXT documents, such as bold, italics, section headings, tables,
|
allows for basic formatting to be added to TXT documents, such as bold, italics, section headings, tables,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user