mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
oops
This commit is contained in:
commit
ed5bb2390a
@ -160,18 +160,6 @@ class InputFormatPlugin(Plugin):
|
|||||||
'''
|
'''
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
def preprocess_html(self, opts, html):
|
|
||||||
'''
|
|
||||||
This method is called by the conversion pipeline on all HTML before it
|
|
||||||
is parsed. It is meant to be used to do any required preprocessing on
|
|
||||||
the HTML, like removing hard line breaks, etc.
|
|
||||||
|
|
||||||
:param html: A unicode string
|
|
||||||
:return: A unicode string
|
|
||||||
'''
|
|
||||||
return html
|
|
||||||
|
|
||||||
|
|
||||||
def convert(self, stream, options, file_ext, log, accelerators):
|
def convert(self, stream, options, file_ext, log, accelerators):
|
||||||
'''
|
'''
|
||||||
This method must be implemented in sub-classes. It must return
|
This method must be implemented in sub-classes. It must return
|
||||||
|
@ -75,7 +75,7 @@ class CHMInput(InputFormatPlugin):
|
|||||||
def _create_oebbook(self, hhcpath, basedir, opts, log, mi):
|
def _create_oebbook(self, hhcpath, basedir, opts, log, mi):
|
||||||
from calibre.ebooks.conversion.plumber import create_oebbook
|
from calibre.ebooks.conversion.plumber import create_oebbook
|
||||||
from calibre.ebooks.oeb.base import DirContainer
|
from calibre.ebooks.oeb.base import DirContainer
|
||||||
oeb = create_oebbook(log, None, opts, self,
|
oeb = create_oebbook(log, None, opts,
|
||||||
encoding=opts.input_encoding, populate=False)
|
encoding=opts.input_encoding, populate=False)
|
||||||
self.oeb = oeb
|
self.oeb = oeb
|
||||||
|
|
||||||
|
@ -126,8 +126,27 @@ def add_pipeline_options(parser, plumber):
|
|||||||
'margin_top', 'margin_left', 'margin_right',
|
'margin_top', 'margin_left', 'margin_right',
|
||||||
'margin_bottom', 'change_justification',
|
'margin_bottom', 'change_justification',
|
||||||
'insert_blank_line', 'remove_paragraph_spacing','remove_paragraph_spacing_indent_size',
|
'insert_blank_line', 'remove_paragraph_spacing','remove_paragraph_spacing_indent_size',
|
||||||
'asciiize', 'remove_header', 'header_regex',
|
'asciiize',
|
||||||
'remove_footer', 'footer_regex',
|
]
|
||||||
|
),
|
||||||
|
|
||||||
|
'HEURISTIC PROCESSING' : (
|
||||||
|
_('Modify the document text and structure using common patterns.'),
|
||||||
|
[
|
||||||
|
'enable_heuristics', 'markup_chapter_headings',
|
||||||
|
'italicize_common_cases', 'fix_indents',
|
||||||
|
'html_unwrap_factor', 'unwrap_lines',
|
||||||
|
'delete_blank_paragraphs', 'format_scene_breaks',
|
||||||
|
'dehyphenate', 'renumber_headings',
|
||||||
|
]
|
||||||
|
),
|
||||||
|
|
||||||
|
'SEARCH AND REPLACE' : (
|
||||||
|
_('Modify the document text and structure using user defined patterns.'),
|
||||||
|
[
|
||||||
|
'sr1_search', 'sr1_replace',
|
||||||
|
'sr2_search', 'sr2_replace',
|
||||||
|
'sr3_search', 'sr3_replace',
|
||||||
]
|
]
|
||||||
),
|
),
|
||||||
|
|
||||||
@ -137,7 +156,6 @@ def add_pipeline_options(parser, plumber):
|
|||||||
'chapter', 'chapter_mark',
|
'chapter', 'chapter_mark',
|
||||||
'prefer_metadata_cover', 'remove_first_image',
|
'prefer_metadata_cover', 'remove_first_image',
|
||||||
'insert_metadata', 'page_breaks_before',
|
'insert_metadata', 'page_breaks_before',
|
||||||
'preprocess_html', 'html_unwrap_factor',
|
|
||||||
]
|
]
|
||||||
),
|
),
|
||||||
|
|
||||||
@ -164,7 +182,8 @@ def add_pipeline_options(parser, plumber):
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
group_order = ['', 'LOOK AND FEEL', 'STRUCTURE DETECTION',
|
group_order = ['', 'LOOK AND FEEL', 'HEURISTIC PROCESSING',
|
||||||
|
'SEARCH AND REPLACE', 'STRUCTURE DETECTION',
|
||||||
'TABLE OF CONTENTS', 'METADATA', 'DEBUG']
|
'TABLE OF CONTENTS', 'METADATA', 'DEBUG']
|
||||||
|
|
||||||
for group in group_order:
|
for group in group_order:
|
||||||
|
@ -376,23 +376,6 @@ OptionRecommendation(name='insert_metadata',
|
|||||||
)
|
)
|
||||||
),
|
),
|
||||||
|
|
||||||
OptionRecommendation(name='preprocess_html',
|
|
||||||
recommended_value=False, level=OptionRecommendation.LOW,
|
|
||||||
help=_('Attempt to detect and correct hard line breaks and other '
|
|
||||||
'problems in the source file. This may make things worse, so use '
|
|
||||||
'with care.'
|
|
||||||
)
|
|
||||||
),
|
|
||||||
|
|
||||||
OptionRecommendation(name='html_unwrap_factor',
|
|
||||||
recommended_value=0.40, level=OptionRecommendation.LOW,
|
|
||||||
help=_('Scale used to determine the length at which a line should '
|
|
||||||
'be unwrapped if preprocess is enabled. Valid values are a decimal between 0 and 1. The '
|
|
||||||
'default is 0.40, just below the median line length. This will unwrap typical books '
|
|
||||||
' with hard line breaks, but should be reduced if the line length is variable.'
|
|
||||||
)
|
|
||||||
),
|
|
||||||
|
|
||||||
OptionRecommendation(name='smarten_punctuation',
|
OptionRecommendation(name='smarten_punctuation',
|
||||||
recommended_value=False, level=OptionRecommendation.LOW,
|
recommended_value=False, level=OptionRecommendation.LOW,
|
||||||
help=_('Convert plain quotes, dashes and ellipsis to their '
|
help=_('Convert plain quotes, dashes and ellipsis to their '
|
||||||
@ -401,32 +384,6 @@ OptionRecommendation(name='smarten_punctuation',
|
|||||||
)
|
)
|
||||||
),
|
),
|
||||||
|
|
||||||
OptionRecommendation(name='remove_header',
|
|
||||||
recommended_value=False, level=OptionRecommendation.LOW,
|
|
||||||
help=_('Use a regular expression to try and remove the header.'
|
|
||||||
)
|
|
||||||
),
|
|
||||||
|
|
||||||
OptionRecommendation(name='header_regex',
|
|
||||||
recommended_value='(?i)(?<=<hr>)((\s*<a name=\d+></a>((<img.+?>)*<br>\s*)?\d+<br>\s*.*?\s*)|(\s*<a name=\d+></a>((<img.+?>)*<br>\s*)?.*?<br>\s*\d+))(?=<br>)',
|
|
||||||
level=OptionRecommendation.LOW,
|
|
||||||
help=_('The regular expression to use to remove the header.'
|
|
||||||
)
|
|
||||||
),
|
|
||||||
|
|
||||||
OptionRecommendation(name='remove_footer',
|
|
||||||
recommended_value=False, level=OptionRecommendation.LOW,
|
|
||||||
help=_('Use a regular expression to try and remove the footer.'
|
|
||||||
)
|
|
||||||
),
|
|
||||||
|
|
||||||
OptionRecommendation(name='footer_regex',
|
|
||||||
recommended_value='(?i)(?<=<hr>)((\s*<a name=\d+></a>((<img.+?>)*<br>\s*)?\d+<br>\s*.*?\s*)|(\s*<a name=\d+></a>((<img.+?>)*<br>\s*)?.*?<br>\s*\d+))(?=<br>)',
|
|
||||||
level=OptionRecommendation.LOW,
|
|
||||||
help=_('The regular expression to use to remove the footer.'
|
|
||||||
)
|
|
||||||
),
|
|
||||||
|
|
||||||
OptionRecommendation(name='read_metadata_from_opf',
|
OptionRecommendation(name='read_metadata_from_opf',
|
||||||
recommended_value=None, level=OptionRecommendation.LOW,
|
recommended_value=None, level=OptionRecommendation.LOW,
|
||||||
short_switch='m',
|
short_switch='m',
|
||||||
@ -527,6 +484,89 @@ OptionRecommendation(name='timestamp',
|
|||||||
recommended_value=None, level=OptionRecommendation.LOW,
|
recommended_value=None, level=OptionRecommendation.LOW,
|
||||||
help=_('Set the book timestamp (used by the date column in calibre).')),
|
help=_('Set the book timestamp (used by the date column in calibre).')),
|
||||||
|
|
||||||
|
OptionRecommendation(name='enable_heuristics',
|
||||||
|
recommended_value=False, level=OptionRecommendation.LOW,
|
||||||
|
help=_('Enable heurisic processing. This option must be set for any '
|
||||||
|
'heuristic processing to take place.')),
|
||||||
|
|
||||||
|
OptionRecommendation(name='markup_chapter_headings',
|
||||||
|
recommended_value=False, level=OptionRecommendation.LOW,
|
||||||
|
help=_('Detect unformatted chapter headings and sub headings. Change '
|
||||||
|
'them to h2 and h3 tags. This setting will not create a TOC, '
|
||||||
|
'but can be used in conjunction with structure detection to create '
|
||||||
|
'one.')),
|
||||||
|
|
||||||
|
OptionRecommendation(name='italicize_common_cases',
|
||||||
|
recommended_value=False, level=OptionRecommendation.LOW,
|
||||||
|
help=_('Look for common words and patterns that denote '
|
||||||
|
'italics and italicize them.')),
|
||||||
|
|
||||||
|
OptionRecommendation(name='fix_indents',
|
||||||
|
recommended_value=False, level=OptionRecommendation.LOW,
|
||||||
|
help=_('Turn indentation created from multiple non-breaking space entities '
|
||||||
|
'into CSS indents.')),
|
||||||
|
|
||||||
|
OptionRecommendation(name='html_unwrap_factor',
|
||||||
|
recommended_value=0.40, level=OptionRecommendation.LOW,
|
||||||
|
help=_('Scale used to determine the length at which a line should '
|
||||||
|
'be unwrapped. Valid values are a decimal between 0 and 1. The '
|
||||||
|
'default is 0.4, just below the median line length. If only a '
|
||||||
|
'few lines in the document require unwrapping this value should '
|
||||||
|
'be reduced')),
|
||||||
|
|
||||||
|
OptionRecommendation(name='unwrap_lines',
|
||||||
|
recommended_value=False, level=OptionRecommendation.LOW,
|
||||||
|
help=_('Unwrap lines using punctuation and other formatting clues.')),
|
||||||
|
|
||||||
|
OptionRecommendation(name='delete_blank_paragraphs',
|
||||||
|
recommended_value=False, level=OptionRecommendation.LOW,
|
||||||
|
help=_('Remove empty paragraphs from the document when they exist between '
|
||||||
|
'every other paragraph')),
|
||||||
|
|
||||||
|
OptionRecommendation(name='format_scene_breaks',
|
||||||
|
recommended_value=False, level=OptionRecommendation.LOW,
|
||||||
|
help=_('left aligned scene break markers are center aligned. '
|
||||||
|
'Replace soft scene breaks that use multiple blank lines with'
|
||||||
|
'horizontal rules.')),
|
||||||
|
|
||||||
|
OptionRecommendation(name='dehyphenate',
|
||||||
|
recommended_value=False, level=OptionRecommendation.LOW,
|
||||||
|
help=_('Analyses hyphenated words throughout the document. The '
|
||||||
|
'document itself is used as a dictionary to determine whether hyphens '
|
||||||
|
'should be retained or removed.')),
|
||||||
|
|
||||||
|
OptionRecommendation(name='renumber_headings',
|
||||||
|
recommended_value=False, level=OptionRecommendation.LOW,
|
||||||
|
help=_('Looks for occurences of sequential <h1> or <h2> tags. '
|
||||||
|
'The tags are renumbered to prevent splitting in the middle '
|
||||||
|
'of chapter headings.')),
|
||||||
|
|
||||||
|
OptionRecommendation(name='sr1_search',
|
||||||
|
recommended_value='', level=OptionRecommendation.LOW,
|
||||||
|
help=_('Search pattern (regular expression) to be replaced with '
|
||||||
|
'sr1-replace.')),
|
||||||
|
|
||||||
|
OptionRecommendation(name='sr1_replace',
|
||||||
|
recommended_value='', level=OptionRecommendation.LOW,
|
||||||
|
help=_('Replace characters to replace the text found with sr1-search.')),
|
||||||
|
|
||||||
|
OptionRecommendation(name='sr2_search',
|
||||||
|
recommended_value='', level=OptionRecommendation.LOW,
|
||||||
|
help=_('Search pattern (regular expression) to be replaced with '
|
||||||
|
'sr2-replace.')),
|
||||||
|
|
||||||
|
OptionRecommendation(name='sr2_replace',
|
||||||
|
recommended_value='', level=OptionRecommendation.LOW,
|
||||||
|
help=_('Replace characters to replace the text found with sr2-search.')),
|
||||||
|
|
||||||
|
OptionRecommendation(name='sr3_search',
|
||||||
|
recommended_value='', level=OptionRecommendation.LOW,
|
||||||
|
help=_('Search pattern (regular expression) to be replaced with '
|
||||||
|
'sr3-replace.')),
|
||||||
|
|
||||||
|
OptionRecommendation(name='sr3_replace',
|
||||||
|
recommended_value='', level=OptionRecommendation.LOW,
|
||||||
|
help=_('Replace characters to replace the text found with sr3-search.')),
|
||||||
]
|
]
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
@ -861,7 +901,6 @@ OptionRecommendation(name='timestamp',
|
|||||||
self.opts_to_mi(self.user_metadata)
|
self.opts_to_mi(self.user_metadata)
|
||||||
if not hasattr(self.oeb, 'manifest'):
|
if not hasattr(self.oeb, 'manifest'):
|
||||||
self.oeb = create_oebbook(self.log, self.oeb, self.opts,
|
self.oeb = create_oebbook(self.log, self.oeb, self.opts,
|
||||||
self.input_plugin,
|
|
||||||
encoding=self.input_plugin.output_encoding)
|
encoding=self.input_plugin.output_encoding)
|
||||||
self.input_plugin.postprocess_book(self.oeb, self.opts, self.log)
|
self.input_plugin.postprocess_book(self.oeb, self.opts, self.log)
|
||||||
self.opts.is_image_collection = self.input_plugin.is_image_collection
|
self.opts.is_image_collection = self.input_plugin.is_image_collection
|
||||||
@ -971,14 +1010,13 @@ OptionRecommendation(name='timestamp',
|
|||||||
self.log(self.output_fmt.upper(), 'output written to', self.output)
|
self.log(self.output_fmt.upper(), 'output written to', self.output)
|
||||||
self.flush()
|
self.flush()
|
||||||
|
|
||||||
def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None,
|
def create_oebbook(log, path_or_stream, opts, reader=None,
|
||||||
encoding='utf-8', populate=True):
|
encoding='utf-8', populate=True):
|
||||||
'''
|
'''
|
||||||
Create an OEBBook.
|
Create an OEBBook.
|
||||||
'''
|
'''
|
||||||
from calibre.ebooks.oeb.base import OEBBook
|
from calibre.ebooks.oeb.base import OEBBook
|
||||||
html_preprocessor = HTMLPreProcessor(input_plugin.preprocess_html,
|
html_preprocessor = HTMLPreProcessor(log, opts)
|
||||||
opts.preprocess_html, opts)
|
|
||||||
if not encoding:
|
if not encoding:
|
||||||
encoding = None
|
encoding = None
|
||||||
oeb = OEBBook(log, html_preprocessor,
|
oeb = OEBBook(log, html_preprocessor,
|
||||||
|
@ -174,13 +174,19 @@ class Dehyphenator(object):
|
|||||||
retain hyphens.
|
retain hyphens.
|
||||||
'''
|
'''
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self, verbose=0, log=None):
|
||||||
|
self.log = log
|
||||||
|
self.verbose = verbose
|
||||||
# Add common suffixes to the regex below to increase the likelihood of a match -
|
# Add common suffixes to the regex below to increase the likelihood of a match -
|
||||||
# don't add suffixes which are also complete words, such as 'able' or 'sex'
|
# don't add suffixes which are also complete words, such as 'able' or 'sex'
|
||||||
self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|m?ents?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex|ian)$", re.IGNORECASE)
|
# only remove if it's not already the point of hyphenation
|
||||||
|
self.suffix_string = "((ed)?ly|'?e?s||a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|m?ents?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex|ian)$"
|
||||||
|
self.suffixes = re.compile(r"^%s" % self.suffix_string, re.IGNORECASE)
|
||||||
|
self.removesuffixes = re.compile(r"%s" % self.suffix_string, re.IGNORECASE)
|
||||||
# remove prefixes if the prefix was not already the point of hyphenation
|
# remove prefixes if the prefix was not already the point of hyphenation
|
||||||
self.prefixes = re.compile(r'^(dis|re|un|in|ex)$', re.IGNORECASE)
|
self.prefix_string = '^(dis|re|un|in|ex)'
|
||||||
self.removeprefix = re.compile(r'^(dis|re|un|in|ex)', re.IGNORECASE)
|
self.prefixes = re.compile(r'%s$' % self.prefix_string, re.IGNORECASE)
|
||||||
|
self.removeprefix = re.compile(r'%s' % self.prefix_string, re.IGNORECASE)
|
||||||
|
|
||||||
def dehyphenate(self, match):
|
def dehyphenate(self, match):
|
||||||
firsthalf = match.group('firstpart')
|
firsthalf = match.group('firstpart')
|
||||||
@ -191,31 +197,44 @@ class Dehyphenator(object):
|
|||||||
wraptags = ''
|
wraptags = ''
|
||||||
hyphenated = unicode(firsthalf) + "-" + unicode(secondhalf)
|
hyphenated = unicode(firsthalf) + "-" + unicode(secondhalf)
|
||||||
dehyphenated = unicode(firsthalf) + unicode(secondhalf)
|
dehyphenated = unicode(firsthalf) + unicode(secondhalf)
|
||||||
|
if self.suffixes.match(secondhalf) is None:
|
||||||
lookupword = self.removesuffixes.sub('', dehyphenated)
|
lookupword = self.removesuffixes.sub('', dehyphenated)
|
||||||
if self.prefixes.match(firsthalf) is None:
|
else:
|
||||||
|
lookupword = dehyphenated
|
||||||
|
if len(firsthalf) > 3 and self.prefixes.match(firsthalf) is None:
|
||||||
lookupword = self.removeprefix.sub('', lookupword)
|
lookupword = self.removeprefix.sub('', lookupword)
|
||||||
#print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
|
if self.verbose > 2:
|
||||||
|
self.log("lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated))
|
||||||
try:
|
try:
|
||||||
searchresult = self.html.find(lookupword.lower())
|
searchresult = self.html.find(lookupword.lower())
|
||||||
except:
|
except:
|
||||||
return hyphenated
|
return hyphenated
|
||||||
if self.format == 'html_cleanup' or self.format == 'txt_cleanup':
|
if self.format == 'html_cleanup' or self.format == 'txt_cleanup':
|
||||||
if self.html.find(lookupword) != -1 or searchresult != -1:
|
if self.html.find(lookupword) != -1 or searchresult != -1:
|
||||||
#print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
|
if self.verbose > 2:
|
||||||
|
self.log(" Cleanup:returned dehyphenated word: " + str(dehyphenated))
|
||||||
return dehyphenated
|
return dehyphenated
|
||||||
elif self.html.find(hyphenated) != -1:
|
elif self.html.find(hyphenated) != -1:
|
||||||
#print "Cleanup:returned hyphenated word: " + str(hyphenated)
|
if self.verbose > 2:
|
||||||
|
self.log(" Cleanup:returned hyphenated word: " + str(hyphenated))
|
||||||
return hyphenated
|
return hyphenated
|
||||||
else:
|
else:
|
||||||
#print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf)
|
if self.verbose > 2:
|
||||||
|
self.log(" Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf))
|
||||||
return firsthalf+u'\u2014'+wraptags+secondhalf
|
return firsthalf+u'\u2014'+wraptags+secondhalf
|
||||||
|
|
||||||
else:
|
else:
|
||||||
|
if len(firsthalf) <= 2 and len(secondhalf) <= 2:
|
||||||
|
if self.verbose > 2:
|
||||||
|
self.log("too short, returned hyphenated word: " + str(hyphenated))
|
||||||
|
return hyphenated
|
||||||
if self.html.find(lookupword) != -1 or searchresult != -1:
|
if self.html.find(lookupword) != -1 or searchresult != -1:
|
||||||
#print "returned dehyphenated word: " + str(dehyphenated)
|
if self.verbose > 2:
|
||||||
|
self.log(" returned dehyphenated word: " + str(dehyphenated))
|
||||||
return dehyphenated
|
return dehyphenated
|
||||||
else:
|
else:
|
||||||
#print " returned hyphenated word: " + str(hyphenated)
|
if self.verbose > 2:
|
||||||
|
self.log(" returned hyphenated word: " + str(hyphenated))
|
||||||
return hyphenated
|
return hyphenated
|
||||||
|
|
||||||
def __call__(self, html, format, length=1):
|
def __call__(self, html, format, length=1):
|
||||||
@ -228,7 +247,7 @@ class Dehyphenator(object):
|
|||||||
elif format == 'txt':
|
elif format == 'txt':
|
||||||
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)(\u0020|\u0009)*(?P<wraptags>(\n(\u0020|\u0009)*)+)(?P<secondpart>[\w\d]+)'% length)
|
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)(\u0020|\u0009)*(?P<wraptags>(\n(\u0020|\u0009)*)+)(?P<secondpart>[\w\d]+)'% length)
|
||||||
elif format == 'individual_words':
|
elif format == 'individual_words':
|
||||||
intextmatch = re.compile(u'>[^<]*\b(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020*(?P<secondpart>\w+)\b[^<]*<') # for later, not called anywhere yet
|
intextmatch = re.compile(u'(?!<)(?P<firstpart>\w+)(-|‐)\s*(?P<secondpart>\w+)(?![^<]*?>)')
|
||||||
elif format == 'html_cleanup':
|
elif format == 'html_cleanup':
|
||||||
intextmatch = re.compile(u'(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)')
|
intextmatch = re.compile(u'(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)')
|
||||||
elif format == 'txt_cleanup':
|
elif format == 'txt_cleanup':
|
||||||
@ -397,10 +416,8 @@ class HTMLPreProcessor(object):
|
|||||||
(re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
|
(re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
|
||||||
lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
|
lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
|
||||||
]
|
]
|
||||||
def __init__(self, input_plugin_preprocess, plugin_preprocess,
|
def __init__(self, log=None, extra_opts=None):
|
||||||
extra_opts=None):
|
self.log = log
|
||||||
self.input_plugin_preprocess = input_plugin_preprocess
|
|
||||||
self.plugin_preprocess = plugin_preprocess
|
|
||||||
self.extra_opts = extra_opts
|
self.extra_opts = extra_opts
|
||||||
|
|
||||||
def is_baen(self, src):
|
def is_baen(self, src):
|
||||||
@ -436,27 +453,19 @@ class HTMLPreProcessor(object):
|
|||||||
if not getattr(self.extra_opts, 'keep_ligatures', False):
|
if not getattr(self.extra_opts, 'keep_ligatures', False):
|
||||||
html = _ligpat.sub(lambda m:LIGATURES[m.group()], html)
|
html = _ligpat.sub(lambda m:LIGATURES[m.group()], html)
|
||||||
|
|
||||||
|
for search, replace in [['sr3_search', 'sr3_replace'], ['sr2_search', 'sr2_replace'], ['sr1_search', 'sr1_replace']]:
|
||||||
|
search_pattern = getattr(self.extra_opts, search, '')
|
||||||
|
if search_pattern:
|
||||||
|
try:
|
||||||
|
search_re = re.compile(search_pattern)
|
||||||
|
replace_txt = getattr(self.extra_opts, replace, '')
|
||||||
|
if replace_txt == None:
|
||||||
|
replace_txt = ''
|
||||||
|
rules.insert(0, (search_re, replace_txt))
|
||||||
|
except Exception as e:
|
||||||
|
self.log.error('Failed to parse %s regexp because %s' % (search, e))
|
||||||
|
|
||||||
end_rules = []
|
end_rules = []
|
||||||
if getattr(self.extra_opts, 'remove_header', None):
|
|
||||||
try:
|
|
||||||
rules.insert(0,
|
|
||||||
(re.compile(self.extra_opts.header_regex), lambda match : '')
|
|
||||||
)
|
|
||||||
except:
|
|
||||||
import traceback
|
|
||||||
print 'Failed to parse remove_header regexp'
|
|
||||||
traceback.print_exc()
|
|
||||||
|
|
||||||
if getattr(self.extra_opts, 'remove_footer', None):
|
|
||||||
try:
|
|
||||||
rules.insert(0,
|
|
||||||
(re.compile(self.extra_opts.footer_regex), lambda match : '')
|
|
||||||
)
|
|
||||||
except:
|
|
||||||
import traceback
|
|
||||||
print 'Failed to parse remove_footer regexp'
|
|
||||||
traceback.print_exc()
|
|
||||||
|
|
||||||
# delete soft hyphens - moved here so it's executed after header/footer removal
|
# delete soft hyphens - moved here so it's executed after header/footer removal
|
||||||
if is_pdftohtml:
|
if is_pdftohtml:
|
||||||
# unwrap/delete soft hyphens
|
# unwrap/delete soft hyphens
|
||||||
@ -464,12 +473,6 @@ class HTMLPreProcessor(object):
|
|||||||
# unwrap/delete soft hyphens with formatting
|
# unwrap/delete soft hyphens with formatting
|
||||||
end_rules.append((re.compile(u'[]\s*(</(i|u|b)>)+(</p>\s*<p>\s*)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: ''))
|
end_rules.append((re.compile(u'[]\s*(</(i|u|b)>)+(</p>\s*<p>\s*)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: ''))
|
||||||
|
|
||||||
# Make the more aggressive chapter marking regex optional with the preprocess option to
|
|
||||||
# reduce false positives and move after header/footer removal
|
|
||||||
if getattr(self.extra_opts, 'preprocess_html', None):
|
|
||||||
if is_pdftohtml:
|
|
||||||
end_rules.append((re.compile(r'<p>\s*(?P<chap>(<[ibu]>){0,2}\s*([A-Z \'"!]{3,})\s*([\dA-Z:]+\s){0,4}\s*(</[ibu]>){0,2})\s*<p>\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<p>)?'), chap_head),)
|
|
||||||
|
|
||||||
length = -1
|
length = -1
|
||||||
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
|
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
|
||||||
docanalysis = DocAnalysis('pdf', html)
|
docanalysis = DocAnalysis('pdf', html)
|
||||||
@ -512,15 +515,14 @@ class HTMLPreProcessor(object):
|
|||||||
|
|
||||||
if is_pdftohtml and length > -1:
|
if is_pdftohtml and length > -1:
|
||||||
# Dehyphenate
|
# Dehyphenate
|
||||||
dehyphenator = Dehyphenator()
|
dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
|
||||||
html = dehyphenator(html,'html', length)
|
html = dehyphenator(html,'html', length)
|
||||||
|
|
||||||
if is_pdftohtml:
|
if is_pdftohtml:
|
||||||
from calibre.ebooks.conversion.utils import PreProcessor
|
from calibre.ebooks.conversion.utils import HeuristicProcessor
|
||||||
pdf_markup = PreProcessor(self.extra_opts, None)
|
pdf_markup = HeuristicProcessor(self.extra_opts, None)
|
||||||
totalwords = 0
|
totalwords = 0
|
||||||
totalwords = pdf_markup.get_word_count(html)
|
if pdf_markup.get_word_count(html) > 7000:
|
||||||
if totalwords > 7000:
|
|
||||||
html = pdf_markup.markup_chapters(html, totalwords, True)
|
html = pdf_markup.markup_chapters(html, totalwords, True)
|
||||||
|
|
||||||
#dump(html, 'post-preprocess')
|
#dump(html, 'post-preprocess')
|
||||||
@ -540,8 +542,10 @@ class HTMLPreProcessor(object):
|
|||||||
unidecoder = Unidecoder()
|
unidecoder = Unidecoder()
|
||||||
html = unidecoder.decode(html)
|
html = unidecoder.decode(html)
|
||||||
|
|
||||||
if self.plugin_preprocess:
|
if getattr(self.extra_opts, 'enable_heuristics', False):
|
||||||
html = self.input_plugin_preprocess(self.extra_opts, html)
|
from calibre.ebooks.conversion.utils import HeuristicProcessor
|
||||||
|
preprocessor = HeuristicProcessor(self.extra_opts, self.log)
|
||||||
|
html = preprocessor(html)
|
||||||
|
|
||||||
if getattr(self.extra_opts, 'smarten_punctuation', False):
|
if getattr(self.extra_opts, 'smarten_punctuation', False):
|
||||||
html = self.smarten_punctuation(html)
|
html = self.smarten_punctuation(html)
|
||||||
|
@ -11,13 +11,22 @@ from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
|
|||||||
from calibre.utils.logging import default_log
|
from calibre.utils.logging import default_log
|
||||||
from calibre.utils.wordcount import get_wordcount_obj
|
from calibre.utils.wordcount import get_wordcount_obj
|
||||||
|
|
||||||
class PreProcessor(object):
|
class HeuristicProcessor(object):
|
||||||
|
|
||||||
def __init__(self, extra_opts=None, log=None):
|
def __init__(self, extra_opts=None, log=None):
|
||||||
self.log = default_log if log is None else log
|
self.log = default_log if log is None else log
|
||||||
self.html_preprocess_sections = 0
|
self.html_preprocess_sections = 0
|
||||||
self.found_indents = 0
|
self.found_indents = 0
|
||||||
self.extra_opts = extra_opts
|
self.extra_opts = extra_opts
|
||||||
|
self.deleted_nbsps = False
|
||||||
|
self.totalwords = 0
|
||||||
|
self.min_chapters = 1
|
||||||
|
self.chapters_no_title = 0
|
||||||
|
self.chapters_with_title = 0
|
||||||
|
self.blanks_deleted = False
|
||||||
|
self.linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
|
||||||
|
self.blankreg = re.compile(r'\s*(?P<openline><p(?!\sid=\"softbreak\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
|
||||||
|
self.multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>){2,}', re.IGNORECASE)
|
||||||
|
|
||||||
def is_pdftohtml(self, src):
|
def is_pdftohtml(self, src):
|
||||||
return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
|
return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
|
||||||
@ -27,12 +36,12 @@ class PreProcessor(object):
|
|||||||
title = match.group('title')
|
title = match.group('title')
|
||||||
if not title:
|
if not title:
|
||||||
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
||||||
self.log("marked " + unicode(self.html_preprocess_sections) +
|
self.log.debug("marked " + unicode(self.html_preprocess_sections) +
|
||||||
" chapters. - " + unicode(chap))
|
" chapters. - " + unicode(chap))
|
||||||
return '<h2>'+chap+'</h2>\n'
|
return '<h2>'+chap+'</h2>\n'
|
||||||
else:
|
else:
|
||||||
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
||||||
self.log("marked " + unicode(self.html_preprocess_sections) +
|
self.log.debug("marked " + unicode(self.html_preprocess_sections) +
|
||||||
" chapters & titles. - " + unicode(chap) + ", " + unicode(title))
|
" chapters & titles. - " + unicode(chap) + ", " + unicode(title))
|
||||||
return '<h2>'+chap+'</h2>\n<h3>'+title+'</h3>\n'
|
return '<h2>'+chap+'</h2>\n<h3>'+title+'</h3>\n'
|
||||||
|
|
||||||
@ -40,10 +49,18 @@ class PreProcessor(object):
|
|||||||
chap = match.group('section')
|
chap = match.group('section')
|
||||||
styles = match.group('styles')
|
styles = match.group('styles')
|
||||||
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
||||||
self.log("marked " + unicode(self.html_preprocess_sections) +
|
self.log.debug("marked " + unicode(self.html_preprocess_sections) +
|
||||||
" section markers based on punctuation. - " + unicode(chap))
|
" section markers based on punctuation. - " + unicode(chap))
|
||||||
return '<'+styles+' style="page-break-before:always">'+chap
|
return '<'+styles+' style="page-break-before:always">'+chap
|
||||||
|
|
||||||
|
def analyze_title_matches(self, match):
|
||||||
|
chap = match.group('chap')
|
||||||
|
title = match.group('title')
|
||||||
|
if not title:
|
||||||
|
self.chapters_no_title = self.chapters_no_title + 1
|
||||||
|
else:
|
||||||
|
self.chapters_with_title = self.chapters_with_title + 1
|
||||||
|
|
||||||
def insert_indent(self, match):
|
def insert_indent(self, match):
|
||||||
pstyle = match.group('formatting')
|
pstyle = match.group('formatting')
|
||||||
span = match.group('span')
|
span = match.group('span')
|
||||||
@ -75,8 +92,8 @@ class PreProcessor(object):
|
|||||||
line_end = line_end_ere.findall(raw)
|
line_end = line_end_ere.findall(raw)
|
||||||
tot_htm_ends = len(htm_end)
|
tot_htm_ends = len(htm_end)
|
||||||
tot_ln_fds = len(line_end)
|
tot_ln_fds = len(line_end)
|
||||||
self.log("There are " + unicode(tot_ln_fds) + " total Line feeds, and " +
|
#self.log.debug("There are " + unicode(tot_ln_fds) + " total Line feeds, and " +
|
||||||
unicode(tot_htm_ends) + " marked up endings")
|
# unicode(tot_htm_ends) + " marked up endings")
|
||||||
|
|
||||||
if percent > 1:
|
if percent > 1:
|
||||||
percent = 1
|
percent = 1
|
||||||
@ -84,7 +101,7 @@ class PreProcessor(object):
|
|||||||
percent = 0
|
percent = 0
|
||||||
|
|
||||||
min_lns = tot_ln_fds * percent
|
min_lns = tot_ln_fds * percent
|
||||||
self.log("There must be fewer than " + unicode(min_lns) + " unmarked lines to add markup")
|
#self.log.debug("There must be fewer than " + unicode(min_lns) + " unmarked lines to add markup")
|
||||||
if min_lns > tot_htm_ends:
|
if min_lns > tot_htm_ends:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
@ -112,16 +129,55 @@ class PreProcessor(object):
|
|||||||
wordcount = get_wordcount_obj(word_count_text)
|
wordcount = get_wordcount_obj(word_count_text)
|
||||||
return wordcount.words
|
return wordcount.words
|
||||||
|
|
||||||
|
def markup_italicis(self, html):
|
||||||
|
ITALICIZE_WORDS = [
|
||||||
|
'Etc.', 'etc.', 'viz.', 'ie.', 'i.e.', 'Ie.', 'I.e.', 'eg.',
|
||||||
|
'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetera', 'n.b.', 'N.b.',
|
||||||
|
'nota bene', 'Nota bene', 'Ste.', 'Mme.', 'Mdme.',
|
||||||
|
'Mlle.', 'Mons.', 'PS.', 'PPS.',
|
||||||
|
]
|
||||||
|
|
||||||
|
ITALICIZE_STYLE_PATS = [
|
||||||
|
r'(?msu)(?<=\s)_(?P<words>\S[^_]{0,40}?\S)?_(?=\s)',
|
||||||
|
r'(?msu)(?<=\s)/(?P<words>\S[^/]{0,40}?\S)?/(?=\s)',
|
||||||
|
r'(?msu)(?<=\s)~~(?P<words>\S[^~]{0,40}?\S)?~~(?=\s)',
|
||||||
|
r'(?msu)(?<=\s)\*(?P<words>\S[^\*]{0,40}?\S)?\*(?=\s)',
|
||||||
|
r'(?msu)(?<=\s)~(?P<words>\S[^~]{0,40}?\S)?~(?=\s)',
|
||||||
|
r'(?msu)(?<=\s)_/(?P<words>\S[^/_]{0,40}?\S)?/_(?=\s)',
|
||||||
|
r'(?msu)(?<=\s)_\*(?P<words>\S[^\*_]{0,40}?\S)?\*_(?=\s)',
|
||||||
|
r'(?msu)(?<=\s)\*/(?P<words>\S[^/\*]{0,40}?\S)?/\*(?=\s)',
|
||||||
|
r'(?msu)(?<=\s)_\*/(?P<words>\S[^\*_]{0,40}?\S)?/\*_(?=\s)',
|
||||||
|
r'(?msu)(?<=\s)/:(?P<words>\S[^:/]{0,40}?\S)?:/(?=\s)',
|
||||||
|
r'(?msu)(?<=\s)\|:(?P<words>\S[^:\|]{0,40}?\S)?:\|(?=\s)',
|
||||||
|
]
|
||||||
|
|
||||||
|
for word in ITALICIZE_WORDS:
|
||||||
|
html = html.replace(word, '<i>%s</i>' % word)
|
||||||
|
|
||||||
|
for pat in ITALICIZE_STYLE_PATS:
|
||||||
|
html = re.sub(pat, lambda mo: '<i>%s</i>' % mo.group('words'), html)
|
||||||
|
|
||||||
|
return html
|
||||||
|
|
||||||
def markup_chapters(self, html, wordcount, blanks_between_paragraphs):
|
def markup_chapters(self, html, wordcount, blanks_between_paragraphs):
|
||||||
|
'''
|
||||||
|
Searches for common chapter headings throughout the document
|
||||||
|
attempts multiple patterns based on likelihood of a match
|
||||||
|
with minimum false positives. Exits after finding a successful pattern
|
||||||
|
'''
|
||||||
# Typical chapters are between 2000 and 7000 words, use the larger number to decide the
|
# Typical chapters are between 2000 and 7000 words, use the larger number to decide the
|
||||||
# minimum of chapters to search for
|
# minimum of chapters to search for. A max limit is calculated to prevent things like OCR
|
||||||
self.min_chapters = 1
|
# or pdf page numbers from being treated as TOC markers
|
||||||
|
max_chapters = 150
|
||||||
|
typical_chapters = 7000.
|
||||||
if wordcount > 7000:
|
if wordcount > 7000:
|
||||||
self.min_chapters = int(ceil(wordcount / 7000.))
|
if wordcount > 200000:
|
||||||
#print "minimum chapters required are: "+str(self.min_chapters)
|
typical_chapters = 15000.
|
||||||
|
self.min_chapters = int(ceil(wordcount / typical_chapters))
|
||||||
|
self.log.debug("minimum chapters required are: "+str(self.min_chapters))
|
||||||
heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
|
heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
|
||||||
self.html_preprocess_sections = len(heading.findall(html))
|
self.html_preprocess_sections = len(heading.findall(html))
|
||||||
self.log("found " + unicode(self.html_preprocess_sections) + " pre-existing headings")
|
self.log.debug("found " + unicode(self.html_preprocess_sections) + " pre-existing headings")
|
||||||
|
|
||||||
# Build the Regular Expressions in pieces
|
# Build the Regular Expressions in pieces
|
||||||
init_lookahead = "(?=<(p|div))"
|
init_lookahead = "(?=<(p|div))"
|
||||||
@ -151,103 +207,160 @@ class PreProcessor(object):
|
|||||||
n_lookahead_open = "\s+(?!"
|
n_lookahead_open = "\s+(?!"
|
||||||
n_lookahead_close = ")"
|
n_lookahead_close = ")"
|
||||||
|
|
||||||
default_title = r"(<[ibu][^>]*>)?\s{0,3}([\w\:\'\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)"
|
default_title = r"(<[ibu][^>]*>)?\s{0,3}(?!Chapter)([\w\:\'’\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)"
|
||||||
|
simple_title = r"(<[ibu][^>]*>)?\s{0,3}(?!(Chapter|\s+<)).{0,65}?(</[ibu][^>]*>)?(?=<)"
|
||||||
|
|
||||||
|
analysis_result = []
|
||||||
|
|
||||||
chapter_types = [
|
chapter_types = [
|
||||||
[r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, "Searching for common Chapter Headings"],
|
[r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Epilogue|CHAPTER|Kapitel|Volume\b|Prologue|Book\b|Part\b|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, True, True, False, "Searching for common section headings", 'common'],
|
||||||
[r"([A-Z-]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, "Searching for letter spaced headings"], # Spaced Lettering
|
[r"[^'\"]?(CHAPTER|Kapitel)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, True, True, False, "Searching for most common chapter headings", 'chapter'], # Highest frequency headings which include titles
|
||||||
[r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>", True, "Searching for emphasized lines"], # Emphasized lines
|
[r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•=]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>", True, True, True, False, "Searching for emphasized lines", 'emphasized'], # Emphasized lines
|
||||||
[r"[^'\"]?(\d+(\.|:)|CHAPTER)\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"], # Numeric Chapters
|
[r"[^'\"]?(\d+(\.|:))\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, True, True, False, "Searching for numeric chapter headings", 'numeric'], # Numeric Chapters
|
||||||
[r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles"], # Numeric Titles
|
[r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, True, True, False, "Searching for letter spaced headings", 'letter_spaced'], # Spaced Lettering
|
||||||
[r"[^'\"]?(\d+|CHAPTER)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, "Searching for simple numeric chapter headings"], # Numeric Chapters, no dot or colon
|
[r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, True, True, False, "Searching for numeric chapters with titles", 'numeric_title'], # Numeric Titles
|
||||||
[r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters" ] # Uppercase Chapters
|
[r"[^'\"]?(\d+)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, True, True, False, "Searching for simple numeric headings", 'plain_number'], # Numeric Chapters, no dot or colon
|
||||||
|
[r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, True, False, False, "Searching for chapters with Uppercase Characters", 'uppercase' ] # Uppercase Chapters
|
||||||
]
|
]
|
||||||
|
|
||||||
|
def recurse_patterns(html, analyze):
|
||||||
# Start with most typical chapter headings, get more aggressive until one works
|
# Start with most typical chapter headings, get more aggressive until one works
|
||||||
for [chapter_type, lookahead_ignorecase, log_message] in chapter_types:
|
for [chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name] in chapter_types:
|
||||||
|
n_lookahead = ''
|
||||||
|
hits = 0
|
||||||
|
self.chapters_no_title = 0
|
||||||
|
self.chapters_with_title = 0
|
||||||
|
|
||||||
|
if n_lookahead_req:
|
||||||
|
lp_n_lookahead_open = n_lookahead_open
|
||||||
|
lp_n_lookahead_close = n_lookahead_close
|
||||||
|
else:
|
||||||
|
lp_n_lookahead_open = ''
|
||||||
|
lp_n_lookahead_close = ''
|
||||||
|
|
||||||
|
if strict_title:
|
||||||
|
lp_title = default_title
|
||||||
|
else:
|
||||||
|
lp_title = simple_title
|
||||||
|
|
||||||
|
if ignorecase:
|
||||||
|
arg_ignorecase = r'(?i)'
|
||||||
|
else:
|
||||||
|
arg_ignorecase = ''
|
||||||
|
|
||||||
|
if title_req:
|
||||||
|
lp_opt_title_open = ''
|
||||||
|
lp_opt_title_close = ''
|
||||||
|
else:
|
||||||
|
lp_opt_title_open = opt_title_open
|
||||||
|
lp_opt_title_close = opt_title_close
|
||||||
|
|
||||||
if self.html_preprocess_sections >= self.min_chapters:
|
if self.html_preprocess_sections >= self.min_chapters:
|
||||||
break
|
break
|
||||||
full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close
|
full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close
|
||||||
|
if n_lookahead_req:
|
||||||
n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
|
n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
|
||||||
self.log("Marked " + unicode(self.html_preprocess_sections) + " headings, " + log_message)
|
if not analyze:
|
||||||
if lookahead_ignorecase:
|
self.log.debug("Marked " + unicode(self.html_preprocess_sections) + " headings, " + log_message)
|
||||||
chapter_marker = init_lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
|
|
||||||
chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
|
chapter_marker = arg_ignorecase+init_lookahead+full_chapter_line+blank_lines+lp_n_lookahead_open+n_lookahead+lp_n_lookahead_close+lp_opt_title_open+title_line_open+title_header_open+lp_title+title_header_close+title_line_close+lp_opt_title_close
|
||||||
|
chapdetect = re.compile(r'%s' % chapter_marker)
|
||||||
|
|
||||||
|
if analyze:
|
||||||
|
hits = len(chapdetect.findall(html))
|
||||||
|
if hits:
|
||||||
|
chapdetect.sub(self.analyze_title_matches, html)
|
||||||
|
if float(self.chapters_with_title) / float(hits) > .5:
|
||||||
|
title_req = True
|
||||||
|
strict_title = False
|
||||||
|
self.log.debug(unicode(type_name)+" had "+unicode(hits)+" hits - "+unicode(self.chapters_no_title)+" chapters with no title, "+unicode(self.chapters_with_title)+" chapters with titles, "+unicode(float(self.chapters_with_title) / float(hits))+" percent. ")
|
||||||
|
if type_name == 'common':
|
||||||
|
analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name])
|
||||||
|
elif self.min_chapters <= hits < max_chapters:
|
||||||
|
analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name])
|
||||||
|
break
|
||||||
else:
|
else:
|
||||||
chapter_marker = init_lookahead+full_chapter_line+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close+n_lookahead_open+n_lookahead+n_lookahead_close
|
|
||||||
chapdetect = re.compile(r'%s' % chapter_marker, re.UNICODE)
|
|
||||||
html = chapdetect.sub(self.chapter_head, html)
|
html = chapdetect.sub(self.chapter_head, html)
|
||||||
|
return html
|
||||||
|
|
||||||
|
recurse_patterns(html, True)
|
||||||
|
chapter_types = analysis_result
|
||||||
|
html = recurse_patterns(html, False)
|
||||||
|
|
||||||
words_per_chptr = wordcount
|
words_per_chptr = wordcount
|
||||||
if words_per_chptr > 0 and self.html_preprocess_sections > 0:
|
if words_per_chptr > 0 and self.html_preprocess_sections > 0:
|
||||||
words_per_chptr = wordcount / self.html_preprocess_sections
|
words_per_chptr = wordcount / self.html_preprocess_sections
|
||||||
self.log("Total wordcount is: "+ str(wordcount)+", Average words per section is: "+str(words_per_chptr)+", Marked up "+str(self.html_preprocess_sections)+" chapters")
|
self.log.debug("Total wordcount is: "+ str(wordcount)+", Average words per section is: "+str(words_per_chptr)+", Marked up "+str(self.html_preprocess_sections)+" chapters")
|
||||||
return html
|
return html
|
||||||
|
|
||||||
def punctuation_unwrap(self, length, content, format):
|
def punctuation_unwrap(self, length, content, format):
|
||||||
|
'''
|
||||||
|
Unwraps lines based on line length and punctuation
|
||||||
|
supports a range of html markup and text files
|
||||||
|
'''
|
||||||
# define the pieces of the regex
|
# define the pieces of the regex
|
||||||
lookahead = "(?<=.{"+str(length)+"}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðßě,:)\IA\u00DF]|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity
|
lookahead = "(?<=.{"+str(length)+u"}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðßě,:)\IA\u00DF]|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity
|
||||||
line_ending = "\s*</(span|p|div)>\s*(</(p|span|div)>)?"
|
em_en_lookahead = "(?<=.{"+str(length)+u"}[\u2013\u2014])"
|
||||||
|
soft_hyphen = u"\xad"
|
||||||
|
line_ending = "\s*</(span|[iubp]|div)>\s*(</(span|[iubp]|div)>)?"
|
||||||
blanklines = "\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*"
|
blanklines = "\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*"
|
||||||
line_opening = "<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*"
|
line_opening = "<(span|[iubp]|div)[^>]*>\s*(<(span|[iubp]|div)[^>]*>)?\s*"
|
||||||
txt_line_wrap = u"((\u0020|\u0009)*\n){1,4}"
|
txt_line_wrap = u"((\u0020|\u0009)*\n){1,4}"
|
||||||
|
|
||||||
unwrap_regex = lookahead+line_ending+blanklines+line_opening
|
unwrap_regex = lookahead+line_ending+blanklines+line_opening
|
||||||
|
em_en_unwrap_regex = em_en_lookahead+line_ending+blanklines+line_opening
|
||||||
|
shy_unwrap_regex = soft_hyphen+line_ending+blanklines+line_opening
|
||||||
|
|
||||||
if format == 'txt':
|
if format == 'txt':
|
||||||
unwrap_regex = lookahead+txt_line_wrap
|
unwrap_regex = lookahead+txt_line_wrap
|
||||||
|
em_en_unwrap_regex = em_en_lookahead+txt_line_wrap
|
||||||
|
shy_unwrap_regex = soft_hyphen+txt_line_wrap
|
||||||
|
|
||||||
unwrap = re.compile(u"%s" % unwrap_regex, re.UNICODE)
|
unwrap = re.compile(u"%s" % unwrap_regex, re.UNICODE)
|
||||||
|
em_en_unwrap = re.compile(u"%s" % em_en_unwrap_regex, re.UNICODE)
|
||||||
|
shy_unwrap = re.compile(u"%s" % shy_unwrap_regex, re.UNICODE)
|
||||||
|
|
||||||
content = unwrap.sub(' ', content)
|
content = unwrap.sub(' ', content)
|
||||||
|
content = em_en_unwrap.sub('', content)
|
||||||
|
content = shy_unwrap.sub('', content)
|
||||||
return content
|
return content
|
||||||
|
|
||||||
|
def txt_process(self, match):
|
||||||
def __call__(self, html):
|
|
||||||
self.log("********* Preprocessing HTML *********")
|
|
||||||
|
|
||||||
# Count the words in the document to estimate how many chapters to look for and whether
|
|
||||||
# other types of processing are attempted
|
|
||||||
totalwords = 0
|
|
||||||
totalwords = self.get_word_count(html)
|
|
||||||
|
|
||||||
if totalwords < 50:
|
|
||||||
self.log("not enough text, not preprocessing")
|
|
||||||
return html
|
|
||||||
|
|
||||||
# Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
|
|
||||||
html = re.sub(r"\s*</(?P<tag>p|div)>", "</"+"\g<tag>"+">\n", html)
|
|
||||||
html = re.sub(r"\s*<(?P<tag>p|div)(?P<style>[^>]*)>\s*", "\n<"+"\g<tag>"+"\g<style>"+">", html)
|
|
||||||
|
|
||||||
###### Check Markup ######
|
|
||||||
#
|
|
||||||
# some lit files don't have any <p> tags or equivalent (generally just plain text between
|
|
||||||
# <pre> tags), check and mark up line endings if required before proceeding
|
|
||||||
if self.no_markup(html, 0.1):
|
|
||||||
self.log("not enough paragraph markers, adding now")
|
|
||||||
# check if content is in pre tags, use txt processor to mark up if so
|
|
||||||
pre = re.compile(r'<pre>', re.IGNORECASE)
|
|
||||||
if len(pre.findall(html)) == 1:
|
|
||||||
self.log("Running Text Processing")
|
|
||||||
from calibre.ebooks.txt.processor import convert_basic, preserve_spaces, \
|
from calibre.ebooks.txt.processor import convert_basic, preserve_spaces, \
|
||||||
separate_paragraphs_single_line
|
separate_paragraphs_single_line
|
||||||
outerhtml = re.compile(r'.*?(?<=<pre>)(?P<text>.*)(?=</pre>).*', re.IGNORECASE|re.DOTALL)
|
content = match.group('text')
|
||||||
html = outerhtml.sub('\g<text>', html)
|
content = separate_paragraphs_single_line(content)
|
||||||
html = separate_paragraphs_single_line(html)
|
content = preserve_spaces(content)
|
||||||
html = preserve_spaces(html)
|
content = convert_basic(content, epub_split_size_kb=0)
|
||||||
html = convert_basic(html, epub_split_size_kb=0)
|
return content
|
||||||
|
|
||||||
|
def markup_pre(self, html):
|
||||||
|
pre = re.compile(r'<pre>', re.IGNORECASE)
|
||||||
|
if len(pre.findall(html)) >= 1:
|
||||||
|
self.log.debug("Running Text Processing")
|
||||||
|
outerhtml = re.compile(r'.*?(?<=<pre>)(?P<text>.*?)</pre>', re.IGNORECASE|re.DOTALL)
|
||||||
|
html = outerhtml.sub(self.txt_process, html)
|
||||||
else:
|
else:
|
||||||
# Add markup naively
|
# Add markup naively
|
||||||
# TODO - find out if there are cases where there are more than one <pre> tag or
|
# TODO - find out if there are cases where there are more than one <pre> tag or
|
||||||
# other types of unmarked html and handle them in some better fashion
|
# other types of unmarked html and handle them in some better fashion
|
||||||
add_markup = re.compile('(?<!>)(\n)')
|
add_markup = re.compile('(?<!>)(\n)')
|
||||||
html = add_markup.sub('</p>\n<p>', html)
|
html = add_markup.sub('</p>\n<p>', html)
|
||||||
|
return html
|
||||||
|
|
||||||
###### Mark Indents/Cleanup ######
|
def arrange_htm_line_endings(self, html):
|
||||||
#
|
html = re.sub(r"\s*</(?P<tag>p|div)>", "</"+"\g<tag>"+">\n", html)
|
||||||
# Replace series of non-breaking spaces with text-indent
|
html = re.sub(r"\s*<(?P<tag>p|div)(?P<style>[^>]*)>\s*", "\n<"+"\g<tag>"+"\g<style>"+">", html)
|
||||||
|
return html
|
||||||
|
|
||||||
|
def fix_nbsp_indents(self, html):
|
||||||
txtindent = re.compile(ur'<p(?P<formatting>[^>]*)>\s*(?P<span>(<span[^>]*>\s*)+)?\s*(\u00a0){2,}', re.IGNORECASE)
|
txtindent = re.compile(ur'<p(?P<formatting>[^>]*)>\s*(?P<span>(<span[^>]*>\s*)+)?\s*(\u00a0){2,}', re.IGNORECASE)
|
||||||
html = txtindent.sub(self.insert_indent, html)
|
html = txtindent.sub(self.insert_indent, html)
|
||||||
if self.found_indents > 1:
|
if self.found_indents > 1:
|
||||||
self.log("replaced "+unicode(self.found_indents)+ " nbsp indents with inline styles")
|
self.log.debug("replaced "+unicode(self.found_indents)+ " nbsp indents with inline styles")
|
||||||
|
return html
|
||||||
|
|
||||||
|
def cleanup_markup(self, html):
|
||||||
# remove remaining non-breaking spaces
|
# remove remaining non-breaking spaces
|
||||||
html = re.sub(ur'\u00a0', ' ', html)
|
html = re.sub(ur'\u00a0', ' ', html)
|
||||||
# Get rid of various common microsoft specific tags which can cause issues later
|
# Get rid of various common microsoft specific tags which can cause issues later
|
||||||
@ -255,108 +368,166 @@ class PreProcessor(object):
|
|||||||
html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
|
html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
|
||||||
# Delete microsoft 'smart' tags
|
# Delete microsoft 'smart' tags
|
||||||
html = re.sub('(?i)</?st1:\w+>', '', html)
|
html = re.sub('(?i)</?st1:\w+>', '', html)
|
||||||
# Get rid of empty span, bold, & italics tags
|
# Get rid of empty span, bold, font, em, & italics tags
|
||||||
html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html)
|
html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html)
|
||||||
html = re.sub(r"\s*<[ibu][^>]*>\s*(<[ibu][^>]*>\s*</[ibu]>\s*){0,2}\s*</[ibu]>", " ", html)
|
html = re.sub(r"\s*<(font|[ibu]|em)[^>]*>\s*(<(font|[ibu]|em)[^>]*>\s*</(font|[ibu]|em)>\s*){0,2}\s*</(font|[ibu]|em)>", " ", html)
|
||||||
html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html)
|
html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html)
|
||||||
# ADE doesn't render <br />, change to empty paragraphs
|
html = re.sub(r"\s*<(font|[ibu]|em)[^>]*>\s*(<(font|[ibu]|em)[^>]*>\s*</(font|[ibu]|em)>\s*){0,2}\s*</(font|[ibu]|em)>", " ", html)
|
||||||
#html = re.sub('<br[^>]*>', u'<p>\u00a0</p>', html)
|
self.deleted_nbsps = True
|
||||||
|
return html
|
||||||
|
|
||||||
# If more than 40% of the lines are empty paragraphs and the user has enabled remove
|
def analyze_line_endings(self, html):
|
||||||
# paragraph spacing then delete blank lines to clean up spacing
|
'''
|
||||||
linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
|
determines the type of html line ending used most commonly in a document
|
||||||
blankreg = re.compile(r'\s*(?P<openline><p[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
|
use before calling docanalysis functions
|
||||||
#multi_blank = re.compile(r'(\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>){2,}', re.IGNORECASE)
|
'''
|
||||||
blanklines = blankreg.findall(html)
|
|
||||||
lines = linereg.findall(html)
|
|
||||||
blanks_between_paragraphs = False
|
|
||||||
if len(lines) > 1:
|
|
||||||
self.log("There are " + unicode(len(blanklines)) + " blank lines. " +
|
|
||||||
unicode(float(len(blanklines)) / float(len(lines))) + " percent blank")
|
|
||||||
if float(len(blanklines)) / float(len(lines)) > 0.40 and getattr(self.extra_opts,
|
|
||||||
'remove_paragraph_spacing', False):
|
|
||||||
self.log("deleting blank lines")
|
|
||||||
html = blankreg.sub('', html)
|
|
||||||
elif float(len(blanklines)) / float(len(lines)) > 0.40:
|
|
||||||
blanks_between_paragraphs = True
|
|
||||||
#print "blanks between paragraphs is marked True"
|
|
||||||
else:
|
|
||||||
blanks_between_paragraphs = False
|
|
||||||
|
|
||||||
#self.dump(html, 'before_chapter_markup')
|
|
||||||
# detect chapters/sections to match xpath or splitting logic
|
|
||||||
#
|
|
||||||
|
|
||||||
html = self.markup_chapters(html, totalwords, blanks_between_paragraphs)
|
|
||||||
|
|
||||||
|
|
||||||
###### Unwrap lines ######
|
|
||||||
#
|
|
||||||
# Some OCR sourced files have line breaks in the html using a combination of span & p tags
|
|
||||||
# span are used for hard line breaks, p for new paragraphs. Determine which is used so
|
|
||||||
# that lines can be un-wrapped across page boundaries
|
|
||||||
paras_reg = re.compile('<p[^>]*>', re.IGNORECASE)
|
paras_reg = re.compile('<p[^>]*>', re.IGNORECASE)
|
||||||
spans_reg = re.compile('<span[^>]*>', re.IGNORECASE)
|
spans_reg = re.compile('<span[^>]*>', re.IGNORECASE)
|
||||||
paras = len(paras_reg.findall(html))
|
paras = len(paras_reg.findall(html))
|
||||||
spans = len(spans_reg.findall(html))
|
spans = len(spans_reg.findall(html))
|
||||||
if spans > 1:
|
if spans > 1:
|
||||||
if float(paras) / float(spans) < 0.75:
|
if float(paras) / float(spans) < 0.75:
|
||||||
format = 'spanned_html'
|
return 'spanned_html'
|
||||||
else:
|
else:
|
||||||
format = 'html'
|
return 'html'
|
||||||
else:
|
else:
|
||||||
format = 'html'
|
return 'html'
|
||||||
|
|
||||||
|
def analyze_blanks(self, html):
|
||||||
|
blanklines = self.blankreg.findall(html)
|
||||||
|
lines = self.linereg.findall(html)
|
||||||
|
if len(lines) > 1:
|
||||||
|
self.log.debug("There are " + unicode(len(blanklines)) + " blank lines. " +
|
||||||
|
unicode(float(len(blanklines)) / float(len(lines))) + " percent blank")
|
||||||
|
|
||||||
|
if float(len(blanklines)) / float(len(lines)) > 0.40:
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def cleanup_required(self):
|
||||||
|
for option in ['unwrap_lines', 'markup_chapter_headings', 'format_scene_breaks', 'delete_blank_paragraphs']:
|
||||||
|
if getattr(self.extra_opts, option, False):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def __call__(self, html):
|
||||||
|
self.log.debug("********* Heuristic processing HTML *********")
|
||||||
|
|
||||||
|
# Count the words in the document to estimate how many chapters to look for and whether
|
||||||
|
# other types of processing are attempted
|
||||||
|
try:
|
||||||
|
self.totalwords = self.get_word_count(html)
|
||||||
|
except:
|
||||||
|
self.log.warn("Can't get wordcount")
|
||||||
|
|
||||||
|
if self.totalwords < 50:
|
||||||
|
self.log.warn("flow is too short, not running heuristics")
|
||||||
|
return html
|
||||||
|
|
||||||
|
# Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
|
||||||
|
html = self.arrange_htm_line_endings(html)
|
||||||
|
|
||||||
|
if self.cleanup_required():
|
||||||
|
###### Check Markup ######
|
||||||
|
#
|
||||||
|
# some lit files don't have any <p> tags or equivalent (generally just plain text between
|
||||||
|
# <pre> tags), check and mark up line endings if required before proceeding
|
||||||
|
# fix indents must run after this step
|
||||||
|
if self.no_markup(html, 0.1):
|
||||||
|
self.log.debug("not enough paragraph markers, adding now")
|
||||||
|
# markup using text processing
|
||||||
|
html = self.markup_pre(html)
|
||||||
|
|
||||||
|
# Replace series of non-breaking spaces with text-indent
|
||||||
|
if getattr(self.extra_opts, 'fix_indents', False):
|
||||||
|
html = self.fix_nbsp_indents(html)
|
||||||
|
|
||||||
|
if self.cleanup_required():
|
||||||
|
# fix indents must run before this step, as it removes non-breaking spaces
|
||||||
|
html = self.cleanup_markup(html)
|
||||||
|
|
||||||
|
# ADE doesn't render <br />, change to empty paragraphs
|
||||||
|
#html = re.sub('<br[^>]*>', u'<p>\u00a0</p>', html)
|
||||||
|
|
||||||
|
# Determine whether the document uses interleaved blank lines
|
||||||
|
blanks_between_paragraphs = self.analyze_blanks(html)
|
||||||
|
|
||||||
|
#self.dump(html, 'before_chapter_markup')
|
||||||
|
# detect chapters/sections to match xpath or splitting logic
|
||||||
|
|
||||||
|
if getattr(self.extra_opts, 'markup_chapter_headings', False):
|
||||||
|
html = self.markup_chapters(html, self.totalwords, blanks_between_paragraphs)
|
||||||
|
|
||||||
|
if getattr(self.extra_opts, 'italicize_common_cases', False):
|
||||||
|
html = self.markup_italicis(html)
|
||||||
|
|
||||||
|
# If more than 40% of the lines are empty paragraphs and the user has enabled delete
|
||||||
|
# blank paragraphs then delete blank lines to clean up spacing
|
||||||
|
if blanks_between_paragraphs and getattr(self.extra_opts, 'delete_blank_paragraphs', False):
|
||||||
|
self.log.debug("deleting blank lines")
|
||||||
|
self.blanks_deleted = True
|
||||||
|
html = self.multi_blank.sub('\n<p id="softbreak" style="margin-top:1.5em; margin-bottom:1.5em"> </p>', html)
|
||||||
|
html = self.blankreg.sub('', html)
|
||||||
|
|
||||||
|
# Determine line ending type
|
||||||
|
# Some OCR sourced files have line breaks in the html using a combination of span & p tags
|
||||||
|
# span are used for hard line breaks, p for new paragraphs. Determine which is used so
|
||||||
|
# that lines can be un-wrapped across page boundaries
|
||||||
|
format = self.analyze_line_endings(html)
|
||||||
|
|
||||||
# Check Line histogram to determine if the document uses hard line breaks, If 50% or
|
# Check Line histogram to determine if the document uses hard line breaks, If 50% or
|
||||||
# more of the lines break in the same region of the document then unwrapping is required
|
# more of the lines break in the same region of the document then unwrapping is required
|
||||||
docanalysis = DocAnalysis(format, html)
|
docanalysis = DocAnalysis(format, html)
|
||||||
hardbreaks = docanalysis.line_histogram(.50)
|
hardbreaks = docanalysis.line_histogram(.50)
|
||||||
self.log("Hard line breaks check returned "+unicode(hardbreaks))
|
self.log.debug("Hard line breaks check returned "+unicode(hardbreaks))
|
||||||
|
|
||||||
# Calculate Length
|
# Calculate Length
|
||||||
unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
|
unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
|
||||||
length = docanalysis.line_length(unwrap_factor)
|
length = docanalysis.line_length(unwrap_factor)
|
||||||
self.log("Median line length is " + unicode(length) + ", calculated with " + format + " format")
|
self.log.debug("Median line length is " + unicode(length) + ", calculated with " + format + " format")
|
||||||
|
|
||||||
|
###### Unwrap lines ######
|
||||||
|
if getattr(self.extra_opts, 'unwrap_lines', False):
|
||||||
# only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor
|
# only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor
|
||||||
if hardbreaks or unwrap_factor < 0.4:
|
if hardbreaks or unwrap_factor < 0.4:
|
||||||
self.log("Unwrapping required, unwrapping Lines")
|
self.log.debug("Unwrapping required, unwrapping Lines")
|
||||||
# Unwrap em/en dashes
|
# Dehyphenate with line length limiters
|
||||||
html = re.sub(u'(?<=.{%i}[\u2013\u2014])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])' % length, '', html)
|
dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
|
||||||
# Dehyphenate
|
|
||||||
self.log("Unwrapping/Removing hyphens")
|
|
||||||
dehyphenator = Dehyphenator()
|
|
||||||
html = dehyphenator(html,'html', length)
|
html = dehyphenator(html,'html', length)
|
||||||
self.log("Done dehyphenating")
|
|
||||||
# Unwrap lines using punctation and line length
|
|
||||||
#unwrap_quotes = re.compile(u"(?<=.{%i}\"')\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*(?=[a-z])" % length, re.UNICODE)
|
|
||||||
html = self.punctuation_unwrap(length, html, 'html')
|
html = self.punctuation_unwrap(length, html, 'html')
|
||||||
#check any remaining hyphens, but only unwrap if there is a match
|
|
||||||
dehyphenator = Dehyphenator()
|
|
||||||
html = dehyphenator(html,'html_cleanup', length)
|
|
||||||
else:
|
|
||||||
# dehyphenate in cleanup mode to fix anything previous conversions/editing missed
|
|
||||||
self.log("Cleaning up hyphenation")
|
|
||||||
dehyphenator = Dehyphenator()
|
|
||||||
html = dehyphenator(html,'html_cleanup', length)
|
|
||||||
self.log("Done dehyphenating")
|
|
||||||
|
|
||||||
# delete soft hyphens
|
if getattr(self.extra_opts, 'dehyphenate', False):
|
||||||
html = re.sub(u'\xad\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
|
# dehyphenate in cleanup mode to fix anything previous conversions/editing missed
|
||||||
|
self.log.debug("Fixing hyphenated content")
|
||||||
|
dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
|
||||||
|
html = dehyphenator(html,'html_cleanup', length)
|
||||||
|
html = dehyphenator(html, 'individual_words', length)
|
||||||
|
|
||||||
# If still no sections after unwrapping mark split points on lines with no punctuation
|
# If still no sections after unwrapping mark split points on lines with no punctuation
|
||||||
if self.html_preprocess_sections < self.min_chapters:
|
if self.html_preprocess_sections < self.min_chapters and getattr(self.extra_opts, 'markup_chapter_headings', False):
|
||||||
self.log("Looking for more split points based on punctuation,"
|
self.log.debug("Looking for more split points based on punctuation,"
|
||||||
" currently have " + unicode(self.html_preprocess_sections))
|
" currently have " + unicode(self.html_preprocess_sections))
|
||||||
chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
|
chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
|
||||||
html = chapdetect3.sub(self.chapter_break, html)
|
html = chapdetect3.sub(self.chapter_break, html)
|
||||||
|
|
||||||
|
if getattr(self.extra_opts, 'renumber_headings', False):
|
||||||
# search for places where a first or second level heading is immediately followed by another
|
# search for places where a first or second level heading is immediately followed by another
|
||||||
# top level heading. demote the second heading to h3 to prevent splitting between chapter
|
# top level heading. demote the second heading to h3 to prevent splitting between chapter
|
||||||
# headings and titles, images, etc
|
# headings and titles, images, etc
|
||||||
doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
|
doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
|
||||||
html = doubleheading.sub('\g<firsthead>'+'\n<h3'+'\g<secondhead>'+'</h3>', html)
|
html = doubleheading.sub('\g<firsthead>'+'\n<h3'+'\g<secondhead>'+'</h3>', html)
|
||||||
|
|
||||||
# put back non-breaking spaces in empty paragraphs to preserve original formatting
|
if getattr(self.extra_opts, 'format_scene_breaks', False):
|
||||||
html = blankreg.sub('\n'+r'\g<openline>'+u'\u00a0'+r'\g<closeline>', html)
|
|
||||||
|
|
||||||
# Center separator lines
|
# Center separator lines
|
||||||
html = re.sub(u'<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*(?P<break>([*#•=✦]+\s*)+)\s*(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>', '<p style="text-align:center">' + '\g<break>' + '</p>', html)
|
html = re.sub(u'<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*(?P<break>([*#•=✦]+\s*)+)\s*(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>', '<p style="text-align:center; margin-top:1.25em; margin-bottom:1.25em">' + '\g<break>' + '</p>', html)
|
||||||
|
if not self.blanks_deleted:
|
||||||
|
html = self.multi_blank.sub('\n<p id="softbreak" style="margin-top:1.5em; margin-bottom:1.5em"> </p>', html)
|
||||||
|
html = re.sub('<p\s+id="softbreak"[^>]*>\s*</p>', '<div id="softbreak" style="margin-left: 45%; margin-right: 45%; margin-top:1.5em; margin-bottom:1.5em"><hr style="height: 3px; background:#505050" /></div>', html)
|
||||||
|
|
||||||
|
if self.deleted_nbsps:
|
||||||
|
# put back non-breaking spaces in empty paragraphs to preserve original formatting
|
||||||
|
html = self.blankreg.sub('\n'+r'\g<openline>'+u'\u00a0'+r'\g<closeline>', html)
|
||||||
|
|
||||||
return html
|
return html
|
||||||
|
@ -24,7 +24,6 @@ from calibre.constants import islinux, isfreebsd, iswindows
|
|||||||
from calibre import unicode_path, as_unicode
|
from calibre import unicode_path, as_unicode
|
||||||
from calibre.utils.localization import get_lang
|
from calibre.utils.localization import get_lang
|
||||||
from calibre.utils.filenames import ascii_filename
|
from calibre.utils.filenames import ascii_filename
|
||||||
from calibre.ebooks.conversion.utils import PreProcessor
|
|
||||||
|
|
||||||
class Link(object):
|
class Link(object):
|
||||||
'''
|
'''
|
||||||
@ -296,7 +295,7 @@ class HTMLInput(InputFormatPlugin):
|
|||||||
return oeb
|
return oeb
|
||||||
|
|
||||||
from calibre.ebooks.conversion.plumber import create_oebbook
|
from calibre.ebooks.conversion.plumber import create_oebbook
|
||||||
return create_oebbook(log, stream.name, opts, self,
|
return create_oebbook(log, stream.name, opts,
|
||||||
encoding=opts.input_encoding)
|
encoding=opts.input_encoding)
|
||||||
|
|
||||||
def is_case_sensitive(self, path):
|
def is_case_sensitive(self, path):
|
||||||
@ -485,9 +484,3 @@ class HTMLInput(InputFormatPlugin):
|
|||||||
self.log.exception('Failed to read CSS file: %r'%link)
|
self.log.exception('Failed to read CSS file: %r'%link)
|
||||||
return (None, None)
|
return (None, None)
|
||||||
return (None, raw)
|
return (None, raw)
|
||||||
|
|
||||||
def preprocess_html(self, options, html):
|
|
||||||
self.options = options
|
|
||||||
preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
|
|
||||||
return preprocessor(html)
|
|
||||||
|
|
||||||
|
@ -7,7 +7,7 @@ __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
|||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
from calibre.customize.conversion import InputFormatPlugin
|
from calibre.customize.conversion import InputFormatPlugin
|
||||||
from calibre.ebooks.conversion.utils import PreProcessor
|
from calibre.ebooks.conversion.utils import HeuristicProcessor
|
||||||
|
|
||||||
|
|
||||||
class LITInput(InputFormatPlugin):
|
class LITInput(InputFormatPlugin):
|
||||||
@ -22,7 +22,7 @@ class LITInput(InputFormatPlugin):
|
|||||||
from calibre.ebooks.lit.reader import LitReader
|
from calibre.ebooks.lit.reader import LitReader
|
||||||
from calibre.ebooks.conversion.plumber import create_oebbook
|
from calibre.ebooks.conversion.plumber import create_oebbook
|
||||||
self.log = log
|
self.log = log
|
||||||
return create_oebbook(log, stream, options, self, reader=LitReader)
|
return create_oebbook(log, stream, options, reader=LitReader)
|
||||||
|
|
||||||
def postprocess_book(self, oeb, opts, log):
|
def postprocess_book(self, oeb, opts, log):
|
||||||
from calibre.ebooks.oeb.base import XHTML_NS, XPath, XHTML
|
from calibre.ebooks.oeb.base import XHTML_NS, XPath, XHTML
|
||||||
@ -39,10 +39,13 @@ class LITInput(InputFormatPlugin):
|
|||||||
body = body[0]
|
body = body[0]
|
||||||
if len(body) == 1 and body[0].tag == XHTML('pre'):
|
if len(body) == 1 and body[0].tag == XHTML('pre'):
|
||||||
pre = body[0]
|
pre = body[0]
|
||||||
from calibre.ebooks.txt.processor import convert_basic
|
from calibre.ebooks.txt.processor import convert_basic, preserve_spaces, \
|
||||||
|
separate_paragraphs_single_line
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
import copy
|
import copy
|
||||||
html = convert_basic(pre.text).replace('<html>',
|
html = separate_paragraphs_single_line(pre.text)
|
||||||
|
html = preserve_spaces(html)
|
||||||
|
html = convert_basic(html).replace('<html>',
|
||||||
'<html xmlns="%s">'%XHTML_NS)
|
'<html xmlns="%s">'%XHTML_NS)
|
||||||
root = etree.fromstring(html)
|
root = etree.fromstring(html)
|
||||||
body = XPath('//h:body')(root)
|
body = XPath('//h:body')(root)
|
||||||
@ -51,10 +54,3 @@ class LITInput(InputFormatPlugin):
|
|||||||
for elem in body:
|
for elem in body:
|
||||||
ne = copy.deepcopy(elem)
|
ne = copy.deepcopy(elem)
|
||||||
pre.append(ne)
|
pre.append(ne)
|
||||||
|
|
||||||
|
|
||||||
def preprocess_html(self, options, html):
|
|
||||||
self.options = options
|
|
||||||
preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
|
|
||||||
return preprocessor(html)
|
|
||||||
|
|
||||||
|
@ -12,7 +12,6 @@ from copy import deepcopy
|
|||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
|
||||||
from calibre.customize.conversion import InputFormatPlugin
|
from calibre.customize.conversion import InputFormatPlugin
|
||||||
from calibre.ebooks.conversion.utils import PreProcessor
|
|
||||||
from calibre import guess_type
|
from calibre import guess_type
|
||||||
|
|
||||||
class Canvas(etree.XSLTExtension):
|
class Canvas(etree.XSLTExtension):
|
||||||
@ -419,11 +418,3 @@ class LRFInput(InputFormatPlugin):
|
|||||||
f.write(result)
|
f.write(result)
|
||||||
styles.write()
|
styles.write()
|
||||||
return os.path.abspath('content.opf')
|
return os.path.abspath('content.opf')
|
||||||
|
|
||||||
def preprocess_html(self, options, html):
|
|
||||||
self.options = options
|
|
||||||
preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
|
|
||||||
return preprocessor(html)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -39,11 +39,3 @@ class MOBIInput(InputFormatPlugin):
|
|||||||
accelerators['pagebreaks'] = '//h:div[@class="mbp_pagebreak"]'
|
accelerators['pagebreaks'] = '//h:div[@class="mbp_pagebreak"]'
|
||||||
return mr.created_opf_path
|
return mr.created_opf_path
|
||||||
|
|
||||||
def preprocess_html(self, options, html):
|
|
||||||
# search for places where a first or second level heading is immediately followed by another
|
|
||||||
# top level heading. demote the second heading to h3 to prevent splitting between chapter
|
|
||||||
# headings and titles, images, etc
|
|
||||||
doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
|
|
||||||
html = doubleheading.sub('\g<firsthead>'+'\n<h3'+'\g<secondhead>'+'</h3>', html)
|
|
||||||
return html
|
|
||||||
|
|
||||||
|
@ -9,7 +9,6 @@ import os
|
|||||||
from calibre.customize.conversion import InputFormatPlugin
|
from calibre.customize.conversion import InputFormatPlugin
|
||||||
from calibre.ebooks.pdb.header import PdbHeaderReader
|
from calibre.ebooks.pdb.header import PdbHeaderReader
|
||||||
from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader
|
from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader
|
||||||
from calibre.ebooks.conversion.utils import PreProcessor
|
|
||||||
|
|
||||||
class PDBInput(InputFormatPlugin):
|
class PDBInput(InputFormatPlugin):
|
||||||
|
|
||||||
@ -32,8 +31,3 @@ class PDBInput(InputFormatPlugin):
|
|||||||
opf = reader.extract_content(os.getcwd())
|
opf = reader.extract_content(os.getcwd())
|
||||||
|
|
||||||
return opf
|
return opf
|
||||||
|
|
||||||
def preprocess_html(self, options, html):
|
|
||||||
self.options = options
|
|
||||||
preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
|
|
||||||
return preprocessor(html)
|
|
||||||
|
@ -7,7 +7,6 @@ import os, glob, re, textwrap
|
|||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
|
||||||
from calibre.customize.conversion import InputFormatPlugin
|
from calibre.customize.conversion import InputFormatPlugin
|
||||||
from calibre.ebooks.conversion.utils import PreProcessor
|
|
||||||
|
|
||||||
border_style_map = {
|
border_style_map = {
|
||||||
'single' : 'solid',
|
'single' : 'solid',
|
||||||
@ -319,13 +318,9 @@ class RTFInput(InputFormatPlugin):
|
|||||||
res = transform.tostring(result)
|
res = transform.tostring(result)
|
||||||
res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
|
res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
|
||||||
# Replace newlines inserted by the 'empty_paragraphs' option in rtf2xml with html blank lines
|
# Replace newlines inserted by the 'empty_paragraphs' option in rtf2xml with html blank lines
|
||||||
if not getattr(self.opts, 'remove_paragraph_spacing', False):
|
|
||||||
res = re.sub('\s*<body>', '<body>', res)
|
res = re.sub('\s*<body>', '<body>', res)
|
||||||
res = re.sub('(?<=\n)\n{2}',
|
res = re.sub('(?<=\n)\n{2}',
|
||||||
u'<p>\u00a0</p>\n'.encode('utf-8'), res)
|
u'<p>\u00a0</p>\n'.encode('utf-8'), res)
|
||||||
if self.opts.preprocess_html:
|
|
||||||
preprocessor = PreProcessor(self.opts, log=getattr(self, 'log', None))
|
|
||||||
res = preprocessor(res.decode('utf-8')).encode('utf-8')
|
|
||||||
f.write(res)
|
f.write(res)
|
||||||
self.write_inline_css(inline_class, border_styles)
|
self.write_inline_css(inline_class, border_styles)
|
||||||
stream.seek(0)
|
stream.seek(0)
|
||||||
|
@ -41,7 +41,7 @@ class SNBInput(InputFormatPlugin):
|
|||||||
raise ValueError("Invalid SNB file")
|
raise ValueError("Invalid SNB file")
|
||||||
log.debug("Handle meta data ...")
|
log.debug("Handle meta data ...")
|
||||||
from calibre.ebooks.conversion.plumber import create_oebbook
|
from calibre.ebooks.conversion.plumber import create_oebbook
|
||||||
oeb = create_oebbook(log, None, options, self,
|
oeb = create_oebbook(log, None, options,
|
||||||
encoding=options.input_encoding, populate=False)
|
encoding=options.input_encoding, populate=False)
|
||||||
meta = snbFile.GetFileStream('snbf/book.snbf')
|
meta = snbFile.GetFileStream('snbf/book.snbf')
|
||||||
if meta != None:
|
if meta != None:
|
||||||
|
@ -1,58 +0,0 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
__license__ = 'GPL 3'
|
|
||||||
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
|
|
||||||
__docformat__ = 'restructuredtext en'
|
|
||||||
|
|
||||||
import re
|
|
||||||
|
|
||||||
from calibre import prepare_string_for_xml
|
|
||||||
|
|
||||||
class TXTHeuristicProcessor(object):
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
self.ITALICIZE_WORDS = [
|
|
||||||
'Etc.', 'etc.', 'viz.', 'ie.', 'i.e.', 'Ie.', 'I.e.', 'eg.',
|
|
||||||
'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetra', 'n.b.', 'N.b.',
|
|
||||||
'nota bene', 'Nota bene', 'Ste.', 'Mme.', 'Mdme.',
|
|
||||||
'Mlle.', 'Mons.', 'PS.', 'PPS.',
|
|
||||||
]
|
|
||||||
self.ITALICIZE_STYLE_PATS = [
|
|
||||||
r'(?msu)_(?P<words>.+?)_',
|
|
||||||
r'(?msu)/(?P<words>[^<>]+?)/',
|
|
||||||
r'(?msu)~~(?P<words>.+?)~~',
|
|
||||||
r'(?msu)\*(?P<words>.+?)\*',
|
|
||||||
r'(?msu)~(?P<words>.+?)~',
|
|
||||||
r'(?msu)_/(?P<words>[^<>]+?)/_',
|
|
||||||
r'(?msu)_\*(?P<words>.+?)\*_',
|
|
||||||
r'(?msu)\*/(?P<words>[^<>]+?)/\*',
|
|
||||||
r'(?msu)_\*/(?P<words>[^<>]+?)/\*_',
|
|
||||||
r'(?msu)/:(?P<words>[^<>]+?):/',
|
|
||||||
r'(?msu)\|:(?P<words>.+?):\|',
|
|
||||||
]
|
|
||||||
|
|
||||||
def process_paragraph(self, paragraph):
|
|
||||||
for word in self.ITALICIZE_WORDS:
|
|
||||||
paragraph = paragraph.replace(word, '<i>%s</i>' % word)
|
|
||||||
for pat in self.ITALICIZE_STYLE_PATS:
|
|
||||||
paragraph = re.sub(pat, lambda mo: '<i>%s</i>' % mo.group('words'), paragraph)
|
|
||||||
return paragraph
|
|
||||||
|
|
||||||
def convert(self, txt, title='', epub_split_size_kb=0):
|
|
||||||
from calibre.ebooks.txt.processor import clean_txt, split_txt, HTML_TEMPLATE
|
|
||||||
txt = clean_txt(txt)
|
|
||||||
txt = split_txt(txt, epub_split_size_kb)
|
|
||||||
|
|
||||||
processed = []
|
|
||||||
for line in txt.split('\n\n'):
|
|
||||||
processed.append(u'<p>%s</p>' % self.process_paragraph(prepare_string_for_xml(line.replace('\n', ' '))))
|
|
||||||
|
|
||||||
txt = u'\n'.join(processed)
|
|
||||||
txt = re.sub('[ ]{2,}', ' ', txt)
|
|
||||||
html = HTML_TEMPLATE % (title, txt)
|
|
||||||
|
|
||||||
from calibre.ebooks.conversion.utils import PreProcessor
|
|
||||||
pp = PreProcessor()
|
|
||||||
html = pp.markup_chapters(html, pp.get_word_count(html), False)
|
|
||||||
|
|
||||||
return html
|
|
@ -12,7 +12,7 @@ from calibre.ebooks.chardet import detect
|
|||||||
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
|
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
|
||||||
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
|
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
|
||||||
preserve_spaces, detect_paragraph_type, detect_formatting_type, \
|
preserve_spaces, detect_paragraph_type, detect_formatting_type, \
|
||||||
convert_heuristic, normalize_line_endings, convert_textile
|
normalize_line_endings, convert_textile
|
||||||
from calibre import _ent_pat, xml_entity_to_unicode
|
from calibre import _ent_pat, xml_entity_to_unicode
|
||||||
|
|
||||||
class TXTInput(InputFormatPlugin):
|
class TXTInput(InputFormatPlugin):
|
||||||
@ -106,7 +106,7 @@ class TXTInput(InputFormatPlugin):
|
|||||||
log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
|
log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
|
||||||
|
|
||||||
# Dehyphenate
|
# Dehyphenate
|
||||||
dehyphenator = Dehyphenator()
|
dehyphenator = Dehyphenator(options.verbose, log=getattr(self, 'log', None))
|
||||||
txt = dehyphenator(txt,'txt', length)
|
txt = dehyphenator(txt,'txt', length)
|
||||||
|
|
||||||
# We don't check for block because the processor assumes block.
|
# We don't check for block because the processor assumes block.
|
||||||
@ -118,24 +118,24 @@ class TXTInput(InputFormatPlugin):
|
|||||||
txt = separate_paragraphs_print_formatted(txt)
|
txt = separate_paragraphs_print_formatted(txt)
|
||||||
|
|
||||||
if options.paragraph_type == 'unformatted':
|
if options.paragraph_type == 'unformatted':
|
||||||
from calibre.ebooks.conversion.utils import PreProcessor
|
from calibre.ebooks.conversion.utils import HeuristicProcessor
|
||||||
# get length
|
# get length
|
||||||
|
|
||||||
# unwrap lines based on punctuation
|
# unwrap lines based on punctuation
|
||||||
preprocessor = PreProcessor(options, log=getattr(self, 'log', None))
|
preprocessor = HeuristicProcessor(options, log=getattr(self, 'log', None))
|
||||||
txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
|
txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
|
||||||
|
|
||||||
flow_size = getattr(options, 'flow_size', 0)
|
flow_size = getattr(options, 'flow_size', 0)
|
||||||
|
|
||||||
if options.formatting_type == 'heuristic':
|
|
||||||
html = convert_heuristic(txt, epub_split_size_kb=flow_size)
|
|
||||||
else:
|
|
||||||
html = convert_basic(txt, epub_split_size_kb=flow_size)
|
html = convert_basic(txt, epub_split_size_kb=flow_size)
|
||||||
|
|
||||||
# Dehyphenate in cleanup mode for missed txt and markdown conversion
|
if options.formatting_type == 'heuristic':
|
||||||
dehyphenator = Dehyphenator()
|
setattr(options, 'enable_heuristics', True)
|
||||||
html = dehyphenator(html,'txt_cleanup', length)
|
setattr(options, 'markup_chapter_headings', True)
|
||||||
html = dehyphenator(html,'html_cleanup', length)
|
setattr(options, 'italicize_common_cases', True)
|
||||||
|
setattr(options, 'fix_indents', True)
|
||||||
|
setattr(options, 'delete_blank_paragraphs', True)
|
||||||
|
setattr(options, 'format_scene_breaks', True)
|
||||||
|
setattr(options, 'dehyphenate', True)
|
||||||
|
|
||||||
from calibre.customize.ui import plugin_for_input_format
|
from calibre.customize.ui import plugin_for_input_format
|
||||||
html_input = plugin_for_input_format('html')
|
html_input = plugin_for_input_format('html')
|
||||||
|
@ -12,7 +12,6 @@ import os, re
|
|||||||
|
|
||||||
from calibre import prepare_string_for_xml, isbytestring
|
from calibre import prepare_string_for_xml, isbytestring
|
||||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||||
from calibre.ebooks.txt.heuristicprocessor import TXTHeuristicProcessor
|
|
||||||
from calibre.ebooks.conversion.preprocess import DocAnalysis
|
from calibre.ebooks.conversion.preprocess import DocAnalysis
|
||||||
from calibre.utils.cleantext import clean_ascii_chars
|
from calibre.utils.cleantext import clean_ascii_chars
|
||||||
|
|
||||||
@ -67,10 +66,6 @@ def convert_basic(txt, title='', epub_split_size_kb=0):
|
|||||||
|
|
||||||
return HTML_TEMPLATE % (title, u'\n'.join(lines))
|
return HTML_TEMPLATE % (title, u'\n'.join(lines))
|
||||||
|
|
||||||
def convert_heuristic(txt, title='', epub_split_size_kb=0):
|
|
||||||
tp = TXTHeuristicProcessor()
|
|
||||||
return tp.convert(txt, title, epub_split_size_kb)
|
|
||||||
|
|
||||||
def convert_markdown(txt, title='', disable_toc=False):
|
def convert_markdown(txt, title='', disable_toc=False):
|
||||||
from calibre.ebooks.markdown import markdown
|
from calibre.ebooks.markdown import markdown
|
||||||
md = markdown.Markdown(
|
md = markdown.Markdown(
|
||||||
|
@ -11,6 +11,8 @@ from calibre.gui2.convert.single import Config, sort_formats_by_preference, \
|
|||||||
from calibre.customize.ui import available_output_formats
|
from calibre.customize.ui import available_output_formats
|
||||||
from calibre.gui2 import ResizableDialog
|
from calibre.gui2 import ResizableDialog
|
||||||
from calibre.gui2.convert.look_and_feel import LookAndFeelWidget
|
from calibre.gui2.convert.look_and_feel import LookAndFeelWidget
|
||||||
|
from calibre.gui2.convert.heuristics import HeuristicsWidget
|
||||||
|
from calibre.gui2.convert.search_and_replace import SearchAndReplaceWidget
|
||||||
from calibre.gui2.convert.page_setup import PageSetupWidget
|
from calibre.gui2.convert.page_setup import PageSetupWidget
|
||||||
from calibre.gui2.convert.structure_detection import StructureDetectionWidget
|
from calibre.gui2.convert.structure_detection import StructureDetectionWidget
|
||||||
from calibre.gui2.convert.toc import TOCWidget
|
from calibre.gui2.convert.toc import TOCWidget
|
||||||
@ -69,6 +71,8 @@ class BulkConfig(Config):
|
|||||||
|
|
||||||
self.setWindowTitle(_('Bulk Convert'))
|
self.setWindowTitle(_('Bulk Convert'))
|
||||||
lf = widget_factory(LookAndFeelWidget)
|
lf = widget_factory(LookAndFeelWidget)
|
||||||
|
hw = widget_factory(HeuristicsWidget)
|
||||||
|
sr = widget_factory(SearchAndReplaceWidget)
|
||||||
ps = widget_factory(PageSetupWidget)
|
ps = widget_factory(PageSetupWidget)
|
||||||
sd = widget_factory(StructureDetectionWidget)
|
sd = widget_factory(StructureDetectionWidget)
|
||||||
toc = widget_factory(TOCWidget)
|
toc = widget_factory(TOCWidget)
|
||||||
@ -90,7 +94,7 @@ class BulkConfig(Config):
|
|||||||
if not c: break
|
if not c: break
|
||||||
self.stack.removeWidget(c)
|
self.stack.removeWidget(c)
|
||||||
|
|
||||||
widgets = [lf, ps, sd, toc]
|
widgets = [lf, hw, sr, ps, sd, toc]
|
||||||
if output_widget is not None:
|
if output_widget is not None:
|
||||||
widgets.append(output_widget)
|
widgets.append(output_widget)
|
||||||
for w in widgets:
|
for w in widgets:
|
||||||
|
72
src/calibre/gui2/convert/heuristics.py
Normal file
72
src/calibre/gui2/convert/heuristics.py
Normal file
@ -0,0 +1,72 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
__license__ = 'GPL 3'
|
||||||
|
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
|
||||||
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
from PyQt4.Qt import Qt
|
||||||
|
|
||||||
|
from calibre.gui2.convert.heuristics_ui import Ui_Form
|
||||||
|
from calibre.gui2.convert import Widget
|
||||||
|
|
||||||
|
class HeuristicsWidget(Widget, Ui_Form):
|
||||||
|
|
||||||
|
TITLE = _('Heuristic Processing')
|
||||||
|
HELP = _('Modify the document text and structure using common patterns.')
|
||||||
|
COMMIT_NAME = 'heuristics'
|
||||||
|
|
||||||
|
def __init__(self, parent, get_option, get_help, db=None, book_id=None):
|
||||||
|
Widget.__init__(self, parent,
|
||||||
|
['enable_heuristics', 'markup_chapter_headings',
|
||||||
|
'italicize_common_cases', 'fix_indents',
|
||||||
|
'html_unwrap_factor', 'unwrap_lines',
|
||||||
|
'delete_blank_paragraphs', 'format_scene_breaks',
|
||||||
|
'dehyphenate', 'renumber_headings']
|
||||||
|
)
|
||||||
|
self.db, self.book_id = db, book_id
|
||||||
|
self.initialize_options(get_option, get_help, db, book_id)
|
||||||
|
|
||||||
|
self.opt_enable_heuristics.stateChanged.connect(self.enable_heuristics)
|
||||||
|
self.opt_unwrap_lines.stateChanged.connect(self.enable_unwrap)
|
||||||
|
|
||||||
|
self.enable_heuristics(self.opt_enable_heuristics.checkState())
|
||||||
|
|
||||||
|
def break_cycles(self):
|
||||||
|
Widget.break_cycles(self)
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.opt_enable_heuristics.stateChanged.disconnect()
|
||||||
|
self.opt_unwrap_lines.stateChanged.disconnect()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def set_value_handler(self, g, val):
|
||||||
|
if val is None and g is self.opt_html_unwrap_factor:
|
||||||
|
g.setValue(0.0)
|
||||||
|
return True
|
||||||
|
|
||||||
|
def enable_heuristics(self, state):
|
||||||
|
if state == Qt.Checked:
|
||||||
|
state = True
|
||||||
|
else:
|
||||||
|
state = False
|
||||||
|
self.opt_markup_chapter_headings.setEnabled(state)
|
||||||
|
self.opt_italicize_common_cases.setEnabled(state)
|
||||||
|
self.opt_fix_indents.setEnabled(state)
|
||||||
|
self.opt_delete_blank_paragraphs.setEnabled(state)
|
||||||
|
self.opt_format_scene_breaks.setEnabled(state)
|
||||||
|
self.opt_dehyphenate.setEnabled(state)
|
||||||
|
self.opt_renumber_headings.setEnabled(state)
|
||||||
|
|
||||||
|
self.opt_unwrap_lines.setEnabled(state)
|
||||||
|
if state and self.opt_unwrap_lines.checkState() == Qt.Checked:
|
||||||
|
self.opt_html_unwrap_factor.setEnabled(True)
|
||||||
|
else:
|
||||||
|
self.opt_html_unwrap_factor.setEnabled(False)
|
||||||
|
|
||||||
|
def enable_unwrap(self, state):
|
||||||
|
if state == Qt.Checked:
|
||||||
|
state = True
|
||||||
|
else:
|
||||||
|
state = False
|
||||||
|
self.opt_html_unwrap_factor.setEnabled(state)
|
178
src/calibre/gui2/convert/heuristics.ui
Normal file
178
src/calibre/gui2/convert/heuristics.ui
Normal file
@ -0,0 +1,178 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<ui version="4.0">
|
||||||
|
<class>Form</class>
|
||||||
|
<widget class="QWidget" name="Form">
|
||||||
|
<property name="geometry">
|
||||||
|
<rect>
|
||||||
|
<x>0</x>
|
||||||
|
<y>0</y>
|
||||||
|
<width>938</width>
|
||||||
|
<height>470</height>
|
||||||
|
</rect>
|
||||||
|
</property>
|
||||||
|
<property name="windowTitle">
|
||||||
|
<string>Form</string>
|
||||||
|
</property>
|
||||||
|
<layout class="QVBoxLayout" name="verticalLayout">
|
||||||
|
<item>
|
||||||
|
<widget class="QCheckBox" name="opt_enable_heuristics">
|
||||||
|
<property name="text">
|
||||||
|
<string>&Preprocess input file to possibly improve structure detection</string>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item>
|
||||||
|
<widget class="QGroupBox" name="groupBox">
|
||||||
|
<property name="title">
|
||||||
|
<string>Heuristic Processing</string>
|
||||||
|
</property>
|
||||||
|
<layout class="QGridLayout" name="gridLayout">
|
||||||
|
<item row="0" column="0" colspan="2">
|
||||||
|
<widget class="QCheckBox" name="opt_unwrap_lines">
|
||||||
|
<property name="text">
|
||||||
|
<string>Unwrap lines</string>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item row="1" column="1">
|
||||||
|
<widget class="QLabel" name="huf_label">
|
||||||
|
<property name="text">
|
||||||
|
<string>Line &un-wrap factor during preprocess:</string>
|
||||||
|
</property>
|
||||||
|
<property name="buddy">
|
||||||
|
<cstring>opt_html_unwrap_factor</cstring>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item row="1" column="2">
|
||||||
|
<widget class="QDoubleSpinBox" name="opt_html_unwrap_factor">
|
||||||
|
<property name="toolTip">
|
||||||
|
<string/>
|
||||||
|
</property>
|
||||||
|
<property name="maximum">
|
||||||
|
<double>1.000000000000000</double>
|
||||||
|
</property>
|
||||||
|
<property name="singleStep">
|
||||||
|
<double>0.050000000000000</double>
|
||||||
|
</property>
|
||||||
|
<property name="value">
|
||||||
|
<double>0.400000000000000</double>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item row="1" column="3">
|
||||||
|
<spacer name="horizontalSpacer_2">
|
||||||
|
<property name="orientation">
|
||||||
|
<enum>Qt::Horizontal</enum>
|
||||||
|
</property>
|
||||||
|
<property name="sizeHint" stdset="0">
|
||||||
|
<size>
|
||||||
|
<width>40</width>
|
||||||
|
<height>20</height>
|
||||||
|
</size>
|
||||||
|
</property>
|
||||||
|
</spacer>
|
||||||
|
</item>
|
||||||
|
<item row="2" column="0" colspan="4">
|
||||||
|
<widget class="QCheckBox" name="opt_markup_chapter_headings">
|
||||||
|
<property name="text">
|
||||||
|
<string>Detect and markup unformatted chapter headings and sub headings</string>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item row="3" column="0" colspan="4">
|
||||||
|
<widget class="QCheckBox" name="opt_renumber_headings">
|
||||||
|
<property name="text">
|
||||||
|
<string>Renumber sequences of <h1> or <h2> tags to prevent splitting</string>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item row="4" column="0" colspan="2">
|
||||||
|
<widget class="QCheckBox" name="opt_delete_blank_paragraphs">
|
||||||
|
<property name="text">
|
||||||
|
<string>Delete blank lines between paragraphs</string>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item row="5" column="0" colspan="3">
|
||||||
|
<widget class="QCheckBox" name="opt_format_scene_breaks">
|
||||||
|
<property name="text">
|
||||||
|
<string>Ensure scene breaks are consistently formatted</string>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item row="6" column="0" colspan="2">
|
||||||
|
<widget class="QCheckBox" name="opt_dehyphenate">
|
||||||
|
<property name="text">
|
||||||
|
<string>Remove unnecessary hyphens</string>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item row="7" column="0" colspan="2">
|
||||||
|
<widget class="QCheckBox" name="opt_italicize_common_cases">
|
||||||
|
<property name="text">
|
||||||
|
<string>Italicize common words and patterns</string>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item row="8" column="0" colspan="2">
|
||||||
|
<widget class="QCheckBox" name="opt_fix_indents">
|
||||||
|
<property name="text">
|
||||||
|
<string>Replace entity indents with CSS indents</string>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item row="9" column="0" colspan="2">
|
||||||
|
<spacer name="verticalSpacer">
|
||||||
|
<property name="orientation">
|
||||||
|
<enum>Qt::Vertical</enum>
|
||||||
|
</property>
|
||||||
|
<property name="sizeHint" stdset="0">
|
||||||
|
<size>
|
||||||
|
<width>131</width>
|
||||||
|
<height>35</height>
|
||||||
|
</size>
|
||||||
|
</property>
|
||||||
|
</spacer>
|
||||||
|
</item>
|
||||||
|
</layout>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
</layout>
|
||||||
|
</widget>
|
||||||
|
<resources/>
|
||||||
|
<connections>
|
||||||
|
<connection>
|
||||||
|
<sender>opt_enable_heuristics</sender>
|
||||||
|
<signal>toggled(bool)</signal>
|
||||||
|
<receiver>opt_html_unwrap_factor</receiver>
|
||||||
|
<slot>setEnabled(bool)</slot>
|
||||||
|
<hints>
|
||||||
|
<hint type="sourcelabel">
|
||||||
|
<x>328</x>
|
||||||
|
<y>87</y>
|
||||||
|
</hint>
|
||||||
|
<hint type="destinationlabel">
|
||||||
|
<x>481</x>
|
||||||
|
<y>113</y>
|
||||||
|
</hint>
|
||||||
|
</hints>
|
||||||
|
</connection>
|
||||||
|
<connection>
|
||||||
|
<sender>opt_enable_heuristics</sender>
|
||||||
|
<signal>toggled(bool)</signal>
|
||||||
|
<receiver>huf_label</receiver>
|
||||||
|
<slot>setEnabled(bool)</slot>
|
||||||
|
<hints>
|
||||||
|
<hint type="sourcelabel">
|
||||||
|
<x>295</x>
|
||||||
|
<y>88</y>
|
||||||
|
</hint>
|
||||||
|
<hint type="destinationlabel">
|
||||||
|
<x>291</x>
|
||||||
|
<y>105</y>
|
||||||
|
</hint>
|
||||||
|
</hints>
|
||||||
|
</connection>
|
||||||
|
</connections>
|
||||||
|
</ui>
|
@ -6,8 +6,6 @@ __docformat__ = 'restructuredtext en'
|
|||||||
|
|
||||||
from calibre.gui2.convert.pdb_output_ui import Ui_Form
|
from calibre.gui2.convert.pdb_output_ui import Ui_Form
|
||||||
from calibre.gui2.convert import Widget
|
from calibre.gui2.convert import Widget
|
||||||
from calibre.ebooks.pdb import FORMAT_WRITERS
|
|
||||||
from calibre.gui2.widgets import BasicComboModel
|
|
||||||
|
|
||||||
format_model = None
|
format_model = None
|
||||||
|
|
||||||
@ -21,17 +19,8 @@ class PluginWidget(Widget, Ui_Form):
|
|||||||
def __init__(self, parent, get_option, get_help, db=None, book_id=None):
|
def __init__(self, parent, get_option, get_help, db=None, book_id=None):
|
||||||
Widget.__init__(self, parent, ['format', 'inline_toc', 'pdb_output_encoding'])
|
Widget.__init__(self, parent, ['format', 'inline_toc', 'pdb_output_encoding'])
|
||||||
self.db, self.book_id = db, book_id
|
self.db, self.book_id = db, book_id
|
||||||
|
|
||||||
|
for x in get_option('format').option.choices:
|
||||||
|
self.opt_format.addItem(x)
|
||||||
|
|
||||||
self.initialize_options(get_option, get_help, db, book_id)
|
self.initialize_options(get_option, get_help, db, book_id)
|
||||||
|
|
||||||
default = self.opt_format.currentText()
|
|
||||||
|
|
||||||
global format_model
|
|
||||||
if format_model is None:
|
|
||||||
format_model = BasicComboModel(FORMAT_WRITERS.keys())
|
|
||||||
self.format_model = format_model
|
|
||||||
self.opt_format.setModel(self.format_model)
|
|
||||||
|
|
||||||
default_index = self.opt_format.findText(default)
|
|
||||||
format_index = self.opt_format.findText('doc')
|
|
||||||
self.opt_format.setCurrentIndex(default_index if default_index != -1 else format_index if format_index != -1 else 0)
|
|
||||||
|
|
||||||
|
@ -6,8 +6,6 @@ __docformat__ = 'restructuredtext en'
|
|||||||
|
|
||||||
from calibre.gui2.convert.pdf_output_ui import Ui_Form
|
from calibre.gui2.convert.pdf_output_ui import Ui_Form
|
||||||
from calibre.gui2.convert import Widget
|
from calibre.gui2.convert import Widget
|
||||||
from calibre.ebooks.pdf.pageoptions import PAPER_SIZES, ORIENTATIONS
|
|
||||||
from calibre.gui2.widgets import BasicComboModel
|
|
||||||
|
|
||||||
paper_size_model = None
|
paper_size_model = None
|
||||||
orientation_model = None
|
orientation_model = None
|
||||||
@ -23,28 +21,11 @@ class PluginWidget(Widget, Ui_Form):
|
|||||||
Widget.__init__(self, parent, ['paper_size',
|
Widget.__init__(self, parent, ['paper_size',
|
||||||
'orientation', 'preserve_cover_aspect_ratio'])
|
'orientation', 'preserve_cover_aspect_ratio'])
|
||||||
self.db, self.book_id = db, book_id
|
self.db, self.book_id = db, book_id
|
||||||
|
|
||||||
|
for x in get_option('paper_size').option.choices:
|
||||||
|
self.opt_paper_size.addItem(x)
|
||||||
|
for x in get_option('orientation').option.choices:
|
||||||
|
self.opt_orientation.addItem(x)
|
||||||
|
|
||||||
self.initialize_options(get_option, get_help, db, book_id)
|
self.initialize_options(get_option, get_help, db, book_id)
|
||||||
|
|
||||||
default_paper_size = self.opt_paper_size.currentText()
|
|
||||||
default_orientation = self.opt_orientation.currentText()
|
|
||||||
|
|
||||||
global paper_size_model
|
|
||||||
if paper_size_model is None:
|
|
||||||
paper_size_model = BasicComboModel(PAPER_SIZES.keys())
|
|
||||||
self.paper_size_model = paper_size_model
|
|
||||||
self.opt_paper_size.setModel(self.paper_size_model)
|
|
||||||
|
|
||||||
default_paper_size_index = self.opt_paper_size.findText(default_paper_size)
|
|
||||||
letter_index = self.opt_paper_size.findText('letter')
|
|
||||||
self.opt_paper_size.setCurrentIndex(default_paper_size_index if default_paper_size_index != -1 else letter_index if letter_index != -1 else 0)
|
|
||||||
|
|
||||||
global orientation_model
|
|
||||||
if orientation_model is None:
|
|
||||||
orientation_model = BasicComboModel(ORIENTATIONS.keys())
|
|
||||||
self.orientation_model = orientation_model
|
|
||||||
self.opt_orientation.setModel(self.orientation_model)
|
|
||||||
|
|
||||||
default_orientation_index = self.opt_orientation.findText(default_orientation)
|
|
||||||
orientation_index = self.opt_orientation.findText('portrait')
|
|
||||||
self.opt_orientation.setCurrentIndex(default_orientation_index if default_orientation_index != -1 else orientation_index if orientation_index != -1 else 0)
|
|
||||||
|
|
||||||
|
54
src/calibre/gui2/convert/search_and_replace.py
Normal file
54
src/calibre/gui2/convert/search_and_replace.py
Normal file
@ -0,0 +1,54 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
__license__ = 'GPL 3'
|
||||||
|
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
|
||||||
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
from calibre.gui2.convert.search_and_replace_ui import Ui_Form
|
||||||
|
from calibre.gui2.convert import Widget
|
||||||
|
from calibre.gui2 import error_dialog
|
||||||
|
|
||||||
|
class SearchAndReplaceWidget(Widget, Ui_Form):
|
||||||
|
|
||||||
|
TITLE = _('Search &\nReplace')
|
||||||
|
HELP = _('Modify the document text and structure using user defined patterns.')
|
||||||
|
COMMIT_NAME = 'search_and_replace'
|
||||||
|
|
||||||
|
def __init__(self, parent, get_option, get_help, db=None, book_id=None):
|
||||||
|
Widget.__init__(self, parent,
|
||||||
|
['sr1_search', 'sr1_replace',
|
||||||
|
'sr2_search', 'sr2_replace',
|
||||||
|
'sr3_search', 'sr3_replace']
|
||||||
|
)
|
||||||
|
self.db, self.book_id = db, book_id
|
||||||
|
self.initialize_options(get_option, get_help, db, book_id)
|
||||||
|
self.opt_sr1_search.set_msg(_('Search Regular Expression'))
|
||||||
|
self.opt_sr1_search.set_book_id(book_id)
|
||||||
|
self.opt_sr1_search.set_db(db)
|
||||||
|
self.opt_sr2_search.set_msg(_('Search Regular Expression'))
|
||||||
|
self.opt_sr2_search.set_book_id(book_id)
|
||||||
|
self.opt_sr2_search.set_db(db)
|
||||||
|
self.opt_sr3_search.set_msg(_('Search Regular Expression'))
|
||||||
|
self.opt_sr3_search.set_book_id(book_id)
|
||||||
|
self.opt_sr3_search.set_db(db)
|
||||||
|
|
||||||
|
def break_cycles(self):
|
||||||
|
Widget.break_cycles(self)
|
||||||
|
|
||||||
|
self.opt_sr1_search.break_cycles()
|
||||||
|
self.opt_sr2_search.break_cycles()
|
||||||
|
self.opt_sr3_search.break_cycles()
|
||||||
|
|
||||||
|
def pre_commit_check(self):
|
||||||
|
for x in ('sr1_search', 'sr2_search', 'sr3_search'):
|
||||||
|
x = getattr(self, 'opt_'+x)
|
||||||
|
try:
|
||||||
|
pat = unicode(x.regex)
|
||||||
|
re.compile(pat)
|
||||||
|
except Exception, err:
|
||||||
|
error_dialog(self, _('Invalid regular expression'),
|
||||||
|
_('Invalid regular expression: %s')%err).exec_()
|
||||||
|
return False
|
||||||
|
return True
|
191
src/calibre/gui2/convert/search_and_replace.ui
Normal file
191
src/calibre/gui2/convert/search_and_replace.ui
Normal file
@ -0,0 +1,191 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<ui version="4.0">
|
||||||
|
<class>Form</class>
|
||||||
|
<widget class="QWidget" name="Form">
|
||||||
|
<property name="geometry">
|
||||||
|
<rect>
|
||||||
|
<x>0</x>
|
||||||
|
<y>0</y>
|
||||||
|
<width>198</width>
|
||||||
|
<height>350</height>
|
||||||
|
</rect>
|
||||||
|
</property>
|
||||||
|
<property name="sizePolicy">
|
||||||
|
<sizepolicy hsizetype="Minimum" vsizetype="Preferred">
|
||||||
|
<horstretch>0</horstretch>
|
||||||
|
<verstretch>0</verstretch>
|
||||||
|
</sizepolicy>
|
||||||
|
</property>
|
||||||
|
<property name="windowTitle">
|
||||||
|
<string>Form</string>
|
||||||
|
</property>
|
||||||
|
<layout class="QGridLayout" name="gridLayout_4">
|
||||||
|
<property name="sizeConstraint">
|
||||||
|
<enum>QLayout::SetDefaultConstraint</enum>
|
||||||
|
</property>
|
||||||
|
<item row="0" column="0">
|
||||||
|
<widget class="QGroupBox" name="groupBox">
|
||||||
|
<property name="sizePolicy">
|
||||||
|
<sizepolicy hsizetype="Minimum" vsizetype="Preferred">
|
||||||
|
<horstretch>0</horstretch>
|
||||||
|
<verstretch>0</verstretch>
|
||||||
|
</sizepolicy>
|
||||||
|
</property>
|
||||||
|
<property name="title">
|
||||||
|
<string>1.</string>
|
||||||
|
</property>
|
||||||
|
<layout class="QGridLayout" name="gridLayout_2">
|
||||||
|
<property name="sizeConstraint">
|
||||||
|
<enum>QLayout::SetMinimumSize</enum>
|
||||||
|
</property>
|
||||||
|
<item row="0" column="0">
|
||||||
|
<widget class="RegexEdit" name="opt_sr1_search" native="true">
|
||||||
|
<property name="sizePolicy">
|
||||||
|
<sizepolicy hsizetype="Minimum" vsizetype="Preferred">
|
||||||
|
<horstretch>0</horstretch>
|
||||||
|
<verstretch>0</verstretch>
|
||||||
|
</sizepolicy>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item row="1" column="0">
|
||||||
|
<widget class="QLabel" name="label_4">
|
||||||
|
<property name="sizePolicy">
|
||||||
|
<sizepolicy hsizetype="Minimum" vsizetype="Preferred">
|
||||||
|
<horstretch>0</horstretch>
|
||||||
|
<verstretch>0</verstretch>
|
||||||
|
</sizepolicy>
|
||||||
|
</property>
|
||||||
|
<property name="text">
|
||||||
|
<string>Replacement Text</string>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item row="2" column="0">
|
||||||
|
<widget class="QLineEdit" name="opt_sr1_replace">
|
||||||
|
<property name="sizePolicy">
|
||||||
|
<sizepolicy hsizetype="Minimum" vsizetype="Fixed">
|
||||||
|
<horstretch>0</horstretch>
|
||||||
|
<verstretch>0</verstretch>
|
||||||
|
</sizepolicy>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
</layout>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item row="1" column="0">
|
||||||
|
<widget class="QGroupBox" name="groupBox_2">
|
||||||
|
<property name="sizePolicy">
|
||||||
|
<sizepolicy hsizetype="Minimum" vsizetype="Preferred">
|
||||||
|
<horstretch>0</horstretch>
|
||||||
|
<verstretch>0</verstretch>
|
||||||
|
</sizepolicy>
|
||||||
|
</property>
|
||||||
|
<property name="title">
|
||||||
|
<string>2.</string>
|
||||||
|
</property>
|
||||||
|
<layout class="QGridLayout" name="gridLayout">
|
||||||
|
<property name="sizeConstraint">
|
||||||
|
<enum>QLayout::SetMinimumSize</enum>
|
||||||
|
</property>
|
||||||
|
<item row="0" column="0">
|
||||||
|
<widget class="RegexEdit" name="opt_sr2_search" native="true">
|
||||||
|
<property name="sizePolicy">
|
||||||
|
<sizepolicy hsizetype="Minimum" vsizetype="Preferred">
|
||||||
|
<horstretch>0</horstretch>
|
||||||
|
<verstretch>0</verstretch>
|
||||||
|
</sizepolicy>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item row="1" column="0">
|
||||||
|
<widget class="QLabel" name="label_5">
|
||||||
|
<property name="sizePolicy">
|
||||||
|
<sizepolicy hsizetype="Minimum" vsizetype="Preferred">
|
||||||
|
<horstretch>0</horstretch>
|
||||||
|
<verstretch>0</verstretch>
|
||||||
|
</sizepolicy>
|
||||||
|
</property>
|
||||||
|
<property name="text">
|
||||||
|
<string>Replacement Text</string>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item row="2" column="0">
|
||||||
|
<widget class="QLineEdit" name="opt_sr2_replace">
|
||||||
|
<property name="sizePolicy">
|
||||||
|
<sizepolicy hsizetype="Minimum" vsizetype="Fixed">
|
||||||
|
<horstretch>0</horstretch>
|
||||||
|
<verstretch>0</verstretch>
|
||||||
|
</sizepolicy>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
</layout>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item row="2" column="0">
|
||||||
|
<widget class="QGroupBox" name="groupBox_3">
|
||||||
|
<property name="sizePolicy">
|
||||||
|
<sizepolicy hsizetype="Minimum" vsizetype="Preferred">
|
||||||
|
<horstretch>0</horstretch>
|
||||||
|
<verstretch>0</verstretch>
|
||||||
|
</sizepolicy>
|
||||||
|
</property>
|
||||||
|
<property name="title">
|
||||||
|
<string>3.</string>
|
||||||
|
</property>
|
||||||
|
<layout class="QGridLayout" name="gridLayout_3">
|
||||||
|
<property name="sizeConstraint">
|
||||||
|
<enum>QLayout::SetMinimumSize</enum>
|
||||||
|
</property>
|
||||||
|
<item row="0" column="0">
|
||||||
|
<widget class="RegexEdit" name="opt_sr3_search" native="true">
|
||||||
|
<property name="sizePolicy">
|
||||||
|
<sizepolicy hsizetype="Minimum" vsizetype="Preferred">
|
||||||
|
<horstretch>0</horstretch>
|
||||||
|
<verstretch>0</verstretch>
|
||||||
|
</sizepolicy>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item row="1" column="0">
|
||||||
|
<widget class="QLabel" name="label_6">
|
||||||
|
<property name="sizePolicy">
|
||||||
|
<sizepolicy hsizetype="Minimum" vsizetype="Preferred">
|
||||||
|
<horstretch>0</horstretch>
|
||||||
|
<verstretch>0</verstretch>
|
||||||
|
</sizepolicy>
|
||||||
|
</property>
|
||||||
|
<property name="text">
|
||||||
|
<string>Replacement Text</string>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item row="2" column="0">
|
||||||
|
<widget class="QLineEdit" name="opt_sr3_replace">
|
||||||
|
<property name="sizePolicy">
|
||||||
|
<sizepolicy hsizetype="Minimum" vsizetype="Fixed">
|
||||||
|
<horstretch>0</horstretch>
|
||||||
|
<verstretch>0</verstretch>
|
||||||
|
</sizepolicy>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
</layout>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
</layout>
|
||||||
|
</widget>
|
||||||
|
<customwidgets>
|
||||||
|
<customwidget>
|
||||||
|
<class>RegexEdit</class>
|
||||||
|
<extends>QWidget</extends>
|
||||||
|
<header>regex_builder.h</header>
|
||||||
|
<container>1</container>
|
||||||
|
</customwidget>
|
||||||
|
</customwidgets>
|
||||||
|
<resources/>
|
||||||
|
<connections/>
|
||||||
|
</ui>
|
@ -16,6 +16,8 @@ from calibre.ebooks.conversion.config import GuiRecommendations, save_specifics,
|
|||||||
from calibre.gui2.convert.single_ui import Ui_Dialog
|
from calibre.gui2.convert.single_ui import Ui_Dialog
|
||||||
from calibre.gui2.convert.metadata import MetadataWidget
|
from calibre.gui2.convert.metadata import MetadataWidget
|
||||||
from calibre.gui2.convert.look_and_feel import LookAndFeelWidget
|
from calibre.gui2.convert.look_and_feel import LookAndFeelWidget
|
||||||
|
from calibre.gui2.convert.heuristics import HeuristicsWidget
|
||||||
|
from calibre.gui2.convert.search_and_replace import SearchAndReplaceWidget
|
||||||
from calibre.gui2.convert.page_setup import PageSetupWidget
|
from calibre.gui2.convert.page_setup import PageSetupWidget
|
||||||
from calibre.gui2.convert.structure_detection import StructureDetectionWidget
|
from calibre.gui2.convert.structure_detection import StructureDetectionWidget
|
||||||
from calibre.gui2.convert.toc import TOCWidget
|
from calibre.gui2.convert.toc import TOCWidget
|
||||||
@ -170,6 +172,8 @@ class Config(ResizableDialog, Ui_Dialog):
|
|||||||
self.mw = widget_factory(MetadataWidget)
|
self.mw = widget_factory(MetadataWidget)
|
||||||
self.setWindowTitle(_('Convert')+ ' ' + unicode(self.mw.title.text()))
|
self.setWindowTitle(_('Convert')+ ' ' + unicode(self.mw.title.text()))
|
||||||
lf = widget_factory(LookAndFeelWidget)
|
lf = widget_factory(LookAndFeelWidget)
|
||||||
|
hw = widget_factory(HeuristicsWidget)
|
||||||
|
sr = widget_factory(SearchAndReplaceWidget)
|
||||||
ps = widget_factory(PageSetupWidget)
|
ps = widget_factory(PageSetupWidget)
|
||||||
sd = widget_factory(StructureDetectionWidget)
|
sd = widget_factory(StructureDetectionWidget)
|
||||||
toc = widget_factory(TOCWidget)
|
toc = widget_factory(TOCWidget)
|
||||||
@ -203,7 +207,7 @@ class Config(ResizableDialog, Ui_Dialog):
|
|||||||
if not c: break
|
if not c: break
|
||||||
self.stack.removeWidget(c)
|
self.stack.removeWidget(c)
|
||||||
|
|
||||||
widgets = [self.mw, lf, ps, sd, toc]
|
widgets = [self.mw, lf, hw, sr, ps, sd, toc]
|
||||||
if input_widget is not None:
|
if input_widget is not None:
|
||||||
widgets.append(input_widget)
|
widgets.append(input_widget)
|
||||||
if output_widget is not None:
|
if output_widget is not None:
|
||||||
|
@ -6,8 +6,6 @@ __license__ = 'GPL v3'
|
|||||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
import re
|
|
||||||
|
|
||||||
from calibre.gui2.convert.structure_detection_ui import Ui_Form
|
from calibre.gui2.convert.structure_detection_ui import Ui_Form
|
||||||
from calibre.gui2.convert import Widget
|
from calibre.gui2.convert import Widget
|
||||||
from calibre.gui2 import error_dialog
|
from calibre.gui2 import error_dialog
|
||||||
@ -24,12 +22,8 @@ class StructureDetectionWidget(Widget, Ui_Form):
|
|||||||
Widget.__init__(self, parent,
|
Widget.__init__(self, parent,
|
||||||
['chapter', 'chapter_mark',
|
['chapter', 'chapter_mark',
|
||||||
'remove_first_image',
|
'remove_first_image',
|
||||||
'insert_metadata', 'page_breaks_before',
|
'insert_metadata', 'page_breaks_before']
|
||||||
'preprocess_html', 'remove_header', 'header_regex',
|
|
||||||
'remove_footer', 'footer_regex','html_unwrap_factor']
|
|
||||||
)
|
)
|
||||||
self.opt_html_unwrap_factor.setEnabled(False)
|
|
||||||
self.huf_label.setEnabled(False)
|
|
||||||
self.db, self.book_id = db, book_id
|
self.db, self.book_id = db, book_id
|
||||||
for x in ('pagebreak', 'rule', 'both', 'none'):
|
for x in ('pagebreak', 'rule', 'both', 'none'):
|
||||||
self.opt_chapter_mark.addItem(x)
|
self.opt_chapter_mark.addItem(x)
|
||||||
@ -37,28 +31,11 @@ class StructureDetectionWidget(Widget, Ui_Form):
|
|||||||
self.opt_chapter.set_msg(_('Detect chapters at (XPath expression):'))
|
self.opt_chapter.set_msg(_('Detect chapters at (XPath expression):'))
|
||||||
self.opt_page_breaks_before.set_msg(_('Insert page breaks before '
|
self.opt_page_breaks_before.set_msg(_('Insert page breaks before '
|
||||||
'(XPath expression):'))
|
'(XPath expression):'))
|
||||||
self.opt_header_regex.set_msg(_('Header regular expression:'))
|
|
||||||
self.opt_header_regex.set_book_id(book_id)
|
|
||||||
self.opt_header_regex.set_db(db)
|
|
||||||
self.opt_footer_regex.set_msg(_('Footer regular expression:'))
|
|
||||||
self.opt_footer_regex.set_book_id(book_id)
|
|
||||||
self.opt_footer_regex.set_db(db)
|
|
||||||
|
|
||||||
def break_cycles(self):
|
def break_cycles(self):
|
||||||
Widget.break_cycles(self)
|
Widget.break_cycles(self)
|
||||||
self.opt_header_regex.break_cycles()
|
|
||||||
self.opt_footer_regex.break_cycles()
|
|
||||||
|
|
||||||
def pre_commit_check(self):
|
def pre_commit_check(self):
|
||||||
for x in ('header_regex', 'footer_regex'):
|
|
||||||
x = getattr(self, 'opt_'+x)
|
|
||||||
try:
|
|
||||||
pat = unicode(x.regex)
|
|
||||||
re.compile(pat)
|
|
||||||
except Exception, err:
|
|
||||||
error_dialog(self, _('Invalid regular expression'),
|
|
||||||
_('Invalid regular expression: %s')%err).exec_()
|
|
||||||
return False
|
|
||||||
for x in ('chapter', 'page_breaks_before'):
|
for x in ('chapter', 'page_breaks_before'):
|
||||||
x = getattr(self, 'opt_'+x)
|
x = getattr(self, 'opt_'+x)
|
||||||
if not x.check():
|
if not x.check():
|
||||||
@ -66,8 +43,3 @@ class StructureDetectionWidget(Widget, Ui_Form):
|
|||||||
_('The XPath expression %s is invalid.')%x.text).exec_()
|
_('The XPath expression %s is invalid.')%x.text).exec_()
|
||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def set_value_handler(self, g, val):
|
|
||||||
if val is None and g is self.opt_html_unwrap_factor:
|
|
||||||
g.setValue(0.0)
|
|
||||||
return True
|
|
||||||
|
@ -14,10 +14,10 @@
|
|||||||
<string>Form</string>
|
<string>Form</string>
|
||||||
</property>
|
</property>
|
||||||
<layout class="QGridLayout" name="gridLayout">
|
<layout class="QGridLayout" name="gridLayout">
|
||||||
<item row="0" column="1" colspan="2">
|
<item row="0" column="0" colspan="3">
|
||||||
<widget class="XPathEdit" name="opt_chapter" native="true"/>
|
<widget class="XPathEdit" name="opt_chapter" native="true"/>
|
||||||
</item>
|
</item>
|
||||||
<item row="1" column="0" colspan="2">
|
<item row="1" column="0">
|
||||||
<widget class="QLabel" name="label">
|
<widget class="QLabel" name="label">
|
||||||
<property name="text">
|
<property name="text">
|
||||||
<string>Chapter &mark:</string>
|
<string>Chapter &mark:</string>
|
||||||
@ -27,7 +27,7 @@
|
|||||||
</property>
|
</property>
|
||||||
</widget>
|
</widget>
|
||||||
</item>
|
</item>
|
||||||
<item row="1" column="2">
|
<item row="1" column="1">
|
||||||
<widget class="QComboBox" name="opt_chapter_mark">
|
<widget class="QComboBox" name="opt_chapter_mark">
|
||||||
<property name="minimumContentsLength">
|
<property name="minimumContentsLength">
|
||||||
<number>20</number>
|
<number>20</number>
|
||||||
@ -41,17 +41,17 @@
|
|||||||
</property>
|
</property>
|
||||||
</widget>
|
</widget>
|
||||||
</item>
|
</item>
|
||||||
<item row="5" column="0" colspan="2">
|
<item row="3" column="0" colspan="2">
|
||||||
<widget class="QCheckBox" name="opt_insert_metadata">
|
<widget class="QCheckBox" name="opt_insert_metadata">
|
||||||
<property name="text">
|
<property name="text">
|
||||||
<string>Insert &metadata as page at start of book</string>
|
<string>Insert &metadata as page at start of book</string>
|
||||||
</property>
|
</property>
|
||||||
</widget>
|
</widget>
|
||||||
</item>
|
</item>
|
||||||
<item row="11" column="0" colspan="3">
|
<item row="5" column="0" colspan="3">
|
||||||
<widget class="XPathEdit" name="opt_page_breaks_before" native="true"/>
|
<widget class="XPathEdit" name="opt_page_breaks_before" native="true"/>
|
||||||
</item>
|
</item>
|
||||||
<item row="12" column="0" colspan="3">
|
<item row="6" column="0" colspan="3">
|
||||||
<spacer name="verticalSpacer">
|
<spacer name="verticalSpacer">
|
||||||
<property name="orientation">
|
<property name="orientation">
|
||||||
<enum>Qt::Vertical</enum>
|
<enum>Qt::Vertical</enum>
|
||||||
@ -64,53 +64,7 @@
|
|||||||
</property>
|
</property>
|
||||||
</spacer>
|
</spacer>
|
||||||
</item>
|
</item>
|
||||||
<item row="8" column="0" colspan="2">
|
<item row="1" column="2">
|
||||||
<widget class="QCheckBox" name="opt_remove_footer">
|
|
||||||
<property name="text">
|
|
||||||
<string>Remove F&ooter</string>
|
|
||||||
</property>
|
|
||||||
</widget>
|
|
||||||
</item>
|
|
||||||
<item row="6" column="0" colspan="2">
|
|
||||||
<widget class="QCheckBox" name="opt_remove_header">
|
|
||||||
<property name="text">
|
|
||||||
<string>Remove H&eader</string>
|
|
||||||
</property>
|
|
||||||
</widget>
|
|
||||||
</item>
|
|
||||||
<item row="7" column="0" colspan="3">
|
|
||||||
<widget class="RegexEdit" name="opt_header_regex" native="true"/>
|
|
||||||
</item>
|
|
||||||
<item row="9" column="0" colspan="3">
|
|
||||||
<widget class="RegexEdit" name="opt_footer_regex" native="true"/>
|
|
||||||
</item>
|
|
||||||
<item row="4" column="1">
|
|
||||||
<widget class="QLabel" name="huf_label">
|
|
||||||
<property name="text">
|
|
||||||
<string>Line &un-wrap factor during preprocess:</string>
|
|
||||||
</property>
|
|
||||||
<property name="buddy">
|
|
||||||
<cstring>opt_html_unwrap_factor</cstring>
|
|
||||||
</property>
|
|
||||||
</widget>
|
|
||||||
</item>
|
|
||||||
<item row="4" column="2">
|
|
||||||
<widget class="QDoubleSpinBox" name="opt_html_unwrap_factor">
|
|
||||||
<property name="toolTip">
|
|
||||||
<string/>
|
|
||||||
</property>
|
|
||||||
<property name="maximum">
|
|
||||||
<double>1.000000000000000</double>
|
|
||||||
</property>
|
|
||||||
<property name="singleStep">
|
|
||||||
<double>0.050000000000000</double>
|
|
||||||
</property>
|
|
||||||
<property name="value">
|
|
||||||
<double>0.400000000000000</double>
|
|
||||||
</property>
|
|
||||||
</widget>
|
|
||||||
</item>
|
|
||||||
<item row="4" column="0">
|
|
||||||
<spacer name="horizontalSpacer">
|
<spacer name="horizontalSpacer">
|
||||||
<property name="orientation">
|
<property name="orientation">
|
||||||
<enum>Qt::Horizontal</enum>
|
<enum>Qt::Horizontal</enum>
|
||||||
@ -123,13 +77,6 @@
|
|||||||
</property>
|
</property>
|
||||||
</spacer>
|
</spacer>
|
||||||
</item>
|
</item>
|
||||||
<item row="3" column="0" colspan="2">
|
|
||||||
<widget class="QCheckBox" name="opt_preprocess_html">
|
|
||||||
<property name="text">
|
|
||||||
<string>&Preprocess input file to possibly improve structure detection</string>
|
|
||||||
</property>
|
|
||||||
</widget>
|
|
||||||
</item>
|
|
||||||
</layout>
|
</layout>
|
||||||
</widget>
|
</widget>
|
||||||
<customwidgets>
|
<customwidgets>
|
||||||
@ -139,46 +86,7 @@
|
|||||||
<header>convert/xpath_wizard.h</header>
|
<header>convert/xpath_wizard.h</header>
|
||||||
<container>1</container>
|
<container>1</container>
|
||||||
</customwidget>
|
</customwidget>
|
||||||
<customwidget>
|
|
||||||
<class>RegexEdit</class>
|
|
||||||
<extends>QWidget</extends>
|
|
||||||
<header>regex_builder.h</header>
|
|
||||||
<container>1</container>
|
|
||||||
</customwidget>
|
|
||||||
</customwidgets>
|
</customwidgets>
|
||||||
<resources/>
|
<resources/>
|
||||||
<connections>
|
<connections/>
|
||||||
<connection>
|
|
||||||
<sender>opt_preprocess_html</sender>
|
|
||||||
<signal>toggled(bool)</signal>
|
|
||||||
<receiver>opt_html_unwrap_factor</receiver>
|
|
||||||
<slot>setEnabled(bool)</slot>
|
|
||||||
<hints>
|
|
||||||
<hint type="sourcelabel">
|
|
||||||
<x>328</x>
|
|
||||||
<y>87</y>
|
|
||||||
</hint>
|
|
||||||
<hint type="destinationlabel">
|
|
||||||
<x>481</x>
|
|
||||||
<y>113</y>
|
|
||||||
</hint>
|
|
||||||
</hints>
|
|
||||||
</connection>
|
|
||||||
<connection>
|
|
||||||
<sender>opt_preprocess_html</sender>
|
|
||||||
<signal>toggled(bool)</signal>
|
|
||||||
<receiver>huf_label</receiver>
|
|
||||||
<slot>setEnabled(bool)</slot>
|
|
||||||
<hints>
|
|
||||||
<hint type="sourcelabel">
|
|
||||||
<x>295</x>
|
|
||||||
<y>88</y>
|
|
||||||
</hint>
|
|
||||||
<hint type="destinationlabel">
|
|
||||||
<x>291</x>
|
|
||||||
<y>105</y>
|
|
||||||
</hint>
|
|
||||||
</hints>
|
|
||||||
</connection>
|
|
||||||
</connections>
|
|
||||||
</ui>
|
</ui>
|
||||||
|
@ -4,10 +4,10 @@ __license__ = 'GPL 3'
|
|||||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
from PyQt4.Qt import Qt
|
||||||
|
|
||||||
from calibre.gui2.convert.txt_output_ui import Ui_Form
|
from calibre.gui2.convert.txt_output_ui import Ui_Form
|
||||||
from calibre.gui2.convert import Widget
|
from calibre.gui2.convert import Widget
|
||||||
from calibre.ebooks.txt.newlines import TxtNewlines
|
|
||||||
from calibre.gui2.widgets import BasicComboModel
|
|
||||||
|
|
||||||
newline_model = None
|
newline_model = None
|
||||||
|
|
||||||
@ -24,16 +24,26 @@ class PluginWidget(Widget, Ui_Form):
|
|||||||
'inline_toc', 'markdown_format', 'keep_links', 'keep_image_references',
|
'inline_toc', 'markdown_format', 'keep_links', 'keep_image_references',
|
||||||
'txt_output_encoding'])
|
'txt_output_encoding'])
|
||||||
self.db, self.book_id = db, book_id
|
self.db, self.book_id = db, book_id
|
||||||
|
for x in get_option('newline').option.choices:
|
||||||
|
self.opt_newline.addItem(x)
|
||||||
self.initialize_options(get_option, get_help, db, book_id)
|
self.initialize_options(get_option, get_help, db, book_id)
|
||||||
|
|
||||||
default = self.opt_newline.currentText()
|
self.opt_markdown_format.stateChanged.connect(self.enable_markdown_format)
|
||||||
|
self.enable_markdown_format(self.opt_markdown_format.checkState())
|
||||||
|
|
||||||
global newline_model
|
def break_cycles(self):
|
||||||
if newline_model is None:
|
Widget.break_cycles(self)
|
||||||
newline_model = BasicComboModel(TxtNewlines.NEWLINE_TYPES.keys())
|
|
||||||
self.newline_model = newline_model
|
try:
|
||||||
self.opt_newline.setModel(self.newline_model)
|
self.opt_markdown_format.stateChanged.disconnect()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def enable_markdown_format(self, state):
|
||||||
|
if state == Qt.Checked:
|
||||||
|
state = True
|
||||||
|
else:
|
||||||
|
state = False
|
||||||
|
self.opt_keep_links.setEnabled(state)
|
||||||
|
self.opt_keep_image_references.setEnabled(state)
|
||||||
|
|
||||||
default_index = self.opt_newline.findText(default)
|
|
||||||
system_index = self.opt_newline.findText('system')
|
|
||||||
self.opt_newline.setCurrentIndex(default_index if default_index != -1 else system_index if system_index != -1 else 0)
|
|
||||||
|
@ -6,8 +6,8 @@
|
|||||||
<rect>
|
<rect>
|
||||||
<x>0</x>
|
<x>0</x>
|
||||||
<y>0</y>
|
<y>0</y>
|
||||||
<width>422</width>
|
<width>434</width>
|
||||||
<height>64</height>
|
<height>74</height>
|
||||||
</rect>
|
</rect>
|
||||||
</property>
|
</property>
|
||||||
<property name="windowTitle">
|
<property name="windowTitle">
|
||||||
@ -53,13 +53,13 @@
|
|||||||
<item row="0" column="1">
|
<item row="0" column="1">
|
||||||
<widget class="QToolButton" name="button">
|
<widget class="QToolButton" name="button">
|
||||||
<property name="toolTip">
|
<property name="toolTip">
|
||||||
<string>Use a wizard to help construct the XPath expression</string>
|
<string>Use a wizard to help construct the Regular expression</string>
|
||||||
</property>
|
</property>
|
||||||
<property name="text">
|
<property name="text">
|
||||||
<string>...</string>
|
<string>...</string>
|
||||||
</property>
|
</property>
|
||||||
<property name="icon">
|
<property name="icon">
|
||||||
<iconset resource="../../../../resources/images.qrc">
|
<iconset>
|
||||||
<normaloff>:/images/wizard.png</normaloff>:/images/wizard.png</iconset>
|
<normaloff>:/images/wizard.png</normaloff>:/images/wizard.png</iconset>
|
||||||
</property>
|
</property>
|
||||||
<property name="iconSize">
|
<property name="iconSize">
|
||||||
@ -70,19 +70,6 @@
|
|||||||
</property>
|
</property>
|
||||||
</widget>
|
</widget>
|
||||||
</item>
|
</item>
|
||||||
<item row="0" column="2">
|
|
||||||
<spacer name="horizontalSpacer">
|
|
||||||
<property name="orientation">
|
|
||||||
<enum>Qt::Horizontal</enum>
|
|
||||||
</property>
|
|
||||||
<property name="sizeHint" stdset="0">
|
|
||||||
<size>
|
|
||||||
<width>20</width>
|
|
||||||
<height>20</height>
|
|
||||||
</size>
|
|
||||||
</property>
|
|
||||||
</spacer>
|
|
||||||
</item>
|
|
||||||
</layout>
|
</layout>
|
||||||
</widget>
|
</widget>
|
||||||
<customwidgets>
|
<customwidgets>
|
||||||
|
@ -12,6 +12,8 @@ from calibre.ebooks.conversion.plumber import Plumber
|
|||||||
from calibre.utils.logging import Log
|
from calibre.utils.logging import Log
|
||||||
from calibre.gui2.preferences.conversion_ui import Ui_Form
|
from calibre.gui2.preferences.conversion_ui import Ui_Form
|
||||||
from calibre.gui2.convert.look_and_feel import LookAndFeelWidget
|
from calibre.gui2.convert.look_and_feel import LookAndFeelWidget
|
||||||
|
from calibre.gui2.convert.heuristics import HeuristicsWidget
|
||||||
|
from calibre.gui2.convert.search_and_replace import SearchAndReplaceWidget
|
||||||
from calibre.gui2.convert.page_setup import PageSetupWidget
|
from calibre.gui2.convert.page_setup import PageSetupWidget
|
||||||
from calibre.gui2.convert.structure_detection import StructureDetectionWidget
|
from calibre.gui2.convert.structure_detection import StructureDetectionWidget
|
||||||
from calibre.gui2.convert.toc import TOCWidget
|
from calibre.gui2.convert.toc import TOCWidget
|
||||||
@ -82,7 +84,8 @@ class Base(ConfigWidgetBase, Ui_Form):
|
|||||||
class CommonOptions(Base):
|
class CommonOptions(Base):
|
||||||
|
|
||||||
def load_conversion_widgets(self):
|
def load_conversion_widgets(self):
|
||||||
self.conversion_widgets = [LookAndFeelWidget, PageSetupWidget,
|
self.conversion_widgets = [LookAndFeelWidget, HeuristicsWidget,
|
||||||
|
SearchAndReplaceWidget, PageSetupWidget,
|
||||||
StructureDetectionWidget, TOCWidget]
|
StructureDetectionWidget, TOCWidget]
|
||||||
|
|
||||||
class InputOptions(Base):
|
class InputOptions(Base):
|
||||||
|
@ -311,32 +311,6 @@ class FontFamilyModel(QAbstractListModel):
|
|||||||
def index_of(self, family):
|
def index_of(self, family):
|
||||||
return self.families.index(family.strip())
|
return self.families.index(family.strip())
|
||||||
|
|
||||||
class BasicComboModel(QAbstractListModel):
|
|
||||||
|
|
||||||
def __init__(self, items, *args):
|
|
||||||
QAbstractListModel.__init__(self, *args)
|
|
||||||
self.items = [i for i in items]
|
|
||||||
self.items.sort()
|
|
||||||
|
|
||||||
def rowCount(self, *args):
|
|
||||||
return len(self.items)
|
|
||||||
|
|
||||||
def data(self, index, role):
|
|
||||||
try:
|
|
||||||
item = self.items[index.row()]
|
|
||||||
except:
|
|
||||||
traceback.print_exc()
|
|
||||||
return NONE
|
|
||||||
if role == Qt.DisplayRole:
|
|
||||||
return QVariant(item)
|
|
||||||
if role == Qt.FontRole:
|
|
||||||
return QVariant(QFont(item))
|
|
||||||
return NONE
|
|
||||||
|
|
||||||
def index_of(self, item):
|
|
||||||
return self.items.index(item.strip())
|
|
||||||
|
|
||||||
|
|
||||||
class BasicListItem(QListWidgetItem):
|
class BasicListItem(QListWidgetItem):
|
||||||
|
|
||||||
def __init__(self, text, user_data=None):
|
def __init__(self, text, user_data=None):
|
||||||
|
@ -255,6 +255,98 @@ you are producing are meant for a particular device type, choose the correspondi
|
|||||||
|
|
||||||
The Output profile also controls the screen size. This will cause, for example, images to be auto-resized to be fit to the screen in some output formats. So choose a profile of a device that has a screen size similar to your device.
|
The Output profile also controls the screen size. This will cause, for example, images to be auto-resized to be fit to the screen in some output formats. So choose a profile of a device that has a screen size similar to your device.
|
||||||
|
|
||||||
|
.. _heuristic-processing:
|
||||||
|
|
||||||
|
Heuristic Processing
|
||||||
|
---------------------
|
||||||
|
|
||||||
|
Heuristic Processing provides a variety of functions which can be used that try to detect and correct
|
||||||
|
common problems in poorly formatted input documents. Use these functions if your input document suffers
|
||||||
|
from bad formatting. Because these functions rely on common patterns, be aware that in some cases an
|
||||||
|
option may lead to worse results, so use with care. As an example, several of these options will
|
||||||
|
remove all non-breaking-space entities.
|
||||||
|
|
||||||
|
:guilabel:`Preprocess input`
|
||||||
|
This option activates various activates |app|'s Heuristic Processing stage of the conversion pipeline.
|
||||||
|
This must be enabled in order for various sub-functions to be applied
|
||||||
|
|
||||||
|
:guilabel:`Unwrap lines`
|
||||||
|
Enabling this option will cause |app| to attempt to detect and correct hard line breaks that exist
|
||||||
|
within a document using punctuation clues and line length. |app| will first attempt to detect whether
|
||||||
|
hard line breaks exist, if they do not appear to exist |app| will not attempt to unwrap lines. The
|
||||||
|
line-unwrap factor can be reduced if you want to 'force' |app| to unwrap lines.
|
||||||
|
|
||||||
|
:guilabel:`Line-unwrap factor`
|
||||||
|
This option controls the algorithm |app| uses to remove hard line breaks. For example, if the value of this
|
||||||
|
option is 0.4, that means calibre will remove hard line breaks from the end of lines whose lengths are less
|
||||||
|
than the length of 40% of all lines in the document. If your document only has a few line breaks which need
|
||||||
|
correction, then this value should be reduced to somewhere between 0.1 and 0.2.
|
||||||
|
|
||||||
|
:guilabel:`Detect and markup unformatted chapter headings and sub headings`
|
||||||
|
If your document does not have Chapter Markers and titles formatted differently from the rest of the text,
|
||||||
|
|app| can use this option to attempt detection them and surround them with heading tags. <h2> tags are used
|
||||||
|
for chapter headings; <h3> tags are used for any titles that are detected.
|
||||||
|
|
||||||
|
This function will not create a TOC, but in many cases it will cause |app|'s default chapter detection settings
|
||||||
|
to correctly detect chapters and build a TOC. Adjust the Xpath under Structure Detection if a TOC is not automatically
|
||||||
|
created. If there are no other headings used in the document then setting "//h:h2" under Structure Detection would
|
||||||
|
be the easiest way to create a TOC for the document.
|
||||||
|
|
||||||
|
The inserted headings are not formatted, to apply formatting use the 'extra_css' option under
|
||||||
|
the Look and Feel conversion settings. For example, to center heading tags, use the following::
|
||||||
|
|
||||||
|
h2, h3 { text-align: center }
|
||||||
|
|
||||||
|
:guilabel:`Renumber sequences of <h1> or <h2> tags`
|
||||||
|
Some publishers format chapter headings using multiple <h1> or <h2> tags sequentially.
|
||||||
|
|app|'s default conversion settings will cause such titles to be split into two pieces. This option
|
||||||
|
will re-number the heading tags to prevent splitting.
|
||||||
|
|
||||||
|
:guilabel:`Delete blank lines between paragraphs`
|
||||||
|
This option will cause |app| to analyze blank lines included within the document. If every paragraph is interleaved
|
||||||
|
with a blank line, then |app| will remove all those blank paragraphs. Sequences of multiple blank lines will be
|
||||||
|
considered scene breaks and retained as a single paragraph. This option differs from the 'Remove Paragraph Spacing'
|
||||||
|
option under 'Look and Feel' in that it actually modifies the HTML content, while the other option modifies the document
|
||||||
|
styles. This option can also remove paragraphs which were inserted using |app|'s 'Insert blank line' option.
|
||||||
|
|
||||||
|
:guilabel:`Ensure scene breaks are consistently formatted`
|
||||||
|
With this option |app| will attempt to detect common scene-break markers and ensure that they are center aligned.
|
||||||
|
It also attempts to detect scene breaks defined by white space and replace them with a horizontal rule 15% of the
|
||||||
|
page width. Some readers may find this desirable as these 'soft' scene breaks often become page breaks on readers, and
|
||||||
|
thus become difficult to distinguish.
|
||||||
|
|
||||||
|
:guilabel:`Remove unnecessary hyphens`
|
||||||
|
|app| will analyze all hyphenated content in the document when this option is enabled. The document itself is used
|
||||||
|
as a dictionary for analysis. This allows |app| to accurately remove hyphens for any words in the document in any language,
|
||||||
|
along with made-up and obscure scientific words. The primary drawback is words appearing only a single time in the document
|
||||||
|
will not be changed. Analysis happens in two passes, the first pass analyzes line endings. Lines are only unwrapped if the
|
||||||
|
word exists with or without a hyphen in the document. The second pass analyzes all hyphenated words throughout the document,
|
||||||
|
hyphens are removed if the word exists elsewhere in the document without a match.
|
||||||
|
|
||||||
|
:guilabel:`Italicize common words and patterns`
|
||||||
|
When enabled, |app| will look for common words and patterns that denote italics and italicize them. Examples are common text
|
||||||
|
conventions such as ~word~ or phrases that should generally be italicized, e.g. latin phrases like 'etc.' or 'et cetera'.
|
||||||
|
|
||||||
|
:guilabel:`Replace entity indents with CSS indents`
|
||||||
|
Some documents use a convention of defining text indents using non-breaking space entities. When this option is enabled |app| will
|
||||||
|
attempt to detect this sort of formatting and convert them to a 3% text indent using css.
|
||||||
|
|
||||||
|
.. search-replace:
|
||||||
|
|
||||||
|
Search & Replace
|
||||||
|
---------------------
|
||||||
|
|
||||||
|
These options are useful primarily for conversion of PDF documents. Often, the conversion leaves
|
||||||
|
behind page headers and footers in the text. These options use regular expressions to try and detect
|
||||||
|
the headers and footers and remove them. Remember that they operate on the intermediate XHTML produced
|
||||||
|
by the conversion pipeline. There is also a wizard to help you customize the regular expressions for
|
||||||
|
your document. These options can also be used for generic search and replace of any content by additionally
|
||||||
|
specifying a replacement expression.
|
||||||
|
|
||||||
|
The search works by using a python regular expression. All matched text is simply removed from
|
||||||
|
the document or replaced using the replacement pattern. You can learn more about regular expressions and
|
||||||
|
their syntax at http://docs.python.org/library/re.html.
|
||||||
|
|
||||||
.. _structure-detection:
|
.. _structure-detection:
|
||||||
|
|
||||||
Structure Detection
|
Structure Detection
|
||||||
@ -298,21 +390,6 @@ which means that |app| will insert page breaks before every `<h1>` and `<h2>` ta
|
|||||||
|
|
||||||
The default expressions may change depending on the input format you are converting.
|
The default expressions may change depending on the input format you are converting.
|
||||||
|
|
||||||
Removing headers and footers
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
These options are useful primarily for conversion of PDF documents. Often, the conversion leaves
|
|
||||||
behind page headers and footers in the text. These options use regular expressions to try and detect
|
|
||||||
the headers and footers and remove them. Remember that they operate on the intermediate XHTML produced
|
|
||||||
by the conversion pipeline. There is also a wizard to help you customize the regular expressions for
|
|
||||||
your document.
|
|
||||||
|
|
||||||
The header and footer regular expressions are used in conjunction with the remove header and footer options.
|
|
||||||
If the remove option is not enabled the regular expression will not be applied to remove the matched text.
|
|
||||||
The removal works by using a python regular expression. All matched text is simply removed from
|
|
||||||
the document. You can learn more about regular expressions and their syntax at
|
|
||||||
http://docs.python.org/library/re.html.
|
|
||||||
|
|
||||||
Miscellaneous
|
Miscellaneous
|
||||||
~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~
|
||||||
|
|
||||||
@ -330,16 +407,6 @@ There are a few more options in this section.
|
|||||||
two covers. This option will simply remove the first image from the source document, thereby
|
two covers. This option will simply remove the first image from the source document, thereby
|
||||||
ensuring that the converted book has only one cover, the one specified in |app|.
|
ensuring that the converted book has only one cover, the one specified in |app|.
|
||||||
|
|
||||||
:guilabel:`Preprocess input`
|
|
||||||
This option activates various algorithms that try to detect and correct common cases of
|
|
||||||
badly formatted input documents. Things like hard line breaks, large blocks of text with no formatting, etc.
|
|
||||||
Turn this option on if your input document suffers from bad formatting. But be aware that in
|
|
||||||
some cases, this option can lead to worse results, so use with care.
|
|
||||||
|
|
||||||
:guilabel:`Line-unwrap factor`
|
|
||||||
This option control the algorithm |app| uses to remove hard line breaks. For example, if the value of this
|
|
||||||
option is 0.4, that means calibre will remove hard line breaks from the end of lines whose lengths are less
|
|
||||||
than the length of 40% of all lines in the document.
|
|
||||||
|
|
||||||
Table of Contents
|
Table of Contents
|
||||||
------------------
|
------------------
|
||||||
@ -488,26 +555,33 @@ at `mobileread <http://www.mobileread.com/forums/showthread.php?t=28313>`_.
|
|||||||
Convert TXT documents
|
Convert TXT documents
|
||||||
~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
TXT documents have no well defined way to specify formatting like bold, italics, etc, or document structure like paragraphs, headings, sections and so on.
|
TXT documents have no well defined way to specify formatting like bold, italics, etc, or document
|
||||||
Since TXT documents provide no way to explicitly mark parts of
|
structure like paragraphs, headings, sections and so on, but there are a variety of conventions commonly
|
||||||
the text, by default |app| only groups lines in the input document into paragraphs. The default is to assume one or
|
used. By default |app| attempts automatic detection of the correct formatting and markup based on those
|
||||||
more blank lines are a paragraph boundary::
|
conventions.
|
||||||
|
|
||||||
|
TXT input supports a number of options to differentiate how paragraphs are detected.
|
||||||
|
|
||||||
|
:guilabel:`Paragraph Style: Auto`
|
||||||
|
Analyzes the text file and attempts to automatically determine how paragraphs are defined. This
|
||||||
|
option will generally work fine, if you achieve undesirable results try one of the manual options.
|
||||||
|
|
||||||
|
:guilabel:`Paragraph Style: Block`
|
||||||
|
Assumes one or more blank lines are a paragraph boundary::
|
||||||
|
|
||||||
This is the first.
|
This is the first.
|
||||||
|
|
||||||
This is the
|
This is the
|
||||||
second paragraph.
|
second paragraph.
|
||||||
|
|
||||||
TXT input supports a number of options to differentiate how paragraphs are detected.
|
:guilabel:`Paragraph Style: Single`
|
||||||
|
|
||||||
:guilabel:`Treat each line as a paragraph`
|
|
||||||
Assumes that every line is a paragraph::
|
Assumes that every line is a paragraph::
|
||||||
|
|
||||||
This is the first.
|
This is the first.
|
||||||
This is the second.
|
This is the second.
|
||||||
This is the third.
|
This is the third.
|
||||||
|
|
||||||
:guilabel:`Assume print formatting`
|
:guilabel:`Paragraph Style: Print`
|
||||||
Assumes that every paragraph starts with an indent (either a tab or 2+ spaces). Paragraphs end when
|
Assumes that every paragraph starts with an indent (either a tab or 2+ spaces). Paragraphs end when
|
||||||
the next line that starts with an indent is reached::
|
the next line that starts with an indent is reached::
|
||||||
|
|
||||||
@ -518,13 +592,28 @@ TXT input supports a number of options to differentiate how paragraphs are detec
|
|||||||
This is the
|
This is the
|
||||||
third.
|
third.
|
||||||
|
|
||||||
:guilabel:`Process using markdown`
|
:guilabel:`Paragraph Style: Unformatted`
|
||||||
|
Assumes that the document has no formatting, but does use hard line breaks. Punctuation
|
||||||
|
and median line length are used to attempt to re-create paragraphs.
|
||||||
|
|
||||||
|
:guilabel:`Formatting Style: Auto`
|
||||||
|
Attemtps to detect the type of formatting markup being used. If no markup is used then heuristic
|
||||||
|
formatting will be applied.
|
||||||
|
|
||||||
|
:guilabel:`Formatting Style: Heuristic`
|
||||||
|
Analyses the document for common chapter headings, scene breaks, and italicized words and applies the
|
||||||
|
appropriate html markup during conversion.
|
||||||
|
|
||||||
|
:guilabel:`Formatting Style: Markdown`
|
||||||
|app| also supports running TXT input though a transformation preprocessor known as markdown. Markdown
|
|app| also supports running TXT input though a transformation preprocessor known as markdown. Markdown
|
||||||
allows for basic formatting to be added to TXT documents, such as bold, italics, section headings, tables,
|
allows for basic formatting to be added to TXT documents, such as bold, italics, section headings, tables,
|
||||||
lists, a Table of Contents, etc. Marking chapter headings with a leading # and setting the chapter XPath detection
|
lists, a Table of Contents, etc. Marking chapter headings with a leading # and setting the chapter XPath detection
|
||||||
expression to "//h:h1" is the easiest way to have a proper table of contents generated from a TXT document.
|
expression to "//h:h1" is the easiest way to have a proper table of contents generated from a TXT document.
|
||||||
You can learn more about the markdown syntax at `daringfireball <http://daringfireball.net/projects/markdown/syntax>`_.
|
You can learn more about the markdown syntax at `daringfireball <http://daringfireball.net/projects/markdown/syntax>`_.
|
||||||
|
|
||||||
|
:guilabel:`Formatting Style: None`
|
||||||
|
Applies no special formatting to the text, the document is converted to html with no other changes.
|
||||||
|
|
||||||
|
|
||||||
Convert PDF documents
|
Convert PDF documents
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
@ -52,9 +52,10 @@ def is_date_undefined(qt_or_dt):
|
|||||||
return True
|
return True
|
||||||
if hasattr(d, 'toString'):
|
if hasattr(d, 'toString'):
|
||||||
d = datetime(d.year(), d.month(), d.day(), tzinfo=utc_tz)
|
d = datetime(d.year(), d.month(), d.day(), tzinfo=utc_tz)
|
||||||
return d.year <= UNDEFINED_DATE.year and \
|
return d.year < UNDEFINED_DATE.year or (
|
||||||
d.month == UNDEFINED_DATE.month and \
|
d.year == UNDEFINED_DATE.year and
|
||||||
d.day == UNDEFINED_DATE.day
|
d.month == UNDEFINED_DATE.month and
|
||||||
|
d.day == UNDEFINED_DATE.day)
|
||||||
|
|
||||||
def parse_date(date_string, assume_utc=False, as_utc=True, default=None):
|
def parse_date(date_string, assume_utc=False, as_utc=True, default=None):
|
||||||
'''
|
'''
|
||||||
|
Loading…
x
Reference in New Issue
Block a user