mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
...
This commit is contained in:
commit
26e8ec2fd0
@ -79,7 +79,7 @@ class DocAnalysis(object):
|
|||||||
elif format == 'spanned_html':
|
elif format == 'spanned_html':
|
||||||
linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL)
|
linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL)
|
||||||
elif format == 'txt':
|
elif format == 'txt':
|
||||||
linere = re.compile('.*?\n', re.DOTALL)
|
linere = re.compile('.*?\n')
|
||||||
self.lines = linere.findall(raw)
|
self.lines = linere.findall(raw)
|
||||||
|
|
||||||
def line_length(self, percent):
|
def line_length(self, percent):
|
||||||
@ -177,7 +177,7 @@ class Dehyphenator(object):
|
|||||||
def __init__(self):
|
def __init__(self):
|
||||||
# Add common suffixes to the regex below to increase the likelihood of a match -
|
# Add common suffixes to the regex below to increase the likelihood of a match -
|
||||||
# don't add suffixes which are also complete words, such as 'able' or 'sex'
|
# don't add suffixes which are also complete words, such as 'able' or 'sex'
|
||||||
self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE)
|
self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|m?ents?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex|ian)$", re.IGNORECASE)
|
||||||
# remove prefixes if the prefix was not already the point of hyphenation
|
# remove prefixes if the prefix was not already the point of hyphenation
|
||||||
self.prefixes = re.compile(r'^(dis|re|un|in|ex)$', re.IGNORECASE)
|
self.prefixes = re.compile(r'^(dis|re|un|in|ex)$', re.IGNORECASE)
|
||||||
self.removeprefix = re.compile(r'^(dis|re|un|in|ex)', re.IGNORECASE)
|
self.removeprefix = re.compile(r'^(dis|re|un|in|ex)', re.IGNORECASE)
|
||||||
@ -199,7 +199,7 @@ class Dehyphenator(object):
|
|||||||
searchresult = self.html.find(lookupword.lower())
|
searchresult = self.html.find(lookupword.lower())
|
||||||
except:
|
except:
|
||||||
return hyphenated
|
return hyphenated
|
||||||
if self.format == 'html_cleanup':
|
if self.format == 'html_cleanup' or self.format == 'txt_cleanup':
|
||||||
if self.html.find(lookupword) != -1 or searchresult != -1:
|
if self.html.find(lookupword) != -1 or searchresult != -1:
|
||||||
#print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
|
#print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
|
||||||
return dehyphenated
|
return dehyphenated
|
||||||
@ -225,10 +225,15 @@ class Dehyphenator(object):
|
|||||||
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P<wraptags>(</span>)?\s*(</[iubp]>\s*){1,2}(?P<up2threeblanks><(p|div)[^>]*>\s*(<p[^>]*>\s*</p>\s*)?</(p|div)>\s+){0,3}\s*(<[iubp][^>]*>\s*){1,2}(<span[^>]*>)?)\s*(?P<secondpart>[\w\d]+)' % length)
|
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P<wraptags>(</span>)?\s*(</[iubp]>\s*){1,2}(?P<up2threeblanks><(p|div)[^>]*>\s*(<p[^>]*>\s*</p>\s*)?</(p|div)>\s+){0,3}\s*(<[iubp][^>]*>\s*){1,2}(<span[^>]*>)?)\s*(?P<secondpart>[\w\d]+)' % length)
|
||||||
elif format == 'pdf':
|
elif format == 'pdf':
|
||||||
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?P<wraptags><p>|</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)'% length)
|
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?P<wraptags><p>|</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)'% length)
|
||||||
|
elif format == 'txt':
|
||||||
|
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)(\u0020|\u0009)*(?P<wraptags>(\n(\u0020|\u0009)*)+)(?P<secondpart>[\w\d]+)'% length)
|
||||||
elif format == 'individual_words':
|
elif format == 'individual_words':
|
||||||
intextmatch = re.compile(u'>[^<]*\b(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)(?P<secondpart)\w+)\b[^<]*<') # for later, not called anywhere yet
|
intextmatch = re.compile(u'>[^<]*\b(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020*(?P<secondpart>\w+)\b[^<]*<') # for later, not called anywhere yet
|
||||||
elif format == 'html_cleanup':
|
elif format == 'html_cleanup':
|
||||||
intextmatch = re.compile(u'(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)')
|
intextmatch = re.compile(u'(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)')
|
||||||
|
elif format == 'txt_cleanup':
|
||||||
|
intextmatch = re.compile(u'(?P<firstpart>\w+)(-|‐)(?P<wraptags>\s+)(?P<secondpart>[\w\d]+)')
|
||||||
|
|
||||||
|
|
||||||
html = intextmatch.sub(self.dehyphenate, html)
|
html = intextmatch.sub(self.dehyphenate, html)
|
||||||
return html
|
return html
|
||||||
|
@ -190,7 +190,7 @@ class PreProcessor(object):
|
|||||||
line_ending = "\s*</(span|p|div)>\s*(</(p|span|div)>)?"
|
line_ending = "\s*</(span|p|div)>\s*(</(p|span|div)>)?"
|
||||||
blanklines = "\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*"
|
blanklines = "\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*"
|
||||||
line_opening = "<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*"
|
line_opening = "<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*"
|
||||||
txt_line_wrap = u"(\u0020|\u0009)*\n"
|
txt_line_wrap = u"((\u0020|\u0009)*\n){1,4}"
|
||||||
|
|
||||||
unwrap_regex = lookahead+line_ending+blanklines+line_opening
|
unwrap_regex = lookahead+line_ending+blanklines+line_opening
|
||||||
if format == 'txt':
|
if format == 'txt':
|
||||||
@ -357,6 +357,6 @@ class PreProcessor(object):
|
|||||||
html = blankreg.sub('\n'+r'\g<openline>'+u'\u00a0'+r'\g<closeline>', html)
|
html = blankreg.sub('\n'+r'\g<openline>'+u'\u00a0'+r'\g<closeline>', html)
|
||||||
|
|
||||||
# Center separator lines
|
# Center separator lines
|
||||||
html = re.sub(u'<p>\s*(?P<break>([*#•]+\s*)+)\s*</p>', '<p style="text-align:center">' + '\g<break>' + '</p>', html)
|
html = re.sub(u'<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*(?P<break>([*#•]+\s*)+)\s*(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>', '<p style="text-align:center">' + '\g<break>' + '</p>', html)
|
||||||
|
|
||||||
return html
|
return html
|
||||||
|
@ -7,11 +7,12 @@ __docformat__ = 'restructuredtext en'
|
|||||||
import os
|
import os
|
||||||
|
|
||||||
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
|
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
|
||||||
|
from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
|
||||||
from calibre.ebooks.chardet import detect
|
from calibre.ebooks.chardet import detect
|
||||||
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
|
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
|
||||||
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
|
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
|
||||||
preserve_spaces, detect_paragraph_type, detect_formatting_type, \
|
preserve_spaces, detect_paragraph_type, detect_formatting_type, \
|
||||||
convert_heuristic
|
convert_heuristic, normalize_line_endings
|
||||||
from calibre import _ent_pat, xml_entity_to_unicode
|
from calibre import _ent_pat, xml_entity_to_unicode
|
||||||
|
|
||||||
class TXTInput(InputFormatPlugin):
|
class TXTInput(InputFormatPlugin):
|
||||||
@ -23,7 +24,7 @@ class TXTInput(InputFormatPlugin):
|
|||||||
|
|
||||||
options = set([
|
options = set([
|
||||||
OptionRecommendation(name='paragraph_type', recommended_value='auto',
|
OptionRecommendation(name='paragraph_type', recommended_value='auto',
|
||||||
choices=['auto', 'block', 'single', 'print'],
|
choices=['auto', 'block', 'single', 'print', 'unformatted'],
|
||||||
help=_('Paragraph structure.\n'
|
help=_('Paragraph structure.\n'
|
||||||
'choices are [\'auto\', \'block\', \'single\', \'print\', \'unformatted\']\n'
|
'choices are [\'auto\', \'block\', \'single\', \'print\', \'unformatted\']\n'
|
||||||
'* auto: Try to auto detect paragraph type.\n'
|
'* auto: Try to auto detect paragraph type.\n'
|
||||||
@ -31,7 +32,7 @@ class TXTInput(InputFormatPlugin):
|
|||||||
'* single: Assume every line is a paragraph.\n'
|
'* single: Assume every line is a paragraph.\n'
|
||||||
'* print: Assume every line starting with 2+ spaces or a tab '
|
'* print: Assume every line starting with 2+ spaces or a tab '
|
||||||
'starts a paragraph.'
|
'starts a paragraph.'
|
||||||
'* unformatted: Most lines have hard line breaks, few/no spaces or indents.')),
|
'* unformatted: Most lines have hard line breaks, few/no blank lines or indents.')),
|
||||||
OptionRecommendation(name='formatting_type', recommended_value='auto',
|
OptionRecommendation(name='formatting_type', recommended_value='auto',
|
||||||
choices=['auto', 'none', 'heuristic', 'markdown'],
|
choices=['auto', 'none', 'heuristic', 'markdown'],
|
||||||
help=_('Formatting used within the document.'
|
help=_('Formatting used within the document.'
|
||||||
@ -72,6 +73,13 @@ class TXTInput(InputFormatPlugin):
|
|||||||
# followed by the entity.
|
# followed by the entity.
|
||||||
if options.preserve_spaces:
|
if options.preserve_spaces:
|
||||||
txt = preserve_spaces(txt)
|
txt = preserve_spaces(txt)
|
||||||
|
|
||||||
|
# Normalize line endings
|
||||||
|
txt = normalize_line_endings(txt)
|
||||||
|
|
||||||
|
# Get length for hyphen removal and punctuation unwrap
|
||||||
|
docanalysis = DocAnalysis('txt', txt)
|
||||||
|
length = docanalysis.line_length(.5)
|
||||||
|
|
||||||
if options.formatting_type == 'auto':
|
if options.formatting_type == 'auto':
|
||||||
options.formatting_type = detect_formatting_type(txt)
|
options.formatting_type = detect_formatting_type(txt)
|
||||||
@ -91,10 +99,15 @@ class TXTInput(InputFormatPlugin):
|
|||||||
log.debug('Could not reliably determine paragraph type using block')
|
log.debug('Could not reliably determine paragraph type using block')
|
||||||
options.paragraph_type = 'block'
|
options.paragraph_type = 'block'
|
||||||
else:
|
else:
|
||||||
log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
|
log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
|
||||||
|
|
||||||
|
# Dehyphenate
|
||||||
|
dehyphenator = Dehyphenator()
|
||||||
|
txt = dehyphenator(txt,'txt', length)
|
||||||
|
|
||||||
# We don't check for block because the processor assumes block.
|
# We don't check for block because the processor assumes block.
|
||||||
# single and print at transformed to block for processing.
|
# single and print at transformed to block for processing.
|
||||||
|
|
||||||
if options.paragraph_type == 'single' or options.paragraph_type == 'unformatted':
|
if options.paragraph_type == 'single' or options.paragraph_type == 'unformatted':
|
||||||
txt = separate_paragraphs_single_line(txt)
|
txt = separate_paragraphs_single_line(txt)
|
||||||
elif options.paragraph_type == 'print':
|
elif options.paragraph_type == 'print':
|
||||||
@ -102,10 +115,8 @@ class TXTInput(InputFormatPlugin):
|
|||||||
|
|
||||||
if options.paragraph_type == 'unformatted':
|
if options.paragraph_type == 'unformatted':
|
||||||
from calibre.ebooks.conversion.utils import PreProcessor
|
from calibre.ebooks.conversion.utils import PreProcessor
|
||||||
from calibre.ebooks.conversion.preprocess import DocAnalysis
|
|
||||||
# get length
|
# get length
|
||||||
docanalysis = DocAnalysis('txt', txt)
|
|
||||||
length = docanalysis.line_length(.5)
|
|
||||||
# unwrap lines based on punctuation
|
# unwrap lines based on punctuation
|
||||||
preprocessor = PreProcessor(options, log=getattr(self, 'log', None))
|
preprocessor = PreProcessor(options, log=getattr(self, 'log', None))
|
||||||
txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
|
txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
|
||||||
@ -116,7 +127,11 @@ class TXTInput(InputFormatPlugin):
|
|||||||
html = convert_heuristic(txt, epub_split_size_kb=flow_size)
|
html = convert_heuristic(txt, epub_split_size_kb=flow_size)
|
||||||
else:
|
else:
|
||||||
html = convert_basic(txt, epub_split_size_kb=flow_size)
|
html = convert_basic(txt, epub_split_size_kb=flow_size)
|
||||||
|
|
||||||
|
# Dehyphenate in cleanup mode for missed txt and markdown conversion
|
||||||
|
dehyphenator = Dehyphenator()
|
||||||
|
html = dehyphenator(html,'txt_cleanup', length)
|
||||||
|
html = dehyphenator(html,'html_cleanup', length)
|
||||||
|
|
||||||
from calibre.customize.ui import plugin_for_input_format
|
from calibre.customize.ui import plugin_for_input_format
|
||||||
html_input = plugin_for_input_format('html')
|
html_input = plugin_for_input_format('html')
|
||||||
|
@ -80,9 +80,12 @@ def convert_markdown(txt, title='', disable_toc=False):
|
|||||||
safe_mode=False)
|
safe_mode=False)
|
||||||
return HTML_TEMPLATE % (title, md.convert(txt))
|
return HTML_TEMPLATE % (title, md.convert(txt))
|
||||||
|
|
||||||
def separate_paragraphs_single_line(txt):
|
def normalize_line_endings(txt):
|
||||||
txt = txt.replace('\r\n', '\n')
|
txt = txt.replace('\r\n', '\n')
|
||||||
txt = txt.replace('\r', '\n')
|
txt = txt.replace('\r', '\n')
|
||||||
|
return txt
|
||||||
|
|
||||||
|
def separate_paragraphs_single_line(txt):
|
||||||
txt = re.sub(u'(?<=.)\n(?=.)', '\n\n', txt)
|
txt = re.sub(u'(?<=.)\n(?=.)', '\n\n', txt)
|
||||||
return txt
|
return txt
|
||||||
|
|
||||||
@ -117,7 +120,7 @@ def detect_paragraph_type(txt):
|
|||||||
single: Each line is a paragraph.
|
single: Each line is a paragraph.
|
||||||
print: Each paragraph starts with a 2+ spaces or a tab
|
print: Each paragraph starts with a 2+ spaces or a tab
|
||||||
and ends when a new paragraph is reached.
|
and ends when a new paragraph is reached.
|
||||||
unformatted: most lines have hard line breaks, few/no spaces or indents
|
unformatted: most lines have hard line breaks, few/no blank lines or indents
|
||||||
|
|
||||||
returns block, single, print, unformatted
|
returns block, single, print, unformatted
|
||||||
'''
|
'''
|
||||||
@ -130,15 +133,21 @@ def detect_paragraph_type(txt):
|
|||||||
hardbreaks = docanalysis.line_histogram(.55)
|
hardbreaks = docanalysis.line_histogram(.55)
|
||||||
|
|
||||||
if hardbreaks:
|
if hardbreaks:
|
||||||
# Check for print
|
# Determine print percentage
|
||||||
tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt))
|
tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt))
|
||||||
if tab_line_count / float(txt_line_count) >= .15:
|
print_percent = tab_line_count / float(txt_line_count)
|
||||||
return 'print'
|
|
||||||
|
# Determine block percentage
|
||||||
# Check for block
|
|
||||||
empty_line_count = len(re.findall('(?mu)^\s*$', txt))
|
empty_line_count = len(re.findall('(?mu)^\s*$', txt))
|
||||||
if empty_line_count / float(txt_line_count) >= .15:
|
block_percent = empty_line_count / float(txt_line_count)
|
||||||
return 'block'
|
|
||||||
|
# Compare the two types - the type with the larger number of instances wins
|
||||||
|
# in cases where only one or the other represents the vast majority of the document neither wins
|
||||||
|
if print_percent >= block_percent:
|
||||||
|
if .15 <= print_percent <= .75:
|
||||||
|
return 'print'
|
||||||
|
elif .15 <= block_percent <= .75:
|
||||||
|
return 'block'
|
||||||
|
|
||||||
# Assume unformatted text with hardbreaks if nothing else matches
|
# Assume unformatted text with hardbreaks if nothing else matches
|
||||||
return 'unformatted'
|
return 'unformatted'
|
||||||
|
Loading…
x
Reference in New Issue
Block a user