This commit is contained in:
Kovid Goyal 2011-01-09 10:15:53 -07:00
commit 26e8ec2fd0
4 changed files with 53 additions and 24 deletions

View File

@ -79,7 +79,7 @@ class DocAnalysis(object):
elif format == 'spanned_html': elif format == 'spanned_html':
linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL) linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL)
elif format == 'txt': elif format == 'txt':
linere = re.compile('.*?\n', re.DOTALL) linere = re.compile('.*?\n')
self.lines = linere.findall(raw) self.lines = linere.findall(raw)
def line_length(self, percent): def line_length(self, percent):
@ -177,7 +177,7 @@ class Dehyphenator(object):
def __init__(self): def __init__(self):
# Add common suffixes to the regex below to increase the likelihood of a match - # Add common suffixes to the regex below to increase the likelihood of a match -
# don't add suffixes which are also complete words, such as 'able' or 'sex' # don't add suffixes which are also complete words, such as 'able' or 'sex'
self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE) self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|m?ents?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex|ian)$", re.IGNORECASE)
# remove prefixes if the prefix was not already the point of hyphenation # remove prefixes if the prefix was not already the point of hyphenation
self.prefixes = re.compile(r'^(dis|re|un|in|ex)$', re.IGNORECASE) self.prefixes = re.compile(r'^(dis|re|un|in|ex)$', re.IGNORECASE)
self.removeprefix = re.compile(r'^(dis|re|un|in|ex)', re.IGNORECASE) self.removeprefix = re.compile(r'^(dis|re|un|in|ex)', re.IGNORECASE)
@ -199,7 +199,7 @@ class Dehyphenator(object):
searchresult = self.html.find(lookupword.lower()) searchresult = self.html.find(lookupword.lower())
except: except:
return hyphenated return hyphenated
if self.format == 'html_cleanup': if self.format == 'html_cleanup' or self.format == 'txt_cleanup':
if self.html.find(lookupword) != -1 or searchresult != -1: if self.html.find(lookupword) != -1 or searchresult != -1:
#print "Cleanup:returned dehyphenated word: " + str(dehyphenated) #print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
return dehyphenated return dehyphenated
@ -225,10 +225,15 @@ class Dehyphenator(object):
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|)\s*(?=<)(?P<wraptags>(</span>)?\s*(</[iubp]>\s*){1,2}(?P<up2threeblanks><(p|div)[^>]*>\s*(<p[^>]*>\s*</p>\s*)?</(p|div)>\s+){0,3}\s*(<[iubp][^>]*>\s*){1,2}(<span[^>]*>)?)\s*(?P<secondpart>[\w\d]+)' % length) intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|)\s*(?=<)(?P<wraptags>(</span>)?\s*(</[iubp]>\s*){1,2}(?P<up2threeblanks><(p|div)[^>]*>\s*(<p[^>]*>\s*</p>\s*)?</(p|div)>\s+){0,3}\s*(<[iubp][^>]*>\s*){1,2}(<span[^>]*>)?)\s*(?P<secondpart>[\w\d]+)' % length)
elif format == 'pdf': elif format == 'pdf':
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|)\s*(?P<wraptags><p>|</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)'% length) intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|)\s*(?P<wraptags><p>|</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)'% length)
elif format == 'txt':
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|)(\u0020|\u0009)*(?P<wraptags>(\n(\u0020|\u0009)*)+)(?P<secondpart>[\w\d]+)'% length)
elif format == 'individual_words': elif format == 'individual_words':
intextmatch = re.compile(u'>[^<]*\b(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|)(?P<secondpart)\w+)\b[^<]*<') # for later, not called anywhere yet intextmatch = re.compile(u'>[^<]*\b(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|)\u0020*(?P<secondpart>\w+)\b[^<]*<') # for later, not called anywhere yet
elif format == 'html_cleanup': elif format == 'html_cleanup':
intextmatch = re.compile(u'(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|)\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)') intextmatch = re.compile(u'(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|)\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)')
elif format == 'txt_cleanup':
intextmatch = re.compile(u'(?P<firstpart>\w+)(-|)(?P<wraptags>\s+)(?P<secondpart>[\w\d]+)')
html = intextmatch.sub(self.dehyphenate, html) html = intextmatch.sub(self.dehyphenate, html)
return html return html

View File

@ -190,7 +190,7 @@ class PreProcessor(object):
line_ending = "\s*</(span|p|div)>\s*(</(p|span|div)>)?" line_ending = "\s*</(span|p|div)>\s*(</(p|span|div)>)?"
blanklines = "\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*" blanklines = "\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*"
line_opening = "<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" line_opening = "<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*"
txt_line_wrap = u"(\u0020|\u0009)*\n" txt_line_wrap = u"((\u0020|\u0009)*\n){1,4}"
unwrap_regex = lookahead+line_ending+blanklines+line_opening unwrap_regex = lookahead+line_ending+blanklines+line_opening
if format == 'txt': if format == 'txt':
@ -357,6 +357,6 @@ class PreProcessor(object):
html = blankreg.sub('\n'+r'\g<openline>'+u'\u00a0'+r'\g<closeline>', html) html = blankreg.sub('\n'+r'\g<openline>'+u'\u00a0'+r'\g<closeline>', html)
# Center separator lines # Center separator lines
html = re.sub(u'<p>\s*(?P<break>([*#•]+\s*)+)\s*</p>', '<p style="text-align:center">' + '\g<break>' + '</p>', html) html = re.sub(u'<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*(?P<break>([*#•]+\s*)+)\s*(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>', '<p style="text-align:center">' + '\g<break>' + '</p>', html)
return html return html

View File

@ -7,11 +7,12 @@ __docformat__ = 'restructuredtext en'
import os import os
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
from calibre.ebooks.chardet import detect from calibre.ebooks.chardet import detect
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \ from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \ separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
preserve_spaces, detect_paragraph_type, detect_formatting_type, \ preserve_spaces, detect_paragraph_type, detect_formatting_type, \
convert_heuristic convert_heuristic, normalize_line_endings
from calibre import _ent_pat, xml_entity_to_unicode from calibre import _ent_pat, xml_entity_to_unicode
class TXTInput(InputFormatPlugin): class TXTInput(InputFormatPlugin):
@ -23,7 +24,7 @@ class TXTInput(InputFormatPlugin):
options = set([ options = set([
OptionRecommendation(name='paragraph_type', recommended_value='auto', OptionRecommendation(name='paragraph_type', recommended_value='auto',
choices=['auto', 'block', 'single', 'print'], choices=['auto', 'block', 'single', 'print', 'unformatted'],
help=_('Paragraph structure.\n' help=_('Paragraph structure.\n'
'choices are [\'auto\', \'block\', \'single\', \'print\', \'unformatted\']\n' 'choices are [\'auto\', \'block\', \'single\', \'print\', \'unformatted\']\n'
'* auto: Try to auto detect paragraph type.\n' '* auto: Try to auto detect paragraph type.\n'
@ -31,7 +32,7 @@ class TXTInput(InputFormatPlugin):
'* single: Assume every line is a paragraph.\n' '* single: Assume every line is a paragraph.\n'
'* print: Assume every line starting with 2+ spaces or a tab ' '* print: Assume every line starting with 2+ spaces or a tab '
'starts a paragraph.' 'starts a paragraph.'
'* unformatted: Most lines have hard line breaks, few/no spaces or indents.')), '* unformatted: Most lines have hard line breaks, few/no blank lines or indents.')),
OptionRecommendation(name='formatting_type', recommended_value='auto', OptionRecommendation(name='formatting_type', recommended_value='auto',
choices=['auto', 'none', 'heuristic', 'markdown'], choices=['auto', 'none', 'heuristic', 'markdown'],
help=_('Formatting used within the document.' help=_('Formatting used within the document.'
@ -72,6 +73,13 @@ class TXTInput(InputFormatPlugin):
# followed by the &nbsp; entity. # followed by the &nbsp; entity.
if options.preserve_spaces: if options.preserve_spaces:
txt = preserve_spaces(txt) txt = preserve_spaces(txt)
# Normalize line endings
txt = normalize_line_endings(txt)
# Get length for hyphen removal and punctuation unwrap
docanalysis = DocAnalysis('txt', txt)
length = docanalysis.line_length(.5)
if options.formatting_type == 'auto': if options.formatting_type == 'auto':
options.formatting_type = detect_formatting_type(txt) options.formatting_type = detect_formatting_type(txt)
@ -91,10 +99,15 @@ class TXTInput(InputFormatPlugin):
log.debug('Could not reliably determine paragraph type using block') log.debug('Could not reliably determine paragraph type using block')
options.paragraph_type = 'block' options.paragraph_type = 'block'
else: else:
log.debug('Auto detected paragraph type as %s' % options.paragraph_type) log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
# Dehyphenate
dehyphenator = Dehyphenator()
txt = dehyphenator(txt,'txt', length)
# We don't check for block because the processor assumes block. # We don't check for block because the processor assumes block.
# single and print at transformed to block for processing. # single and print at transformed to block for processing.
if options.paragraph_type == 'single' or options.paragraph_type == 'unformatted': if options.paragraph_type == 'single' or options.paragraph_type == 'unformatted':
txt = separate_paragraphs_single_line(txt) txt = separate_paragraphs_single_line(txt)
elif options.paragraph_type == 'print': elif options.paragraph_type == 'print':
@ -102,10 +115,8 @@ class TXTInput(InputFormatPlugin):
if options.paragraph_type == 'unformatted': if options.paragraph_type == 'unformatted':
from calibre.ebooks.conversion.utils import PreProcessor from calibre.ebooks.conversion.utils import PreProcessor
from calibre.ebooks.conversion.preprocess import DocAnalysis
# get length # get length
docanalysis = DocAnalysis('txt', txt)
length = docanalysis.line_length(.5)
# unwrap lines based on punctuation # unwrap lines based on punctuation
preprocessor = PreProcessor(options, log=getattr(self, 'log', None)) preprocessor = PreProcessor(options, log=getattr(self, 'log', None))
txt = preprocessor.punctuation_unwrap(length, txt, 'txt') txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
@ -116,7 +127,11 @@ class TXTInput(InputFormatPlugin):
html = convert_heuristic(txt, epub_split_size_kb=flow_size) html = convert_heuristic(txt, epub_split_size_kb=flow_size)
else: else:
html = convert_basic(txt, epub_split_size_kb=flow_size) html = convert_basic(txt, epub_split_size_kb=flow_size)
# Dehyphenate in cleanup mode for missed txt and markdown conversion
dehyphenator = Dehyphenator()
html = dehyphenator(html,'txt_cleanup', length)
html = dehyphenator(html,'html_cleanup', length)
from calibre.customize.ui import plugin_for_input_format from calibre.customize.ui import plugin_for_input_format
html_input = plugin_for_input_format('html') html_input = plugin_for_input_format('html')

View File

@ -80,9 +80,12 @@ def convert_markdown(txt, title='', disable_toc=False):
safe_mode=False) safe_mode=False)
return HTML_TEMPLATE % (title, md.convert(txt)) return HTML_TEMPLATE % (title, md.convert(txt))
def separate_paragraphs_single_line(txt): def normalize_line_endings(txt):
txt = txt.replace('\r\n', '\n') txt = txt.replace('\r\n', '\n')
txt = txt.replace('\r', '\n') txt = txt.replace('\r', '\n')
return txt
def separate_paragraphs_single_line(txt):
txt = re.sub(u'(?<=.)\n(?=.)', '\n\n', txt) txt = re.sub(u'(?<=.)\n(?=.)', '\n\n', txt)
return txt return txt
@ -117,7 +120,7 @@ def detect_paragraph_type(txt):
single: Each line is a paragraph. single: Each line is a paragraph.
print: Each paragraph starts with a 2+ spaces or a tab print: Each paragraph starts with a 2+ spaces or a tab
and ends when a new paragraph is reached. and ends when a new paragraph is reached.
unformatted: most lines have hard line breaks, few/no spaces or indents unformatted: most lines have hard line breaks, few/no blank lines or indents
returns block, single, print, unformatted returns block, single, print, unformatted
''' '''
@ -130,15 +133,21 @@ def detect_paragraph_type(txt):
hardbreaks = docanalysis.line_histogram(.55) hardbreaks = docanalysis.line_histogram(.55)
if hardbreaks: if hardbreaks:
# Check for print # Determine print percentage
tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt)) tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt))
if tab_line_count / float(txt_line_count) >= .15: print_percent = tab_line_count / float(txt_line_count)
return 'print'
# Determine block percentage
# Check for block
empty_line_count = len(re.findall('(?mu)^\s*$', txt)) empty_line_count = len(re.findall('(?mu)^\s*$', txt))
if empty_line_count / float(txt_line_count) >= .15: block_percent = empty_line_count / float(txt_line_count)
return 'block'
# Compare the two types - the type with the larger number of instances wins
# in cases where only one or the other represents the vast majority of the document neither wins
if print_percent >= block_percent:
if .15 <= print_percent <= .75:
return 'print'
elif .15 <= block_percent <= .75:
return 'block'
# Assume unformatted text with hardbreaks if nothing else matches # Assume unformatted text with hardbreaks if nothing else matches
return 'unformatted' return 'unformatted'