This commit is contained in:
Sengian 2011-01-18 23:24:21 +01:00
commit a827fcb5e1
6 changed files with 18 additions and 27 deletions

View File

@ -54,7 +54,7 @@ class HeuristicProcessor(object):
return '<'+styles+' style="page-break-before:always">'+chap return '<'+styles+' style="page-break-before:always">'+chap
def analyze_title_matches(self, match): def analyze_title_matches(self, match):
chap = match.group('chap') #chap = match.group('chap')
title = match.group('title') title = match.group('title')
if not title: if not title:
self.chapters_no_title = self.chapters_no_title + 1 self.chapters_no_title = self.chapters_no_title + 1
@ -102,8 +102,7 @@ class HeuristicProcessor(object):
min_lns = tot_ln_fds * percent min_lns = tot_ln_fds * percent
#self.log.debug("There must be fewer than " + unicode(min_lns) + " unmarked lines to add markup") #self.log.debug("There must be fewer than " + unicode(min_lns) + " unmarked lines to add markup")
if min_lns > tot_htm_ends: return min_lns > tot_htm_ends
return True
def dump(self, raw, where): def dump(self, raw, where):
import os import os
@ -136,7 +135,7 @@ class HeuristicProcessor(object):
'nota bene', 'Nota bene', 'Ste.', 'Mme.', 'Mdme.', 'nota bene', 'Nota bene', 'Ste.', 'Mme.', 'Mdme.',
'Mlle.', 'Mons.', 'PS.', 'PPS.', 'Mlle.', 'Mons.', 'PS.', 'PPS.',
] ]
ITALICIZE_STYLE_PATS = [ ITALICIZE_STYLE_PATS = [
r'(?msu)(?<=\s)_(?P<words>\S[^_]{0,40}?\S)?_(?=\s)', r'(?msu)(?<=\s)_(?P<words>\S[^_]{0,40}?\S)?_(?=\s)',
r'(?msu)(?<=\s)/(?P<words>\S[^/]{0,40}?\S)?/(?=\s)', r'(?msu)(?<=\s)/(?P<words>\S[^/]{0,40}?\S)?/(?=\s)',
@ -150,7 +149,7 @@ class HeuristicProcessor(object):
r'(?msu)(?<=\s)/:(?P<words>\S[^:/]{0,40}?\S)?:/(?=\s)', r'(?msu)(?<=\s)/:(?P<words>\S[^:/]{0,40}?\S)?:/(?=\s)',
r'(?msu)(?<=\s)\|:(?P<words>\S[^:\|]{0,40}?\S)?:\|(?=\s)', r'(?msu)(?<=\s)\|:(?P<words>\S[^:\|]{0,40}?\S)?:\|(?=\s)',
] ]
for word in ITALICIZE_WORDS: for word in ITALICIZE_WORDS:
html = html.replace(word, '<i>%s</i>' % word) html = html.replace(word, '<i>%s</i>' % word)
@ -242,7 +241,7 @@ class HeuristicProcessor(object):
lp_title = default_title lp_title = default_title
else: else:
lp_title = simple_title lp_title = simple_title
if ignorecase: if ignorecase:
arg_ignorecase = r'(?i)' arg_ignorecase = r'(?i)'
else: else:
@ -250,7 +249,7 @@ class HeuristicProcessor(object):
if title_req: if title_req:
lp_opt_title_open = '' lp_opt_title_open = ''
lp_opt_title_close = '' lp_opt_title_close = ''
else: else:
lp_opt_title_open = opt_title_open lp_opt_title_open = opt_title_open
lp_opt_title_close = opt_title_close lp_opt_title_close = opt_title_close
@ -399,7 +398,7 @@ class HeuristicProcessor(object):
if len(lines) > 1: if len(lines) > 1:
self.log.debug("There are " + unicode(len(blanklines)) + " blank lines. " + self.log.debug("There are " + unicode(len(blanklines)) + " blank lines. " +
unicode(float(len(blanklines)) / float(len(lines))) + " percent blank") unicode(float(len(blanklines)) / float(len(lines))) + " percent blank")
if float(len(blanklines)) / float(len(lines)) > 0.40: if float(len(blanklines)) / float(len(lines)) > 0.40:
return True return True
else: else:
@ -460,7 +459,7 @@ class HeuristicProcessor(object):
if getattr(self.extra_opts, 'markup_chapter_headings', False): if getattr(self.extra_opts, 'markup_chapter_headings', False):
html = self.markup_chapters(html, self.totalwords, blanks_between_paragraphs) html = self.markup_chapters(html, self.totalwords, blanks_between_paragraphs)
if getattr(self.extra_opts, 'italicize_common_cases', False): if getattr(self.extra_opts, 'italicize_common_cases', False):
html = self.markup_italicis(html) html = self.markup_italicis(html)
# If more than 40% of the lines are empty paragraphs and the user has enabled delete # If more than 40% of the lines are empty paragraphs and the user has enabled delete
@ -487,7 +486,7 @@ class HeuristicProcessor(object):
unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4) unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
length = docanalysis.line_length(unwrap_factor) length = docanalysis.line_length(unwrap_factor)
self.log.debug("Median line length is " + unicode(length) + ", calculated with " + format + " format") self.log.debug("Median line length is " + unicode(length) + ", calculated with " + format + " format")
###### Unwrap lines ###### ###### Unwrap lines ######
if getattr(self.extra_opts, 'unwrap_lines', False): if getattr(self.extra_opts, 'unwrap_lines', False):
# only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor # only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor

View File

@ -7,8 +7,6 @@ __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
from calibre.customize.conversion import InputFormatPlugin from calibre.customize.conversion import InputFormatPlugin
from calibre.ebooks.conversion.utils import HeuristicProcessor
class LITInput(InputFormatPlugin): class LITInput(InputFormatPlugin):

View File

@ -3,7 +3,6 @@ __license__ = 'GPL 3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import re
from calibre.customize.conversion import InputFormatPlugin from calibre.customize.conversion import InputFormatPlugin
class MOBIInput(InputFormatPlugin): class MOBIInput(InputFormatPlugin):

View File

@ -53,6 +53,7 @@ class TXTInput(InputFormatPlugin):
def convert(self, stream, options, file_ext, log, def convert(self, stream, options, file_ext, log,
accelerators): accelerators):
self.log = log
log.debug('Reading text from file...') log.debug('Reading text from file...')
txt = stream.read() txt = stream.read()
@ -106,7 +107,7 @@ class TXTInput(InputFormatPlugin):
log.debug('Auto detected paragraph type as %s' % options.paragraph_type) log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
# Dehyphenate # Dehyphenate
dehyphenator = Dehyphenator(options.verbose, log=getattr(self, 'log', None)) dehyphenator = Dehyphenator(options.verbose, log=self.log)
txt = dehyphenator(txt,'txt', length) txt = dehyphenator(txt,'txt', length)
# We don't check for block because the processor assumes block. # We don't check for block because the processor assumes block.

View File

@ -25,21 +25,21 @@ class HeuristicsWidget(Widget, Ui_Form):
) )
self.db, self.book_id = db, book_id self.db, self.book_id = db, book_id
self.initialize_options(get_option, get_help, db, book_id) self.initialize_options(get_option, get_help, db, book_id)
self.opt_enable_heuristics.stateChanged.connect(self.enable_heuristics) self.opt_enable_heuristics.stateChanged.connect(self.enable_heuristics)
self.opt_unwrap_lines.stateChanged.connect(self.enable_unwrap) self.opt_unwrap_lines.stateChanged.connect(self.enable_unwrap)
self.enable_heuristics(self.opt_enable_heuristics.checkState()) self.enable_heuristics(self.opt_enable_heuristics.checkState())
def break_cycles(self): def break_cycles(self):
Widget.break_cycles(self) Widget.break_cycles(self)
try: try:
self.opt_enable_heuristics.stateChanged.disconnect() self.opt_enable_heuristics.stateChanged.disconnect()
self.opt_unwrap_lines.stateChanged.disconnect() self.opt_unwrap_lines.stateChanged.disconnect()
except: except:
pass pass
def set_value_handler(self, g, val): def set_value_handler(self, g, val):
if val is None and g is self.opt_html_unwrap_factor: if val is None and g is self.opt_html_unwrap_factor:
g.setValue(0.0) g.setValue(0.0)
@ -57,7 +57,7 @@ class HeuristicsWidget(Widget, Ui_Form):
self.opt_format_scene_breaks.setEnabled(state) self.opt_format_scene_breaks.setEnabled(state)
self.opt_dehyphenate.setEnabled(state) self.opt_dehyphenate.setEnabled(state)
self.opt_renumber_headings.setEnabled(state) self.opt_renumber_headings.setEnabled(state)
self.opt_unwrap_lines.setEnabled(state) self.opt_unwrap_lines.setEnabled(state)
if state and self.opt_unwrap_lines.checkState() == Qt.Checked: if state and self.opt_unwrap_lines.checkState() == Qt.Checked:
self.opt_html_unwrap_factor.setEnabled(True) self.opt_html_unwrap_factor.setEnabled(True)

View File

@ -19,7 +19,7 @@ from calibre.devices.scanner import DeviceScanner
from calibre.gui2 import config, error_dialog, Dispatcher, dynamic, \ from calibre.gui2 import config, error_dialog, Dispatcher, dynamic, \
warning_dialog, info_dialog, choose_dir warning_dialog, info_dialog, choose_dir
from calibre.ebooks.metadata import authors_to_string from calibre.ebooks.metadata import authors_to_string
from calibre import preferred_encoding, prints, force_unicode from calibre import preferred_encoding, prints, force_unicode, as_unicode
from calibre.utils.filenames import ascii_filename from calibre.utils.filenames import ascii_filename
from calibre.devices.errors import FreeSpaceError from calibre.devices.errors import FreeSpaceError
from calibre.devices.apple.driver import ITUNES_ASYNC from calibre.devices.apple.driver import ITUNES_ASYNC
@ -68,13 +68,7 @@ class DeviceJob(BaseJob): # {{{
if self._aborted: if self._aborted:
return return
self.failed = True self.failed = True
try: ex = as_unicode(err)
ex = unicode(err)
except:
try:
ex = str(err).decode(preferred_encoding, 'replace')
except:
ex = repr(err)
self._details = ex + '\n\n' + \ self._details = ex + '\n\n' + \
traceback.format_exc() traceback.format_exc()
self.exception = err self.exception = err