From d335bccd67da45f2fd6b69b81e7e8d6db89fa378 Mon Sep 17 00:00:00 2001
From: ldolse \s*(?=[[a-z\d])'), lambda match: ''))
# unwrap/delete soft hyphens
end_rules.append((re.compile(u'[](\s* )+\s*(?=[[a-z\d])'), lambda match: ''))
# unwrap/delete soft hyphens with formatting
@@ -397,6 +395,8 @@ class HTMLPreProcessor(object):
# Un wrap using punctuation
(re.compile(r'(?<=.{%i}([a-z,:)\IA]|(?(i|b|u)>)?\s*( \s*(?=[[a-z\d])' % length), lambda match: ''))
for rule in self.PREPROCESS + start_rules:
html = rule[0].sub(rule[1], html)
From 936451853caa1190eff41bf07a28f39005da5fb3 Mon Sep 17 00:00:00 2001
From: ldolse )', re.DOTALL)
+ linere = re.compile('(?<= ]*>\s* |[iub]>\s* \s*<[iub]>)\s*(?P |[iub]>\s* \s*<[iub]>)\s*(?P \s*(?P \s*(?P )?'), chap_head),)
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
- length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'))
+ length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'), 'median')
if length:
# print "The pdf line length returned is " + str(length)
end_rules.append(
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index f38d02309a..7e85e24a83 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -153,7 +153,6 @@ class PreProcessor(object):
###### Unwrap lines ######
#
- self.log("Unwrapping Lines")
# Some OCR sourced files have line breaks in the html using a combination of span & p tags
# span are used for hard line breaks, p for new paragraphs. Determine which is used so
# that lines can be un-wrapped across page boundaries
@@ -168,25 +167,40 @@ class PreProcessor(object):
format = 'html'
else:
format = 'html'
-
+ # Check Line histogram to determine if the document uses hard line breaks, If 50% or
+ # more of the lines break in the same region of the document then unwrapping is required
+ hardbreaks = line_length(format, html, .50, 'histogram')
+ print "Hard line breaks check returned "+str(hardbreaks)
# Calculate Length
- length = line_length(format, html, getattr(self.extra_opts,
- 'html_unwrap_factor', 0.4))
+ unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
+ length = line_length(format, html, unwrap_factor, 'median')
self.log("*** Median line length is " + str(length) + ", calculated with " + format + " format ***")
- max_length = length * 1.4
- min_max = str("(?<=.{"+str(length)+"})(?\s*([iubp]>\s*<[iubp][^>]*>\s*)?]*>|[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])' % length, '', html)
+ # Dehyphenate
+ self.log("Unwrapping/Removing hyphens")
+ dehyphenator = Dehyphenator()
+ html = dehyphenator(html,'html', length)
+ self.log("Done dehyphenating")
+ # Unwrap lines using punctation and line length
+ unwrap = re.compile(r"(?<=.{%i}([a-z,;):\IA]|(?\s*((p|span|div)>)?\s*(?P )', re.IGNORECASE|re.DOTALL)
blankreg = re.compile(r'\s*(?P ]*>)\s*(?P ]*>\s*(<(b|i|u)>)?\s*((b|i|u)>)?\s* \s*", "\n ", html)
# detect chapters/sections to match xpath or splitting logic
+ #
+ # Build the Regular Expressions in pieces
+ lookahead = "(?=<(p|div))"
+ chapter_line_open = "<(?P ]*>\s* ]*>\s* ]*>\s* tags), check and mark up line endings if required before proceeding
if self.no_markup(html, 0.1):
self.log("not enough paragraph markers, adding now")
- # check if content is in pre tags, use txt procesor to mark up if so
+ # check if content is in pre tags, use txt processor to mark up if so
pre = re.compile(r'
', re.IGNORECASE)
if len(pre.findall(html)) == 1:
self.log("Running Text Processing")
From 301af532c6940ec8082dbe6ece4dca351417ac63 Mon Sep 17 00:00:00 2001
From: ldolse
).*?(?=
)', re.DOTALL)
elif format == 'spanned_html':
linere = re.compile('(?<=)', re.DOTALL)
lines = linere.findall(raw)
- lengths = []
- for line in lines:
- if len(line) > 0:
- lengths.append(len(line))
+ if test_type == 'median':
+ lengths = []
+ for line in lines:
+ if len(line) > 0:
+ lengths.append(len(line))
- if not lengths:
- return 0
+ if not lengths:
+ return 0
- lengths = list(set(lengths))
- total = sum(lengths)
- avg = total / len(lengths)
- max_line = avg * 2
+ lengths = list(set(lengths))
+ total = sum(lengths)
+ avg = total / len(lengths)
+ max_line = avg * 2
- lengths = sorted(lengths)
- for i in range(len(lengths) - 1, -1, -1):
- if lengths[i] > max_line:
- del lengths[i]
+ lengths = sorted(lengths)
+ for i in range(len(lengths) - 1, -1, -1):
+ if lengths[i] > max_line:
+ del lengths[i]
- if percent > 1:
- percent = 1
- if percent < 0:
- percent = 0
+ if percent > 1:
+ percent = 1
+ if percent < 0:
+ percent = 0
- index = int(len(lengths) * percent) - 1
+ index = int(len(lengths) * percent) - 1
- return lengths[index]
+ return lengths[index]
+
+ if test_type == 'histogram':
+ minLineLength=20 # Ignore lines under 20 chars (typical of spaces)
+ maxLineLength=1900 # Discard larger than this to stay in range
+ buckets=20 # Each line is divided into a bucket based on length
+
+ #print "there are "+str(len(lines))+" lines"
+ max = 0
+ for line in lines:
+ l = len(line)
+ if l > max:
+ max = l
+ print "max line found is "+str(max)
+ # Build the line length histogram
+ hRaw = [ 0 for i in range(0,buckets) ]
+ for line in lines:
+ l = len(line)
+ if l > minLineLength and l < maxLineLength:
+ l = int(l/100)
+ #print "adding "+str(l)
+ hRaw[l]+=1
+
+ # Normalize the histogram into percents
+ totalLines = len(lines)
+ h = [ float(count)/totalLines for count in hRaw ]
+ print "\nhRaw histogram lengths are: "+str(hRaw)
+ print " percents are: "+str(h)+"\n"
+
+ # Find the biggest bucket
+ maxValue = 0
+ peakPosition = 0
+ for i in range(0,len(h)):
+ if h[i] > maxValue:
+ maxValue = h[i]
+ peakPosition = i
+
+ if maxValue < percent:
+ #print "Line lengths are too variable. Not unwrapping."
+ return False
+ else:
+ #print str(maxValue)+" of the lines were in one bucket"
+ return True
class Dehyphenator(object):
'''
@@ -117,7 +165,7 @@ class Dehyphenator(object):
def __init__(self):
# Add common suffixes to the regex below to increase the likelihood of a match -
# don't add suffixes which are also complete words, such as 'able' or 'sex'
- self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE)
+ self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE)
# remove prefixes if the prefix was not already the point of hyphenation
self.prefixes = re.compile(r'^(un|in|ex)$', re.IGNORECASE)
self.removeprefix = re.compile(r'^(un|in|ex)', re.IGNORECASE)
@@ -125,34 +173,54 @@ class Dehyphenator(object):
def dehyphenate(self, match):
firsthalf = match.group('firstpart')
secondhalf = match.group('secondpart')
+ try:
+ wraptags = match.group('wraptags')
+ except:
+ wraptags = ''
hyphenated = str(firsthalf) + "-" + str(secondhalf)
dehyphenated = str(firsthalf) + str(secondhalf)
lookupword = self.removesuffixes.sub('', dehyphenated)
if self.prefixes.match(firsthalf) is None:
lookupword = self.removeprefix.sub('', lookupword)
booklookup = re.compile(u'%s' % lookupword, re.IGNORECASE)
- #print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
- match = booklookup.search(self.html)
- if match:
- #print "returned dehyphenated word: " + str(dehyphenated)
- return dehyphenated
+ print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
+ if self.format == 'html_cleanup':
+ match = booklookup.search(self.html)
+ hyphenmatch = re.search(u'%s' % hyphenated, self.html)
+ if match:
+ print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
+ return dehyphenated
+ elif hyphenmatch:
+ print "Cleanup:returned hyphenated word: " + str(hyphenated)
+ return hyphenated
+ else:
+ print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf)
+ return firsthalf+u'\u2014'+wraptags+secondhalf
+
else:
- #print "returned hyphenated word: " + str(hyphenated)
- return hyphenated
+ match = booklookup.search(self.html)
+ if match:
+ print "returned dehyphenated word: " + str(dehyphenated)
+ return dehyphenated
+ else:
+ print "returned hyphenated word: " + str(hyphenated)
+ return hyphenated
def __call__(self, html, format, length=1):
self.html = html
+ self.format = format
if format == 'html':
- intextmatch = re.compile(u'(?<=.{%i})(?P
]*>\s*
){0,2}\s*(<(/?br|p)[^>]*>\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*(?P\s*(?=[[a-z\d])' % length), lambda match: '')) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 555f42702b..f41f6abd08 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -77,6 +77,11 @@ class PreProcessor(object): def __call__(self, html): self.log("********* Preprocessing HTML *********") + + # Arrange line feeds and
tags so the line_length and no_markup functions work correctly + html = re.sub(r"\s*", "\n", html) + html = re.sub(r"\s*\s*", "\n
", html) + ###### Check Markup ###### # # some lit files don't have any
tags or equivalent (generally just plain text between @@ -135,9 +140,7 @@ class PreProcessor(object): #print "blanks between paragraphs is marked True" else: blanks_between_paragraphs = False - # Arrange line feeds and
tags so the line_length and no_markup functions work correctly - html = re.sub(r"\s*", "\n", html) - html = re.sub(r"\s*\s*", "\n
", html)
+ #self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n")
# detect chapters/sections to match xpath or splitting logic
#
# Build the Regular Expressions in pieces
@@ -160,11 +163,10 @@ class PreProcessor(object):
default_title = r"(\s*[\w\'\"-]+){1,5}(?!<)"
typical_chapters = r".?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}"
numeric_chapters = r".?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*"
- uppercase_chapters = r"\s*.?([A-Z#\-\s]+)\s*"
+ uppercase_chapters = r"\s*.?([A-Z#\-]+\s{0,3}){1,5}\s*"
chapter_marker = lookahead+chapter_line_open+chapter_header_open+typical_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
- #print chapter_marker
- #self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n")
+ #print chapter_marker
heading = re.compile(' |[iub]>\s* \s*<[iub]>)\s*(?P |[iub]>\s* \s*<[iub]>)\s*(?P ]*>\s*
]*>\s*
).*?(?=)', re.DOTALL) + elif format == 'pdf': + linere = re.compile('(?<=\s*(?=[[a-z\d])' % length), lambda match: '')) end_rules.append( # Un wrap using punctuation (re.compile(u'(?<=.{%i}([a-z,:)\IA\u00DF]|(?(i|b|u)>)?\s*(
\s*(?=[[a-z\d])' % length), lambda match: ''))
for rule in self.PREPROCESS + start_rules:
html = rule[0].sub(rule[1], html)
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 9c57756d28..96df37f631 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -6,7 +6,7 @@ __copyright__ = '2010, Kovid Goyal ]*>\s*
' +
_('Editing meta information for %d books') %
len(rows))
@@ -170,7 +171,7 @@ class MetadataBulkDialog(QDialog, Ui_MetadataBulkDialog):
self.tag_editor_button.clicked.connect(self.tag_editor)
self.autonumber_series.stateChanged[int].connect(self.auto_number_changed)
- if len(db.custom_field_keys(include_composites=False)) == 0:
+ if len(self.db.custom_field_keys(include_composites=False)) == 0:
self.central_widget.removeTab(1)
else:
self.create_custom_column_editors()
@@ -617,8 +618,15 @@ class MetadataBulkDialog(QDialog, Ui_MetadataBulkDialog):
self.worker = Worker(args, self.db, self.ids,
getattr(self, 'custom_column_widgets', []),
Dispatcher(bb.accept, parent=bb))
- self.worker.start()
- bb.exec_()
+
+ # The metadata backup thread causes database commits
+ # which can slow down bulk editing of large numbers of books
+ self.model.stop_metadata_backup()
+ try:
+ self.worker.start()
+ bb.exec_()
+ finally:
+ self.model.start_metadata_backup()
if self.worker.error is not None:
return error_dialog(self, _('Failed'),
diff --git a/src/calibre/gui2/library/models.py b/src/calibre/gui2/library/models.py
index b2a7f08055..9da5420681 100644
--- a/src/calibre/gui2/library/models.py
+++ b/src/calibre/gui2/library/models.py
@@ -159,17 +159,24 @@ class BooksModel(QAbstractTableModel): # {{{
# do something on the GUI thread. Deadlock.
self.cover_cache = CoverCache(db, FunctionDispatcher(self.db.cover))
self.cover_cache.start()
- if self.metadata_backup is not None:
- self.metadata_backup.stop()
- # Would like to to a join here, but the thread might be waiting to
- # do something on the GUI thread. Deadlock.
- self.metadata_backup = MetadataBackup(db)
- self.metadata_backup.start()
+ self.stop_metadata_backup()
+ self.start_metadata_backup()
def refresh_cover(event, ids):
if event == 'cover' and self.cover_cache is not None:
self.cover_cache.refresh(ids)
db.add_listener(refresh_cover)
+ def start_metadata_backup(self):
+ self.metadata_backup = MetadataBackup(self.db)
+ self.metadata_backup.start()
+
+ def stop_metadata_backup(self):
+ if getattr(self, 'metadata_backup', None) is not None:
+ self.metadata_backup.stop()
+ # Would like to to a join here, but the thread might be waiting to
+ # do something on the GUI thread. Deadlock.
+
+
def refresh_ids(self, ids, current_row=-1):
rows = self.db.refresh_ids(ids)
if rows:
diff --git a/src/calibre/gui2/preferences/misc.py b/src/calibre/gui2/preferences/misc.py
index 865115c2ed..582d110c6c 100644
--- a/src/calibre/gui2/preferences/misc.py
+++ b/src/calibre/gui2/preferences/misc.py
@@ -106,14 +106,13 @@ class ConfigWidget(ConfigWidgetBase, Ui_Form):
d.exec_()
def compact(self, *args):
- from calibre.library.caches import MetadataBackup
m = self.gui.library_view.model()
- if m.metadata_backup is not None:
- m.metadata_backup.stop()
- d = CheckIntegrity(m.db, self)
- d.exec_()
- m.metadata_backup = MetadataBackup(m.db)
- m.metadata_backup.start()
+ m.stop_metadata_backup()
+ try:
+ d = CheckIntegrity(m.db, self)
+ d.exec_()
+ finally:
+ m.start_metadata_backup()
def open_config_dir(self, *args):
from calibre.utils.config import config_dir
From fef738c53b8d5a980423d1930e6a94d4ffc8a6a8 Mon Sep 17 00:00:00 2001
From: Kovid Goyal