tags to simplify other processing
- html = re.sub(ur'\s*\s*', ' ', html)
- # Get rid of empty span tags
- html = re.sub(r"\s*]*>\s*", " ", html)
-
- # If more than 40% of the lines are empty paragraphs then delete them to clean up spacing
- linereg = re.compile('(?<=)', re.IGNORECASE)
- blankreg = re.compile(r'\s*
]*>\s*(<(b|i|u)>)?\s*((b|i|u)>)?\s*
', re.IGNORECASE)
- blanklines = blankreg.findall(html)
- lines = linereg.findall(html)
- if len(lines) > 1:
- self.log("There are " + str(len(blanklines)) + " blank lines. " + str(float(len(blanklines)) / float(len(lines))) + " percent blank")
- if float(len(blanklines)) / float(len(lines)) > 0.40:
- self.log("deleting blank lines")
- html = blankreg.sub('', html)
- # Arrange line feeds and tags so the line_length and no_markup functions work correctly
- html = re.sub(r"\s*", "\n", html)
-
- # some lit files don't have any tags or equivalent, check and
- # mark up line endings if required before proceeding
- if no_markup(html, 0.1):
- self.log("not enough paragraph markers, adding now")
- add_markup = re.compile('(?)(\n)')
- html = add_markup.sub('
\n', html)
-
- # detect chapters/sections to match xpath or splitting logic
- #
- # Mark split points based on embedded links
- chaplink = re.compile(r']*>\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P[^\s<]+(\s*[^\s<]+){0,4})?\s*()?\s*((i|b|u)>){0,2}\s*', re.IGNORECASE)
- html = chaplink.sub(chapter_link, html)
- # Continue with alternate patterns, start with most typical chapter headings
- if self.html_preprocess_sections < 10:
- chapdetect = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}s*(]*>)?\s*.?(\d+\.?|Introduction|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*((i|b|u)>){0,2})\s*()?s*()?\s*((i|b|u)>){0,2}\s*((p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*((i|b|u)>){0,2})\s*()?\s*((i|b|u)>){0,2}\s*((br|p)>))?', re.IGNORECASE)
- html = chapdetect.sub(chapter_head, html)
- if self.html_preprocess_sections < 10:
- self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying a more aggressive pattern")
- chapdetect2 = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}\s*.?(([A-Z#]+\s*){1,9}|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*((i|b|u)>){0,2})\s*()?\s*((i|b|u)>){0,2}\s*((p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*((i|b|u)>){0,2})\s*()?\s*((i|b|u)>){0,2}\s*((br|p)>))?', re.UNICODE)
- html = chapdetect2.sub(chapter_head, html)
- #
- # Unwrap lines using punctation if the median length of all lines is less than 150
- length = line_length('html', html, 0.4)
- self.log("*** Median line length is " + str(length) + " ***")
- unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*(span|p|div)>\s*((p|span|div)>)?\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*(span|p|div)>\s*)(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
- if length < 150:
- self.log("Unwrapping Lines")
- html = unwrap.sub(' ', html)
- # If still no sections after unwrapping lines break on lines with no punctuation
- if self.html_preprocess_sections < 10:
- self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", splitting based on punctuation")
- #self.log(html)
- chapdetect3 = re.compile(r'(]*>)\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}\s*.?([a-z]+\s*){1,5}\s*((i|b|u)>){0,2})\s*()?\s*((i|b|u)>){0,2}\s*(
)(?P)?', re.IGNORECASE)
- html = chapdetect3.sub(chapter_head, html)
- # search for places where a first or second level heading is immediately followed by another
- # top level heading. demote the second heading to h3 to prevent splitting between chapter
- # headings and titles, images, etc
- doubleheading = re.compile(r'(?P]*>.+?\s*(<(?!h\d)[^>]*>\s*)*)[^>]*>.+?)', re.IGNORECASE)
- html = doubleheading.sub('\g'+''+'
', html)
-
+ preprocessor = PreProcessor(html)
+ html = preprocessor(html)
return html
diff --git a/src/calibre/ebooks/pdb/pdf/reader.py b/src/calibre/ebooks/pdb/pdf/reader.py
index 3ae9f8ccca..c151551866 100644
--- a/src/calibre/ebooks/pdb/pdf/reader.py
+++ b/src/calibre/ebooks/pdb/pdf/reader.py
@@ -21,7 +21,7 @@ class Reader(FormatReader):
self.options = options
setattr(self.options, 'new_pdf_engine', False)
setattr(self.options, 'no_images', False)
- setattr(self.options, 'unwrap_factor', 0.5)
+ setattr(self.options, 'unwrap_factor', 0.45)
def extract_content(self, output_dir):
self.log.info('Extracting PDF...')
diff --git a/src/calibre/ebooks/pdf/input.py b/src/calibre/ebooks/pdf/input.py
index 113c3d99d8..14b3552b04 100644
--- a/src/calibre/ebooks/pdf/input.py
+++ b/src/calibre/ebooks/pdf/input.py
@@ -25,7 +25,7 @@ class PDFInput(InputFormatPlugin):
OptionRecommendation(name='unwrap_factor', recommended_value=0.45,
help=_('Scale used to determine the length at which a line should '
'be unwrapped. Valid values are a decimal between 0 and 1. The '
- 'default is 0.45, this is the median line length.')),
+ 'default is 0.45, just below the median line length.')),
OptionRecommendation(name='new_pdf_engine', recommended_value=False,
help=_('Use the new PDF conversion engine.'))
])
From f6de0bef13d7d1001b951d465cff3135aad616ed Mon Sep 17 00:00:00 2001
From: ldolse
Date: Sat, 11 Sep 2010 22:15:09 +1000
Subject: [PATCH 14/43] replaced messed up rtf file
---
src/calibre/ebooks/rtf/preprocess.py | 624 +++++++++++++--------------
1 file changed, 289 insertions(+), 335 deletions(-)
diff --git a/src/calibre/ebooks/rtf/preprocess.py b/src/calibre/ebooks/rtf/preprocess.py
index ee45da697f..a3076651fd 100644
--- a/src/calibre/ebooks/rtf/preprocess.py
+++ b/src/calibre/ebooks/rtf/preprocess.py
@@ -1,390 +1,344 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import with_statement
__license__ = 'GPL v3'
-__copyright__ = '2009, Kovid Goyal '
+__copyright__ = '2010, Gerendi Sandor Attila'
__docformat__ = 'restructuredtext en'
-import functools, re
+"""
+RTF tokenizer and token parser. v.1.0 (1/17/2010)
+Author: Gerendi Sandor Attila
-from calibre import entity_to_unicode
+At this point this will tokenize a RTF file then rebuild it from the tokens.
+In the process the UTF8 tokens are altered to be supported by the RTF2XML and also remain RTF specification compilant.
+"""
-XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>')
-SVG_NS = 'http://www.w3.org/2000/svg'
-XLINK_NS = 'http://www.w3.org/1999/xlink'
+class tokenDelimitatorStart():
+ def __init__(self):
+ pass
+ def toRTF(self):
+ return b'{'
+ def __repr__(self):
+ return '{'
-convert_entities = functools.partial(entity_to_unicode,
- result_exceptions = {
- u'<' : '<',
- u'>' : '>',
- u"'" : ''',
- u'"' : '"',
- u'&' : '&',
- })
-_span_pat = re.compile('', re.DOTALL|re.IGNORECASE)
+class tokenDelimitatorEnd():
+ def __init__(self):
+ pass
+ def toRTF(self):
+ return b'}'
+ def __repr__(self):
+ return '}'
-LIGATURES = {
-# u'\u00c6': u'AE',
-# u'\u00e6': u'ae',
-# u'\u0152': u'OE',
-# u'\u0153': u'oe',
-# u'\u0132': u'IJ',
-# u'\u0133': u'ij',
-# u'\u1D6B': u'ue',
- u'\uFB00': u'ff',
- u'\uFB01': u'fi',
- u'\uFB02': u'fl',
- u'\uFB03': u'ffi',
- u'\uFB04': u'ffl',
- u'\uFB05': u'ft',
- u'\uFB06': u'st',
- }
+class tokenControlWord():
+ def __init__(self, name, separator = ''):
+ self.name = name
+ self.separator = separator
+ def toRTF(self):
+ return self.name + self.separator
+ def __repr__(self):
+ return self.name + self.separator
-_ligpat = re.compile(u'|'.join(LIGATURES))
+class tokenControlWordWithNumericArgument():
+ def __init__(self, name, argument, separator = ''):
+ self.name = name
+ self.argument = argument
+ self.separator = separator
+ def toRTF(self):
+ return self.name + repr(self.argument) + self.separator
+ def __repr__(self):
+ return self.name + repr(self.argument) + self.separator
-def sanitize_head(match):
- x = match.group(1)
- x = _span_pat.sub('', x)
- return '\n%s\n' % x
+class tokenControlSymbol():
+ def __init__(self, name):
+ self.name = name
+ def toRTF(self):
+ return self.name
+ def __repr__(self):
+ return self.name
-def chap_head(match):
- chap = match.group('chap')
- title = match.group('title')
- if not title:
- return ''+chap+'
\n'
- else:
- return ''+chap+'
\n'+title+'
\n'
+class tokenData():
+ def __init__(self, data):
+ self.data = data
+ def toRTF(self):
+ return self.data
+ def __repr__(self):
+ return self.data
-def wrap_lines(match):
- ital = match.group('ital')
- if not ital:
- return ' '
- else:
- return ital+' '
+class tokenBinN():
+ def __init__(self, data, separator = ''):
+ self.data = data
+ self.separator = separator
+ def toRTF(self):
+ return "\\bin" + repr(len(self.data)) + self.separator + self.data
+ def __repr__(self):
+ return "\\bin" + repr(len(self.data)) + self.separator + self.data
+
+class token8bitChar():
+ def __init__(self, data):
+ self.data = data
+ def toRTF(self):
+ return "\\'" + self.data
+ def __repr__(self):
+ return "\\'" + self.data
+
+class tokenUnicode():
+ def __init__(self, data, separator = '', current_ucn = 1, eqList = []):
+ self.data = data
+ self.separator = separator
+ self.current_ucn = current_ucn
+ self.eqList = eqList
+ def toRTF(self):
+ result = '\\u' + repr(self.data) + ' '
+ ucn = self.current_ucn
+ if len(self.eqList) < ucn:
+ ucn = len(self.eqList)
+ result = tokenControlWordWithNumericArgument('\\uc', ucn).toRTF() + result
+ i = 0
+ for eq in self.eqList:
+ if i >= ucn:
+ break
+ result = result + eq.toRTF()
+ return result
+ def __repr__(self):
+ return '\\u' + repr(self.data)
-def line_length(format, raw, percent):
- '''
- raw is the raw text to find the line length to use for wrapping.
- percentage is a decimal number, 0 - 1 which is used to determine
- how far in the list of line lengths to use. The list of line lengths is
- ordered smallest to larged and does not include duplicates. 0.5 is the
- median value.
- '''
- raw = raw.replace(' ', ' ')
- if format == 'html':
- linere = re.compile('(?<=)', re.DOTALL)
- elif format == 'pdf':
- linere = re.compile('(?<=
).*?(?=
)', re.DOTALL)
- lines = linere.findall(raw)
- print "percent is " + str(percent)
+def isAsciiLetter(value):
+ return ((value >= 'a') and (value <= 'z')) or ((value >= 'A') and (value <= 'Z'))
- lengths = []
- for line in lines:
- if len(line) > 0:
- lengths.append(len(line))
+def isDigit(value):
+ return (value >= '0') and (value <= '9')
- if not lengths:
- return 0
+def isChar(value, char):
+ return value == char
- lengths = list(set(lengths))
- total = sum(lengths)
- avg = total / len(lengths)
- max_line = avg * 2
-
- lengths = sorted(lengths)
- for i in range(len(lengths) - 1, -1, -1):
- if lengths[i] > max_line:
- del lengths[i]
-
- if percent > 1:
- percent = 1
- if percent < 0:
- percent = 0
-
- index = int(len(lengths) * percent) - 1
-
- return lengths[index]
+def isString(buffer, string):
+ return buffer == string
-class CSSPreProcessor(object):
+class RtfTokenParser():
+ def __init__(self, tokens):
+ self.tokens = tokens
+ self.process()
+ self.processUnicode()
- PAGE_PAT = re.compile(r'@page[^{]*?{[^}]*?}')
+ def process(self):
+ i = 0
+ newTokens = []
+ while i < len(self.tokens):
+ if isinstance(self.tokens[i], tokenControlSymbol):
+ if isString(self.tokens[i].name, "\\'"):
+ i = i + 1
+ if not isinstance(self.tokens[i], tokenData):
+ raise Exception('Error: token8bitChar without data.')
+ if len(self.tokens[i].data) < 2:
+ raise Exception('Error: token8bitChar without data.')
+ newTokens.append(token8bitChar(self.tokens[i].data[0:2]))
+ if len(self.tokens[i].data) > 2:
+ newTokens.append(tokenData(self.tokens[i].data[2:]))
+ i = i + 1
+ continue
- def __call__(self, data, add_namespace=False):
- from calibre.ebooks.oeb.base import XHTML_CSS_NAMESPACE
- data = self.PAGE_PAT.sub('', data)
- if not add_namespace:
- return data
- ans, namespaced = [], False
- for line in data.splitlines():
- ll = line.lstrip()
- if not (namespaced or ll.startswith('@import') or
- ll.startswith('@charset')):
- ans.append(XHTML_CSS_NAMESPACE.strip())
- namespaced = True
- ans.append(line)
+ newTokens.append(self.tokens[i])
+ i = i + 1
- return u'\n'.join(ans)
+ self.tokens = list(newTokens)
-class HTMLPreProcessor(object):
+ def processUnicode(self):
+ i = 0
+ newTokens = []
+ ucNbStack = [1]
+ while i < len(self.tokens):
+ if isinstance(self.tokens[i], tokenDelimitatorStart):
+ ucNbStack.append(ucNbStack[len(ucNbStack) - 1])
+ newTokens.append(self.tokens[i])
+ i = i + 1
+ continue
+ if isinstance(self.tokens[i], tokenDelimitatorEnd):
+ ucNbStack.pop()
+ newTokens.append(self.tokens[i])
+ i = i + 1
+ continue
+ if isinstance(self.tokens[i], tokenControlWordWithNumericArgument):
+ if isString(self.tokens[i].name, '\\uc'):
+ ucNbStack[len(ucNbStack) - 1] = self.tokens[i].argument
+ newTokens.append(self.tokens[i])
+ i = i + 1
+ continue
+ if isString(self.tokens[i].name, '\\u'):
+ x = i
+ j = 0
+ i = i + 1
+ replace = []
+ partialData = None
+ ucn = ucNbStack[len(ucNbStack) - 1]
+ while (i < len(self.tokens)) and (j < ucn):
+ if isinstance(self.tokens[i], tokenDelimitatorStart):
+ break
+ if isinstance(self.tokens[i], tokenDelimitatorEnd):
+ break
+ if isinstance(self.tokens[i], tokenData):
+ if len(self.tokens[i].data) >= ucn - j:
+ replace.append(tokenData(self.tokens[i].data[0 : ucn - j]))
+ if len(self.tokens[i].data) > ucn - j:
+ partialData = tokenData(self.tokens[i].data[ucn - j:])
+ i = i + 1
+ break
+ else:
+ replace.append(self.tokens[i])
+ j = j + len(self.tokens[i].data)
+ i = i + 1
+ continue
+ if isinstance(self.tokens[i], token8bitChar) or isinstance(self.tokens[i], tokenBinN):
+ replace.append(self.tokens[i])
+ i = i + 1
+ j = j + 1
+ continue
+ raise Exception('Error: incorect utf replacement.')
- PREPROCESS = [
- # Some idiotic HTML generators (Frontpage I'm looking at you)
- # Put all sorts of crap into
. This messes up lxml
- (re.compile(r']*>\n*(.*?)\n*', re.IGNORECASE|re.DOTALL),
- sanitize_head),
- # Convert all entities, since lxml doesn't handle them well
- (re.compile(r'&(\S+?);'), convert_entities),
- # Remove the ', re.IGNORECASE),
- lambda match: ''),
- ]
+ #calibre rtf2xml does not support utfreplace
+ replace = []
- # Fix pdftohtml markup
- PDFTOHTML = [
- # Fix umlauts
- # ¨
- (re.compile(u'¨\s*()*\s*a', re.UNICODE), lambda match: u'ä'),
- (re.compile(u'¨\s*()*\s*A', re.UNICODE), lambda match: u'Ä'),
- (re.compile(u'¨\s*()*\s*e', re.UNICODE), lambda match: u'ë'),
- (re.compile(u'¨\s*()*\s*E', re.UNICODE), lambda match: u'Ë'),
- (re.compile(u'¨\s*()*\s*i', re.UNICODE), lambda match: u'ï'),
- (re.compile(u'¨\s*()*\s*I', re.UNICODE), lambda match: u'Ï'),
- (re.compile(u'¨\s*()*\s*o', re.UNICODE), lambda match: u'ö'),
- (re.compile(u'¨\s*()*\s*O', re.UNICODE), lambda match: u'Ö'),
- (re.compile(u'¨\s*()*\s*u', re.UNICODE), lambda match: u'ü'),
- (re.compile(u'¨\s*()*\s*U', re.UNICODE), lambda match: u'Ü'),
+ newTokens.append(tokenUnicode(self.tokens[x].argument, self.tokens[x].separator, ucNbStack[len(ucNbStack) - 1], replace))
+ if partialData != None:
+ newTokens.append(partialData)
+ continue
- # Fix accents
- # `
- (re.compile(u'`\s*()*\s*a', re.UNICODE), lambda match: u'à'),
- (re.compile(u'`\s*()*\s*A', re.UNICODE), lambda match: u'À'),
- (re.compile(u'`\s*()*\s*e', re.UNICODE), lambda match: u'è'),
- (re.compile(u'`\s*()*\s*E', re.UNICODE), lambda match: u'È'),
- (re.compile(u'`\s*()*\s*i', re.UNICODE), lambda match: u'ì'),
- (re.compile(u'`\s*()*\s*I', re.UNICODE), lambda match: u'Ì'),
- (re.compile(u'`\s*()*\s*o', re.UNICODE), lambda match: u'ò'),
- (re.compile(u'`\s*()*\s*O', re.UNICODE), lambda match: u'Ò'),
- (re.compile(u'`\s*()*\s*u', re.UNICODE), lambda match: u'ù'),
- (re.compile(u'`\s*()*\s*U', re.UNICODE), lambda match: u'Ù'),
+ newTokens.append(self.tokens[i])
+ i = i + 1
- # ´
- (re.compile(u'´\s*()*\s*a', re.UNICODE), lambda match: u'á'),
- (re.compile(u'´\s*()*\s*A', re.UNICODE), lambda match: u'Á'),
- (re.compile(u'´\s*()*\s*c', re.UNICODE), lambda match: u'ć'),
- (re.compile(u'´\s*()*\s*C', re.UNICODE), lambda match: u'Ć'),
- (re.compile(u'´\s*()*\s*e', re.UNICODE), lambda match: u'é'),
- (re.compile(u'´\s*()*\s*E', re.UNICODE), lambda match: u'É'),
- (re.compile(u'´\s*()*\s*i', re.UNICODE), lambda match: u'í'),
- (re.compile(u'´\s*()*\s*I', re.UNICODE), lambda match: u'Í'),
- (re.compile(u'´\s*()*\s*o', re.UNICODE), lambda match: u'ó'),
- (re.compile(u'´\s*()*\s*O', re.UNICODE), lambda match: u'Ó'),
- (re.compile(u'´\s*()*\s*n', re.UNICODE), lambda match: u'ń'),
- (re.compile(u'´\s*()*\s*N', re.UNICODE), lambda match: u'Ń'),
- (re.compile(u'´\s*()*\s*s', re.UNICODE), lambda match: u'ś'),
- (re.compile(u'´\s*()*\s*S', re.UNICODE), lambda match: u'Ś'),
- (re.compile(u'´\s*()*\s*u', re.UNICODE), lambda match: u'ú'),
- (re.compile(u'´\s*()*\s*U', re.UNICODE), lambda match: u'Ú'),
- (re.compile(u'´\s*()*\s*z', re.UNICODE), lambda match: u'ź'),
- (re.compile(u'´\s*()*\s*Z', re.UNICODE), lambda match: u'Ź'),
+ self.tokens = list(newTokens)
- # ˆ
- (re.compile(u'ˆ\s*()*\s*a', re.UNICODE), lambda match: u'â'),
- (re.compile(u'ˆ\s*()*\s*A', re.UNICODE), lambda match: u'Â'),
- (re.compile(u'ˆ\s*()*\s*e', re.UNICODE), lambda match: u'ê'),
- (re.compile(u'ˆ\s*()*\s*E', re.UNICODE), lambda match: u'Ê'),
- (re.compile(u'ˆ\s*()*\s*i', re.UNICODE), lambda match: u'î'),
- (re.compile(u'ˆ\s*()*\s*I', re.UNICODE), lambda match: u'Î'),
- (re.compile(u'ˆ\s*()*\s*o', re.UNICODE), lambda match: u'ô'),
- (re.compile(u'ˆ\s*()*\s*O', re.UNICODE), lambda match: u'Ô'),
- (re.compile(u'ˆ\s*()*\s*u', re.UNICODE), lambda match: u'û'),
- (re.compile(u'ˆ\s*()*\s*U', re.UNICODE), lambda match: u'Û'),
- # ¸
- (re.compile(u'¸\s*()*\s*c', re.UNICODE), lambda match: u'ç'),
- (re.compile(u'¸\s*()*\s*C', re.UNICODE), lambda match: u'Ç'),
+ def toRTF(self):
+ result = []
+ for token in self.tokens:
+ result.append(token.toRTF())
+ return "".join(result)
- # ˛
- (re.compile(u'˛\s*()*\s*a', re.UNICODE), lambda match: u'ą'),
- (re.compile(u'˛\s*()*\s*A', re.UNICODE), lambda match: u'Ą'),
- (re.compile(u'˛\s*()*\s*e', re.UNICODE), lambda match: u'ę'),
- (re.compile(u'˛\s*()*\s*E', re.UNICODE), lambda match: u'Ę'),
-
- # ˙
- (re.compile(u'˙\s*()*\s*z', re.UNICODE), lambda match: u'ż'),
- (re.compile(u'˙\s*()*\s*Z', re.UNICODE), lambda match: u'Ż'),
-
- # If pdf printed from a browser then the header/footer has a reliable pattern
- (re.compile(r'((?<=)\s*file:////?[A-Z].*
|file:////?[A-Z].*
(?=\s*
))', re.IGNORECASE), lambda match: ''),
+class RtfTokenizer():
+ def __init__(self, rtfData):
+ self.rtfData = []
+ self.tokens = []
+ self.rtfData = rtfData
+ self.tokenize()
- # Center separator lines
- (re.compile(u'
\s*(?P([*#•]+\s*)+)\s*
'), lambda match: '\n
' + match.group(1) + '
'),
+ def tokenize(self):
+ i = 0
+ lastDataStart = -1
+ while i < len(self.rtfData):
- # Remove page links
- (re.compile(r'', re.IGNORECASE), lambda match: ''),
- # Remove
tags
- (re.compile(r'', re.IGNORECASE), lambda match: '
'),
- # Replace
with
- # (re.compile(r'
\s*
', re.IGNORECASE), lambda match: '\n
'),
+ if isChar(self.rtfData[i], '{'):
+ if lastDataStart > -1:
+ self.tokens.append(tokenData(self.rtfData[lastDataStart : i]))
+ lastDataStart = -1
+ self.tokens.append(tokenDelimitatorStart())
+ i = i + 1
+ continue
- # unwrap hyphenation - don't delete the hyphen (often doesn't split words)
- (re.compile(u'(?<=[-–—])\s*
\s*(?=[[a-z\d])'), lambda match: ''),
+ if isChar(self.rtfData[i], '}'):
+ if lastDataStart > -1:
+ self.tokens.append(tokenData(self.rtfData[lastDataStart : i]))
+ lastDataStart = -1
+ self.tokens.append(tokenDelimitatorEnd())
+ i = i + 1
+ continue
- # Remove gray background
- (re.compile(r'
]+>'), lambda match : ''),
+ if isChar(self.rtfData[i], '\\'):
+ if i + 1 >= len(self.rtfData):
+ raise Exception('Error: Control character found at the end of the document.')
- # Detect Chapters to match default XPATH in GUI
- (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P(<(i|b)>(<(i|b)>)?)?.?(Introduction|Chapter|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*((i|b)>((i|b)>)?)?)\s*(?(br|p)[^>]*>\s*){1,3}\s*(?P(<(i|b)>)?(\s*\w+){1,4}\s*((i|b)>)?\s*(?(br|p)[^>]*>))?', re.IGNORECASE), chap_head),
+ if lastDataStart > -1:
+ self.tokens.append(tokenData(self.rtfData[lastDataStart : i]))
+ lastDataStart = -1
- # Have paragraphs show better
- (re.compile(r''), lambda match : ''),
- # Clean up spaces
- (re.compile(u'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
- # Add space before and after italics
- (re.compile(u'(?'), lambda match: ' '),
- (re.compile(r'(?=\w)'), lambda match: ' '),
-
- ]
+ tokenStart = i
+ i = i + 1
- # Fix Book Designer markup
- BOOK_DESIGNER = [
- # HR
- (re.compile('
', re.IGNORECASE),
- lambda match : ' '),
- # Create header tags
- (re.compile('<]*?id=BookTitle[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?
', re.IGNORECASE),
- lambda match : '%s
'%(match.group(2) if match.group(2) else 'center', match.group(3))),
- (re.compile('<]*?id=BookAuthor[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?
', re.IGNORECASE),
- lambda match : '%s
'%(match.group(2) if match.group(2) else 'center', match.group(3))),
- (re.compile('<]*?id=title[^><]*?>(.*?)', re.IGNORECASE|re.DOTALL),
- lambda match : '%s
'%(match.group(1),)),
- (re.compile('<]*?id=subtitle[^><]*?>(.*?)', re.IGNORECASE|re.DOTALL),
- lambda match : '%s
'%(match.group(1),)),
- ]
- def __init__(self, input_plugin_preprocess, plugin_preprocess,
- extra_opts=None):
- self.input_plugin_preprocess = input_plugin_preprocess
- self.plugin_preprocess = plugin_preprocess
- self.extra_opts = extra_opts
+ #Control Words
+ if isAsciiLetter(self.rtfData[i]):
+ #consume
+ consumed = False
+ while i < len(self.rtfData):
+ if not isAsciiLetter(self.rtfData[i]):
+ tokenEnd = i
+ consumed = True
+ break
+ i = i + 1
- def is_baen(self, src):
- return re.compile(r'<]*id=BookTitle', raw) is not None
+ #we have numeric argument before delimiter
+ if isChar(self.rtfData[i], '-') or isDigit(self.rtfData[i]):
+ #consume the numeric argument
+ consumed = False
+ l = 0
+ while i < len(self.rtfData):
+ if not isDigit(self.rtfData[i]):
+ consumed = True
+ break
+ l = l + 1
+ i = i + 1
+ if l > 10 :
+ raise Exception('Error (at:%d): Too many digits in control word numeric argument.'%[tokenStart])
- def is_pdftohtml(self, src):
- return '' in src[:1000]
+ if not consumed:
+ raise Exception('Error (at:%d): Control Word without numeric argument end.'%[tokenStart])
- def __call__(self, html, remove_special_chars=None,
- get_preprocess_html=False):
- if remove_special_chars is not None:
- html = remove_special_chars.sub('', html)
- html = html.replace('\0', '')
- is_pdftohtml = self.is_pdftohtml(html)
- if self.is_baen(html):
- rules = []
- elif self.is_book_designer(html):
- rules = self.BOOK_DESIGNER
- elif is_pdftohtml:
- rules = self.PDFTOHTML
- else:
- rules = []
+ separator = ''
+ if isChar(self.rtfData[i], ' '):
+ separator = ' '
- start_rules = []
- if is_pdftohtml:
- # Remove non breaking spaces
- start_rules.append((re.compile(ur'\u00a0'), lambda match : ' '))
+ controlWord = self.rtfData[tokenStart: tokenEnd]
+ if tokenEnd < i:
+ value = int(self.rtfData[tokenEnd: i])
+ if isString(controlWord, "\\bin"):
+ i = i + value
+ self.tokens.append(tokenBinN(self.rtfData[tokenStart:i], separator))
+ else:
+ self.tokens.append(tokenControlWordWithNumericArgument(controlWord, value, separator))
+ else:
+ self.tokens.append(tokenControlWord(controlWord, separator))
+ #space delimiter, we should discard it
+ if self.rtfData[i] == ' ':
+ i = i + 1
- if not getattr(self.extra_opts, 'keep_ligatures', False):
- html = _ligpat.sub(lambda m:LIGATURES[m.group()], html)
+ #Control Symbol
+ else:
+ self.tokens.append(tokenControlSymbol(self.rtfData[tokenStart : i + 1]))
+ i = i + 1
+ continue
- end_rules = []
- if getattr(self.extra_opts, 'remove_header', None):
- try:
- rules.insert(0,
- (re.compile(self.extra_opts.header_regex), lambda match : '')
- )
- except:
- import traceback
- print 'Failed to parse remove_header regexp'
- traceback.print_exc()
+ if lastDataStart < 0:
+ lastDataStart = i
+ i = i + 1
- if getattr(self.extra_opts, 'remove_footer', None):
- try:
- rules.insert(0,
- (re.compile(self.extra_opts.footer_regex), lambda match : '')
- )
- except:
- import traceback
- print 'Failed to parse remove_footer regexp'
- traceback.print_exc()
+ def toRTF(self):
+ result = []
+ for token in self.tokens:
+ result.append(token.toRTF())
+ return "".join(result)
- # Make the more aggressive chapter marking regex optional with the preprocess option to reduce false positives
- if getattr(self.extra_opts, 'preprocess_html', None):
- if is_pdftohtml:
- end_rules.append(
- (re.compile(r'(?=<(/?br|p|hr))(<(/?br|p|hr)[^>]*)?>\s*(<(i|b)>(<(i|b)>)?)?\s*(?P([A-Z-\'"!]{3,})\s*(\d+|[A-Z]+(\s*[A-Z]+)?)?|\d+\.?\s*([\d\w-]+\s*){0,4}\s*)\s*((i|b)>((i|b)>)?)?\s*(?p[^>]*>|
]*>)\n?((?=()?\s*\w+(\s+\w+)?()?(
]*>|?p[^>]*>))((?P.*)(
]*>|?p[^>]*>)))?'), chap_head),
- )
- if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
- length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'))
- if length:
- print "The pdf line length returned is " + str(length)
- end_rules.append(
- # Un wrap using punctuation
- (re.compile(r'(?<=.{%i}[a-z,;:)\-IA])\s*(?P(i|b|u)>)?\s*()\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines),
- )
+if __name__ == "__main__":
+ import sys
+ if len(sys.argv) < 2:
+ print ("Usage %prog rtfFileToConvert")
+ sys.exit()
+ f = open(sys.argv[1], 'rb')
+ data = f.read()
+ f.close()
- for rule in self.PREPROCESS + start_rules:
- html = rule[0].sub(rule[1], html)
+ tokenizer = RtfTokenizer(data)
+ parsedTokens = RtfTokenParser(tokenizer.tokens)
- if get_preprocess_html:
- return html
+ data = parsedTokens.toRTF()
- def dump(raw, where):
- import os
- dp = getattr(self.extra_opts, 'debug_pipeline', None)
- if dp and os.path.exists(dp):
- odir = os.path.join(dp, 'input')
- if os.path.exists(odir):
- odir = os.path.join(odir, where)
- if not os.path.exists(odir):
- os.makedirs(odir)
- name, i = None, 0
- while not name or os.path.exists(os.path.join(odir, name)):
- i += 1
- name = '%04d.html'%i
- with open(os.path.join(odir, name), 'wb') as f:
- f.write(raw.encode('utf-8'))
+ f = open(sys.argv[1], 'w')
+ f.write(data)
+ f.close()
- #dump(html, 'pre-preprocess')
-
- for rule in rules + end_rules:
- html = rule[0].sub(rule[1], html)
-
- #dump(html, 'post-preprocess')
-
- # Handle broken XHTML w/ SVG (ugh)
- if 'svg:' in html and SVG_NS not in html:
- html = html.replace(
- '
Date: Sat, 11 Sep 2010 08:39:40 -0400
Subject: [PATCH 15/43] PDF Input: Fix bug #6734, add additional matching for
unicode characters.
---
src/calibre/ebooks/conversion/preprocess.py | 11 +++++++++++
1 file changed, 11 insertions(+)
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index f7b803974f..256bcce6fc 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -166,6 +166,17 @@ class HTMLPreProcessor(object):
(re.compile(u'`\s*()*\s*O', re.UNICODE), lambda match: u'Ò'),
(re.compile(u'`\s*()*\s*u', re.UNICODE), lambda match: u'ù'),
(re.compile(u'`\s*()*\s*U', re.UNICODE), lambda match: u'Ù'),
+ # ` with letter before
+ (re.compile(u'a\s*()*\s*`', re.UNICODE), lambda match: u'à'),
+ (re.compile(u'A\s*()*\s*`', re.UNICODE), lambda match: u'À'),
+ (re.compile(u'e\s*()*\s*`', re.UNICODE), lambda match: u'è'),
+ (re.compile(u'E\s*()*\s*`', re.UNICODE), lambda match: u'È'),
+ (re.compile(u'i\s*()*\s*`', re.UNICODE), lambda match: u'ì'),
+ (re.compile(u'I\s*()*\s*`', re.UNICODE), lambda match: u'Ì'),
+ (re.compile(u'o\s*()*\s*`', re.UNICODE), lambda match: u'ò'),
+ (re.compile(u'O\s*()*\s*`', re.UNICODE), lambda match: u'Ò'),
+ (re.compile(u'u\s*()*\s*`', re.UNICODE), lambda match: u'ù'),
+ (re.compile(u'U\s*()*\s*`', re.UNICODE), lambda match: u'Ù'),
# ´
(re.compile(u'´\s*()*\s*a', re.UNICODE), lambda match: u'á'),
From c4071a245d256642568aa8fc827a8e8516f0df98 Mon Sep 17 00:00:00 2001
From: Charles Haley <>
Date: Sat, 11 Sep 2010 13:40:27 +0100
Subject: [PATCH 16/43] Fix library sorting problem introduced by calling
model.refresh() in the device connection sequence.
---
src/calibre/gui2/library/models.py | 30 ++++++++++++++----------------
1 file changed, 14 insertions(+), 16 deletions(-)
diff --git a/src/calibre/gui2/library/models.py b/src/calibre/gui2/library/models.py
index bb47508531..8ad0cd6818 100644
--- a/src/calibre/gui2/library/models.py
+++ b/src/calibre/gui2/library/models.py
@@ -121,10 +121,8 @@ class BooksModel(QAbstractTableModel): # {{{
def set_device_connected(self, is_connected):
self.device_connected = is_connected
self.db.refresh_ondevice()
- self.refresh()
+ self.refresh() # does a resort()
self.research()
- if is_connected and self.sorted_on[0] == 'ondevice':
- self.resort()
def set_book_on_device_func(self, func):
self.book_on_device = func
@@ -249,7 +247,7 @@ class BooksModel(QAbstractTableModel): # {{{
# the search and count records for restrictions
self.searched.emit(True)
- def sort(self, col, order, reset=True):
+ def sort(self, col, order, reset=True, update_history=True):
if not self.db:
return
self.about_to_be_sorted.emit(self.db.id)
@@ -260,23 +258,23 @@ class BooksModel(QAbstractTableModel): # {{{
self.clear_caches()
self.reset()
self.sorted_on = (label, order)
- self.sort_history.insert(0, self.sorted_on)
+ if update_history:
+ self.sort_history.insert(0, self.sorted_on)
self.sorting_done.emit(self.db.index)
def refresh(self, reset=True):
- try:
- col = self.column_map.index(self.sorted_on[0])
- except:
- col = 0
self.db.refresh(field=None)
- self.sort(col, self.sorted_on[1], reset=reset)
+ self.resort(reset=reset)
- def resort(self, reset=True):
- try:
- col = self.column_map.index(self.sorted_on[0])
- except ValueError:
- col = 0
- self.sort(col, self.sorted_on[1], reset=reset)
+ def resort(self, reset=True, history=5): # Bug report needed history=4 :)
+ for col,ord in reversed(self.sort_history[:history]):
+ try:
+ col = self.column_map.index(col)
+ except ValueError:
+ col = 0
+ self.sort(col, ord, reset=False, update_history=False)
+ if reset:
+ self.reset()
def research(self, reset=True):
self.search(self.last_search, reset=reset)
From 96478da323e642febb94c2c1a2c9826a6b3dddb7 Mon Sep 17 00:00:00 2001
From: John Schember
Date: Sat, 11 Sep 2010 08:48:47 -0400
Subject: [PATCH 17/43] PLM Input: Fix cleanup code.
---
src/calibre/ebooks/pml/pmlconverter.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/calibre/ebooks/pml/pmlconverter.py b/src/calibre/ebooks/pml/pmlconverter.py
index 166695ff5c..3a4454725a 100644
--- a/src/calibre/ebooks/pml/pmlconverter.py
+++ b/src/calibre/ebooks/pml/pmlconverter.py
@@ -216,7 +216,7 @@ class PML_HTMLizer(object):
html = re.sub(r'(?u)%s\s*%s' % (open % '.*?', close), '', html)
else:
html = re.sub(r'(?u)%s\s*%s' % (open, close), '', html)
- html = re.sub(r'\s*
', '', html)
+ html = re.sub(r'(?imu)\s*
', '', html)
return html
def start_line(self):
From dc7bc5dd5d890278d7f43377e9df944675888fc6 Mon Sep 17 00:00:00 2001
From: John Schember
Date: Sat, 11 Sep 2010 09:01:34 -0400
Subject: [PATCH 18/43] PML Input: Fix bug #6770, put toc link after header so
toc link goes to correct page.
---
src/calibre/ebooks/pml/pmlconverter.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/calibre/ebooks/pml/pmlconverter.py b/src/calibre/ebooks/pml/pmlconverter.py
index 3a4454725a..6e479a71ef 100644
--- a/src/calibre/ebooks/pml/pmlconverter.py
+++ b/src/calibre/ebooks/pml/pmlconverter.py
@@ -556,7 +556,7 @@ class PML_HTMLizer(object):
text = t
else:
self.toc.add_item(os.path.basename(self.file_name), id, value)
- text = '%s' % (id, t)
+ text = '%s' % (t, id)
elif c == 'm':
empty = False
src = self.code_value(line)
From c2b3c445e17a38b5599393c943036c6c448886da Mon Sep 17 00:00:00 2001
From: John Schember
Date: Sat, 11 Sep 2010 09:09:08 -0400
Subject: [PATCH 19/43] PML Input: Remove emtpy lines.
---
src/calibre/ebooks/pml/pmlconverter.py | 1 +
1 file changed, 1 insertion(+)
diff --git a/src/calibre/ebooks/pml/pmlconverter.py b/src/calibre/ebooks/pml/pmlconverter.py
index 6e479a71ef..b0fc15197a 100644
--- a/src/calibre/ebooks/pml/pmlconverter.py
+++ b/src/calibre/ebooks/pml/pmlconverter.py
@@ -207,6 +207,7 @@ class PML_HTMLizer(object):
while html != old:
old = html
html = self.cleanup_html_remove_redundant(html)
+ html = re.sub(r'(?imu)^\s*', '', html)
return html
def cleanup_html_remove_redundant(self, html):
From ef8408869cebac380474deb971c4b6910680c895 Mon Sep 17 00:00:00 2001
From: John Schember
Date: Sat, 11 Sep 2010 09:13:23 -0400
Subject: [PATCH 20/43] TXT Output: preserve spaces, handle tab character
correct. is reduced to a single space by many renderers.
---
src/calibre/ebooks/txt/processor.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py
index a12e8a0761..dac1e34df7 100644
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@@ -77,7 +77,7 @@ def separate_paragraphs_print_formatted(txt):
def preserve_spaces(txt):
txt = txt.replace(' ', ' ')
- txt = txt.replace('\t', ' ')
+ txt = txt.replace('\t', ' ')
return txt
def opf_writer(path, opf_name, manifest, spine, mi):
From a58aa5f0e5f455defefe94c10f372d33763e9b75 Mon Sep 17 00:00:00 2001
From: Charles Haley <>
Date: Sat, 11 Sep 2010 15:37:11 +0100
Subject: [PATCH 21/43] Fix bug reported in forum:
http://www.mobileread.com/forums/showthread.php?t=98242
cache.refresh still used a parameter when calling search that was removed some releases ago.
---
src/calibre/library/caches.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/calibre/library/caches.py b/src/calibre/library/caches.py
index b9c1211c7f..2096180f3c 100644
--- a/src/calibre/library/caches.py
+++ b/src/calibre/library/caches.py
@@ -549,7 +549,7 @@ class ResultCache(SearchQueryParser):
self.sort(field, ascending)
self._map_filtered = list(self._map)
if self.search_restriction:
- self.search('', return_matches=False, ignore_search_restriction=False)
+ self.search('', return_matches=False)
def seriescmp(self, sidx, siidx, x, y, library_order=None):
try:
From 3766f34aab8b6ae8b78570fb51d17bd92edc39a7 Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Sat, 11 Sep 2010 11:54:54 -0600
Subject: [PATCH 22/43] Fix regression in filename shortening that caused loss
of filename extension
---
src/calibre/utils/filenames.py | 6 ++----
1 file changed, 2 insertions(+), 4 deletions(-)
diff --git a/src/calibre/utils/filenames.py b/src/calibre/utils/filenames.py
index 9fd57ab53c..47ccbe73c2 100644
--- a/src/calibre/utils/filenames.py
+++ b/src/calibre/utils/filenames.py
@@ -54,10 +54,8 @@ def shorten_components_to(length, components):
r = x[0] if x is components[-1] else ''
else:
if x is components[-1]:
- b, _, e = x.rpartition('.')
- if not b and e:
- b = e
- e = ''
+ b, e = os.path.splitext(x)
+ if e == '.': e = ''
r = b[:-delta]+e
if r.startswith('.'): r = x[0]+r
else:
From 6eaa75527b5754cfbb8df833ad3375b724d51cfd Mon Sep 17 00:00:00 2001
From: Charles Haley <>
Date: Sat, 11 Sep 2010 21:01:26 +0100
Subject: [PATCH 23/43] resort maximum_resort_levels tweak implemented
---
resources/default_tweaks.py | 7 +++++++
src/calibre/gui2/library/models.py | 4 ++--
2 files changed, 9 insertions(+), 2 deletions(-)
diff --git a/resources/default_tweaks.py b/resources/default_tweaks.py
index 66ee4d1471..9d9bc7651c 100644
--- a/resources/default_tweaks.py
+++ b/resources/default_tweaks.py
@@ -114,3 +114,10 @@ add_new_book_tags_when_importing_books = False
# Set the maximum number of tags to show per book in the content server
max_content_server_tags_shown=5
+
+# Set the maximum number of sort 'levels' that calibre will use to resort the
+# library after certain operations such as searches or device insertion. Each
+# sort level adds a performance penalty. If the database is large (thousands of
+# books) the penalty might be noticeable. If you are not concerned about multi-
+# level sorts, and if you are seeing a slowdown, reduce the value of this tweak.
+maximum_resort_levels = 5
\ No newline at end of file
diff --git a/src/calibre/gui2/library/models.py b/src/calibre/gui2/library/models.py
index 8ad0cd6818..d2f38cc0a1 100644
--- a/src/calibre/gui2/library/models.py
+++ b/src/calibre/gui2/library/models.py
@@ -266,8 +266,8 @@ class BooksModel(QAbstractTableModel): # {{{
self.db.refresh(field=None)
self.resort(reset=reset)
- def resort(self, reset=True, history=5): # Bug report needed history=4 :)
- for col,ord in reversed(self.sort_history[:history]):
+ def resort(self, reset=True):
+ for col,ord in reversed(self.sort_history[:tweaks['maximum_resort_levels']]):
try:
col = self.column_map.index(col)
except ValueError:
From 721e61ef2a1fd090566e232ff9ca65e37400fe44 Mon Sep 17 00:00:00 2001
From: Charles Haley <>
Date: Sat, 11 Sep 2010 21:05:05 +0100
Subject: [PATCH 24/43] Clean up tweaks.py formatting (add blank lines)
---
resources/default_tweaks.py | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/resources/default_tweaks.py b/resources/default_tweaks.py
index 9d9bc7651c..71bf2c6c37 100644
--- a/resources/default_tweaks.py
+++ b/resources/default_tweaks.py
@@ -120,4 +120,5 @@ max_content_server_tags_shown=5
# sort level adds a performance penalty. If the database is large (thousands of
# books) the penalty might be noticeable. If you are not concerned about multi-
# level sorts, and if you are seeing a slowdown, reduce the value of this tweak.
-maximum_resort_levels = 5
\ No newline at end of file
+maximum_resort_levels = 5
+
From e531b517670e90cf99b8255fd47775e50450d7d1 Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Sat, 11 Sep 2010 16:16:57 -0600
Subject: [PATCH 25/43] Code organization
---
src/calibre/library/caches.py | 48 ++++++++++++++++-----------
src/calibre/library/field_metadata.py | 5 ++-
2 files changed, 33 insertions(+), 20 deletions(-)
diff --git a/src/calibre/library/caches.py b/src/calibre/library/caches.py
index 2096180f3c..eb0ceb3fe4 100644
--- a/src/calibre/library/caches.py
+++ b/src/calibre/library/caches.py
@@ -141,6 +141,8 @@ class ResultCache(SearchQueryParser):
for x in self.iterall():
yield x[idx]
+ # Search functions {{{
+
def universal_set(self):
return set([i[0] for i in self._data if i is not None])
@@ -462,6 +464,30 @@ class ResultCache(SearchQueryParser):
continue
return matches
+ def search(self, query, return_matches=False):
+ ans = self.search_getting_ids(query, self.search_restriction)
+ if return_matches:
+ return ans
+ self._map_filtered = ans
+
+ def search_getting_ids(self, query, search_restriction):
+ q = ''
+ if not query or not query.strip():
+ q = search_restriction
+ else:
+ q = query
+ if search_restriction:
+ q = u'%s (%s)' % (search_restriction, query)
+ if not q:
+ return list(self._map)
+ matches = sorted(self.parse(q))
+ return [id for id in self._map if id in matches]
+
+ def set_search_restriction(self, s):
+ self.search_restriction = s
+
+ # }}}
+
def remove(self, id):
self._data[id] = None
if id in self._map:
@@ -551,6 +577,8 @@ class ResultCache(SearchQueryParser):
if self.search_restriction:
self.search('', return_matches=False)
+ # Sorting functions {{{
+
def seriescmp(self, sidx, siidx, x, y, library_order=None):
try:
if library_order:
@@ -615,24 +643,6 @@ class ResultCache(SearchQueryParser):
self._map.sort(cmp=fcmp, reverse=not ascending)
self._map_filtered = [id for id in self._map if id in self._map_filtered]
- def search(self, query, return_matches=False):
- ans = self.search_getting_ids(query, self.search_restriction)
- if return_matches:
- return ans
- self._map_filtered = ans
+ # }}}
- def search_getting_ids(self, query, search_restriction):
- q = ''
- if not query or not query.strip():
- q = search_restriction
- else:
- q = query
- if search_restriction:
- q = u'%s (%s)' % (search_restriction, query)
- if not q:
- return list(self._map)
- matches = sorted(self.parse(q))
- return [id for id in self._map if id in matches]
- def set_search_restriction(self, s):
- self.search_restriction = s
diff --git a/src/calibre/library/field_metadata.py b/src/calibre/library/field_metadata.py
index 66cdee51f0..096dfa66fe 100644
--- a/src/calibre/library/field_metadata.py
+++ b/src/calibre/library/field_metadata.py
@@ -69,6 +69,8 @@ class FieldMetadata(dict):
VALID_DATA_TYPES = frozenset([None, 'rating', 'text', 'comments', 'datetime',
'int', 'float', 'bool', 'series'])
+ # Builtin metadata {{{
+
_field_metadata = [
('authors', {'table':'authors',
'column':'name',
@@ -287,7 +289,8 @@ class FieldMetadata(dict):
'search_terms':[],
'is_custom':False,
'is_category':False}),
- ]
+ ]
+ # }}}
# search labels that are not db columns
search_items = [ 'all',
From 7382552d18d604dff3b5472195fa9f3c07b0186c Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Sat, 11 Sep 2010 19:11:30 -0600
Subject: [PATCH 26/43] Much faster sorting code
---
src/calibre/library/caches.py | 178 ++++++++++++++++++++++++++++++++--
1 file changed, 171 insertions(+), 7 deletions(-)
diff --git a/src/calibre/library/caches.py b/src/calibre/library/caches.py
index eb0ceb3fe4..59d5b45d5f 100644
--- a/src/calibre/library/caches.py
+++ b/src/calibre/library/caches.py
@@ -607,16 +607,22 @@ class ResultCache(SearchQueryParser):
y = UNDEFINED_DATE
return cmp(x, y)
if subsort and ans == 0:
- return cmp(self._data[x][11].lower(), self._data[y][11].lower())
+ idx = self.FIELD_MAP['sort']
+ return cmp(self._data[x][idx].lower(), self._data[y][idx].lower())
return ans
- def sort(self, field, ascending, subsort=False):
+ def sanitize_field_name(self, field):
field = field.lower().strip()
- if field in ('author', 'tag', 'comment'):
- field += 's'
- if field == 'date': field = 'timestamp'
- elif field == 'title': field = 'sort'
- elif field == 'authors': field = 'author_sort'
+ if field not in self.field_metadata.iterkeys():
+ if field in ('author', 'tag', 'comment'):
+ field += 's'
+ if field == 'date': field = 'timestamp'
+ elif field == 'title': field = 'sort'
+ elif field == 'authors': field = 'author_sort'
+ return field
+
+ def sort(self, field, ascending, subsort=False):
+ field = self.sanitize_field_name(field)
as_string = field not in ('size', 'rating', 'timestamp')
if self.first_sort:
@@ -643,6 +649,164 @@ class ResultCache(SearchQueryParser):
self._map.sort(cmp=fcmp, reverse=not ascending)
self._map_filtered = [id for id in self._map if id in self._map_filtered]
+ def multisort(self, fields=[], subsort=False):
+ fields = [(self.sanitize_field_name(x), bool(y)) for x, y in fields]
+ if subsort and 'sort' not in [x[0] for x in fields]:
+ fields += [('sort', True)]
+ if not fields:
+ fields = [('timestamp', False)]
+ keys = self.field_metadata.keys()
+ for f, order in fields:
+ if f not in keys:
+ raise ValueError(f + ' not an existing field name')
+
+ keyg = SortKeyGenerator(fields, self.field_metadata, self._data)
+ if len(fields) == 1:
+ self._map.sort(key=keyg, reverse=not fields[0][1])
+ else:
+ self._map.sort(key=keyg)
+ self._map_filtered = [id for id in self._map if id in self._map_filtered]
+
+
+class SortKey(object):
+
+ def __init__(self, orders, values):
+ self.orders, self.values = orders, values
+
+ def __cmp__(self, other):
+ for i, ascending in enumerate(self.orders):
+ ans = cmp(self.values[i], other.values[i])
+ if ans != 0:
+ if not ascending:
+ ans *= -1
+ return ans
+ return 0
+
+class SortKeyGenerator(object):
+
+ def __init__(self, fields, field_metadata, data):
+ self.field_metadata = field_metadata
+ self.orders = [x[1] for x in fields]
+ self.entries = [(x[0], field_metadata[x[0]]) for x in fields]
+ self.library_order = tweaks['title_series_sorting'] == 'library_order'
+ self.data = data
+
+ def __call__(self, record):
+ values = tuple(self.itervals(self.data[record]))
+ if len(values) == 1:
+ return values[0]
+ return SortKey(self.orders, values)
+
+ def itervals(self, record):
+ for name, fm in self.entries:
+ dt = fm['datatype']
+ val = record[fm['rec_index']]
+
+ if dt == 'datetime':
+ if val is None:
+ val = UNDEFINED_DATE
+
+ elif dt == 'series':
+ if val is None:
+ val = ('', 1)
+ else:
+ val = val.lower()
+ if self.library_order:
+ val = title_sort(val)
+ sidx_fm = self.field_metadata[name + '_index']
+ sidx = record[sidx_fm['rec_index']]
+ val = (val, sidx)
+
+ elif dt in ('text', 'comments'):
+ if val is None:
+ val = ''
+ val = val.lower()
+ yield val
+
# }}}
+if __name__ == '__main__':
+ # Testing.timing for new multi-sort {{{
+ import time
+
+ from calibre.library import db
+ db = db()
+
+ db.refresh()
+
+ fields = db.field_metadata.keys()
+
+ print fields
+
+
+ def do_single_sort(meth, field, order):
+ if meth == 'old':
+ db.data.sort(field, order)
+ else:
+ db.data.multisort([(field, order)])
+
+ def test_single_sort(field):
+ for meth in ('old', 'new'):
+ ttime = 0
+ NUM = 10
+ asc = desc = None
+ for i in range(NUM):
+ db.data.sort('id', False)
+ st = time.time()
+ do_single_sort(meth, field, True)
+ asc = db.data._map
+ do_single_sort(meth, field, False)
+ desc = db.data._map
+ ttime += time.time() - st
+ yield (ttime/NUM, asc, desc)
+
+
+ print 'Running single sort differentials'
+ for field in fields:
+ if field in ('search', 'id', 'news', 'flags'): continue
+ print '\t', field
+ old, new = test_single_sort(field)
+ if old[1] != new[1] or old[2] != new[2]:
+ print '\t\t', 'Sort failure!'
+ raise SystemExit(1)
+ print '\t\t', 'Old:', old[0], 'New:', new[0], 'Ratio: %.2f'%(new[0]/old[0])
+
+ def do_multi_sort(meth, ms):
+ if meth == 'new':
+ db.data.multisort(ms)
+ else:
+ for s in reversed(ms):
+ db.data.sort(*s)
+
+ def test_multi_sort(ms):
+ for meth in ('old', 'new'):
+ ttime = 0
+ NUM = 10
+ for i in range(NUM):
+ db.data.sort('id', False)
+ st = time.time()
+ do_multi_sort(meth, ms)
+ ttime += time.time() - st
+ yield (ttime/NUM, db.data._map)
+
+ print 'Running multi-sort differentials'
+
+ for ms in [
+ [('timestamp', False), ('author', True), ('title', False)],
+ [('size', True), ('tags', True), ('author', False)],
+ [('series', False), ('title', True)],
+ [('size', True), ('tags', True), ('author', False), ('pubdate',
+ True), ('tags', False), ('formats', False), ('uuid', True)],
+
+ ]:
+ print '\t', ms
+ db.data.sort('id', False)
+ old, new = test_multi_sort(ms)
+ if old[1] != new[1]:
+ print '\t\t', 'Sort failure!'
+ raise SystemExit()
+ print '\t\t', 'Old:', old[0], 'New:', new[0], 'Ratio: %.2f'%(new[0]/old[0])
+
+ # }}}
+
From 9a06996b16486a3511e4055535a6be48f484a90a Mon Sep 17 00:00:00 2001
From: ldolse
Date: Sun, 12 Sep 2010 11:17:49 +1000
Subject: [PATCH 27/43] minor tweaks to preprocessing, backed out reflow change
---
src/calibre/ebooks/conversion/preprocess.py | 4 +--
src/calibre/ebooks/conversion/utils.py | 36 +++++++++++----------
src/calibre/ebooks/pdf/reflow.py | 4 ---
3 files changed, 21 insertions(+), 23 deletions(-)
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 6123577191..46308b2ea0 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -319,8 +319,8 @@ class HTMLPreProcessor(object):
# unwrap hyphenation - moved here so it's executed after header/footer removal
if is_pdftohtml:
- # unwrap visible dashes and hyphens - don't delete as 50% or more of the time these
- # hyphens are for compound words, formatting, etc
+ # unwrap visible dashes and hyphens - don't delete they are often hyphens for
+ # for compound words, formatting, etc
end_rules.append((re.compile(u'(?<=[-–—])\s*\s*(?=[[a-z\d])'), lambda match: ''))
# unwrap/delete soft hyphens
end_rules.append((re.compile(u'[](\s*
)+\s*(?=[[a-z\d])'), lambda match: ''))
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 68cebb3a11..fb683bdb12 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -29,16 +29,12 @@ class PreProcessor(object):
self.log("marked " + str(self.html_preprocess_sections) + " chapters & titles. - " + str(chap) + ", " + str(title))
return '
'+chap+'
\n'+title+'
\n'
- def chapter_link(self, match):
- chap = match.group('sectionlink')
- if not chap:
- self.html_preprocess_sections = self.html_preprocess_sections + 1
- self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links")
- return '
'
- else:
- self.html_preprocess_sections = self.html_preprocess_sections + 1
- self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links. - " + str(chap))
- return '
\n'+chap+'
'
+ def chapter_break(self, match):
+ chap = match.group('section')
+ styles = match.group('styles')
+ self.html_preprocess_sections = self.html_preprocess_sections + 1
+ self.log("marked " + str(self.html_preprocess_sections) + " section markers based on punctuation. - " + str(chap))
+ return '<'+styles+' style="page-break-before:always">'+chap
def no_markup(self, raw, percent):
'''
@@ -74,7 +70,7 @@ class PreProcessor(object):
html = re.sub(r"\s*]*>\s*", " ", html)
# If more than 40% of the lines are empty paragraphs then delete them to clean up spacing
- linereg = re.compile('(?<=)', re.IGNORECASE)
+ linereg = re.compile('(?<=
)', re.IGNORECASE|re.DOTALL)
blankreg = re.compile(r'\s*
]*>\s*(<(b|i|u)>)?\s*((b|i|u)>)?\s*
', re.IGNORECASE)
blanklines = blankreg.findall(html)
lines = linereg.findall(html)
@@ -100,8 +96,13 @@ class PreProcessor(object):
chapdetect = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}s*(]*>)?\s*.?(Introduction|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*((i|b|u)>){0,2})\s*()?s*()?\s*((i|b|u)>){0,2}\s*((p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*((i|b|u)>){0,2})\s*()?\s*((i|b|u)>){0,2}\s*((br|p)>))?', re.IGNORECASE)
html = chapdetect.sub(self.chapter_head, html)
if self.html_preprocess_sections < 10:
- self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying a more aggressive pattern")
- chapdetect2 = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}\s*.?(([A-Z#-]+\s*){1,9}|\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*((i|b|u)>){0,2})\s*()?\s*((i|b|u)>){0,2}\s*((p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*((i|b|u)>){0,2})\s*()?\s*((i|b|u)>){0,2}\s*((br|p)>))?', re.UNICODE)
+ self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying numeric chapters")
+ chapdetect2 = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}\s*.?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*((i|b|u)>){0,2})\s*()?\s*((i|b|u)>){0,2}\s*((p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*((i|b|u)>){0,2})\s*()?\s*((i|b|u)>){0,2}\s*((br|p)>))?', re.UNICODE)
+ html = chapdetect2.sub(self.chapter_head, html)
+
+ if self.html_preprocess_sections < 10:
+ self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying with uppercase words")
+ chapdetect2 = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}\s*.?(([A-Z#-]+\s*){1,9})\s*((i|b|u)>){0,2})\s*()?\s*((i|b|u)>){0,2}\s*((p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*((i|b|u)>){0,2})\s*()?\s*((i|b|u)>){0,2}\s*((br|p)>))?', re.UNICODE)
html = chapdetect2.sub(self.chapter_head, html)
#
# Unwrap lines using punctation if the median length of all lines is less than 200
@@ -110,13 +111,14 @@ class PreProcessor(object):
unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*(span|p|div)>\s*((p|span|div)>)?\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*(span|p|div)>\s*)(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
if length < 200:
self.log("Unwrapping Lines")
- html = unwrap.sub(' ', html)
+ html = unwrap.sub(' ', html)
+
# If still no sections after unwrapping lines break on lines with no punctuation
if self.html_preprocess_sections < 10:
- self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", splitting based on punctuation")
+ self.log(str(self.html_preprocess_sections) + " split points marked, matching based on punctuation")
#self.log(html)
- chapdetect3 = re.compile(r'(]*>)\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}\s*.?([a-z]+\s*){1,5}\s*((i|b|u)>){0,2})\s*()?\s*((i|b|u)>){0,2}\s*(
)(?P)?', re.IGNORECASE)
- html = chapdetect3.sub(self.chapter_head, html)
+ chapdetect3 = re.compile(r'<(?P(p|div)[^>]*)>\s*(?P(]*>)?\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*.?([a-z#-*]+\s*){1,5}\s*\s*()?((i|b|u)>){0,2}\s*()?\s*((i|b|u)>){0,2}\s*()?\s*(p|div)>)', re.IGNORECASE)
+ html = chapdetect3.sub(self.chapter_break, html)
# search for places where a first or second level heading is immediately followed by another
# top level heading. demote the second heading to h3 to prevent splitting between chapter
# headings and titles, images, etc
diff --git a/src/calibre/ebooks/pdf/reflow.py b/src/calibre/ebooks/pdf/reflow.py
index 36848ddb8b..584d631d0b 100644
--- a/src/calibre/ebooks/pdf/reflow.py
+++ b/src/calibre/ebooks/pdf/reflow.py
@@ -408,10 +408,6 @@ class Page(object):
# Fraction of text height that two strings' bottoms can differ by
# for them to be considered to be part of the same text fragment
LINE_FACTOR = 0.4
-
- # Percentage of the page heigth which should be considered header
- # or footer to be discarded from reflow considerations
- HEAD_FOOTER_MARGIN
# Multiplies the average line height when determining row height
# of a particular element to detect columns.
From bcd0430791f44ec926910eeb8bb18d7cbbff5fc9 Mon Sep 17 00:00:00 2001
From: Charles Haley <>
Date: Sun, 12 Sep 2010 13:37:28 +0100
Subject: [PATCH 28/43] Starting from Kovid's multisort: 1) change
_map_filtered to an ordered dict to make 'in' operations much faster 2) add a
method to field_metadata to return a dict of database fields. 3) fix a couple
of places where field_metadata needed to be used. 4) make changes so
gui2.library.models.resort uses multisort
---
src/calibre/gui2/library/models.py | 14 +++----
src/calibre/library/caches.py | 59 ++++++++++++++++-----------
src/calibre/library/database2.py | 1 +
src/calibre/library/field_metadata.py | 3 ++
4 files changed, 45 insertions(+), 32 deletions(-)
diff --git a/src/calibre/gui2/library/models.py b/src/calibre/gui2/library/models.py
index d2f38cc0a1..d18516493a 100644
--- a/src/calibre/gui2/library/models.py
+++ b/src/calibre/gui2/library/models.py
@@ -247,7 +247,7 @@ class BooksModel(QAbstractTableModel): # {{{
# the search and count records for restrictions
self.searched.emit(True)
- def sort(self, col, order, reset=True, update_history=True):
+ def sort(self, col, order, reset=True):
if not self.db:
return
self.about_to_be_sorted.emit(self.db.id)
@@ -258,8 +258,7 @@ class BooksModel(QAbstractTableModel): # {{{
self.clear_caches()
self.reset()
self.sorted_on = (label, order)
- if update_history:
- self.sort_history.insert(0, self.sorted_on)
+ self.sort_history.insert(0, self.sorted_on)
self.sorting_done.emit(self.db.index)
def refresh(self, reset=True):
@@ -267,12 +266,9 @@ class BooksModel(QAbstractTableModel): # {{{
self.resort(reset=reset)
def resort(self, reset=True):
- for col,ord in reversed(self.sort_history[:tweaks['maximum_resort_levels']]):
- try:
- col = self.column_map.index(col)
- except ValueError:
- col = 0
- self.sort(col, ord, reset=False, update_history=False)
+ if not self.db:
+ return
+ self.db.multisort(self.sort_history[:tweaks['maximum_resort_levels']])
if reset:
self.reset()
diff --git a/src/calibre/library/caches.py b/src/calibre/library/caches.py
index 59d5b45d5f..c342d5ff15 100644
--- a/src/calibre/library/caches.py
+++ b/src/calibre/library/caches.py
@@ -20,6 +20,7 @@ from calibre.utils.search_query_parser import SearchQueryParser
from calibre.utils.pyparsing import ParseException
from calibre.ebooks.metadata import title_sort
from calibre import fit_image
+from calibre.utils.ordered_dict import OrderedDict
class CoverCache(Thread):
@@ -112,7 +113,8 @@ class ResultCache(SearchQueryParser):
'''
def __init__(self, FIELD_MAP, field_metadata):
self.FIELD_MAP = FIELD_MAP
- self._map = self._map_filtered = self._data = []
+ self._map = self._data = []
+ self._map_filtered = OrderedDict()
self.first_sort = True
self.search_restriction = ''
self.field_metadata = field_metadata
@@ -122,14 +124,14 @@ class ResultCache(SearchQueryParser):
self.build_numeric_relop_dict()
def __getitem__(self, row):
- return self._data[self._map_filtered[row]]
+ return self._data[self._map_filtered.keys()[row]]
def __len__(self):
return len(self._map_filtered)
def __iter__(self):
for id in self._map_filtered:
- yield self._data[id]
+ yield id
def iterall(self):
for x in self._data:
@@ -468,7 +470,7 @@ class ResultCache(SearchQueryParser):
ans = self.search_getting_ids(query, self.search_restriction)
if return_matches:
return ans
- self._map_filtered = ans
+ self._map_filtered = OrderedDict.fromkeys(ans, True)
def search_getting_ids(self, query, search_restriction):
q = ''
@@ -480,7 +482,7 @@ class ResultCache(SearchQueryParser):
q = u'%s (%s)' % (search_restriction, query)
if not q:
return list(self._map)
- matches = sorted(self.parse(q))
+ matches = self.parse(q)
return [id for id in self._map if id in matches]
def set_search_restriction(self, s):
@@ -493,18 +495,18 @@ class ResultCache(SearchQueryParser):
if id in self._map:
self._map.remove(id)
if id in self._map_filtered:
- self._map_filtered.remove(id)
+ del self._map_filtered[id]
def set(self, row, col, val, row_is_id=False):
- id = row if row_is_id else self._map_filtered[row]
+ id = row if row_is_id else self._map_filtered.keys()[row]
self._data[id][col] = val
def get(self, row, col, row_is_id=False):
- id = row if row_is_id else self._map_filtered[row]
+ id = row if row_is_id else self._map_filtered.keys()[row]
return self._data[id][col]
def index(self, id, cache=False):
- x = self._map if cache else self._map_filtered
+ x = self._map if cache else self._map_filtered.keys()
return x.index(id)
def row(self, id):
@@ -544,13 +546,18 @@ class ResultCache(SearchQueryParser):
self._data[id].append(db.has_cover(id, index_is_id=True))
self._data[id].append(db.book_on_device_string(id))
self._map[0:0] = ids
- self._map_filtered[0:0] = ids
+ mf = OrderedDict()
+ for id in ids:
+ mf[id] = True
+ for id in self._map_filtered:
+ mf[id] = True
+ self._map_filtered = mf
def books_deleted(self, ids):
for id in ids:
self._data[id] = None
if id in self._map: self._map.remove(id)
- if id in self._map_filtered: self._map_filtered.remove(id)
+ if id in self._map_filtered: del self._map_filtered[id]
def count(self):
return len(self._map)
@@ -573,7 +580,7 @@ class ResultCache(SearchQueryParser):
self._map = [i[0] for i in self._data if i is not None]
if field is not None:
self.sort(field, ascending)
- self._map_filtered = list(self._map)
+ self._map_filtered = OrderedDict.fromkeys(self._map, True)
if self.search_restriction:
self.search('', return_matches=False)
@@ -644,10 +651,14 @@ class ResultCache(SearchQueryParser):
self.FIELD_MAP['series_index'],
library_order=tweaks['title_series_sorting'] == 'library_order')
else:
- fcmp = functools.partial(self.cmp, self.FIELD_MAP[field],
+ fcmp = functools.partial(self.cmp, self.field_metadata[field]['rec_index'],
subsort=subsort, asstr=as_string)
self._map.sort(cmp=fcmp, reverse=not ascending)
- self._map_filtered = [id for id in self._map if id in self._map_filtered]
+ mf = OrderedDict()
+ for id in self._map:
+ if id in self._map_filtered:
+ mf[id] = True
+ self._map_filtered = mf
def multisort(self, fields=[], subsort=False):
fields = [(self.sanitize_field_name(x), bool(y)) for x, y in fields]
@@ -655,7 +666,7 @@ class ResultCache(SearchQueryParser):
fields += [('sort', True)]
if not fields:
fields = [('timestamp', False)]
- keys = self.field_metadata.keys()
+ keys = self.field_metadata.field_keys()
for f, order in fields:
if f not in keys:
raise ValueError(f + ' not an existing field name')
@@ -665,7 +676,11 @@ class ResultCache(SearchQueryParser):
self._map.sort(key=keyg, reverse=not fields[0][1])
else:
self._map.sort(key=keyg)
- self._map_filtered = [id for id in self._map if id in self._map_filtered]
+ mf = OrderedDict()
+ for id in self._map:
+ if id in self._map_filtered:
+ mf[id] = id
+ self._map_filtered = mf
class SortKey(object):
@@ -677,16 +692,14 @@ class SortKey(object):
for i, ascending in enumerate(self.orders):
ans = cmp(self.values[i], other.values[i])
if ans != 0:
- if not ascending:
- ans *= -1
- return ans
+ return ans * ascending
return 0
class SortKeyGenerator(object):
def __init__(self, fields, field_metadata, data):
self.field_metadata = field_metadata
- self.orders = [x[1] for x in fields]
+ self.orders = [-1 if x[1] else 1 for x in fields]
self.entries = [(x[0], field_metadata[x[0]]) for x in fields]
self.library_order = tweaks['title_series_sorting'] == 'library_order'
self.data = data
@@ -735,7 +748,7 @@ if __name__ == '__main__':
db.refresh()
- fields = db.field_metadata.keys()
+ fields = db.field_metadata.field_keys()
print fields
@@ -765,7 +778,7 @@ if __name__ == '__main__':
print 'Running single sort differentials'
for field in fields:
if field in ('search', 'id', 'news', 'flags'): continue
- print '\t', field
+ print '\t', field, db.field_metadata[field]['datatype']
old, new = test_single_sort(field)
if old[1] != new[1] or old[2] != new[2]:
print '\t\t', 'Sort failure!'
@@ -797,7 +810,7 @@ if __name__ == '__main__':
[('size', True), ('tags', True), ('author', False)],
[('series', False), ('title', True)],
[('size', True), ('tags', True), ('author', False), ('pubdate',
- True), ('tags', False), ('formats', False), ('uuid', True)],
+ True), ('series', False), ('formats', False), ('uuid', True)],
]:
print '\t', ms
diff --git a/src/calibre/library/database2.py b/src/calibre/library/database2.py
index 4106f8c965..8a5ab75c3c 100644
--- a/src/calibre/library/database2.py
+++ b/src/calibre/library/database2.py
@@ -311,6 +311,7 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
self.search_getting_ids = self.data.search_getting_ids
self.refresh = functools.partial(self.data.refresh, self)
self.sort = self.data.sort
+ self.multisort = self.data.multisort
self.index = self.data.index
self.refresh_ids = functools.partial(self.data.refresh_ids, self)
self.row = self.data.row
diff --git a/src/calibre/library/field_metadata.py b/src/calibre/library/field_metadata.py
index 096dfa66fe..276a6ba971 100644
--- a/src/calibre/library/field_metadata.py
+++ b/src/calibre/library/field_metadata.py
@@ -335,6 +335,9 @@ class FieldMetadata(dict):
def keys(self):
return self._tb_cats.keys()
+ def field_keys(self):
+ return [k for k in self._tb_cats.keys() if self._tb_cats[k]['kind']=='field']
+
def iterkeys(self):
for key in self._tb_cats:
yield key
From 8b09f4c293e82ff797635320c42487d9be190831 Mon Sep 17 00:00:00 2001
From: Charles Haley <>
Date: Sun, 12 Sep 2010 13:42:37 +0100
Subject: [PATCH 29/43] Restore the second 'tags' to the tests
---
src/calibre/library/caches.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/calibre/library/caches.py b/src/calibre/library/caches.py
index c342d5ff15..882de975db 100644
--- a/src/calibre/library/caches.py
+++ b/src/calibre/library/caches.py
@@ -810,7 +810,7 @@ if __name__ == '__main__':
[('size', True), ('tags', True), ('author', False)],
[('series', False), ('title', True)],
[('size', True), ('tags', True), ('author', False), ('pubdate',
- True), ('series', False), ('formats', False), ('uuid', True)],
+ True), ('tags', False), ('formats', False), ('uuid', True)],
]:
print '\t', ms
From 5626418d1a6993b16f3d6a83c22a761a7490b7ee Mon Sep 17 00:00:00 2001
From: Charles Haley <>
Date: Sun, 12 Sep 2010 14:51:21 +0100
Subject: [PATCH 30/43] Correct regression in device handing -- sorting after
sending a book.
---
src/calibre/gui2/library/models.py | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/src/calibre/gui2/library/models.py b/src/calibre/gui2/library/models.py
index d18516493a..c746a5aa56 100644
--- a/src/calibre/gui2/library/models.py
+++ b/src/calibre/gui2/library/models.py
@@ -1024,6 +1024,11 @@ class DeviceBooksModel(BooksModel): # {{{
if reset:
self.reset()
+ def resort(self, reset=True):
+ if self.sorted_on:
+ self.sort(self.column_map.index(self.sorted_on[0]),
+ self.sorted_on[1], reset=reset)
+
def columnCount(self, parent):
if parent and parent.isValid():
return 0
From cdb696f63bc39b9327abe809fa71e94baa6e0b86 Mon Sep 17 00:00:00 2001
From: ldolse
Date: Mon, 13 Sep 2010 00:12:21 +1000
Subject: [PATCH 31/43] enhanced preprocessing class - looking pretty good
---
src/calibre/ebooks/conversion/preprocess.py | 18 ++--
src/calibre/ebooks/conversion/utils.py | 98 +++++++++++++++------
2 files changed, 82 insertions(+), 34 deletions(-)
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 46308b2ea0..f6277956c8 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -62,7 +62,6 @@ def wrap_lines(match):
else:
return ital+' '
-
def line_length(format, raw, percent):
'''
raw is the raw text to find the line length to use for wrapping.
@@ -76,6 +75,8 @@ def line_length(format, raw, percent):
linere = re.compile('(?<=)', re.DOTALL)
elif format == 'pdf':
linere = re.compile('(?<=
).*?(?=
)', re.DOTALL)
+ elif format == 'spanned_html':
+ linere = re.compile('(?<=)', re.DOTALL)
lines = linere.findall(raw)
lengths = []
@@ -223,14 +224,15 @@ class HTMLPreProcessor(object):
# Remove page links
(re.compile(r'', re.IGNORECASE), lambda match: ''),
# Remove
tags
- (re.compile(r'', re.IGNORECASE), lambda match: '
'),
+ (re.compile(r'', re.IGNORECASE), lambda match: '
'),
# Remove gray background
(re.compile(r']+>'), lambda match : ''),
# Detect Chapters to match default XPATH in GUI
- (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P(<(i|b)>(<(i|b)>)?)?.?(Introduction|Chapter|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*((i|b)>((i|b)>)?)?)\s*(?(br|p)[^>]*>\s*){1,3}\s*(?P(<(i|b)>)?(\s*\w+){1,4}\s*((i|b)>)?\s*(?(br|p)[^>]*>))?', re.IGNORECASE), chap_head),
- (re.compile(r'
\s*(?P([A-Z]\s+){4,}\s*([\d\w-]+\s*){0,3}\s*)\s*(
\s*){1,3}\s*(?P(<(i|b)>)?(\s*\w+){1,4}\s*((i|b)>)?\s*(?(br|p)[^>]*>))?'), chap_head),
+ (re.compile(r'
\s*(?P(<[ibu]>){0,2}\s*.?(Introduction|Chapter|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*([ibu]>){0,2})\s*(
\s*){1,3}\s*(?P(<[ibu]>){0,2}(\s*\w+){1,4}\s*([ibu]>){0,2}\s*
)?', re.IGNORECASE), chap_head),
+ # Cover the case where every letter in a chapter title is separated by a space
+ (re.compile(r'
\s*(?P([A-Z]\s+){4,}\s*([\d\w-]+\s*){0,3}\s*)\s*(
\s*){1,3}\s*(?P(<[ibu]>){0,2}(\s*\w+){1,4}\s*([ibu]>){0,2}\s*(
))?'), chap_head),
# Have paragraphs show better
(re.compile(r''), lambda match : ''),
@@ -238,8 +240,7 @@ class HTMLPreProcessor(object):
(re.compile(u'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
# Add space before and after italics
(re.compile(u'(?'), lambda match: ' '),
- (re.compile(r'(?=\w)'), lambda match: ' '),
-
+ (re.compile(r'(?=\w)'), lambda match: ' '),
]
# Fix Book Designer markup
@@ -327,10 +328,11 @@ class HTMLPreProcessor(object):
# unwrap/delete soft hyphens with formatting
end_rules.append((re.compile(u'[]\s*((i|u|b)>)+(\s*
)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: ''))
- # Make the more aggressive chapter marking regex optional with the preprocess option to reduce false positives
+ # Make the more aggressive chapter marking regex optional with the preprocess option to
+ # reduce false positives and move after header/footer removal
if getattr(self.extra_opts, 'preprocess_html', None):
if is_pdftohtml:
- end_rules.append((re.compile(r'(?=<(/?br|p|hr))(<(/?br|p|hr)[^>]*)?>\s*(<(i|b)>(<(i|b)>)?)?\s*(?P([A-Z-\'"!]{3,})\s*(\d+|[A-Z]+(\s*[A-Z]+)?)?|\d+\.?\s*([\d\w-]+\s*){0,4}\s*)\s*((i|b)>((i|b)>)?)?\s*(?p[^>]*>|
]*>)\n?((?=()?\s*\w+(\s+\w+)?()?(
]*>|?p[^>]*>))((?P.*)(
]*>|?p[^>]*>)))?'), chap_head))
+ end_rules.append((re.compile(r'\s*(?P(<[ibu]>){0,2}\s*([A-Z \'"!]{3,})\s*([\dA-Z:]+\s){0,4}\s*([ibu]>){0,2})\s*\s*(?P
(<[ibu]>){0,2}(\s*\w+){1,4}\s*([ibu]>){0,2}\s*)?'), chap_head),)
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'))
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index fb683bdb12..abfa43e7ed 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -8,10 +8,10 @@ __docformat__ = 'restructuredtext en'
import re
from calibre.ebooks.conversion.preprocess import line_length
from calibre.utils.logging import default_log
-from lxml import etree
class PreProcessor(object):
html_preprocess_sections = 0
+ found_indents = 0
def __init__(self, args):
self.args = args
@@ -22,11 +22,11 @@ class PreProcessor(object):
title = match.group('title')
if not title:
self.html_preprocess_sections = self.html_preprocess_sections + 1
- self.log("marked " + str(self.html_preprocess_sections) + " chapters. - " + str(chap))
+ self.log("found " + str(self.html_preprocess_sections) + " chapters. - " + str(chap))
return '
'+chap+'
\n'
else:
self.html_preprocess_sections = self.html_preprocess_sections + 1
- self.log("marked " + str(self.html_preprocess_sections) + " chapters & titles. - " + str(chap) + ", " + str(title))
+ self.log("found " + str(self.html_preprocess_sections) + " chapters & titles. - " + str(chap) + ", " + str(title))
return ''+chap+'
\n'+title+'
\n'
def chapter_break(self, match):
@@ -35,7 +35,22 @@ class PreProcessor(object):
self.html_preprocess_sections = self.html_preprocess_sections + 1
self.log("marked " + str(self.html_preprocess_sections) + " section markers based on punctuation. - " + str(chap))
return '<'+styles+' style="page-break-before:always">'+chap
-
+
+ def insert_indent(self, match):
+ pstyle = match.group('formatting')
+ span = match.group('span')
+ self.found_indents = self.found_indents + 1
+ if pstyle:
+ if not span:
+ return ''
+ else:
+ return '
'+span
+ else:
+ if not span:
+ return '
'
+ else:
+ return '
'+span
+
def no_markup(self, raw, percent):
'''
Detects total marked up line endings in the file. raw is the text to
@@ -48,7 +63,7 @@ class PreProcessor(object):
line_end = line_end_ere.findall(raw)
tot_htm_ends = len(htm_end)
tot_ln_fds = len(line_end)
- self.log("*** There are " + str(tot_ln_fds) + " total Line feeds, and " + str(tot_htm_ends) + " marked endings***")
+ self.log("There are " + str(tot_ln_fds) + " total Line feeds, and " + str(tot_htm_ends) + " marked up endings")
if percent > 1:
percent = 1
@@ -56,13 +71,18 @@ class PreProcessor(object):
percent = 0
min_lns = tot_ln_fds * percent
- self.log("There must be fewer than " + str(min_lns) + " unmarked lines to return true")
+ self.log("There must be fewer than " + str(min_lns) + " unmarked lines to add markup")
if min_lns > tot_htm_ends:
return True
def __call__(self, html):
self.log("********* Preprocessing HTML *********")
- # remove non-breaking spaces
+ # Replace series of non-breaking spaces with text-indent
+ txtindent = re.compile(ur'
[^>]*)>\s*(?P(]*>\s*)+)?\s*(\u00a0){2,}', re.IGNORECASE)
+ html = txtindent.sub(self.insert_indent, html)
+ if self.found_indents > 1:
+ self.log("replaced "+str(self.found_indents)+ " nbsp indents with inline styles")
+ # remove remaining non-breaking spaces
html = re.sub(ur'\u00a0', ' ', html)
# Get rid of empty tags to simplify other processing
html = re.sub(ur'\s*\s*', ' ', html)
@@ -83,41 +103,67 @@ class PreProcessor(object):
html = re.sub(r"\s*
", "
\n", html)
html = re.sub(r"\s*\s*", "\n
", html)
- # some lit files don't have any
tags or equivalent, check and
- # mark up line endings if required before proceeding
+ # some lit files don't have any
tags or equivalent (generally just plain text between
+ #
tags), check and mark up line endings if required before proceeding
if self.no_markup(html, 0.1):
self.log("not enough paragraph markers, adding now")
add_markup = re.compile('(?)(\n)')
html = add_markup.sub('
\n', html)
# detect chapters/sections to match xpath or splitting logic
+ heading = re.compile(']*>', re.IGNORECASE)
+ self.html_preprocess_sections = len(heading.findall(html))
+ self.log("found " + str(self.html_preprocess_sections) + " pre-existing headings")
#
- # Start with most typical chapter headings
- chapdetect = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}s*(]*>)?\s*.?(Introduction|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*((i|b|u)>){0,2})\s*()?s*()?\s*((i|b|u)>){0,2}\s*((p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*((i|b|u)>){0,2})\s*()?\s*((i|b|u)>){0,2}\s*((br|p)>))?', re.IGNORECASE)
- html = chapdetect.sub(self.chapter_head, html)
+ # Start with most typical chapter headings, get more aggressive until one works
+ if self.html_preprocess_sections < 10:
+ chapdetect = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(]*>)?\s*(?P(<[ibu]>){0,2}s*(]*>)?\s*.?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*([ibu]>){0,2})\s*()?s*()?\s*([ibu]>){0,2}\s*((p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(]*>)?\s*(?P(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*([ibu]>){0,2})\s*()?\s*([ibu]>){0,2}\s*((br|p)>))?', re.IGNORECASE)
+ html = chapdetect.sub(self.chapter_head, html)
if self.html_preprocess_sections < 10:
self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying numeric chapters")
- chapdetect2 = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}\s*.?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*((i|b|u)>){0,2})\s*()?\s*((i|b|u)>){0,2}\s*((p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*((i|b|u)>){0,2})\s*()?\s*((i|b|u)>){0,2}\s*((br|p)>))?', re.UNICODE)
+ chapdetect2 = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(]*>)?\s*(?P(<[ibu]>){0,2}\s*.?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*([ibu]>){0,2})\s*()?\s*([ibu]>){0,2}\s*((p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(]*>)?\s*(?P(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*([ibu]>){0,2})\s*()?\s*([ibu]>){0,2}\s*((br|p)>))?', re.UNICODE)
html = chapdetect2.sub(self.chapter_head, html)
if self.html_preprocess_sections < 10:
self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying with uppercase words")
- chapdetect2 = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}\s*.?(([A-Z#-]+\s*){1,9})\s*((i|b|u)>){0,2})\s*()?\s*((i|b|u)>){0,2}\s*((p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*((i|b|u)>){0,2})\s*()?\s*((i|b|u)>){0,2}\s*((br|p)>))?', re.UNICODE)
- html = chapdetect2.sub(self.chapter_head, html)
- #
- # Unwrap lines using punctation if the median length of all lines is less than 200
- length = line_length('html', html, 0.4)
- self.log("*** Median line length is " + str(length) + " ***")
- unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*(span|p|div)>\s*((p|span|div)>)?\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*(span|p|div)>\s*)(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
- if length < 200:
- self.log("Unwrapping Lines")
- html = unwrap.sub(' ', html)
+ chapdetect2 = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(]*>)?\s*(?P(<[ibu]>){0,2}\s*.?(([A-Z#-]+\s*){1,9})\s*([ibu]>){0,2})\s*()?\s*([ibu]>){0,2}\s*((p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(]*>)?\s*(?P(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*([ibu]>){0,2})\s*()?\s*([ibu]>){0,2}\s*((br|p)>))?', re.UNICODE)
+ html = chapdetect2.sub(self.chapter_head, html)
- # If still no sections after unwrapping lines break on lines with no punctuation
+ # Unwrap lines
+ #
+ self.log("Unwrapping Lines")
+ # Some OCR sourced files have line breaks in the html using a combination of span & p tags
+ # span are used for hard line breaks, p for new paragraphs. Determine which is used so
+ # that lines can be wrapped across page boundaries
+ paras_reg = re.compile(']*>', re.IGNORECASE)
+ spans_reg = re.compile(']*>', re.IGNORECASE)
+ paras = len(paras_reg.findall(html))
+ spans = len(spans_reg.findall(html))
+ if spans > 1:
+ if float(paras) / float(spans) < 0.75:
+ format = 'spanned_html'
+ else:
+ format = 'html'
+ else:
+ format = 'html'
+
+ # Calculate Length
+ length = line_length(format, html, 0.4)
+ self.log("*** Median line length is " + str(length) + ",calculated with " + format + " format ***")
+ #
+ # Unwrap and/or delete soft-hyphens, hyphens
+ html = re.sub(u'\s*(\s*([iubp]>\s*<[iubp][^>]*>\s*)?]*>|[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
+ html = re.sub(u'(?<=[-–—])\s*(?=<)(\s*([iubp]>\s*<[iubp][^>]*>\s*)?]*>|[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])', '', html)
+
+ # Unwrap lines using punctation if the median length of all lines is less than 200
+ unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*(span|p|div)>\s*((p|span|div)>)?\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*(span|p|div)>\s*)(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
+ html = unwrap.sub(' ', html)
+
+ # If still no sections after unwrapping mark split points on lines with no punctuation
if self.html_preprocess_sections < 10:
- self.log(str(self.html_preprocess_sections) + " split points marked, matching based on punctuation")
+ self.log("Looking for more split points based on punctuation, currently have " + str(self.html_preprocess_sections))
#self.log(html)
- chapdetect3 = re.compile(r'<(?P(p|div)[^>]*)>\s*(?P(]*>)?\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*.?([a-z#-*]+\s*){1,5}\s*\s*()?((i|b|u)>){0,2}\s*()?\s*((i|b|u)>){0,2}\s*()?\s*(p|div)>)', re.IGNORECASE)
+ chapdetect3 = re.compile(r'<(?P(p|div)[^>]*)>\s*(?P(]*>)?\s*(<[ibu]>){0,2}\s*(]*>)?\s*(<[ibu]>){0,2}\s*(]*>)?\s*.?([a-z#-*]+\s*){1,5}\s*\s*()?([ibu]>){0,2}\s*()?\s*([ibu]>){0,2}\s*()?\s*(p|div)>)', re.IGNORECASE)
html = chapdetect3.sub(self.chapter_break, html)
# search for places where a first or second level heading is immediately followed by another
# top level heading. demote the second heading to h3 to prevent splitting between chapter
From 6cc332089a421e6100fa4937c5126309c483e132 Mon Sep 17 00:00:00 2001
From: Starson17
Date: Sun, 12 Sep 2010 11:28:24 -0400
Subject: [PATCH 32/43] Change Merge and Safe Merge warnings re ISBN
---
src/calibre/gui2/actions/edit_metadata.py | 10 ++++++----
1 file changed, 6 insertions(+), 4 deletions(-)
diff --git a/src/calibre/gui2/actions/edit_metadata.py b/src/calibre/gui2/actions/edit_metadata.py
index f0232d9859..878ba77a43 100644
--- a/src/calibre/gui2/actions/edit_metadata.py
+++ b/src/calibre/gui2/actions/edit_metadata.py
@@ -209,8 +209,9 @@ class EditMetadataAction(InterfaceAction):
dest_id, src_books, src_ids = self.books_to_merge(rows)
if safe_merge:
if not confirm(''+_(
- 'All book formats and metadata from the selected books '
- 'will be added to the first selected book.
'
+ 'Book formats and metadata from the selected books '
+ 'will be added to the first selected book. '
+ 'ISBN will not be merged.
'
'The second and subsequently selected books will not '
'be deleted or changed.
'
'Please confirm you want to proceed.')
@@ -220,8 +221,9 @@ class EditMetadataAction(InterfaceAction):
self.merge_metadata(dest_id, src_ids)
else:
if not confirm('
'+_(
- 'All book formats and metadata from the selected books will be merged '
- 'into the first selected book.
'
+ 'Book formats and metadata from the selected books will be merged '
+ 'into the first selected book. '
+ 'ISBN will not be merged.
'
'After merger the second and '
'subsequently selected books will be deleted.
'
'All book formats of the first selected book will be kept '
From 78874a9117941de749f3b09934be8588181dd4b7 Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Sun, 12 Sep 2010 09:32:16 -0600
Subject: [PATCH 33/43] Use the new sorting code in the content server as well.
---
src/calibre/library/caches.py | 153 +-------------------------
src/calibre/library/server/content.py | 38 +++----
2 files changed, 18 insertions(+), 173 deletions(-)
diff --git a/src/calibre/library/caches.py b/src/calibre/library/caches.py
index dfd7086076..4f795ab733 100644
--- a/src/calibre/library/caches.py
+++ b/src/calibre/library/caches.py
@@ -6,7 +6,7 @@ __license__ = 'GPL v3'
__copyright__ = '2010, Kovid Goyal '
__docformat__ = 'restructuredtext en'
-import re, itertools, functools
+import re, itertools
from itertools import repeat
from datetime import timedelta
from threading import Thread, RLock
@@ -584,39 +584,7 @@ class ResultCache(SearchQueryParser):
# Sorting functions {{{
- def seriescmp(self, sidx, siidx, x, y, library_order=None):
- try:
- if library_order:
- ans = cmp(title_sort(self._data[x][sidx].lower()),
- title_sort(self._data[y][sidx].lower()))
- else:
- ans = cmp(self._data[x][sidx].lower(),
- self._data[y][sidx].lower())
- except AttributeError: # Some entries may be None
- ans = cmp(self._data[x][sidx], self._data[y][sidx])
- if ans != 0: return ans
- return cmp(self._data[x][siidx], self._data[y][siidx])
-
- def cmp(self, loc, x, y, asstr=True, subsort=False):
- try:
- ans = cmp(self._data[x][loc].lower(), self._data[y][loc].lower()) if \
- asstr else cmp(self._data[x][loc], self._data[y][loc])
- except AttributeError: # Some entries may be None
- ans = cmp(self._data[x][loc], self._data[y][loc])
- except TypeError: ## raised when a datetime is None
- x = self._data[x][loc]
- if x is None:
- x = UNDEFINED_DATE
- y = self._data[y][loc]
- if y is None:
- y = UNDEFINED_DATE
- return cmp(x, y)
- if subsort and ans == 0:
- idx = self.FIELD_MAP['sort']
- return cmp(self._data[x][idx].lower(), self._data[y][idx].lower())
- return ans
-
- def sanitize_field_name(self, field):
+ def sanitize_sort_field_name(self, field):
field = field.lower().strip()
if field not in self.field_metadata.iterkeys():
if field in ('author', 'tag', 'comment'):
@@ -627,38 +595,10 @@ class ResultCache(SearchQueryParser):
return field
def sort(self, field, ascending, subsort=False):
- field = self.sanitize_field_name(field)
- as_string = field not in ('size', 'rating', 'timestamp')
-
- if self.first_sort:
- subsort = True
- self.first_sort = False
- if self.field_metadata[field]['is_custom']:
- if self.field_metadata[field]['datatype'] == 'series':
- fcmp = functools.partial(self.seriescmp,
- self.field_metadata[field]['rec_index'],
- self.field_metadata.cc_series_index_column_for(field),
- library_order=tweaks['title_series_sorting'] == 'library_order')
- else:
- as_string = self.field_metadata[field]['datatype'] in ('comments', 'text')
- field = self.field_metadata[field]['colnum']
- fcmp = functools.partial(self.cmp, self.FIELD_MAP[field],
- subsort=subsort, asstr=as_string)
- elif field == 'series':
- fcmp = functools.partial(self.seriescmp, self.FIELD_MAP['series'],
- self.FIELD_MAP['series_index'],
- library_order=tweaks['title_series_sorting'] == 'library_order')
- else:
- fcmp = functools.partial(self.cmp, self.field_metadata[field]['rec_index'],
- subsort=subsort, asstr=as_string)
- self._map.sort(cmp=fcmp, reverse=not ascending)
- tmap = list(itertools.repeat(False, len(self._data)))
- for x in self._map_filtered:
- tmap[x] = True
- self._map_filtered = [x for x in self._map if tmap[x]]
+ self.multisort([(field, ascending)])
def multisort(self, fields=[], subsort=False):
- fields = [(self.sanitize_field_name(x), bool(y)) for x, y in fields]
+ fields = [(self.sanitize_sort_field_name(x), bool(y)) for x, y in fields]
keys = self.field_metadata.field_keys()
fields = [x for x in fields if x[0] in keys]
if subsort and 'sort' not in [x[0] for x in fields]:
@@ -671,6 +611,7 @@ class ResultCache(SearchQueryParser):
self._map.sort(key=keyg, reverse=not fields[0][1])
else:
self._map.sort(key=keyg)
+
tmap = list(itertools.repeat(False, len(self._data)))
for x in self._map_filtered:
tmap[x] = True
@@ -733,87 +674,3 @@ class SortKeyGenerator(object):
# }}}
-if __name__ == '__main__':
- # Testing.timing for new multi-sort {{{
- import time
-
- from calibre.library import db
- db = db()
-
- db.refresh()
-
- fields = db.field_metadata.field_keys()
-
- print fields
-
-
- def do_single_sort(meth, field, order):
- if meth == 'old':
- db.data.sort(field, order)
- else:
- db.data.multisort([(field, order)])
-
- def test_single_sort(field):
- for meth in ('old', 'new'):
- ttime = 0
- NUM = 10
- asc = desc = None
- for i in range(NUM):
- db.data.sort('id', False)
- st = time.time()
- do_single_sort(meth, field, True)
- asc = db.data._map
- do_single_sort(meth, field, False)
- desc = db.data._map
- ttime += time.time() - st
- yield (ttime/NUM, asc, desc)
-
-
- print 'Running single sort differentials'
- for field in fields:
- if field in ('search', 'id', 'news', 'flags'): continue
- print '\t', field, db.field_metadata[field]['datatype']
- old, new = test_single_sort(field)
- if old[1] != new[1] or old[2] != new[2]:
- print '\t\t', 'Sort failure!'
- raise SystemExit(1)
- print '\t\t', 'Old:', old[0], 'New:', new[0], 'Ratio: %.2f'%(new[0]/old[0])
-
- def do_multi_sort(meth, ms):
- if meth == 'new':
- db.data.multisort(ms)
- else:
- for s in reversed(ms):
- db.data.sort(*s)
-
- def test_multi_sort(ms):
- for meth in ('old', 'new'):
- ttime = 0
- NUM = 10
- for i in range(NUM):
- db.data.sort('id', False)
- st = time.time()
- do_multi_sort(meth, ms)
- ttime += time.time() - st
- yield (ttime/NUM, db.data._map)
-
- print 'Running multi-sort differentials'
-
- for ms in [
- [('timestamp', False), ('author', True), ('title', False)],
- [('size', True), ('tags', True), ('author', False)],
- [('series', False), ('title', True)],
- [('size', True), ('tags', True), ('author', False), ('pubdate',
- True), ('tags', False), ('formats', False), ('uuid', True)],
-
- ]:
- print '\t', ms
- db.data.sort('id', False)
- old, new = test_multi_sort(ms)
- if old[1] != new[1]:
- print '\t\t', 'Sort failure!'
- raise SystemExit()
- print '\t\t', 'Old:', old[0], 'New:', new[0], 'Ratio: %.2f'%(new[0]/old[0])
-
- # }}}
-
diff --git a/src/calibre/library/server/content.py b/src/calibre/library/server/content.py
index 6784abd8f4..ecb467b4c2 100644
--- a/src/calibre/library/server/content.py
+++ b/src/calibre/library/server/content.py
@@ -5,7 +5,7 @@ __license__ = 'GPL v3'
__copyright__ = '2010, Kovid Goyal '
__docformat__ = 'restructuredtext en'
-import re, os, cStringIO, operator
+import re, os, cStringIO
import cherrypy
try:
@@ -16,7 +16,15 @@ except ImportError:
from calibre import fit_image, guess_type
from calibre.utils.date import fromtimestamp
-from calibre.ebooks.metadata import title_sort
+from calibre.library.caches import SortKeyGenerator
+
+class CSSortKeyGenerator(SortKeyGenerator):
+
+ def __init__(self, fields, fm):
+ SortKeyGenerator.__init__(self, fields, fm, None)
+
+ def __call__(self, record):
+ return self.itervals(record).next()
class ContentServer(object):
@@ -47,32 +55,12 @@ class ContentServer(object):
def sort(self, items, field, order):
- field = field.lower().strip()
- if field == 'author':
- field = 'authors'
- if field == 'date':
- field = 'timestamp'
+ field = self.db.data.sanitize_sort_field_name(field)
if field not in ('title', 'authors', 'rating', 'timestamp', 'tags', 'size', 'series'):
raise cherrypy.HTTPError(400, '%s is not a valid sort field'%field)
- cmpf = cmp if field in ('rating', 'size', 'timestamp') else \
- lambda x, y: cmp(x.lower() if x else '', y.lower() if y else '')
- if field == 'series':
- items.sort(cmp=self.seriescmp, reverse=not order)
- else:
- lookup = 'sort' if field == 'title' else field
- lookup = 'author_sort' if field == 'authors' else field
- field = self.db.FIELD_MAP[lookup]
- getter = operator.itemgetter(field)
- items.sort(cmp=lambda x, y: cmpf(getter(x), getter(y)), reverse=not order)
+ keyg = CSSortKeyGenerator([(field, order)], self.db.field_metadata)
+ items.sort(key=keyg, reverse=not order)
- def seriescmp(self, x, y):
- si = self.db.FIELD_MAP['series']
- try:
- ans = cmp(title_sort(x[si].lower()), title_sort(y[si].lower()))
- except AttributeError: # Some entries may be None
- ans = cmp(x[si], y[si])
- if ans != 0: return ans
- return cmp(x[self.db.FIELD_MAP['series_index']], y[self.db.FIELD_MAP['series_index']])
# }}}
From 80c976e0f24f05a5ee7a9bfce50bf7745215e339 Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Sun, 12 Sep 2010 11:11:00 -0600
Subject: [PATCH 34/43] Fix #6794 (Updated recipes for Infobae and NSPM)
---
resources/recipes/infobae.recipe | 82 ++++++++------------------------
resources/recipes/nspm.recipe | 11 ++++-
2 files changed, 30 insertions(+), 63 deletions(-)
diff --git a/resources/recipes/infobae.recipe b/resources/recipes/infobae.recipe
index cda9bf83d2..b7f9cd3c6c 100644
--- a/resources/recipes/infobae.recipe
+++ b/resources/recipes/infobae.recipe
@@ -1,12 +1,8 @@
-#!/usr/bin/env python
-
__license__ = 'GPL v3'
-__copyright__ = '2008-2009, Darko Miletic '
+__copyright__ = '2008-2010, Darko Miletic '
'''
infobae.com
'''
-import re
-import urllib, urlparse
from calibre.web.feeds.news import BasicNewsRecipe
@@ -20,35 +16,24 @@ class Infobae(BasicNewsRecipe):
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
- language = 'es'
- lang = 'es-AR'
-
+ language = 'es'
encoding = 'cp1252'
- cover_url = 'http://www.infobae.com/imgs/header/header.gif'
+ masthead_url = 'http://www.infobae.com/imgs/header/header.gif'
remove_javascript = True
- preprocess_regexps = [(re.compile(
- r''), lambda m:'')]
-
-
- html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True'
-
- extra_css = '''
- .col-center{font-family:Arial,Helvetica,sans-serif;}
- h1{font-family:Arial,Helvetica,sans-serif; color:#0D4261;}
- .fuenteIntNota{font-family:Arial,Helvetica,sans-serif; color:#1D1D1D; font-size:x-small;}
- '''
-
- keep_only_tags = [dict(name='div', attrs={'class':['content']})]
-
-
- remove_tags = [
- dict(name='div', attrs={'class':['options','col-right','controles', 'bannerLibre','tiulo-masleidas','masleidas-h']}),
- dict(name='a', attrs={'name' : 'comentario',}),
- dict(name='iframe'),
- dict(name='img', alt = "Ver galerias de imagenes"),
-
- ]
-
+ remove_empty_feeds = True
+ extra_css = '''
+ body{font-family:Arial,Helvetica,sans-serif;}
+ .popUpTitulo{color:#0D4261; font-size: xx-large}
+ '''
+
+ conversion_options = {
+ 'comment' : description
+ , 'tags' : category
+ , 'publisher' : publisher
+ , 'language' : language
+ , 'linearize_tables' : True
+ }
+
feeds = [
(u'Noticias' , u'http://www.infobae.com/adjuntos/html/RSS/hoy.xml' )
@@ -57,39 +42,14 @@ class Infobae(BasicNewsRecipe):
,(u'Deportes' , u'http://www.infobae.com/adjuntos/html/RSS/deportes.xml' )
]
-# def print_version(self, url):
-# main, sep, article_part = url.partition('contenidos/')
-# article_id, rsep, rrest = article_part.partition('-')
-# return u'http://www.infobae.com/notas/nota_imprimir.php?Idx=' + article_id
-
- def get_article_url(self, article):
- ans = article.get('link').encode('utf-8')
- parts = list(urlparse.urlparse(ans))
- parts[2] = urllib.quote(parts[2])
- ans = urlparse.urlunparse(parts)
- return ans.decode('utf-8')
-
-
- def preprocess_html(self, soup):
-
- for tag in soup.head.findAll('strong'):
- tag.extract()
- for tag in soup.findAll('meta'):
- del tag['content']
- tag.extract()
-
- mtag = '\n\n'
- soup.head.insert(0,mtag)
- for item in soup.findAll(style=True):
- del item['style']
-
- return soup
+ def print_version(self, url):
+ article_part = url.rpartition('/')[2]
+ article_id= article_part.partition('-')[0]
+ return 'http://www.infobae.com/notas/nota_imprimir.php?Idx=' + article_id
def postprocess_html(self, soup, first):
-
for tag in soup.findAll(name='strong'):
tag.name = 'b'
-
return soup
diff --git a/resources/recipes/nspm.recipe b/resources/recipes/nspm.recipe
index 13ff42b277..29f2cfc5e3 100644
--- a/resources/recipes/nspm.recipe
+++ b/resources/recipes/nspm.recipe
@@ -6,6 +6,7 @@ nspm.rs
import re
from calibre.web.feeds.news import BasicNewsRecipe
+from calibre.ebooks.BeautifulSoup import Tag, NavigableString
class Nspm(BasicNewsRecipe):
title = 'Nova srpska politicka misao'
@@ -21,6 +22,7 @@ class Nspm(BasicNewsRecipe):
encoding = 'utf-8'
language = 'sr'
delay = 2
+ remove_empty_feeds = True
publication_type = 'magazine'
masthead_url = 'http://www.nspm.rs/templates/jsn_epic_pro/images/logol.jpg'
extra_css = """ @font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)}
@@ -45,8 +47,9 @@ class Nspm(BasicNewsRecipe):
dict(name=['link','object','embed','script','meta','base','iframe'])
,dict(attrs={'class':'buttonheading'})
]
- remove_tags_after = dict(attrs={'class':'article_separator'})
- remove_attributes = ['width','height']
+ remove_tags_before = dict(attrs={'class':'contentheading'})
+ remove_tags_after = dict(attrs={'class':'article_separator'})
+ remove_attributes = ['width','height']
def get_browser(self):
br = BasicNewsRecipe.get_browser()
@@ -67,4 +70,8 @@ class Nspm(BasicNewsRecipe):
def preprocess_html(self, soup):
for item in soup.body.findAll(style=True):
del item['style']
+ for item in soup.body.findAll('h1'):
+ nh = NavigableString(item.a.string)
+ item.a.extract()
+ item.insert(0,nh)
return self.adeify_images(soup)
From 548417ea6b6157faf1688b3b082f3eac5476636f Mon Sep 17 00:00:00 2001
From: ldolse
Date: Mon, 13 Sep 2010 09:18:45 +1000
Subject: [PATCH 35/43] comments and minor tweak
---
src/calibre/ebooks/conversion/utils.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index abfa43e7ed..ecf030b27d 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -111,7 +111,7 @@ class PreProcessor(object):
html = add_markup.sub('
\n', html)
# detect chapters/sections to match xpath or splitting logic
- heading = re.compile(']*>', re.IGNORECASE)
+ heading = re.compile(']*>', re.IGNORECASE)
self.html_preprocess_sections = len(heading.findall(html))
self.log("found " + str(self.html_preprocess_sections) + " pre-existing headings")
#
@@ -134,7 +134,7 @@ class PreProcessor(object):
self.log("Unwrapping Lines")
# Some OCR sourced files have line breaks in the html using a combination of span & p tags
# span are used for hard line breaks, p for new paragraphs. Determine which is used so
- # that lines can be wrapped across page boundaries
+ # that lines can be un-wrapped across page boundaries
paras_reg = re.compile(']*>', re.IGNORECASE)
spans_reg = re.compile(']*>', re.IGNORECASE)
paras = len(paras_reg.findall(html))
From de6aadee76d4dafe9b84133dc3af43ddef22fd0a Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Mon, 13 Sep 2010 10:15:35 -0600
Subject: [PATCH 36/43] News download: Fix bug that could break some downloads
in non ASCII locales
---
resources/recipes/xkcd.recipe | 6 +++---
src/calibre/web/feeds/__init__.py | 4 +++-
2 files changed, 6 insertions(+), 4 deletions(-)
diff --git a/resources/recipes/xkcd.recipe b/resources/recipes/xkcd.recipe
index 312027004e..ad0d420deb 100644
--- a/resources/recipes/xkcd.recipe
+++ b/resources/recipes/xkcd.recipe
@@ -24,18 +24,18 @@ class XkcdCom(BasicNewsRecipe):
(re.compile(r'()'),
lambda m: '%s%s%s
' % (m.group(1), m.group(3), m.group(2)))
]
-
+
def parse_index(self):
INDEX = 'http://xkcd.com/archive/'
- soup = self.index_to_soup(INDEX)
+ soup = self.index_to_soup(INDEX)
articles = []
for item in soup.findAll('a', title=True):
articles.append({
'date': item['title'],
'timestamp': time.mktime(time.strptime(item['title'], '%Y-%m-%d'))+1,
'url': 'http://xkcd.com' + item['href'],
- 'title': self.tag_to_string(item).encode('UTF-8'),
+ 'title': self.tag_to_string(item),
'description': '',
'content': '',
})
diff --git a/src/calibre/web/feeds/__init__.py b/src/calibre/web/feeds/__init__.py
index a70cf8b664..8aef350498 100644
--- a/src/calibre/web/feeds/__init__.py
+++ b/src/calibre/web/feeds/__init__.py
@@ -165,7 +165,9 @@ class Feed(object):
if delta.days*24*3600 + delta.seconds <= 24*3600*self.oldest_article:
self.articles.append(article)
else:
- self.logger.debug('Skipping article %s (%s) from feed %s as it is too old.'%(title, article.localtime.strftime('%a, %d %b, %Y %H:%M'), self.title))
+ t = strftime(u'%a, %d %b, %Y %H:%M', article.localtime.timetuple())
+ self.logger.debug('Skipping article %s (%s) from feed %s as it is too old.'%
+ (title, t, self.title))
d = item.get('date', '')
article.formatted_date = d
From b73e1b3da50810e151d10a5d62251754a077e605 Mon Sep 17 00:00:00 2001
From: ldolse
Date: Tue, 14 Sep 2010 02:56:56 +1000
Subject: [PATCH 37/43] tweaked preprocess for $, added rtf to new preprocess
logic, changed last pdf default
---
src/calibre/ebooks/conversion/preprocess.py | 2 +-
src/calibre/ebooks/rtf/input.py | 13 +++----------
src/calibre/gui2/convert/pdf_input.ui | 2 +-
3 files changed, 5 insertions(+), 12 deletions(-)
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index f6277956c8..9464be1210 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -340,7 +340,7 @@ class HTMLPreProcessor(object):
# print "The pdf line length returned is " + str(length)
end_rules.append(
# Un wrap using punctuation
- (re.compile(r'(?<=.{%i}[a-z,;:)\-IA])\s*(?P(i|b|u)>)?\s*(\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines),
+ (re.compile(r'(?<=.{%i}[a-z,;:)\-IA])\s*(?P(i|b|u)>)?\s*(\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
)
for rule in self.PREPROCESS + start_rules:
diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py
index 216ccf591d..d229b80c16 100644
--- a/src/calibre/ebooks/rtf/input.py
+++ b/src/calibre/ebooks/rtf/input.py
@@ -8,6 +8,7 @@ from lxml import etree
from calibre.customize.conversion import InputFormatPlugin
from calibre.ebooks.conversion.preprocess import line_length
+from calibre.ebooks.conversion.utils import PreProcessor
class InlineClass(etree.XSLTExtension):
@@ -229,16 +230,8 @@ class RTFInput(InputFormatPlugin):
res = transform.tostring(result)
res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
if self.options.preprocess_html:
- self.log("********* Preprocessing HTML *********")
- # Detect Chapters to match the xpath in the GUI
- chapdetect = re.compile(r']*>\s*]*>\s*(?P(<(i|b)>(<(i|b)>)?)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?((i|b)>(<(/i|b)>)?)?)\s*\s*
', re.IGNORECASE)
- res = chapdetect.sub(''+'\g'+'
\n', res)
- # Unwrap lines using punctation if the median length of all lines is less than 150
- length = line_length('html', res, 0.4)
- self.log("*** Median length is " + str(length) + " ***")
- unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*\s*
\s*(?P]*>\s*(]*>\s*\s*)
\s*){0,3}\s*]*>\s*]*>\s*" % length, re.UNICODE)
- if length < 150:
- res = unwrap.sub(' ', res)
+ preprocessor = PreProcessor(res)
+ res = preprocessor(res)
f.write(res)
self.write_inline_css(inline_class)
stream.seek(0)
diff --git a/src/calibre/gui2/convert/pdf_input.ui b/src/calibre/gui2/convert/pdf_input.ui
index 626c68ea63..b2ee421922 100644
--- a/src/calibre/gui2/convert/pdf_input.ui
+++ b/src/calibre/gui2/convert/pdf_input.ui
@@ -46,7 +46,7 @@
0.010000000000000
- 0.500000000000000
+ 0.450000000000000
From 8b73bb52e8d551538d0c0e55e7b91b6b16f69977 Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Mon, 13 Sep 2010 16:42:22 -0600
Subject: [PATCH 38/43] Fix #6802 (Sovos E Reader Not Recognised / Floppy Drive
Activation)
---
src/calibre/customize/builtins.py | 3 ++-
src/calibre/devices/teclast/driver.py | 11 +++++++++++
2 files changed, 13 insertions(+), 1 deletion(-)
diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py
index 4c87236e71..68df832048 100644
--- a/src/calibre/customize/builtins.py
+++ b/src/calibre/customize/builtins.py
@@ -459,7 +459,7 @@ from calibre.devices.iriver.driver import IRIVER_STORY
from calibre.devices.binatone.driver import README
from calibre.devices.hanvon.driver import N516, EB511, ALEX, AZBOOKA, THEBOOK
from calibre.devices.edge.driver import EDGE
-from calibre.devices.teclast.driver import TECLAST_K3, NEWSMY, IPAPYRUS
+from calibre.devices.teclast.driver import TECLAST_K3, NEWSMY, IPAPYRUS, SOVOS
from calibre.devices.sne.driver import SNE
from calibre.devices.misc import PALMPRE, AVANT, SWEEX, PDNOVEL, KOGAN, GEMEI
from calibre.devices.folder_device.driver import FOLDER_DEVICE_FOR_CONFIG
@@ -557,6 +557,7 @@ plugins += [
TECLAST_K3,
NEWSMY,
IPAPYRUS,
+ SOVOS,
EDGE,
SNE,
ALEX,
diff --git a/src/calibre/devices/teclast/driver.py b/src/calibre/devices/teclast/driver.py
index 0c60a367cf..2055ff9306 100644
--- a/src/calibre/devices/teclast/driver.py
+++ b/src/calibre/devices/teclast/driver.py
@@ -52,3 +52,14 @@ class IPAPYRUS(TECLAST_K3):
VENDOR_NAME = 'E_READER'
WINDOWS_MAIN_MEM = ''
+class SOVOS(TECLAST_K3):
+
+ name = 'Sovos device interface'
+ gui_name = 'Sovos'
+ description = _('Communicate with the Sovos reader.')
+
+ FORMATS = ['epub', 'fb2', 'pdf', 'txt']
+
+ VENDOR_NAME = 'RK28XX'
+ WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = 'USB-MSC'
+
From fb053fe3f37d531a170bb2a1d67ccf70ea030351 Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Mon, 13 Sep 2010 16:58:09 -0600
Subject: [PATCH 39/43] Fix #6773 (Slightly broken CHM file)
---
src/calibre/ebooks/chm/reader.py | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/src/calibre/ebooks/chm/reader.py b/src/calibre/ebooks/chm/reader.py
index 67a2d36607..831c16bf6a 100644
--- a/src/calibre/ebooks/chm/reader.py
+++ b/src/calibre/ebooks/chm/reader.py
@@ -132,7 +132,11 @@ class CHMReader(CHMFile):
for path in self.Contents():
lpath = os.path.join(output_dir, path)
self._ensure_dir(lpath)
- data = self.GetFile(path)
+ try:
+ data = self.GetFile(path)
+ except:
+ self.log.exception('Failed to extract %s from CHM, ignoring'%path)
+ continue
if lpath.find(';') != -1:
# fix file names with ";" at the end, see _reformat()
lpath = lpath.split(';')[0]
From ba5de1c92d797abc1f82782c7e15bd61dfa387c5 Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Mon, 13 Sep 2010 18:18:32 -0600
Subject: [PATCH 40/43] Conversion pipeline: When setting margins on
explicitly set padding to 0 to override and existing padding in the input
document
---
src/calibre/ebooks/oeb/transforms/flatcss.py | 1 +
1 file changed, 1 insertion(+)
diff --git a/src/calibre/ebooks/oeb/transforms/flatcss.py b/src/calibre/ebooks/oeb/transforms/flatcss.py
index f48bdb9934..ffdc641d1e 100644
--- a/src/calibre/ebooks/oeb/transforms/flatcss.py
+++ b/src/calibre/ebooks/oeb/transforms/flatcss.py
@@ -138,6 +138,7 @@ class CSSFlattener(object):
float(self.context.margin_left))
bs.append('margin-right : %fpt'%\
float(self.context.margin_right))
+ bs.extend(['padding-left: 0pt', 'padding-right: 0pt'])
if self.context.change_justification != 'original':
bs.append('text-align: '+ self.context.change_justification)
body.set('style', '; '.join(bs))
From c5063b8633506f3b661d3e3dcc84d7ec68e74345 Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Mon, 13 Sep 2010 18:26:51 -0600
Subject: [PATCH 41/43] Fix #6804 (Timeout error when browsing content server
via browser)
---
resources/content_server/gui.js | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/resources/content_server/gui.js b/resources/content_server/gui.js
index 631fb8b617..d0fb49cc8e 100644
--- a/resources/content_server/gui.js
+++ b/resources/content_server/gui.js
@@ -26,7 +26,7 @@ var current_library_request = null;
////////////////////////////// GET BOOK LIST //////////////////////////////
-var LIBRARY_FETCH_TIMEOUT = 30000; // milliseconds
+var LIBRARY_FETCH_TIMEOUT = 5*60000; // milliseconds
function create_table_headers() {
var thead = $('table#book_list thead tr');
From c5415bbe8012179b405f2c3ca3b5258e83a863b3 Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Mon, 13 Sep 2010 19:11:38 -0600
Subject: [PATCH 42/43] Fix #6806 (--start-in-tray switch displays hidden
windows in metacity, xfwm4 and compiz)
---
src/calibre/gui2/cover_flow.py | 1 +
1 file changed, 1 insertion(+)
diff --git a/src/calibre/gui2/cover_flow.py b/src/calibre/gui2/cover_flow.py
index 88bbae6c41..cb951b09be 100644
--- a/src/calibre/gui2/cover_flow.py
+++ b/src/calibre/gui2/cover_flow.py
@@ -155,6 +155,7 @@ class CoverFlowMixin(object):
self.cb_splitter.action_toggle.triggered.connect(self.toggle_cover_browser)
if CoverFlow is not None:
self.cover_flow.stop.connect(self.hide_cover_browser)
+ self.cover_flow.setVisible(False)
else:
self.cb_splitter.insertWidget(self.cb_splitter.side_index, self.cover_flow)
if CoverFlow is not None:
From 6a3609f031bb9400630cd6418b278903a4883c8a Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Mon, 13 Sep 2010 19:58:22 -0600
Subject: [PATCH 43/43] Implement #6808 (Feature request: ability to convert
all single/double quotes to "smart quotes")
---
src/calibre/ebooks/conversion/cli.py | 2 +-
src/calibre/ebooks/conversion/plumber.py | 8 +
src/calibre/ebooks/conversion/preprocess.py | 23 +-
src/calibre/gui2/convert/look_and_feel.py | 2 +-
src/calibre/gui2/convert/look_and_feel.ui | 9 +-
src/calibre/utils/smartypants.py | 899 ++++++++++++++++++++
6 files changed, 933 insertions(+), 10 deletions(-)
create mode 100755 src/calibre/utils/smartypants.py
diff --git a/src/calibre/ebooks/conversion/cli.py b/src/calibre/ebooks/conversion/cli.py
index 7439718cf6..2ef633d0bb 100644
--- a/src/calibre/ebooks/conversion/cli.py
+++ b/src/calibre/ebooks/conversion/cli.py
@@ -122,7 +122,7 @@ def add_pipeline_options(parser, plumber):
'font_size_mapping',
'line_height',
'linearize_tables',
- 'extra_css',
+ 'extra_css', 'smarten_punctuation',
'margin_top', 'margin_left', 'margin_right',
'margin_bottom', 'change_justification',
'insert_blank_line', 'remove_paragraph_spacing','remove_paragraph_spacing_indent_size',
diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py
index 24b35f804f..16282dd28d 100644
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@@ -362,6 +362,14 @@ OptionRecommendation(name='preprocess_html',
)
),
+OptionRecommendation(name='smarten_punctuation',
+ recommended_value=False, level=OptionRecommendation.LOW,
+ help=_('Convert plain quotes, dashes and ellipsis to their '
+ 'typographically correct equivalents. For details, see '
+ 'http://daringfireball.net/projects/smartypants'
+ )
+ ),
+
OptionRecommendation(name='remove_header',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Use a regular expression to try and remove the header.'
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 7742a20a21..4538af96c4 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -221,7 +221,7 @@ class HTMLPreProcessor(object):
(re.compile(u'˛\s*()*\s*A', re.UNICODE), lambda match: u'Ą'),
(re.compile(u'˛\s*()*\s*e', re.UNICODE), lambda match: u'ę'),
(re.compile(u'˛\s*()*\s*E', re.UNICODE), lambda match: u'Ę'),
-
+
# ˙
(re.compile(u'˙\s*()*\s*z', re.UNICODE), lambda match: u'ż'),
(re.compile(u'˙\s*()*\s*Z', re.UNICODE), lambda match: u'Ż'),
@@ -244,14 +244,14 @@ class HTMLPreProcessor(object):
(re.compile(r'
\s*(?P(<[ibu]>){0,2}\s*.?(Introduction|Chapter|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*([ibu]>){0,2})\s*(
\s*){1,3}\s*(?P(<[ibu]>){0,2}(\s*\w+){1,4}\s*([ibu]>){0,2}\s*
)?', re.IGNORECASE), chap_head),
# Cover the case where every letter in a chapter title is separated by a space
(re.compile(r'
\s*(?P([A-Z]\s+){4,}\s*([\d\w-]+\s*){0,3}\s*)\s*(
\s*){1,3}\s*(?P(<[ibu]>){0,2}(\s*\w+){1,4}\s*([ibu]>){0,2}\s*(
))?'), chap_head),
-
+
# Have paragraphs show better
(re.compile(r''), lambda match : ''),
# Clean up spaces
(re.compile(u'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
# Add space before and after italics
(re.compile(u'(?'), lambda match: ' '),
- (re.compile(r'(?=\w)'), lambda match: ' '),
+ (re.compile(r'(?=\w)'), lambda match: ' '),
]
# Fix Book Designer markup
@@ -328,7 +328,7 @@ class HTMLPreProcessor(object):
import traceback
print 'Failed to parse remove_footer regexp'
traceback.print_exc()
-
+
# unwrap hyphenation - moved here so it's executed after header/footer removal
if is_pdftohtml:
# unwrap visible dashes and hyphens - don't delete they are often hyphens for
@@ -338,13 +338,13 @@ class HTMLPreProcessor(object):
end_rules.append((re.compile(u'[](\s*
)+\s*(?=[[a-z\d])'), lambda match: ''))
# unwrap/delete soft hyphens with formatting
end_rules.append((re.compile(u'[]\s*((i|u|b)>)+(\s*
)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: ''))
-
- # Make the more aggressive chapter marking regex optional with the preprocess option to
+
+ # Make the more aggressive chapter marking regex optional with the preprocess option to
# reduce false positives and move after header/footer removal
if getattr(self.extra_opts, 'preprocess_html', None):
if is_pdftohtml:
end_rules.append((re.compile(r'
\s*(?P(<[ibu]>){0,2}\s*([A-Z \'"!]{3,})\s*([\dA-Z:]+\s){0,4}\s*([ibu]>){0,2})\s*\s*(?P
(<[ibu]>){0,2}(\s*\w+){1,4}\s*([ibu]>){0,2}\s*)?'), chap_head),)
-
+
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'))
if length:
@@ -401,5 +401,14 @@ class HTMLPreProcessor(object):
if self.plugin_preprocess:
html = self.input_plugin_preprocess(html)
+ if getattr(self.extra_opts, 'smarten_punctuation', False):
+ html = self.smarten_punctuation(html)
+
return html
+ def smarten_punctuation(self, html):
+ from calibre.utils.smartypants import smartyPants
+ from calibre.ebooks.chardet import substitute_entites
+ html = smartyPants(html)
+ return substitute_entites(html)
+
diff --git a/src/calibre/gui2/convert/look_and_feel.py b/src/calibre/gui2/convert/look_and_feel.py
index b0403bf1dd..ec3f0b944d 100644
--- a/src/calibre/gui2/convert/look_and_feel.py
+++ b/src/calibre/gui2/convert/look_and_feel.py
@@ -22,7 +22,7 @@ class LookAndFeelWidget(Widget, Ui_Form):
Widget.__init__(self, parent,
['change_justification', 'extra_css', 'base_font_size',
'font_size_mapping', 'line_height',
- 'linearize_tables',
+ 'linearize_tables', 'smarten_punctuation',
'disable_font_rescaling', 'insert_blank_line',
'remove_paragraph_spacing', 'remove_paragraph_spacing_indent_size','input_encoding',
'asciiize', 'keep_ligatures']
diff --git a/src/calibre/gui2/convert/look_and_feel.ui b/src/calibre/gui2/convert/look_and_feel.ui
index de48e7caf9..c683300854 100644
--- a/src/calibre/gui2/convert/look_and_feel.ui
+++ b/src/calibre/gui2/convert/look_and_feel.ui
@@ -178,7 +178,7 @@
- -
+
-
Extra &CSS
@@ -214,6 +214,13 @@
+ -
+
+
+ Smarten &punctuation
+
+
+
diff --git a/src/calibre/utils/smartypants.py b/src/calibre/utils/smartypants.py
new file mode 100755
index 0000000000..44aac4de8c
--- /dev/null
+++ b/src/calibre/utils/smartypants.py
@@ -0,0 +1,899 @@
+#!/usr/bin/python
+
+r"""
+==============
+smartypants.py
+==============
+
+----------------------------
+SmartyPants ported to Python
+----------------------------
+
+Ported by `Chad Miller`_
+Copyright (c) 2004, 2007 Chad Miller
+
+original `SmartyPants`_ by `John Gruber`_
+Copyright (c) 2003 John Gruber
+
+
+Synopsis
+========
+
+A smart-quotes plugin for Pyblosxom_.
+
+The priginal "SmartyPants" is a free web publishing plug-in for Movable Type,
+Blosxom, and BBEdit that easily translates plain ASCII punctuation characters
+into "smart" typographic punctuation HTML entities.
+
+This software, *smartypants.py*, endeavours to be a functional port of
+SmartyPants to Python, for use with Pyblosxom_.
+
+
+Description
+===========
+
+SmartyPants can perform the following transformations:
+
+- Straight quotes ( " and ' ) into "curly" quote HTML entities
+- Backticks-style quotes (\`\`like this'') into "curly" quote HTML entities
+- Dashes (``--`` and ``---``) into en- and em-dash entities
+- Three consecutive dots (``...`` or ``. . .``) into an ellipsis entity
+
+This means you can write, edit, and save your posts using plain old
+ASCII straight quotes, plain dashes, and plain dots, but your published
+posts (and final HTML output) will appear with smart quotes, em-dashes,
+and proper ellipses.
+
+SmartyPants does not modify characters within ````, ````, ````,
+``