diff --git a/setup/test.py b/setup/test.py
index e3a8ab4142..1c0ac4dfca 100644
--- a/setup/test.py
+++ b/setup/test.py
@@ -125,6 +125,9 @@ def find_tests(which_tests=None):
a(find_tests())
from calibre.library.comments import find_tests
a(find_tests())
+ from calibre.ebooks.compression.palmdoc import find_tests
+ a(find_tests())
+
a(unittest.defaultTestLoader.loadTestsFromTestCase(TestImports))
if ok('dbcli'):
from calibre.db.cli.tests import find_tests
diff --git a/src/calibre/ebooks/chm/reader.py b/src/calibre/ebooks/chm/reader.py
index a8be20c360..3759310fd7 100644
--- a/src/calibre/ebooks/chm/reader.py
+++ b/src/calibre/ebooks/chm/reader.py
@@ -106,7 +106,7 @@ class CHMReader(CHMFile):
return data
def ExtractFiles(self, output_dir=getcwd(), debug_dump=False):
- html_files = set([])
+ html_files = set()
try:
x = self.get_encoding()
codecs.lookup(x)
diff --git a/src/calibre/ebooks/compression/palmdoc.py b/src/calibre/ebooks/compression/palmdoc.py
index 931a2feda4..5bcd20a653 100644
--- a/src/calibre/ebooks/compression/palmdoc.py
+++ b/src/calibre/ebooks/compression/palmdoc.py
@@ -21,32 +21,7 @@ def decompress_doc(data):
def compress_doc(data):
- if not data:
- return ''
- return cPalmdoc.compress(data)
-
-
-def test():
- TESTS = [
- 'abc\x03\x04\x05\x06ms', # Test binary writing
- 'a b c \xfed ', # Test encoding of spaces
- '0123456789axyz2bxyz2cdfgfo9iuyerh',
- '0123456789asd0123456789asd|yyzzxxffhhjjkk',
- ('ciewacnaq eiu743 r787q 0w% ; sa fd\xef\ffdxosac wocjp acoiecowei '
- 'owaic jociowapjcivcjpoivjporeivjpoavca; p9aw8743y6r74%$^$^%8 ')
- ]
- for test in TESTS:
- print('Test:', repr(test))
- print('\tTesting compression...')
- good = py_compress_doc(test)
- x = compress_doc(test)
- print('\t\tgood:', repr(good))
- print('\t\tx :', repr(x))
- assert x == good
- print('\tTesting decompression...')
- print('\t\t', repr(decompress_doc(x)))
- assert decompress_doc(x) == test
- print()
+ return cPalmdoc.compress(data) if data else b''
def py_compress_doc(data):
@@ -55,7 +30,7 @@ def py_compress_doc(data):
ldata = len(data)
while i < ldata:
if i > 10 and (ldata - i) > 10:
- chunk = ''
+ chunk = b''
match = -1
for j in range(10, 2, -1):
chunk = data[i:i+j]
@@ -76,14 +51,14 @@ def py_compress_doc(data):
ch = data[i]
och = ord(ch)
i += 1
- if ch == ' ' and (i + 1) < ldata:
+ if ch == b' ' and (i + 1) < ldata:
onch = ord(data[i])
if onch >= 0x40 and onch < 0x80:
out.write(pack('>B', onch ^ 0x80))
i += 1
continue
if och == 0 or (och > 8 and och < 0x80):
- out.write(ch.encode('utf-8'))
+ out.write(ch)
else:
j = i
binseq = [ch]
@@ -95,6 +70,27 @@ def py_compress_doc(data):
binseq.append(ch)
j += 1
out.write(pack('>B', len(binseq)))
- out.write(''.join(binseq).encode('utf-8'))
+ out.write(b''.join(binseq))
i += len(binseq) - 1
return out.getvalue()
+
+
+def find_tests():
+ import unittest
+
+ class Test(unittest.TestCase):
+
+ def test_palmdoc_compression(self):
+ for test in [
+ b'abc\x03\x04\x05\x06ms', # Test binary writing
+ b'a b c \xfed ', # Test encoding of spaces
+ b'0123456789axyz2bxyz2cdfgfo9iuyerh',
+ b'0123456789asd0123456789asd|yyzzxxffhhjjkk',
+ (b'ciewacnaq eiu743 r787q 0w% ; sa fd\xef\ffdxosac wocjp acoiecowei '
+ b'owaic jociowapjcivcjpoivjporeivjpoavca; p9aw8743y6r74%$^$^%8 ')
+ ]:
+ x = compress_doc(test)
+ self.assertEqual(py_compress_doc(test), x)
+ self.assertEqual(decompress_doc(x), test)
+
+ return unittest.defaultTestLoader.loadTestsFromTestCase(Test)
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 85a42f13aa..6d509094ab 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -33,12 +33,14 @@ class HeuristicProcessor(object):
self.anyblank = re.compile(r'\s*(?P ]*>)\s*(?P ]*>\s*
'
- self.common_in_text_endings = u'[\"\'—’”,\\.!\\?\\…\\)„\\w]'
- self.common_in_text_beginnings = u'[\\w\'\"“‘‛]'
+ self.common_in_text_endings = '[\"\'—’”,\\.!\\?\\…\\)„\\w]'
+ self.common_in_text_beginnings = '[\\w\'\"“‘‛]'
def is_pdftohtml(self, src):
return '' in src[:1000]
@@ -215,7 +217,8 @@ class HeuristicProcessor(object):
# Build the Regular Expressions in pieces
init_lookahead = "(?=<(p|div))"
chapter_line_open = self.line_open
- title_line_open = "<(?P ]*>.*? ]*>.*? ]*>\s* ]*>\s*(<(span|[ibu]|em|strong|font)[^>]*>\s*)*.{1,100}?[^\W]((span|[ibu]|em|strong|font)>\s*)* ]*>\s* ]*>\s* ]*>\s*(<(span|[ibu]|em|strong|font)[^>]*>\s*)*'
+ r'.{1,100}?[^\W]((span|[ibu]|em|strong|font)>\s*)* ]*>\s*