From b842fe758a56e1b855ff4d3276e63ddaf2d53897 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 27 May 2019 14:25:52 +0530 Subject: [PATCH] Fixes for f2e0181926 --- setup/test.py | 3 ++ src/calibre/ebooks/chm/reader.py | 2 +- src/calibre/ebooks/compression/palmdoc.py | 56 +++++++++++------------ src/calibre/ebooks/conversion/utils.py | 29 ++++++++---- 4 files changed, 50 insertions(+), 40 deletions(-) diff --git a/setup/test.py b/setup/test.py index e3a8ab4142..1c0ac4dfca 100644 --- a/setup/test.py +++ b/setup/test.py @@ -125,6 +125,9 @@ def find_tests(which_tests=None): a(find_tests()) from calibre.library.comments import find_tests a(find_tests()) + from calibre.ebooks.compression.palmdoc import find_tests + a(find_tests()) + a(unittest.defaultTestLoader.loadTestsFromTestCase(TestImports)) if ok('dbcli'): from calibre.db.cli.tests import find_tests diff --git a/src/calibre/ebooks/chm/reader.py b/src/calibre/ebooks/chm/reader.py index a8be20c360..3759310fd7 100644 --- a/src/calibre/ebooks/chm/reader.py +++ b/src/calibre/ebooks/chm/reader.py @@ -106,7 +106,7 @@ class CHMReader(CHMFile): return data def ExtractFiles(self, output_dir=getcwd(), debug_dump=False): - html_files = set([]) + html_files = set() try: x = self.get_encoding() codecs.lookup(x) diff --git a/src/calibre/ebooks/compression/palmdoc.py b/src/calibre/ebooks/compression/palmdoc.py index 931a2feda4..5bcd20a653 100644 --- a/src/calibre/ebooks/compression/palmdoc.py +++ b/src/calibre/ebooks/compression/palmdoc.py @@ -21,32 +21,7 @@ def decompress_doc(data): def compress_doc(data): - if not data: - return '' - return cPalmdoc.compress(data) - - -def test(): - TESTS = [ - 'abc\x03\x04\x05\x06ms', # Test binary writing - 'a b c \xfed ', # Test encoding of spaces - '0123456789axyz2bxyz2cdfgfo9iuyerh', - '0123456789asd0123456789asd|yyzzxxffhhjjkk', - ('ciewacnaq eiu743 r787q 0w% ; sa fd\xef\ffdxosac wocjp acoiecowei ' - 'owaic jociowapjcivcjpoivjporeivjpoavca; p9aw8743y6r74%$^$^%8 ') - ] - for test in TESTS: - print('Test:', repr(test)) - print('\tTesting compression...') - good = py_compress_doc(test) - x = compress_doc(test) - print('\t\tgood:', repr(good)) - print('\t\tx :', repr(x)) - assert x == good - print('\tTesting decompression...') - print('\t\t', repr(decompress_doc(x))) - assert decompress_doc(x) == test - print() + return cPalmdoc.compress(data) if data else b'' def py_compress_doc(data): @@ -55,7 +30,7 @@ def py_compress_doc(data): ldata = len(data) while i < ldata: if i > 10 and (ldata - i) > 10: - chunk = '' + chunk = b'' match = -1 for j in range(10, 2, -1): chunk = data[i:i+j] @@ -76,14 +51,14 @@ def py_compress_doc(data): ch = data[i] och = ord(ch) i += 1 - if ch == ' ' and (i + 1) < ldata: + if ch == b' ' and (i + 1) < ldata: onch = ord(data[i]) if onch >= 0x40 and onch < 0x80: out.write(pack('>B', onch ^ 0x80)) i += 1 continue if och == 0 or (och > 8 and och < 0x80): - out.write(ch.encode('utf-8')) + out.write(ch) else: j = i binseq = [ch] @@ -95,6 +70,27 @@ def py_compress_doc(data): binseq.append(ch) j += 1 out.write(pack('>B', len(binseq))) - out.write(''.join(binseq).encode('utf-8')) + out.write(b''.join(binseq)) i += len(binseq) - 1 return out.getvalue() + + +def find_tests(): + import unittest + + class Test(unittest.TestCase): + + def test_palmdoc_compression(self): + for test in [ + b'abc\x03\x04\x05\x06ms', # Test binary writing + b'a b c \xfed ', # Test encoding of spaces + b'0123456789axyz2bxyz2cdfgfo9iuyerh', + b'0123456789asd0123456789asd|yyzzxxffhhjjkk', + (b'ciewacnaq eiu743 r787q 0w% ; sa fd\xef\ffdxosac wocjp acoiecowei ' + b'owaic jociowapjcivcjpoivjporeivjpoavca; p9aw8743y6r74%$^$^%8 ') + ]: + x = compress_doc(test) + self.assertEqual(py_compress_doc(test), x) + self.assertEqual(decompress_doc(x), test) + + return unittest.defaultTestLoader.loadTestsFromTestCase(Test) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 85a42f13aa..6d509094ab 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -33,12 +33,14 @@ class HeuristicProcessor(object): self.anyblank = re.compile(r'\s*(?P]*>)\s*(?P

)', re.IGNORECASE) self.multi_blank = re.compile(r'(\s*]*>\s*

(\s*]*>\s*\s*)*){2,}(?!\s*]*>\s*

(\s*]*>\s*\s*)*){2,}', re.IGNORECASE) - self.line_open = "<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*" # noqa + self.line_open = ( + r"<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*" + r"(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*") self.line_close = "()?\\s*()?\\s*()?\\s*" self.single_blank = re.compile(r'(\s*<(p|div)[^>]*>\s*)', re.IGNORECASE) self.scene_break_open = '

' - self.common_in_text_endings = u'[\"\'—’”,\\.!\\?\\…\\)„\\w]' - self.common_in_text_beginnings = u'[\\w\'\"“‘‛]' + self.common_in_text_endings = '[\"\'—’”,\\.!\\?\\…\\)„\\w]' + self.common_in_text_beginnings = '[\\w\'\"“‘‛]' def is_pdftohtml(self, src): return '' in src[:1000] @@ -215,7 +217,8 @@ class HeuristicProcessor(object): # Build the Regular Expressions in pieces init_lookahead = "(?=<(p|div))" chapter_line_open = self.line_open - title_line_open = "<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*" # noqa + title_line_open = (r"<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?" + r"\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*") chapter_header_open = r"(?P" title_header_open = r"(?P" chapter_header_close = ")\\s*" @@ -243,7 +246,9 @@ class HeuristicProcessor(object): analysis_result = [] chapter_types = [ - [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Epilogue|CHAPTER|Kapitel|Volume\b|Prologue|Book\b|Part\b|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, True, True, False, "Searching for common section headings", 'common'], # noqa + [( + r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Epilogue|CHAPTER|Kapitel|Volume\b|Prologue|Book\b|Part\b|Dedication|Preface)" + r"\s*([\d\w-]+\:?\'?\s*){0,5}"), True, True, True, False, "Searching for common section headings", 'common'], # Highest frequency headings which include titles [r"[^'\"]?(CHAPTER|Kapitel)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, True, True, False, "Searching for most common chapter headings", 'chapter'], [r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•=]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>", @@ -509,11 +514,14 @@ class HeuristicProcessor(object): def detect_whitespace(self, html): blanks_around_headings = re.compile( - r'(?P<initparas>(<(p|div)[^>]*>\s*</(p|div)>\s*){1,}\s*)?(?P<content><h(?P<hnum>\d+)[^>]*>.*?</h(?P=hnum)>)(?P<endparas>\s*(<(p|div)[^>]*>\s*</(p|div)>\s*){1,})?', re.IGNORECASE|re.DOTALL) # noqa + r'(?P<initparas>(<(p|div)[^>]*>\s*</(p|div)>\s*){1,}\s*)?' + r'(?P<content><h(?P<hnum>\d+)[^>]*>.*?</h(?P=hnum)>)(?P<endparas>\s*(<(p|div)[^>]*>\s*</(p|div)>\s*){1,})?', re.IGNORECASE|re.DOTALL) blanks_around_scene_breaks = re.compile( - r'(?P<initparas>(<(p|div)[^>]*>\s*</(p|div)>\s*){1,}\s*)?(?P<content><p class="scenebreak"[^>]*>.*?</p>)(?P<endparas>\s*(<(p|div)[^>]*>\s*</(p|div)>\s*){1,})?', re.IGNORECASE|re.DOTALL) # noqa + r'(?P<initparas>(<(p|div)[^>]*>\s*</(p|div)>\s*){1,}\s*)?' + r'(?P<content><p class="scenebreak"[^>]*>.*?</p>)(?P<endparas>\s*(<(p|div)[^>]*>\s*</(p|div)>\s*){1,})?', re.IGNORECASE|re.DOTALL) blanks_n_nopunct = re.compile( - r'(?P<initparas>(<p[^>]*>\s*</p>\s*){1,}\s*)?<p[^>]*>\s*(<(span|[ibu]|em|strong|font)[^>]*>\s*)*.{1,100}?[^\W](</(span|[ibu]|em|strong|font)>\s*)*</p>(?P<endparas>\s*(<p[^>]*>\s*</p>\s*){1,})?', re.IGNORECASE|re.DOTALL) # noqa + r'(?P<initparas>(<p[^>]*>\s*</p>\s*){1,}\s*)?<p[^>]*>\s*(<(span|[ibu]|em|strong|font)[^>]*>\s*)*' + r'.{1,100}?[^\W](</(span|[ibu]|em|strong|font)>\s*)*</p>(?P<endparas>\s*(<p[^>]*>\s*</p>\s*){1,})?', re.IGNORECASE|re.DOTALL) def merge_header_whitespace(match): initblanks = match.group('initparas') @@ -826,7 +834,10 @@ class HeuristicProcessor(object): self.log.debug("Looking for more split points based on punctuation," " currently have " + unicode_type(self.html_preprocess_sections)) chapdetect3 = re.compile( - r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([\W]+\s*)+)(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE) # noqa + r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([\W]+\s*)+)' + r'(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*' + r'.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*' + r'(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE) html = chapdetect3.sub(self.chapter_break, html) if getattr(self.extra_opts, 'renumber_headings', False):