Fixes for f2e0181926

This commit is contained in:
Kovid Goyal 2019-05-27 14:25:52 +05:30
parent fe5aac9d97
commit b842fe758a
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
4 changed files with 50 additions and 40 deletions

View File

@ -125,6 +125,9 @@ def find_tests(which_tests=None):
a(find_tests())
from calibre.library.comments import find_tests
a(find_tests())
from calibre.ebooks.compression.palmdoc import find_tests
a(find_tests())
a(unittest.defaultTestLoader.loadTestsFromTestCase(TestImports))
if ok('dbcli'):
from calibre.db.cli.tests import find_tests

View File

@ -106,7 +106,7 @@ class CHMReader(CHMFile):
return data
def ExtractFiles(self, output_dir=getcwd(), debug_dump=False):
html_files = set([])
html_files = set()
try:
x = self.get_encoding()
codecs.lookup(x)

View File

@ -21,32 +21,7 @@ def decompress_doc(data):
def compress_doc(data):
if not data:
return ''
return cPalmdoc.compress(data)
def test():
TESTS = [
'abc\x03\x04\x05\x06ms', # Test binary writing
'a b c \xfed ', # Test encoding of spaces
'0123456789axyz2bxyz2cdfgfo9iuyerh',
'0123456789asd0123456789asd|yyzzxxffhhjjkk',
('ciewacnaq eiu743 r787q 0w% ; sa fd\xef\ffdxosac wocjp acoiecowei '
'owaic jociowapjcivcjpoivjporeivjpoavca; p9aw8743y6r74%$^$^%8 ')
]
for test in TESTS:
print('Test:', repr(test))
print('\tTesting compression...')
good = py_compress_doc(test)
x = compress_doc(test)
print('\t\tgood:', repr(good))
print('\t\tx :', repr(x))
assert x == good
print('\tTesting decompression...')
print('\t\t', repr(decompress_doc(x)))
assert decompress_doc(x) == test
print()
return cPalmdoc.compress(data) if data else b''
def py_compress_doc(data):
@ -55,7 +30,7 @@ def py_compress_doc(data):
ldata = len(data)
while i < ldata:
if i > 10 and (ldata - i) > 10:
chunk = ''
chunk = b''
match = -1
for j in range(10, 2, -1):
chunk = data[i:i+j]
@ -76,14 +51,14 @@ def py_compress_doc(data):
ch = data[i]
och = ord(ch)
i += 1
if ch == ' ' and (i + 1) < ldata:
if ch == b' ' and (i + 1) < ldata:
onch = ord(data[i])
if onch >= 0x40 and onch < 0x80:
out.write(pack('>B', onch ^ 0x80))
i += 1
continue
if och == 0 or (och > 8 and och < 0x80):
out.write(ch.encode('utf-8'))
out.write(ch)
else:
j = i
binseq = [ch]
@ -95,6 +70,27 @@ def py_compress_doc(data):
binseq.append(ch)
j += 1
out.write(pack('>B', len(binseq)))
out.write(''.join(binseq).encode('utf-8'))
out.write(b''.join(binseq))
i += len(binseq) - 1
return out.getvalue()
def find_tests():
import unittest
class Test(unittest.TestCase):
def test_palmdoc_compression(self):
for test in [
b'abc\x03\x04\x05\x06ms', # Test binary writing
b'a b c \xfed ', # Test encoding of spaces
b'0123456789axyz2bxyz2cdfgfo9iuyerh',
b'0123456789asd0123456789asd|yyzzxxffhhjjkk',
(b'ciewacnaq eiu743 r787q 0w% ; sa fd\xef\ffdxosac wocjp acoiecowei '
b'owaic jociowapjcivcjpoivjporeivjpoavca; p9aw8743y6r74%$^$^%8 ')
]:
x = compress_doc(test)
self.assertEqual(py_compress_doc(test), x)
self.assertEqual(decompress_doc(x), test)
return unittest.defaultTestLoader.loadTestsFromTestCase(Test)

View File

@ -33,12 +33,14 @@ class HeuristicProcessor(object):
self.anyblank = re.compile(r'\s*(?P<openline><p[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
self.multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>(\s*<div[^>]*>\s*</div>\s*)*){2,}(?!\s*<h\d)', re.IGNORECASE)
self.any_multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>(\s*<div[^>]*>\s*</div>\s*)*){2,}', re.IGNORECASE)
self.line_open = "<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*" # noqa
self.line_open = (
r"<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*"
r"(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*")
self.line_close = "(</(?P=inner3)>)?\\s*(</(?P=inner2)>)?\\s*(</(?P=inner1)>)?\\s*</(?P=outer)>"
self.single_blank = re.compile(r'(\s*<(p|div)[^>]*>\s*</(p|div)>)', re.IGNORECASE)
self.scene_break_open = '<p class="scenebreak" style="text-align:center; text-indent:0%; margin-top:1em; margin-bottom:1em; page-break-before:avoid">'
self.common_in_text_endings = u'[\"\'—’”,\\.!\\?\\\\)„\\w]'
self.common_in_text_beginnings = u'[\\w\'\"“‘‛]'
self.common_in_text_endings = '[\"\'—’”,\\.!\\?\\\\)„\\w]'
self.common_in_text_beginnings = '[\\w\'\"“‘‛]'
def is_pdftohtml(self, src):
return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
@ -215,7 +217,8 @@ class HeuristicProcessor(object):
# Build the Regular Expressions in pieces
init_lookahead = "(?=<(p|div))"
chapter_line_open = self.line_open
title_line_open = "<(?P<outer2>p|div)[^>]*>\s*(<(?P<inner4>font|span|[ibu])[^>]*>)?\s*(<(?P<inner5>font|span|[ibu])[^>]*>)?\s*(<(?P<inner6>font|span|[ibu])[^>]*>)?\s*" # noqa
title_line_open = (r"<(?P<outer2>p|div)[^>]*>\s*(<(?P<inner4>font|span|[ibu])[^>]*>)?"
r"\s*(<(?P<inner5>font|span|[ibu])[^>]*>)?\s*(<(?P<inner6>font|span|[ibu])[^>]*>)?\s*")
chapter_header_open = r"(?P<chap>"
title_header_open = r"(?P<title>"
chapter_header_close = ")\\s*"
@ -243,7 +246,9 @@ class HeuristicProcessor(object):
analysis_result = []
chapter_types = [
[r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Epilogue|CHAPTER|Kapitel|Volume\b|Prologue|Book\b|Part\b|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, True, True, False, "Searching for common section headings", 'common'], # noqa
[(
r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Epilogue|CHAPTER|Kapitel|Volume\b|Prologue|Book\b|Part\b|Dedication|Preface)"
r"\s*([\d\w-]+\:?\'?\s*){0,5}"), True, True, True, False, "Searching for common section headings", 'common'],
# Highest frequency headings which include titles
[r"[^'\"]?(CHAPTER|Kapitel)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, True, True, False, "Searching for most common chapter headings", 'chapter'],
[r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•=]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>",
@ -509,11 +514,14 @@ class HeuristicProcessor(object):
def detect_whitespace(self, html):
blanks_around_headings = re.compile(
r'(?P<initparas>(<(p|div)[^>]*>\s*</(p|div)>\s*){1,}\s*)?(?P<content><h(?P<hnum>\d+)[^>]*>.*?</h(?P=hnum)>)(?P<endparas>\s*(<(p|div)[^>]*>\s*</(p|div)>\s*){1,})?', re.IGNORECASE|re.DOTALL) # noqa
r'(?P<initparas>(<(p|div)[^>]*>\s*</(p|div)>\s*){1,}\s*)?'
r'(?P<content><h(?P<hnum>\d+)[^>]*>.*?</h(?P=hnum)>)(?P<endparas>\s*(<(p|div)[^>]*>\s*</(p|div)>\s*){1,})?', re.IGNORECASE|re.DOTALL)
blanks_around_scene_breaks = re.compile(
r'(?P<initparas>(<(p|div)[^>]*>\s*</(p|div)>\s*){1,}\s*)?(?P<content><p class="scenebreak"[^>]*>.*?</p>)(?P<endparas>\s*(<(p|div)[^>]*>\s*</(p|div)>\s*){1,})?', re.IGNORECASE|re.DOTALL) # noqa
r'(?P<initparas>(<(p|div)[^>]*>\s*</(p|div)>\s*){1,}\s*)?'
r'(?P<content><p class="scenebreak"[^>]*>.*?</p>)(?P<endparas>\s*(<(p|div)[^>]*>\s*</(p|div)>\s*){1,})?', re.IGNORECASE|re.DOTALL)
blanks_n_nopunct = re.compile(
r'(?P<initparas>(<p[^>]*>\s*</p>\s*){1,}\s*)?<p[^>]*>\s*(<(span|[ibu]|em|strong|font)[^>]*>\s*)*.{1,100}?[^\W](</(span|[ibu]|em|strong|font)>\s*)*</p>(?P<endparas>\s*(<p[^>]*>\s*</p>\s*){1,})?', re.IGNORECASE|re.DOTALL) # noqa
r'(?P<initparas>(<p[^>]*>\s*</p>\s*){1,}\s*)?<p[^>]*>\s*(<(span|[ibu]|em|strong|font)[^>]*>\s*)*'
r'.{1,100}?[^\W](</(span|[ibu]|em|strong|font)>\s*)*</p>(?P<endparas>\s*(<p[^>]*>\s*</p>\s*){1,})?', re.IGNORECASE|re.DOTALL)
def merge_header_whitespace(match):
initblanks = match.group('initparas')
@ -826,7 +834,10 @@ class HeuristicProcessor(object):
self.log.debug("Looking for more split points based on punctuation,"
" currently have " + unicode_type(self.html_preprocess_sections))
chapdetect3 = re.compile(
r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([\W]+\s*)+)(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE) # noqa
r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([\W]+\s*)+)'
r'(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*'
r'.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*'
r'(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
html = chapdetect3.sub(self.chapter_break, html)
if getattr(self.extra_opts, 'renumber_headings', False):