mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Fixes for f2e0181926
This commit is contained in:
parent
fe5aac9d97
commit
b842fe758a
@ -125,6 +125,9 @@ def find_tests(which_tests=None):
|
||||
a(find_tests())
|
||||
from calibre.library.comments import find_tests
|
||||
a(find_tests())
|
||||
from calibre.ebooks.compression.palmdoc import find_tests
|
||||
a(find_tests())
|
||||
|
||||
a(unittest.defaultTestLoader.loadTestsFromTestCase(TestImports))
|
||||
if ok('dbcli'):
|
||||
from calibre.db.cli.tests import find_tests
|
||||
|
@ -106,7 +106,7 @@ class CHMReader(CHMFile):
|
||||
return data
|
||||
|
||||
def ExtractFiles(self, output_dir=getcwd(), debug_dump=False):
|
||||
html_files = set([])
|
||||
html_files = set()
|
||||
try:
|
||||
x = self.get_encoding()
|
||||
codecs.lookup(x)
|
||||
|
@ -21,32 +21,7 @@ def decompress_doc(data):
|
||||
|
||||
|
||||
def compress_doc(data):
|
||||
if not data:
|
||||
return ''
|
||||
return cPalmdoc.compress(data)
|
||||
|
||||
|
||||
def test():
|
||||
TESTS = [
|
||||
'abc\x03\x04\x05\x06ms', # Test binary writing
|
||||
'a b c \xfed ', # Test encoding of spaces
|
||||
'0123456789axyz2bxyz2cdfgfo9iuyerh',
|
||||
'0123456789asd0123456789asd|yyzzxxffhhjjkk',
|
||||
('ciewacnaq eiu743 r787q 0w% ; sa fd\xef\ffdxosac wocjp acoiecowei '
|
||||
'owaic jociowapjcivcjpoivjporeivjpoavca; p9aw8743y6r74%$^$^%8 ')
|
||||
]
|
||||
for test in TESTS:
|
||||
print('Test:', repr(test))
|
||||
print('\tTesting compression...')
|
||||
good = py_compress_doc(test)
|
||||
x = compress_doc(test)
|
||||
print('\t\tgood:', repr(good))
|
||||
print('\t\tx :', repr(x))
|
||||
assert x == good
|
||||
print('\tTesting decompression...')
|
||||
print('\t\t', repr(decompress_doc(x)))
|
||||
assert decompress_doc(x) == test
|
||||
print()
|
||||
return cPalmdoc.compress(data) if data else b''
|
||||
|
||||
|
||||
def py_compress_doc(data):
|
||||
@ -55,7 +30,7 @@ def py_compress_doc(data):
|
||||
ldata = len(data)
|
||||
while i < ldata:
|
||||
if i > 10 and (ldata - i) > 10:
|
||||
chunk = ''
|
||||
chunk = b''
|
||||
match = -1
|
||||
for j in range(10, 2, -1):
|
||||
chunk = data[i:i+j]
|
||||
@ -76,14 +51,14 @@ def py_compress_doc(data):
|
||||
ch = data[i]
|
||||
och = ord(ch)
|
||||
i += 1
|
||||
if ch == ' ' and (i + 1) < ldata:
|
||||
if ch == b' ' and (i + 1) < ldata:
|
||||
onch = ord(data[i])
|
||||
if onch >= 0x40 and onch < 0x80:
|
||||
out.write(pack('>B', onch ^ 0x80))
|
||||
i += 1
|
||||
continue
|
||||
if och == 0 or (och > 8 and och < 0x80):
|
||||
out.write(ch.encode('utf-8'))
|
||||
out.write(ch)
|
||||
else:
|
||||
j = i
|
||||
binseq = [ch]
|
||||
@ -95,6 +70,27 @@ def py_compress_doc(data):
|
||||
binseq.append(ch)
|
||||
j += 1
|
||||
out.write(pack('>B', len(binseq)))
|
||||
out.write(''.join(binseq).encode('utf-8'))
|
||||
out.write(b''.join(binseq))
|
||||
i += len(binseq) - 1
|
||||
return out.getvalue()
|
||||
|
||||
|
||||
def find_tests():
|
||||
import unittest
|
||||
|
||||
class Test(unittest.TestCase):
|
||||
|
||||
def test_palmdoc_compression(self):
|
||||
for test in [
|
||||
b'abc\x03\x04\x05\x06ms', # Test binary writing
|
||||
b'a b c \xfed ', # Test encoding of spaces
|
||||
b'0123456789axyz2bxyz2cdfgfo9iuyerh',
|
||||
b'0123456789asd0123456789asd|yyzzxxffhhjjkk',
|
||||
(b'ciewacnaq eiu743 r787q 0w% ; sa fd\xef\ffdxosac wocjp acoiecowei '
|
||||
b'owaic jociowapjcivcjpoivjporeivjpoavca; p9aw8743y6r74%$^$^%8 ')
|
||||
]:
|
||||
x = compress_doc(test)
|
||||
self.assertEqual(py_compress_doc(test), x)
|
||||
self.assertEqual(decompress_doc(x), test)
|
||||
|
||||
return unittest.defaultTestLoader.loadTestsFromTestCase(Test)
|
||||
|
@ -33,12 +33,14 @@ class HeuristicProcessor(object):
|
||||
self.anyblank = re.compile(r'\s*(?P<openline><p[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
|
||||
self.multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>(\s*<div[^>]*>\s*</div>\s*)*){2,}(?!\s*<h\d)', re.IGNORECASE)
|
||||
self.any_multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>(\s*<div[^>]*>\s*</div>\s*)*){2,}', re.IGNORECASE)
|
||||
self.line_open = "<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*" # noqa
|
||||
self.line_open = (
|
||||
r"<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*"
|
||||
r"(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*")
|
||||
self.line_close = "(</(?P=inner3)>)?\\s*(</(?P=inner2)>)?\\s*(</(?P=inner1)>)?\\s*</(?P=outer)>"
|
||||
self.single_blank = re.compile(r'(\s*<(p|div)[^>]*>\s*</(p|div)>)', re.IGNORECASE)
|
||||
self.scene_break_open = '<p class="scenebreak" style="text-align:center; text-indent:0%; margin-top:1em; margin-bottom:1em; page-break-before:avoid">'
|
||||
self.common_in_text_endings = u'[\"\'—’”,\\.!\\?\\…\\)„\\w]'
|
||||
self.common_in_text_beginnings = u'[\\w\'\"“‘‛]'
|
||||
self.common_in_text_endings = '[\"\'—’”,\\.!\\?\\…\\)„\\w]'
|
||||
self.common_in_text_beginnings = '[\\w\'\"“‘‛]'
|
||||
|
||||
def is_pdftohtml(self, src):
|
||||
return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
|
||||
@ -215,7 +217,8 @@ class HeuristicProcessor(object):
|
||||
# Build the Regular Expressions in pieces
|
||||
init_lookahead = "(?=<(p|div))"
|
||||
chapter_line_open = self.line_open
|
||||
title_line_open = "<(?P<outer2>p|div)[^>]*>\s*(<(?P<inner4>font|span|[ibu])[^>]*>)?\s*(<(?P<inner5>font|span|[ibu])[^>]*>)?\s*(<(?P<inner6>font|span|[ibu])[^>]*>)?\s*" # noqa
|
||||
title_line_open = (r"<(?P<outer2>p|div)[^>]*>\s*(<(?P<inner4>font|span|[ibu])[^>]*>)?"
|
||||
r"\s*(<(?P<inner5>font|span|[ibu])[^>]*>)?\s*(<(?P<inner6>font|span|[ibu])[^>]*>)?\s*")
|
||||
chapter_header_open = r"(?P<chap>"
|
||||
title_header_open = r"(?P<title>"
|
||||
chapter_header_close = ")\\s*"
|
||||
@ -243,7 +246,9 @@ class HeuristicProcessor(object):
|
||||
analysis_result = []
|
||||
|
||||
chapter_types = [
|
||||
[r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Epilogue|CHAPTER|Kapitel|Volume\b|Prologue|Book\b|Part\b|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, True, True, False, "Searching for common section headings", 'common'], # noqa
|
||||
[(
|
||||
r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Epilogue|CHAPTER|Kapitel|Volume\b|Prologue|Book\b|Part\b|Dedication|Preface)"
|
||||
r"\s*([\d\w-]+\:?\'?\s*){0,5}"), True, True, True, False, "Searching for common section headings", 'common'],
|
||||
# Highest frequency headings which include titles
|
||||
[r"[^'\"]?(CHAPTER|Kapitel)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, True, True, False, "Searching for most common chapter headings", 'chapter'],
|
||||
[r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•=]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>",
|
||||
@ -509,11 +514,14 @@ class HeuristicProcessor(object):
|
||||
|
||||
def detect_whitespace(self, html):
|
||||
blanks_around_headings = re.compile(
|
||||
r'(?P<initparas>(<(p|div)[^>]*>\s*</(p|div)>\s*){1,}\s*)?(?P<content><h(?P<hnum>\d+)[^>]*>.*?</h(?P=hnum)>)(?P<endparas>\s*(<(p|div)[^>]*>\s*</(p|div)>\s*){1,})?', re.IGNORECASE|re.DOTALL) # noqa
|
||||
r'(?P<initparas>(<(p|div)[^>]*>\s*</(p|div)>\s*){1,}\s*)?'
|
||||
r'(?P<content><h(?P<hnum>\d+)[^>]*>.*?</h(?P=hnum)>)(?P<endparas>\s*(<(p|div)[^>]*>\s*</(p|div)>\s*){1,})?', re.IGNORECASE|re.DOTALL)
|
||||
blanks_around_scene_breaks = re.compile(
|
||||
r'(?P<initparas>(<(p|div)[^>]*>\s*</(p|div)>\s*){1,}\s*)?(?P<content><p class="scenebreak"[^>]*>.*?</p>)(?P<endparas>\s*(<(p|div)[^>]*>\s*</(p|div)>\s*){1,})?', re.IGNORECASE|re.DOTALL) # noqa
|
||||
r'(?P<initparas>(<(p|div)[^>]*>\s*</(p|div)>\s*){1,}\s*)?'
|
||||
r'(?P<content><p class="scenebreak"[^>]*>.*?</p>)(?P<endparas>\s*(<(p|div)[^>]*>\s*</(p|div)>\s*){1,})?', re.IGNORECASE|re.DOTALL)
|
||||
blanks_n_nopunct = re.compile(
|
||||
r'(?P<initparas>(<p[^>]*>\s*</p>\s*){1,}\s*)?<p[^>]*>\s*(<(span|[ibu]|em|strong|font)[^>]*>\s*)*.{1,100}?[^\W](</(span|[ibu]|em|strong|font)>\s*)*</p>(?P<endparas>\s*(<p[^>]*>\s*</p>\s*){1,})?', re.IGNORECASE|re.DOTALL) # noqa
|
||||
r'(?P<initparas>(<p[^>]*>\s*</p>\s*){1,}\s*)?<p[^>]*>\s*(<(span|[ibu]|em|strong|font)[^>]*>\s*)*'
|
||||
r'.{1,100}?[^\W](</(span|[ibu]|em|strong|font)>\s*)*</p>(?P<endparas>\s*(<p[^>]*>\s*</p>\s*){1,})?', re.IGNORECASE|re.DOTALL)
|
||||
|
||||
def merge_header_whitespace(match):
|
||||
initblanks = match.group('initparas')
|
||||
@ -826,7 +834,10 @@ class HeuristicProcessor(object):
|
||||
self.log.debug("Looking for more split points based on punctuation,"
|
||||
" currently have " + unicode_type(self.html_preprocess_sections))
|
||||
chapdetect3 = re.compile(
|
||||
r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([\W]+\s*)+)(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE) # noqa
|
||||
r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([\W]+\s*)+)'
|
||||
r'(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*'
|
||||
r'.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*'
|
||||
r'(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
|
||||
html = chapdetect3.sub(self.chapter_break, html)
|
||||
|
||||
if getattr(self.extra_opts, 'renumber_headings', False):
|
||||
|
Loading…
x
Reference in New Issue
Block a user