mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Fixes for f2e0181926
This commit is contained in:
parent
fe5aac9d97
commit
b842fe758a
@ -125,6 +125,9 @@ def find_tests(which_tests=None):
|
|||||||
a(find_tests())
|
a(find_tests())
|
||||||
from calibre.library.comments import find_tests
|
from calibre.library.comments import find_tests
|
||||||
a(find_tests())
|
a(find_tests())
|
||||||
|
from calibre.ebooks.compression.palmdoc import find_tests
|
||||||
|
a(find_tests())
|
||||||
|
|
||||||
a(unittest.defaultTestLoader.loadTestsFromTestCase(TestImports))
|
a(unittest.defaultTestLoader.loadTestsFromTestCase(TestImports))
|
||||||
if ok('dbcli'):
|
if ok('dbcli'):
|
||||||
from calibre.db.cli.tests import find_tests
|
from calibre.db.cli.tests import find_tests
|
||||||
|
@ -106,7 +106,7 @@ class CHMReader(CHMFile):
|
|||||||
return data
|
return data
|
||||||
|
|
||||||
def ExtractFiles(self, output_dir=getcwd(), debug_dump=False):
|
def ExtractFiles(self, output_dir=getcwd(), debug_dump=False):
|
||||||
html_files = set([])
|
html_files = set()
|
||||||
try:
|
try:
|
||||||
x = self.get_encoding()
|
x = self.get_encoding()
|
||||||
codecs.lookup(x)
|
codecs.lookup(x)
|
||||||
|
@ -21,32 +21,7 @@ def decompress_doc(data):
|
|||||||
|
|
||||||
|
|
||||||
def compress_doc(data):
|
def compress_doc(data):
|
||||||
if not data:
|
return cPalmdoc.compress(data) if data else b''
|
||||||
return ''
|
|
||||||
return cPalmdoc.compress(data)
|
|
||||||
|
|
||||||
|
|
||||||
def test():
|
|
||||||
TESTS = [
|
|
||||||
'abc\x03\x04\x05\x06ms', # Test binary writing
|
|
||||||
'a b c \xfed ', # Test encoding of spaces
|
|
||||||
'0123456789axyz2bxyz2cdfgfo9iuyerh',
|
|
||||||
'0123456789asd0123456789asd|yyzzxxffhhjjkk',
|
|
||||||
('ciewacnaq eiu743 r787q 0w% ; sa fd\xef\ffdxosac wocjp acoiecowei '
|
|
||||||
'owaic jociowapjcivcjpoivjporeivjpoavca; p9aw8743y6r74%$^$^%8 ')
|
|
||||||
]
|
|
||||||
for test in TESTS:
|
|
||||||
print('Test:', repr(test))
|
|
||||||
print('\tTesting compression...')
|
|
||||||
good = py_compress_doc(test)
|
|
||||||
x = compress_doc(test)
|
|
||||||
print('\t\tgood:', repr(good))
|
|
||||||
print('\t\tx :', repr(x))
|
|
||||||
assert x == good
|
|
||||||
print('\tTesting decompression...')
|
|
||||||
print('\t\t', repr(decompress_doc(x)))
|
|
||||||
assert decompress_doc(x) == test
|
|
||||||
print()
|
|
||||||
|
|
||||||
|
|
||||||
def py_compress_doc(data):
|
def py_compress_doc(data):
|
||||||
@ -55,7 +30,7 @@ def py_compress_doc(data):
|
|||||||
ldata = len(data)
|
ldata = len(data)
|
||||||
while i < ldata:
|
while i < ldata:
|
||||||
if i > 10 and (ldata - i) > 10:
|
if i > 10 and (ldata - i) > 10:
|
||||||
chunk = ''
|
chunk = b''
|
||||||
match = -1
|
match = -1
|
||||||
for j in range(10, 2, -1):
|
for j in range(10, 2, -1):
|
||||||
chunk = data[i:i+j]
|
chunk = data[i:i+j]
|
||||||
@ -76,14 +51,14 @@ def py_compress_doc(data):
|
|||||||
ch = data[i]
|
ch = data[i]
|
||||||
och = ord(ch)
|
och = ord(ch)
|
||||||
i += 1
|
i += 1
|
||||||
if ch == ' ' and (i + 1) < ldata:
|
if ch == b' ' and (i + 1) < ldata:
|
||||||
onch = ord(data[i])
|
onch = ord(data[i])
|
||||||
if onch >= 0x40 and onch < 0x80:
|
if onch >= 0x40 and onch < 0x80:
|
||||||
out.write(pack('>B', onch ^ 0x80))
|
out.write(pack('>B', onch ^ 0x80))
|
||||||
i += 1
|
i += 1
|
||||||
continue
|
continue
|
||||||
if och == 0 or (och > 8 and och < 0x80):
|
if och == 0 or (och > 8 and och < 0x80):
|
||||||
out.write(ch.encode('utf-8'))
|
out.write(ch)
|
||||||
else:
|
else:
|
||||||
j = i
|
j = i
|
||||||
binseq = [ch]
|
binseq = [ch]
|
||||||
@ -95,6 +70,27 @@ def py_compress_doc(data):
|
|||||||
binseq.append(ch)
|
binseq.append(ch)
|
||||||
j += 1
|
j += 1
|
||||||
out.write(pack('>B', len(binseq)))
|
out.write(pack('>B', len(binseq)))
|
||||||
out.write(''.join(binseq).encode('utf-8'))
|
out.write(b''.join(binseq))
|
||||||
i += len(binseq) - 1
|
i += len(binseq) - 1
|
||||||
return out.getvalue()
|
return out.getvalue()
|
||||||
|
|
||||||
|
|
||||||
|
def find_tests():
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
class Test(unittest.TestCase):
|
||||||
|
|
||||||
|
def test_palmdoc_compression(self):
|
||||||
|
for test in [
|
||||||
|
b'abc\x03\x04\x05\x06ms', # Test binary writing
|
||||||
|
b'a b c \xfed ', # Test encoding of spaces
|
||||||
|
b'0123456789axyz2bxyz2cdfgfo9iuyerh',
|
||||||
|
b'0123456789asd0123456789asd|yyzzxxffhhjjkk',
|
||||||
|
(b'ciewacnaq eiu743 r787q 0w% ; sa fd\xef\ffdxosac wocjp acoiecowei '
|
||||||
|
b'owaic jociowapjcivcjpoivjporeivjpoavca; p9aw8743y6r74%$^$^%8 ')
|
||||||
|
]:
|
||||||
|
x = compress_doc(test)
|
||||||
|
self.assertEqual(py_compress_doc(test), x)
|
||||||
|
self.assertEqual(decompress_doc(x), test)
|
||||||
|
|
||||||
|
return unittest.defaultTestLoader.loadTestsFromTestCase(Test)
|
||||||
|
@ -33,12 +33,14 @@ class HeuristicProcessor(object):
|
|||||||
self.anyblank = re.compile(r'\s*(?P<openline><p[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
|
self.anyblank = re.compile(r'\s*(?P<openline><p[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
|
||||||
self.multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>(\s*<div[^>]*>\s*</div>\s*)*){2,}(?!\s*<h\d)', re.IGNORECASE)
|
self.multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>(\s*<div[^>]*>\s*</div>\s*)*){2,}(?!\s*<h\d)', re.IGNORECASE)
|
||||||
self.any_multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>(\s*<div[^>]*>\s*</div>\s*)*){2,}', re.IGNORECASE)
|
self.any_multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>(\s*<div[^>]*>\s*</div>\s*)*){2,}', re.IGNORECASE)
|
||||||
self.line_open = "<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*" # noqa
|
self.line_open = (
|
||||||
|
r"<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*"
|
||||||
|
r"(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*")
|
||||||
self.line_close = "(</(?P=inner3)>)?\\s*(</(?P=inner2)>)?\\s*(</(?P=inner1)>)?\\s*</(?P=outer)>"
|
self.line_close = "(</(?P=inner3)>)?\\s*(</(?P=inner2)>)?\\s*(</(?P=inner1)>)?\\s*</(?P=outer)>"
|
||||||
self.single_blank = re.compile(r'(\s*<(p|div)[^>]*>\s*</(p|div)>)', re.IGNORECASE)
|
self.single_blank = re.compile(r'(\s*<(p|div)[^>]*>\s*</(p|div)>)', re.IGNORECASE)
|
||||||
self.scene_break_open = '<p class="scenebreak" style="text-align:center; text-indent:0%; margin-top:1em; margin-bottom:1em; page-break-before:avoid">'
|
self.scene_break_open = '<p class="scenebreak" style="text-align:center; text-indent:0%; margin-top:1em; margin-bottom:1em; page-break-before:avoid">'
|
||||||
self.common_in_text_endings = u'[\"\'—’”,\\.!\\?\\…\\)„\\w]'
|
self.common_in_text_endings = '[\"\'—’”,\\.!\\?\\…\\)„\\w]'
|
||||||
self.common_in_text_beginnings = u'[\\w\'\"“‘‛]'
|
self.common_in_text_beginnings = '[\\w\'\"“‘‛]'
|
||||||
|
|
||||||
def is_pdftohtml(self, src):
|
def is_pdftohtml(self, src):
|
||||||
return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
|
return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
|
||||||
@ -215,7 +217,8 @@ class HeuristicProcessor(object):
|
|||||||
# Build the Regular Expressions in pieces
|
# Build the Regular Expressions in pieces
|
||||||
init_lookahead = "(?=<(p|div))"
|
init_lookahead = "(?=<(p|div))"
|
||||||
chapter_line_open = self.line_open
|
chapter_line_open = self.line_open
|
||||||
title_line_open = "<(?P<outer2>p|div)[^>]*>\s*(<(?P<inner4>font|span|[ibu])[^>]*>)?\s*(<(?P<inner5>font|span|[ibu])[^>]*>)?\s*(<(?P<inner6>font|span|[ibu])[^>]*>)?\s*" # noqa
|
title_line_open = (r"<(?P<outer2>p|div)[^>]*>\s*(<(?P<inner4>font|span|[ibu])[^>]*>)?"
|
||||||
|
r"\s*(<(?P<inner5>font|span|[ibu])[^>]*>)?\s*(<(?P<inner6>font|span|[ibu])[^>]*>)?\s*")
|
||||||
chapter_header_open = r"(?P<chap>"
|
chapter_header_open = r"(?P<chap>"
|
||||||
title_header_open = r"(?P<title>"
|
title_header_open = r"(?P<title>"
|
||||||
chapter_header_close = ")\\s*"
|
chapter_header_close = ")\\s*"
|
||||||
@ -243,7 +246,9 @@ class HeuristicProcessor(object):
|
|||||||
analysis_result = []
|
analysis_result = []
|
||||||
|
|
||||||
chapter_types = [
|
chapter_types = [
|
||||||
[r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Epilogue|CHAPTER|Kapitel|Volume\b|Prologue|Book\b|Part\b|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, True, True, False, "Searching for common section headings", 'common'], # noqa
|
[(
|
||||||
|
r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Epilogue|CHAPTER|Kapitel|Volume\b|Prologue|Book\b|Part\b|Dedication|Preface)"
|
||||||
|
r"\s*([\d\w-]+\:?\'?\s*){0,5}"), True, True, True, False, "Searching for common section headings", 'common'],
|
||||||
# Highest frequency headings which include titles
|
# Highest frequency headings which include titles
|
||||||
[r"[^'\"]?(CHAPTER|Kapitel)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, True, True, False, "Searching for most common chapter headings", 'chapter'],
|
[r"[^'\"]?(CHAPTER|Kapitel)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, True, True, False, "Searching for most common chapter headings", 'chapter'],
|
||||||
[r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•=]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>",
|
[r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•=]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>",
|
||||||
@ -509,11 +514,14 @@ class HeuristicProcessor(object):
|
|||||||
|
|
||||||
def detect_whitespace(self, html):
|
def detect_whitespace(self, html):
|
||||||
blanks_around_headings = re.compile(
|
blanks_around_headings = re.compile(
|
||||||
r'(?P<initparas>(<(p|div)[^>]*>\s*</(p|div)>\s*){1,}\s*)?(?P<content><h(?P<hnum>\d+)[^>]*>.*?</h(?P=hnum)>)(?P<endparas>\s*(<(p|div)[^>]*>\s*</(p|div)>\s*){1,})?', re.IGNORECASE|re.DOTALL) # noqa
|
r'(?P<initparas>(<(p|div)[^>]*>\s*</(p|div)>\s*){1,}\s*)?'
|
||||||
|
r'(?P<content><h(?P<hnum>\d+)[^>]*>.*?</h(?P=hnum)>)(?P<endparas>\s*(<(p|div)[^>]*>\s*</(p|div)>\s*){1,})?', re.IGNORECASE|re.DOTALL)
|
||||||
blanks_around_scene_breaks = re.compile(
|
blanks_around_scene_breaks = re.compile(
|
||||||
r'(?P<initparas>(<(p|div)[^>]*>\s*</(p|div)>\s*){1,}\s*)?(?P<content><p class="scenebreak"[^>]*>.*?</p>)(?P<endparas>\s*(<(p|div)[^>]*>\s*</(p|div)>\s*){1,})?', re.IGNORECASE|re.DOTALL) # noqa
|
r'(?P<initparas>(<(p|div)[^>]*>\s*</(p|div)>\s*){1,}\s*)?'
|
||||||
|
r'(?P<content><p class="scenebreak"[^>]*>.*?</p>)(?P<endparas>\s*(<(p|div)[^>]*>\s*</(p|div)>\s*){1,})?', re.IGNORECASE|re.DOTALL)
|
||||||
blanks_n_nopunct = re.compile(
|
blanks_n_nopunct = re.compile(
|
||||||
r'(?P<initparas>(<p[^>]*>\s*</p>\s*){1,}\s*)?<p[^>]*>\s*(<(span|[ibu]|em|strong|font)[^>]*>\s*)*.{1,100}?[^\W](</(span|[ibu]|em|strong|font)>\s*)*</p>(?P<endparas>\s*(<p[^>]*>\s*</p>\s*){1,})?', re.IGNORECASE|re.DOTALL) # noqa
|
r'(?P<initparas>(<p[^>]*>\s*</p>\s*){1,}\s*)?<p[^>]*>\s*(<(span|[ibu]|em|strong|font)[^>]*>\s*)*'
|
||||||
|
r'.{1,100}?[^\W](</(span|[ibu]|em|strong|font)>\s*)*</p>(?P<endparas>\s*(<p[^>]*>\s*</p>\s*){1,})?', re.IGNORECASE|re.DOTALL)
|
||||||
|
|
||||||
def merge_header_whitespace(match):
|
def merge_header_whitespace(match):
|
||||||
initblanks = match.group('initparas')
|
initblanks = match.group('initparas')
|
||||||
@ -826,7 +834,10 @@ class HeuristicProcessor(object):
|
|||||||
self.log.debug("Looking for more split points based on punctuation,"
|
self.log.debug("Looking for more split points based on punctuation,"
|
||||||
" currently have " + unicode_type(self.html_preprocess_sections))
|
" currently have " + unicode_type(self.html_preprocess_sections))
|
||||||
chapdetect3 = re.compile(
|
chapdetect3 = re.compile(
|
||||||
r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([\W]+\s*)+)(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE) # noqa
|
r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([\W]+\s*)+)'
|
||||||
|
r'(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*'
|
||||||
|
r'.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*'
|
||||||
|
r'(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
|
||||||
html = chapdetect3.sub(self.chapter_break, html)
|
html = chapdetect3.sub(self.chapter_break, html)
|
||||||
|
|
||||||
if getattr(self.extra_opts, 'renumber_headings', False):
|
if getattr(self.extra_opts, 'renumber_headings', False):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user