DJVU Input: When extracting embedded txt from TXTz sections in DJVU files, fix incorrect extraction of txt when the djvutxt external program is not present. Fixes #1286771 [converting DJVU file containing text fails](https://bugs.launchpad.net/calibre/+bug/1286771)

This commit is contained in:
Kovid Goyal 2014-03-03 11:02:32 +05:30
parent cc26b6e27f
commit 90aa34a473
3 changed files with 27 additions and 54 deletions

View File

@ -15,7 +15,7 @@ import sys
import struct import struct
from cStringIO import StringIO from cStringIO import StringIO
from .djvubzzdec import BZZDecoder from calibre.ebooks.djvu.djvubzzdec import BZZDecoder
class DjvuChunk(object): class DjvuChunk(object):
def __init__(self, buf, start, end, align=True, bigendian=True, def __init__(self, buf, start, end, align=True, bigendian=True,
@ -73,14 +73,16 @@ class DjvuChunk(object):
if not xxres: if not xxres:
break break
res = outbuf.getvalue() res = outbuf.getvalue()
if not res.strip(b'\0'):
raise ValueError('TXTz block is completely null')
l = 0 l = 0
for x in res[:3]: for x in res[:3]:
l <<= 8 l <<= 8
l += ord(x) l += ord(x)
if verbose > 0 and out: if verbose > 0 and out:
print >> out, l print (l, file=out)
txtout.write(res[3:3+l]) txtout.write(res[3:3+l])
txtout.write(b'\n\f') txtout.write(b'\n')
if txtout and self.type == b'TXTa': if txtout and self.type == b'TXTa':
res = self.buf[self.datastart: self.dataend] res = self.buf[self.datastart: self.dataend]
l = 0 l = 0
@ -88,9 +90,9 @@ class DjvuChunk(object):
l <<= 8 l <<= 8
l += ord(x) l += ord(x)
if verbose > 0 and out: if verbose > 0 and out:
print >> out, l print (l, file=out)
txtout.write(res[3:3+l]) txtout.write(res[3:3+l])
txtout.write(b'\n\f') txtout.write(b'\n')
if indent >= maxlevel: if indent >= maxlevel:
return return
for schunk in self._subchunks: for schunk in self._subchunks:
@ -111,36 +113,8 @@ class DJVUFile(object):
self.dc.dump(out=outfile, maxlevel=maxlevel) self.dc.dump(out=outfile, maxlevel=maxlevel)
def main(): def main():
from ruamel.util.program import Program f = DJVUFile(open(sys.argv[-1], 'rb'))
class DJVUDecoder(Program): print (f.get_text(sys.stdout))
def __init__(self):
Program.__init__(self)
def parser_setup(self):
Program.parser_setup(self)
#self._argparser.add_argument('--combine', '-c', action=CountAction, const=1, nargs=0)
#self._argparser.add_argument('--combine', '-c', type=int, default=1)
#self._argparser.add_argument('--segments', '-s', action='append', nargs='+')
#self._argparser.add_argument('--force', '-f', action='store_true')
#self._argparser.add_argument('classname')
self._argparser.add_argument('--text', '-t', action='store_true')
self._argparser.add_argument('--dump', type=int, default=0)
self._argparser.add_argument('file', nargs='+')
def run(self):
if self._args.verbose > 1: # can be negative with --quiet
print (self._args.file)
x = DJVUFile(file(self._args.file[0], 'rb'), verbose=self._args.verbose)
if self._args.text:
print (x.get_text(sys.stdout))
if self._args.dump:
x.dump(sys.stdout, maxlevel=self._args.dump)
return 0
tt = DJVUDecoder()
res = tt.result
if res != 0:
print (res)
if __name__ == '__main__': if __name__ == '__main__':
main() main()

View File

@ -80,6 +80,7 @@ MAXLEN = 1024 ** 2
# Exception classes used by this module. # Exception classes used by this module.
class BZZDecoderError(Exception): class BZZDecoderError(Exception):
"""This exception is raised when BZZDecode runs into trouble """This exception is raised when BZZDecode runs into trouble
""" """
def __init__(self, msg): def __init__(self, msg):
@ -391,6 +392,7 @@ def chr3(l):
return bytes(bytearray(l)) return bytes(bytearray(l))
class BZZDecoder(): class BZZDecoder():
def __init__(self, infile, outfile): def __init__(self, infile, outfile):
self.instream = infile self.instream = infile
self.outf = outfile self.outf = outfile
@ -450,17 +452,15 @@ class BZZDecoder():
self.xsize -= 1 self.xsize -= 1
# Compute remaining # Compute remaining
bytes = self.xsize remaining = min(sz, self.xsize)
if bytes > sz:
bytes = sz
# Transfer # Transfer
if bytes: if remaining > 0:
for i in range(bytes): raw = bytes(bytearray(self.outbuf[self.bptr:self.bptr + remaining]))
self.outf.write(chr3(self.outbuf[self.bptr + i])) self.outf.write(raw)
self.xsize -= bytes self.xsize -= remaining
self.bptr += bytes self.bptr += remaining
sz -= bytes sz -= remaining
copied += bytes copied += remaining
# offset += bytes; // for tell() # offset += bytes; // for tell()
return copied return copied
@ -468,7 +468,8 @@ class BZZDecoder():
while self.scount <= 24: while self.scount <= 24:
if self.read_byte() < 1: if self.read_byte() < 1:
self.byte = 0xff self.byte = 0xff
if --self.delay < 1: self.delay -= 1
if self.delay < 1:
raise BZZDecoderError("BiteStream EOF") raise BZZDecoderError("BiteStream EOF")
self.bufint = (self.bufint << 8) | self.byte self.bufint = (self.bufint << 8) | self.byte
self.scount += 8 self.scount += 8
@ -524,10 +525,10 @@ class BZZDecoder():
elif self.zpcodec_decode(cx, 2*CTXIDS + 14): elif self.zpcodec_decode(cx, 2*CTXIDS + 14):
mtfno = 16 + self.decode_binary(cx, 2*CTXIDS + 14 + 1, 4) mtfno = 16 + self.decode_binary(cx, 2*CTXIDS + 14 + 1, 4)
outbuf[i] = mtf[mtfno] outbuf[i] = mtf[mtfno]
elif self.zpcodec_decode(cx, 2*CTXIDS + 30 ): elif self.zpcodec_decode(cx, 2*CTXIDS + 30):
mtfno = 32 + self.decode_binary(cx, 2*CTXIDS + 30 + 1, 5) mtfno = 32 + self.decode_binary(cx, 2*CTXIDS + 30 + 1, 5)
outbuf[i] = mtf[mtfno] outbuf[i] = mtf[mtfno]
elif self.zpcodec_decode(cx, 2*CTXIDS + 62 ): elif self.zpcodec_decode(cx, 2*CTXIDS + 62):
mtfno = 64 + self.decode_binary(cx, 2*CTXIDS + 62 + 1, 6) mtfno = 64 + self.decode_binary(cx, 2*CTXIDS + 62 + 1, 6)
outbuf[i] = mtf[mtfno] outbuf[i] = mtf[mtfno]
elif self.zpcodec_decode(cx, 2*CTXIDS + 126): elif self.zpcodec_decode(cx, 2*CTXIDS + 126):
@ -729,9 +730,7 @@ class BZZDecoder():
return (self.ffzt[(x >> 8) & 0xff]) return (self.ffzt[(x >> 8) & 0xff])
# for testing
### for testing
def main(): def main():
import sys import sys
infile = file(sys.argv[1], "rb") infile = file(sys.argv[1], "rb")