From 90aa34a473a551742ff516f953eca0c1884d4e42 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 3 Mar 2014 11:02:32 +0530 Subject: [PATCH] DJVU Input: When extracting embedded txt from TXTz sections in DJVU files, fix incorrect extraction of txt when the djvutxt external program is not present. Fixes #1286771 [converting DJVU file containing text fails](https://bugs.launchpad.net/calibre/+bug/1286771) --- .../ebooks/conversion/plugins/djvu_input.py | 2 +- src/calibre/ebooks/djvu/djvu.py | 44 ++++--------------- src/calibre/ebooks/djvu/djvubzzdec.py | 35 +++++++-------- 3 files changed, 27 insertions(+), 54 deletions(-) diff --git a/src/calibre/ebooks/conversion/plugins/djvu_input.py b/src/calibre/ebooks/conversion/plugins/djvu_input.py index a5aa258d87..783931f4f3 100644 --- a/src/calibre/ebooks/conversion/plugins/djvu_input.py +++ b/src/calibre/ebooks/conversion/plugins/djvu_input.py @@ -44,7 +44,7 @@ class DJVUInput(InputFormatPlugin): os.remove(filename) ppdjvu = False except: - stream.seek(0) # retry with the pure python converter + stream.seek(0) # retry with the pure python converter if ppdjvu: from calibre.ebooks.djvu.djvu import DJVUFile x = DJVUFile(stream) diff --git a/src/calibre/ebooks/djvu/djvu.py b/src/calibre/ebooks/djvu/djvu.py index ca71e97220..7eebbca52b 100644 --- a/src/calibre/ebooks/djvu/djvu.py +++ b/src/calibre/ebooks/djvu/djvu.py @@ -15,7 +15,7 @@ import sys import struct from cStringIO import StringIO -from .djvubzzdec import BZZDecoder +from calibre.ebooks.djvu.djvubzzdec import BZZDecoder class DjvuChunk(object): def __init__(self, buf, start, end, align=True, bigendian=True, @@ -73,14 +73,16 @@ class DjvuChunk(object): if not xxres: break res = outbuf.getvalue() + if not res.strip(b'\0'): + raise ValueError('TXTz block is completely null') l = 0 for x in res[:3]: l <<= 8 l += ord(x) if verbose > 0 and out: - print >> out, l + print (l, file=out) txtout.write(res[3:3+l]) - txtout.write(b'\n\f') + txtout.write(b'\n') if txtout and self.type == b'TXTa': res = self.buf[self.datastart: self.dataend] l = 0 @@ -88,9 +90,9 @@ class DjvuChunk(object): l <<= 8 l += ord(x) if verbose > 0 and out: - print >> out, l + print (l, file=out) txtout.write(res[3:3+l]) - txtout.write(b'\n\f') + txtout.write(b'\n') if indent >= maxlevel: return for schunk in self._subchunks: @@ -111,36 +113,8 @@ class DJVUFile(object): self.dc.dump(out=outfile, maxlevel=maxlevel) def main(): - from ruamel.util.program import Program - class DJVUDecoder(Program): - def __init__(self): - Program.__init__(self) - - def parser_setup(self): - Program.parser_setup(self) - #self._argparser.add_argument('--combine', '-c', action=CountAction, const=1, nargs=0) - #self._argparser.add_argument('--combine', '-c', type=int, default=1) - #self._argparser.add_argument('--segments', '-s', action='append', nargs='+') - #self._argparser.add_argument('--force', '-f', action='store_true') - #self._argparser.add_argument('classname') - self._argparser.add_argument('--text', '-t', action='store_true') - self._argparser.add_argument('--dump', type=int, default=0) - self._argparser.add_argument('file', nargs='+') - - def run(self): - if self._args.verbose > 1: # can be negative with --quiet - print (self._args.file) - x = DJVUFile(file(self._args.file[0], 'rb'), verbose=self._args.verbose) - if self._args.text: - print (x.get_text(sys.stdout)) - if self._args.dump: - x.dump(sys.stdout, maxlevel=self._args.dump) - return 0 - - tt = DJVUDecoder() - res = tt.result - if res != 0: - print (res) + f = DJVUFile(open(sys.argv[-1], 'rb')) + print (f.get_text(sys.stdout)) if __name__ == '__main__': main() diff --git a/src/calibre/ebooks/djvu/djvubzzdec.py b/src/calibre/ebooks/djvu/djvubzzdec.py index 3eb8baa9a4..6e8ebe54ae 100644 --- a/src/calibre/ebooks/djvu/djvubzzdec.py +++ b/src/calibre/ebooks/djvu/djvubzzdec.py @@ -80,6 +80,7 @@ MAXLEN = 1024 ** 2 # Exception classes used by this module. class BZZDecoderError(Exception): + """This exception is raised when BZZDecode runs into trouble """ def __init__(self, msg): @@ -91,7 +92,7 @@ class BZZDecoderError(Exception): # This table has been designed for the ZPCoder # * by running the following command in file 'zptable.sn': # * (fast-crude (steady-mat 0.0035 0.0002) 260))) -default_ztable = [ # {{{ +default_ztable = [ # {{{ (0x8000, 0x0000, 84, 145), # 000: p=0.500000 ( 0, 0) (0x8000, 0x0000, 3, 4), # 001: p=0.500000 ( 0, 0) (0x8000, 0x0000, 4, 3), # 002: p=0.500000 ( 0, 0) @@ -391,6 +392,7 @@ def chr3(l): return bytes(bytearray(l)) class BZZDecoder(): + def __init__(self, infile, outfile): self.instream = infile self.outf = outfile @@ -450,17 +452,15 @@ class BZZDecoder(): self.xsize -= 1 # Compute remaining - bytes = self.xsize - if bytes > sz: - bytes = sz + remaining = min(sz, self.xsize) # Transfer - if bytes: - for i in range(bytes): - self.outf.write(chr3(self.outbuf[self.bptr + i])) - self.xsize -= bytes - self.bptr += bytes - sz -= bytes - copied += bytes + if remaining > 0: + raw = bytes(bytearray(self.outbuf[self.bptr:self.bptr + remaining])) + self.outf.write(raw) + self.xsize -= remaining + self.bptr += remaining + sz -= remaining + copied += remaining # offset += bytes; // for tell() return copied @@ -468,7 +468,8 @@ class BZZDecoder(): while self.scount <= 24: if self.read_byte() < 1: self.byte = 0xff - if --self.delay < 1: + self.delay -= 1 + if self.delay < 1: raise BZZDecoderError("BiteStream EOF") self.bufint = (self.bufint << 8) | self.byte self.scount += 8 @@ -495,7 +496,7 @@ class BZZDecoder(): if self.zpcodec_decoder(): fshift += 1 # Prepare Quasi MTF - mtf = list(xmtf) # unsigned chars + mtf = list(xmtf) # unsigned chars freq = [0] * FREQMAX fadd = 4 # Decode @@ -524,10 +525,10 @@ class BZZDecoder(): elif self.zpcodec_decode(cx, 2*CTXIDS + 14): mtfno = 16 + self.decode_binary(cx, 2*CTXIDS + 14 + 1, 4) outbuf[i] = mtf[mtfno] - elif self.zpcodec_decode(cx, 2*CTXIDS + 30 ): + elif self.zpcodec_decode(cx, 2*CTXIDS + 30): mtfno = 32 + self.decode_binary(cx, 2*CTXIDS + 30 + 1, 5) outbuf[i] = mtf[mtfno] - elif self.zpcodec_decode(cx, 2*CTXIDS + 62 ): + elif self.zpcodec_decode(cx, 2*CTXIDS + 62): mtfno = 64 + self.decode_binary(cx, 2*CTXIDS + 62 + 1, 6) outbuf[i] = mtf[mtfno] elif self.zpcodec_decode(cx, 2*CTXIDS + 126): @@ -729,9 +730,7 @@ class BZZDecoder(): return (self.ffzt[(x >> 8) & 0xff]) - -### for testing - +# for testing def main(): import sys infile = file(sys.argv[1], "rb")