DJVU Input: When extracting embedded txt from TXTz sections in DJVU files, fix incorrect extraction of txt when the djvutxt external program is not present. Fixes #1286771 [converting DJVU file containing text fails](https://bugs.launchpad.net/calibre/+bug/1286771)

2025-07-09 03:04:10 -04:00 · 2014-03-03 11:02:32 +05:30 · 2014-03-03 11:02:32 +05:30 · 90aa34a473
commit 90aa34a473
parent cc26b6e27f
3 changed files with 27 additions and 54 deletions
--- a/src/calibre/ebooks/djvu/djvu.py
+++ b/src/calibre/ebooks/djvu/djvu.py
@ -15,7 +15,7 @@ import sys
 import struct
 from cStringIO import StringIO
-from .djvubzzdec import BZZDecoder
+from calibre.ebooks.djvu.djvubzzdec import BZZDecoder
 class DjvuChunk(object):
    def __init__(self, buf, start, end, align=True, bigendian=True,
@ -73,14 +73,16 @@ class DjvuChunk(object):
                if not xxres:
                    break
            res = outbuf.getvalue()
            if not res.strip(b'\0'):
                raise ValueError('TXTz block is completely null')
            l = 0
            for x in res[:3]:
                l <<= 8
                l += ord(x)
            if verbose > 0 and out:
-                print >> out, l
+                print (l, file=out)
            txtout.write(res[3:3+l])
-            txtout.write(b'\n\f')
+            txtout.write(b'\n')
        if txtout and self.type == b'TXTa':
            res = self.buf[self.datastart: self.dataend]
            l = 0
@ -88,9 +90,9 @@ class DjvuChunk(object):
                l <<= 8
                l += ord(x)
            if verbose > 0 and out:
-                print >> out, l
+                print (l, file=out)
            txtout.write(res[3:3+l])
-            txtout.write(b'\n\f')
+            txtout.write(b'\n')
        if indent >= maxlevel:
            return
        for schunk in self._subchunks:
@ -111,36 +113,8 @@ class DJVUFile(object):
        self.dc.dump(out=outfile, maxlevel=maxlevel)
 def main():
-    from ruamel.util.program import Program
+    f = DJVUFile(open(sys.argv[-1], 'rb'))
-    class DJVUDecoder(Program):
+    print (f.get_text(sys.stdout))
        def __init__(self):
            Program.__init__(self)
        def parser_setup(self):
            Program.parser_setup(self)
            #self._argparser.add_argument('--combine', '-c', action=CountAction, const=1, nargs=0)
            #self._argparser.add_argument('--combine', '-c', type=int, default=1)
            #self._argparser.add_argument('--segments', '-s', action='append', nargs='+')
            #self._argparser.add_argument('--force', '-f', action='store_true')
            #self._argparser.add_argument('classname')
            self._argparser.add_argument('--text', '-t', action='store_true')
            self._argparser.add_argument('--dump', type=int, default=0)
            self._argparser.add_argument('file', nargs='+')
        def run(self):
            if self._args.verbose > 1: # can be negative with --quiet
                print (self._args.file)
            x = DJVUFile(file(self._args.file[0], 'rb'), verbose=self._args.verbose)
            if self._args.text:
                print (x.get_text(sys.stdout))
            if self._args.dump:
                x.dump(sys.stdout, maxlevel=self._args.dump)
            return 0
    tt = DJVUDecoder()
    res = tt.result
    if res != 0:
        print (res)
 if __name__ == '__main__':
    main()
--- a/src/calibre/ebooks/djvu/djvubzzdec.py
+++ b/src/calibre/ebooks/djvu/djvubzzdec.py
@ -80,6 +80,7 @@ MAXLEN = 1024 ** 2
 # Exception classes used by this module.
 class BZZDecoderError(Exception):
    """This exception is raised when BZZDecode runs into trouble
    """
    def __init__(self, msg):
@ -391,6 +392,7 @@ def chr3(l):
    return bytes(bytearray(l))
 class BZZDecoder():
    def __init__(self, infile, outfile):
        self.instream = infile
        self.outf = outfile
@ -450,17 +452,15 @@ class BZZDecoder():
                self.xsize -= 1
            # Compute remaining
-            bytes = self.xsize
+            remaining = min(sz, self.xsize)
            if bytes > sz:
                bytes = sz
            # Transfer
-            if bytes:
+            if remaining > 0:
-                for i in range(bytes):
+                raw = bytes(bytearray(self.outbuf[self.bptr:self.bptr + remaining]))
-                    self.outf.write(chr3(self.outbuf[self.bptr + i]))
+                self.outf.write(raw)
-            self.xsize -= bytes
+            self.xsize -= remaining
-            self.bptr += bytes
+            self.bptr += remaining
-            sz -= bytes
+            sz -= remaining
-            copied += bytes
+            copied += remaining
            # offset += bytes; // for tell()
        return copied
@ -468,7 +468,8 @@ class BZZDecoder():
        while self.scount <= 24:
            if self.read_byte() < 1:
                self.byte = 0xff
-                if --self.delay < 1:
+                self.delay -= 1
                if self.delay < 1:
                    raise BZZDecoderError("BiteStream EOF")
            self.bufint = (self.bufint << 8) | self.byte
            self.scount += 8
@ -524,10 +525,10 @@ class BZZDecoder():
            elif self.zpcodec_decode(cx, 2*CTXIDS + 14):
                mtfno = 16 + self.decode_binary(cx, 2*CTXIDS + 14 + 1, 4)
                outbuf[i] = mtf[mtfno]
-            elif self.zpcodec_decode(cx, 2*CTXIDS + 30 ):
+            elif self.zpcodec_decode(cx, 2*CTXIDS + 30):
                mtfno = 32 + self.decode_binary(cx, 2*CTXIDS + 30 + 1, 5)
                outbuf[i] = mtf[mtfno]
-            elif self.zpcodec_decode(cx, 2*CTXIDS + 62 ):
+            elif self.zpcodec_decode(cx, 2*CTXIDS + 62):
                mtfno = 64 + self.decode_binary(cx, 2*CTXIDS + 62 + 1, 6)
                outbuf[i] = mtf[mtfno]
            elif self.zpcodec_decode(cx, 2*CTXIDS + 126):
@ -729,9 +730,7 @@ class BZZDecoder():
            return (self.ffzt[(x >> 8) & 0xff])
-
+# for testing
 ### for testing
 def main():
    import sys
    infile = file(sys.argv[1], "rb")