Input plugin for OCR-ed DJVU files (i.e. .djvu files that contain text. The text is converted.

This commit is contained in:
Kovid Goyal 2011-10-12 08:12:10 +05:30
commit 65d42ab1dc
10 changed files with 1070 additions and 14 deletions

View File

@ -4,9 +4,7 @@ __copyright__ = '2011, Darko Miletic <darko.miletic at gmail.com>'
www.defensenews.com
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
class DefenseNews(BasicNewsRecipe):
title = 'Defense News'
@ -42,7 +40,7 @@ class DefenseNews(BasicNewsRecipe):
]
remove_tags_before = attrs={'class':'storyWrp'}
remove_tags_after = attrs={'class':'middle'}
remove_attributes=['lang']
feeds = [

View File

@ -336,7 +336,7 @@ class Build(Command):
oinc = ['/Fo'+obj] if iswindows else ['-o', obj]
cmd = [compiler] + cflags + ext.cflags + einc + sinc + oinc
self.info(' '.join(cmd))
subprocess.check_call(cmd)
self.check_call(cmd)
dest = self.dest(ext)
elib = self.lib_dirs_to_ldflags(ext.lib_dirs)
@ -350,18 +350,32 @@ class Build(Command):
else:
cmd += objects + ext.extra_objs + ['-o', dest] + ldflags + ext.ldflags + elib + xlib
self.info('\n\n', ' '.join(cmd), '\n\n')
subprocess.check_call(cmd)
self.check_call(cmd)
if iswindows:
#manifest = dest+'.manifest'
#cmd = [MT, '-manifest', manifest, '-outputresource:%s;2'%dest]
#self.info(*cmd)
#subprocess.check_call(cmd)
#self.check_call(cmd)
#os.remove(manifest)
for x in ('.exp', '.lib'):
x = os.path.splitext(dest)[0]+x
if os.path.exists(x):
os.remove(x)
def check_call(self, *args, **kwargs):
"""print cmdline if an error occured
If something is missing (qmake e.g.) you get a non-informative error
self.check_call(qmc + [ext.name+'.pro'])
so you would have to look a the source to see the actual command.
"""
try:
subprocess.check_call(*args, **kwargs)
except:
cmdline = ' '.join(['"%s"' % (arg) if ' ' in arg else arg for arg in args[0]])
print "Error while executing: %s\n" % (cmdline)
raise
def build_qt_objects(self, ext):
obj_pat = 'release\\*.obj' if iswindows else '*.o'
objects = glob.glob(obj_pat)
@ -380,8 +394,8 @@ class Build(Command):
qmc = [QMAKE, '-o', 'Makefile']
if iswindows:
qmc += ['-spec', 'win32-msvc2008']
subprocess.check_call(qmc + [ext.name+'.pro'])
subprocess.check_call([make, '-f', 'Makefile'])
self.check_call(qmc + [ext.name+'.pro'])
self.check_call([make, '-f', 'Makefile'])
objects = glob.glob(obj_pat)
return list(map(self.a, objects))
@ -407,7 +421,7 @@ class Build(Command):
cmd = [pyqt.sip_bin+exe, '-w', '-c', src_dir, '-b', sbf, '-I'+\
pyqt.pyqt_sip_dir] + shlex.split(pyqt.pyqt_sip_flags) + [sipf]
self.info(' '.join(cmd))
subprocess.check_call(cmd)
self.check_call(cmd)
module = self.j(src_dir, self.b(dest))
if self.newer(dest, [sbf]+qt_objects):
mf = self.j(src_dir, 'Makefile')
@ -417,7 +431,7 @@ class Build(Command):
makefile.extra_include_dirs = ext.inc_dirs
makefile.generate()
subprocess.check_call([make, '-f', mf], cwd=src_dir)
self.check_call([make, '-f', mf], cwd=src_dir)
shutil.copy2(module, dest)
def clean(self):
@ -457,7 +471,7 @@ class BuildPDF2XML(Command):
cmd += ['-I'+x for x in poppler_inc_dirs+magick_inc_dirs]
cmd += ['/Fo'+obj, src]
self.info(*cmd)
subprocess.check_call(cmd)
self.check_call(cmd)
objects.append(obj)
if self.newer(dest, objects):
@ -470,7 +484,7 @@ class BuildPDF2XML(Command):
png_libs+magick_libs+poppler_libs+ft_libs+jpg_libs+pdfreflow_libs]
cmd += ['/OUT:'+dest] + objects
self.info(*cmd)
subprocess.check_call(cmd)
self.check_call(cmd)
self.info('Binary installed as', dest)

View File

@ -502,6 +502,7 @@ class TXTZMetadataWriter(MetadataWriterPlugin):
# }}}
from calibre.ebooks.comic.input import ComicInput
from calibre.ebooks.djvu.input import DJVUInput
from calibre.ebooks.epub.input import EPUBInput
from calibre.ebooks.fb2.input import FB2Input
from calibre.ebooks.html.input import HTMLInput
@ -600,6 +601,7 @@ plugins += [GoogleBooks, Amazon, OpenLibrary, ISBNDB, OverDrive, Douban, Ozon]
plugins += [
ComicInput,
DJVUInput,
EPUBInput,
FB2Input,
HTMLInput,

View File

@ -4,7 +4,6 @@ __license__ = 'GPL 3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import sys
from itertools import izip
from xml.sax.saxutils import escape
@ -742,7 +741,7 @@ class PocketBook900Output(OutputProfile):
screen_size = (810, 1180)
dpi = 150.0
comic_screen_size = screen_size
output_profiles = [OutputProfile, SonyReaderOutput, SonyReader300Output,
SonyReader900Output, MSReaderOutput, MobipocketOutput, HanlinV3Output,
HanlinV5Output, CybookG3Output, CybookOpusOutput, KindleOutput,

View File

@ -0,0 +1,12 @@
#!/usr/bin/env python
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2011, Anthon van der Neut <anthon@mnt.org>'
__docformat__ = 'restructuredtext en'
'''
Used for DJVU input
'''

View File

@ -0,0 +1,146 @@
#! /usr/bin/env python
# coding: utf-8
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2011, Anthon van der Neut <A.van.der.Neut@ruamel.eu>'
# this code is based on:
# Lizardtech DjVu Reference
# DjVu v3
# November 2005
import sys
import struct
from cStringIO import StringIO
from .djvubzzdec import BZZDecoder
class DjvuChunk(object):
def __init__(self, buf, start, end, align=True, bigendian=True,
inclheader=False, verbose=0):
self.subtype = None
self._subchunks = []
self.buf = buf
pos = start + 4
self.type = buf[start:pos]
self.align = align # whether to align to word (2-byte) boundaries
self.headersize = 0 if inclheader else 8
if bigendian:
self.strflag = b'>'
else:
self.strflag = b'<'
oldpos, pos = pos, pos+4
self.size = struct.unpack(self.strflag+b'L', buf[oldpos:pos])[0]
self.dataend = pos + self.size - (8 if inclheader else 0)
if self.type == b'FORM':
oldpos, pos = pos, pos+4
#print oldpos, pos
self.subtype = buf[oldpos:pos]
#self.headersize += 4
self.datastart = pos
if verbose > 0:
print ('found', self.type, self.subtype, pos, self.size)
if self.type in b'FORM'.split():
if verbose > 0:
print ('processing substuff %d %d (%x)' % (pos, self.dataend,
self.dataend))
numchunks = 0
while pos < self.dataend:
x = DjvuChunk(buf, pos, start+self.size, verbose=verbose)
numchunks += 1
self._subchunks.append(x)
newpos = pos + x.size + x.headersize + (1 if (x.size % 2) else 0)
if verbose > 0:
print ('newpos %d %d (%x, %x) %d' % (newpos, self.dataend,
newpos, self.dataend, x.headersize))
pos = newpos
if verbose > 0:
print (' end of chunk %d (%x)' % (pos, pos))
def dump(self, verbose=0, indent=1, out=None, txtout=None, maxlevel=100):
if out:
out.write(b' ' * indent)
out.write(b'%s%s [%d]\n' % (self.type,
b':' + self.subtype if self.subtype else b'', self.size))
if txtout and self.type == b'TXTz':
inbuf = StringIO(self.buf[self.datastart: self.dataend])
outbuf = StringIO()
decoder = BZZDecoder(inbuf, outbuf)
while True:
xxres = decoder.convert(1024 * 1024)
if not xxres:
break
res = outbuf.getvalue()
l = 0
for x in res[:3]:
l <<= 8
l += ord(x)
if verbose > 0 and out:
print >> out, l
txtout.write(res[3:3+l])
txtout.write(b'\n\f')
if txtout and self.type == b'TXTa':
res = self.buf[self.datastart: self.dataend]
l = 0
for x in res[:3]:
l <<= 8
l += ord(x)
if verbose > 0 and out:
print >> out, l
txtout.write(res[3:3+l])
txtout.write(b'\n\f')
if indent >= maxlevel:
return
for schunk in self._subchunks:
schunk.dump(verbose=verbose, indent=indent+1, out=out, txtout=txtout)
class DJVUFile(object):
def __init__(self, instream, verbose=0):
self.instream = instream
buf = self.instream.read(4)
assert(buf == b'AT&T')
buf = self.instream.read()
self.dc = DjvuChunk(buf, 0, len(buf), verbose=verbose)
def get_text(self, outfile=None):
self.dc.dump(txtout=outfile)
def dump(self, outfile=None, maxlevel=0):
self.dc.dump(out=outfile, maxlevel=maxlevel)
def main():
from ruamel.util.program import Program
class DJVUDecoder(Program):
def __init__(self):
Program.__init__(self)
def parser_setup(self):
Program.parser_setup(self)
#self._argparser.add_argument('--combine', '-c', action=CountAction, const=1, nargs=0)
#self._argparser.add_argument('--combine', '-c', type=int, default=1)
#self._argparser.add_argument('--segments', '-s', action='append', nargs='+')
#self._argparser.add_argument('--force', '-f', action='store_true')
#self._argparser.add_argument('classname')
self._argparser.add_argument('--text', '-t', action='store_true')
self._argparser.add_argument('--dump', type=int, default=0)
self._argparser.add_argument('file', nargs='+')
def run(self):
if self._args.verbose > 1: # can be negative with --quiet
print (self._args.file)
x = DJVUFile(file(self._args.file[0], 'rb'), verbose=self._args.verbose)
if self._args.text:
print (x.get_text(sys.stdout))
if self._args.dump:
x.dump(sys.stdout, maxlevel=self._args.dump)
return 0
tt = DJVUDecoder()
res = tt.result
if res != 0:
print (res)
if __name__ == '__main__':
main()

View File

@ -0,0 +1,746 @@
#! /usr/bin/env python
# coding: utf-8
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2011, Anthon van der Neut <A.van.der.Neut@ruamel.eu>'
#__docformat__ = 'restructuredtext en'
# Copyright (C) 2011 Anthon van der Neut, Ruamel bvba
# Adapted from Leon Bottou's djvulibre C++ code,
# ( ZPCodec.{cpp,h} and BSByteStream.{cpp,h} )
# that code was first converted to C removing any dependencies on the DJVU libre
# framework for ByteStream, making it into a ctypes callable shared object
# then to python, and remade into a class
original_copyright_notice = '''
//C- -------------------------------------------------------------------
//C- DjVuLibre-3.5
//C- Copyright (c) 2002 Leon Bottou and Yann Le Cun.
//C- Copyright (c) 2001 AT&T
//C-
//C- This software is subject to, and may be distributed under, the
//C- GNU General Public License, either Version 2 of the license,
//C- or (at your option) any later version. The license should have
//C- accompanied the software or you may obtain a copy of the license
//C- from the Free Software Foundation at http://www.fsf.org .
//C-
//C- This program is distributed in the hope that it will be useful,
//C- but WITHOUT ANY WARRANTY; without even the implied warranty of
//C- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
//C- GNU General Public License for more details.
//C-
//C- DjVuLibre-3.5 is derived from the DjVu(r) Reference Library from
//C- Lizardtech Software. Lizardtech Software has authorized us to
//C- replace the original DjVu(r) Reference Library notice by the following
//C- text (see doc/lizard2002.djvu and doc/lizardtech2007.djvu):
//C-
//C- ------------------------------------------------------------------
//C- | DjVu (r) Reference Library (v. 3.5)
//C- | Copyright (c) 1999-2001 LizardTech, Inc. All Rights Reserved.
//C- | The DjVu Reference Library is protected by U.S. Pat. No.
//C- | 6,058,214 and patents pending.
//C- |
//C- | This software is subject to, and may be distributed under, the
//C- | GNU General Public License, either Version 2 of the license,
//C- | or (at your option) any later version. The license should have
//C- | accompanied the software or you may obtain a copy of the license
//C- | from the Free Software Foundation at http://www.fsf.org .
//C- |
//C- | The computer code originally released by LizardTech under this
//C- | license and unmodified by other parties is deemed "the LIZARDTECH
//C- | ORIGINAL CODE." Subject to any third party intellectual property
//C- | claims, LizardTech grants recipient a worldwide, royalty-free,
//C- | non-exclusive license to make, use, sell, or otherwise dispose of
//C- | the LIZARDTECH ORIGINAL CODE or of programs derived from the
//C- | LIZARDTECH ORIGINAL CODE in compliance with the terms of the GNU
//C- | General Public License. This grant only confers the right to
//C- | infringe patent claims underlying the LIZARDTECH ORIGINAL CODE to
//C- | the extent such infringement is reasonably necessary to enable
//C- | recipient to make, have made, practice, sell, or otherwise dispose
//C- | of the LIZARDTECH ORIGINAL CODE (or portions thereof) and not to
//C- | any greater extent that may be necessary to utilize further
//C- | modifications or combinations.
//C- |
//C- | The LIZARDTECH ORIGINAL CODE is provided "AS IS" WITHOUT WARRANTY
//C- | OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
//C- | TO ANY WARRANTY OF NON-INFRINGEMENT, OR ANY IMPLIED WARRANTY OF
//C- | MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
//C- +------------------------------------------------------------------
//
// $Id: BSByteStream.cpp,v 1.9 2007/03/25 20:48:29 leonb Exp $
// $Name: release_3_5_23 $
'''
MAXBLOCK = 4096
FREQMAX = 4
CTXIDS = 3
MAXLEN = 1024 ** 2
# Exception classes used by this module.
class BZZDecoderError(Exception):
"""This exception is raised when BZZDecode runs into trouble
"""
def __init__(self, msg):
self.msg = msg
def __str__(self):
return "BZZDecoderError: %s" % (self.msg)
# This table has been designed for the ZPCoder
# * by running the following command in file 'zptable.sn':
# * (fast-crude (steady-mat 0.0035 0.0002) 260)))
default_ztable = [ # {{{
(0x8000, 0x0000, 84, 145), # 000: p=0.500000 ( 0, 0)
(0x8000, 0x0000, 3, 4), # 001: p=0.500000 ( 0, 0)
(0x8000, 0x0000, 4, 3), # 002: p=0.500000 ( 0, 0)
(0x6bbd, 0x10a5, 5, 1), # 003: p=0.465226 ( 0, 0)
(0x6bbd, 0x10a5, 6, 2), # 004: p=0.465226 ( 0, 0)
(0x5d45, 0x1f28, 7, 3), # 005: p=0.430708 ( 0, 0)
(0x5d45, 0x1f28, 8, 4), # 006: p=0.430708 ( 0, 0)
(0x51b9, 0x2bd3, 9, 5), # 007: p=0.396718 ( 0, 0)
(0x51b9, 0x2bd3, 10, 6), # 008: p=0.396718 ( 0, 0)
(0x4813, 0x36e3, 11, 7), # 009: p=0.363535 ( 0, 0)
(0x4813, 0x36e3, 12, 8), # 010: p=0.363535 ( 0, 0)
(0x3fd5, 0x408c, 13, 9), # 011: p=0.331418 ( 0, 0)
(0x3fd5, 0x408c, 14, 10), # 012: p=0.331418 ( 0, 0)
(0x38b1, 0x48fd, 15, 11), # 013: p=0.300585 ( 0, 0)
(0x38b1, 0x48fd, 16, 12), # 014: p=0.300585 ( 0, 0)
(0x3275, 0x505d, 17, 13), # 015: p=0.271213 ( 0, 0)
(0x3275, 0x505d, 18, 14), # 016: p=0.271213 ( 0, 0)
(0x2cfd, 0x56d0, 19, 15), # 017: p=0.243438 ( 0, 0)
(0x2cfd, 0x56d0, 20, 16), # 018: p=0.243438 ( 0, 0)
(0x2825, 0x5c71, 21, 17), # 019: p=0.217391 ( 0, 0)
(0x2825, 0x5c71, 22, 18), # 020: p=0.217391 ( 0, 0)
(0x23ab, 0x615b, 23, 19), # 021: p=0.193150 ( 0, 0)
(0x23ab, 0x615b, 24, 20), # 022: p=0.193150 ( 0, 0)
(0x1f87, 0x65a5, 25, 21), # 023: p=0.170728 ( 0, 0)
(0x1f87, 0x65a5, 26, 22), # 024: p=0.170728 ( 0, 0)
(0x1bbb, 0x6962, 27, 23), # 025: p=0.150158 ( 0, 0)
(0x1bbb, 0x6962, 28, 24), # 026: p=0.150158 ( 0, 0)
(0x1845, 0x6ca2, 29, 25), # 027: p=0.131418 ( 0, 0)
(0x1845, 0x6ca2, 30, 26), # 028: p=0.131418 ( 0, 0)
(0x1523, 0x6f74, 31, 27), # 029: p=0.114460 ( 0, 0)
(0x1523, 0x6f74, 32, 28), # 030: p=0.114460 ( 0, 0)
(0x1253, 0x71e6, 33, 29), # 031: p=0.099230 ( 0, 0)
(0x1253, 0x71e6, 34, 30), # 032: p=0.099230 ( 0, 0)
(0x0fcf, 0x7404, 35, 31), # 033: p=0.085611 ( 0, 0)
(0x0fcf, 0x7404, 36, 32), # 034: p=0.085611 ( 0, 0)
(0x0d95, 0x75d6, 37, 33), # 035: p=0.073550 ( 0, 0)
(0x0d95, 0x75d6, 38, 34), # 036: p=0.073550 ( 0, 0)
(0x0b9d, 0x7768, 39, 35), # 037: p=0.062888 ( 0, 0)
(0x0b9d, 0x7768, 40, 36), # 038: p=0.062888 ( 0, 0)
(0x09e3, 0x78c2, 41, 37), # 039: p=0.053539 ( 0, 0)
(0x09e3, 0x78c2, 42, 38), # 040: p=0.053539 ( 0, 0)
(0x0861, 0x79ea, 43, 39), # 041: p=0.045365 ( 0, 0)
(0x0861, 0x79ea, 44, 40), # 042: p=0.045365 ( 0, 0)
(0x0711, 0x7ae7, 45, 41), # 043: p=0.038272 ( 0, 0)
(0x0711, 0x7ae7, 46, 42), # 044: p=0.038272 ( 0, 0)
(0x05f1, 0x7bbe, 47, 43), # 045: p=0.032174 ( 0, 0)
(0x05f1, 0x7bbe, 48, 44), # 046: p=0.032174 ( 0, 0)
(0x04f9, 0x7c75, 49, 45), # 047: p=0.026928 ( 0, 0)
(0x04f9, 0x7c75, 50, 46), # 048: p=0.026928 ( 0, 0)
(0x0425, 0x7d0f, 51, 47), # 049: p=0.022444 ( 0, 0)
(0x0425, 0x7d0f, 52, 48), # 050: p=0.022444 ( 0, 0)
(0x0371, 0x7d91, 53, 49), # 051: p=0.018636 ( 0, 0)
(0x0371, 0x7d91, 54, 50), # 052: p=0.018636 ( 0, 0)
(0x02d9, 0x7dfe, 55, 51), # 053: p=0.015421 ( 0, 0)
(0x02d9, 0x7dfe, 56, 52), # 054: p=0.015421 ( 0, 0)
(0x0259, 0x7e5a, 57, 53), # 055: p=0.012713 ( 0, 0)
(0x0259, 0x7e5a, 58, 54), # 056: p=0.012713 ( 0, 0)
(0x01ed, 0x7ea6, 59, 55), # 057: p=0.010419 ( 0, 0)
(0x01ed, 0x7ea6, 60, 56), # 058: p=0.010419 ( 0, 0)
(0x0193, 0x7ee6, 61, 57), # 059: p=0.008525 ( 0, 0)
(0x0193, 0x7ee6, 62, 58), # 060: p=0.008525 ( 0, 0)
(0x0149, 0x7f1a, 63, 59), # 061: p=0.006959 ( 0, 0)
(0x0149, 0x7f1a, 64, 60), # 062: p=0.006959 ( 0, 0)
(0x010b, 0x7f45, 65, 61), # 063: p=0.005648 ( 0, 0)
(0x010b, 0x7f45, 66, 62), # 064: p=0.005648 ( 0, 0)
(0x00d5, 0x7f6b, 67, 63), # 065: p=0.004506 ( 0, 0)
(0x00d5, 0x7f6b, 68, 64), # 066: p=0.004506 ( 0, 0)
(0x00a5, 0x7f8d, 69, 65), # 067: p=0.003480 ( 0, 0)
(0x00a5, 0x7f8d, 70, 66), # 068: p=0.003480 ( 0, 0)
(0x007b, 0x7faa, 71, 67), # 069: p=0.002602 ( 0, 0)
(0x007b, 0x7faa, 72, 68), # 070: p=0.002602 ( 0, 0)
(0x0057, 0x7fc3, 73, 69), # 071: p=0.001843 ( 0, 0)
(0x0057, 0x7fc3, 74, 70), # 072: p=0.001843 ( 0, 0)
(0x003b, 0x7fd7, 75, 71), # 073: p=0.001248 ( 0, 0)
(0x003b, 0x7fd7, 76, 72), # 074: p=0.001248 ( 0, 0)
(0x0023, 0x7fe7, 77, 73), # 075: p=0.000749 ( 0, 0)
(0x0023, 0x7fe7, 78, 74), # 076: p=0.000749 ( 0, 0)
(0x0013, 0x7ff2, 79, 75), # 077: p=0.000402 ( 0, 0)
(0x0013, 0x7ff2, 80, 76), # 078: p=0.000402 ( 0, 0)
(0x0007, 0x7ffa, 81, 77), # 079: p=0.000153 ( 0, 0)
(0x0007, 0x7ffa, 82, 78), # 080: p=0.000153 ( 0, 0)
(0x0001, 0x7fff, 81, 79), # 081: p=0.000027 ( 0, 0)
(0x0001, 0x7fff, 82, 80), # 082: p=0.000027 ( 0, 0)
(0x5695, 0x0000, 9, 85), # 083: p=0.411764 ( 2, 3)
(0x24ee, 0x0000, 86, 226), # 084: p=0.199988 ( 1, 0)
(0x8000, 0x0000, 5, 6), # 085: p=0.500000 ( 3, 3)
(0x0d30, 0x0000, 88, 176), # 086: p=0.071422 ( 4, 0)
(0x481a, 0x0000, 89, 143), # 087: p=0.363634 ( 1, 2)
(0x0481, 0x0000, 90, 138), # 088: p=0.024388 ( 13, 0)
(0x3579, 0x0000, 91, 141), # 089: p=0.285711 ( 1, 3)
(0x017a, 0x0000, 92, 112), # 090: p=0.007999 ( 41, 0)
(0x24ef, 0x0000, 93, 135), # 091: p=0.199997 ( 1, 5)
(0x007b, 0x0000, 94, 104), # 092: p=0.002611 ( 127, 0)
(0x1978, 0x0000, 95, 133), # 093: p=0.137929 ( 1, 8)
(0x0028, 0x0000, 96, 100), # 094: p=0.000849 ( 392, 0)
(0x10ca, 0x0000, 97, 129), # 095: p=0.090907 ( 1, 13)
(0x000d, 0x0000, 82, 98), # 096: p=0.000276 ( 1208, 0)
(0x0b5d, 0x0000, 99, 127), # 097: p=0.061537 ( 1, 20)
(0x0034, 0x0000, 76, 72), # 098: p=0.001102 ( 1208, 1)
(0x078a, 0x0000, 101, 125), # 099: p=0.040815 ( 1, 31)
(0x00a0, 0x0000, 70, 102), # 100: p=0.003387 ( 392, 1)
(0x050f, 0x0000, 103, 123), # 101: p=0.027397 ( 1, 47)
(0x0117, 0x0000, 66, 60), # 102: p=0.005912 ( 392, 2)
(0x0358, 0x0000, 105, 121), # 103: p=0.018099 ( 1, 72)
(0x01ea, 0x0000, 106, 110), # 104: p=0.010362 ( 127, 1)
(0x0234, 0x0000, 107, 119), # 105: p=0.011940 ( 1, 110)
(0x0144, 0x0000, 66, 108), # 106: p=0.006849 ( 193, 1)
(0x0173, 0x0000, 109, 117), # 107: p=0.007858 ( 1, 168)
(0x0234, 0x0000, 60, 54), # 108: p=0.011925 ( 193, 2)
(0x00f5, 0x0000, 111, 115), # 109: p=0.005175 ( 1, 256)
(0x0353, 0x0000, 56, 48), # 110: p=0.017995 ( 127, 2)
(0x00a1, 0x0000, 69, 113), # 111: p=0.003413 ( 1, 389)
(0x05c5, 0x0000, 114, 134), # 112: p=0.031249 ( 41, 1)
(0x011a, 0x0000, 65, 59), # 113: p=0.005957 ( 2, 389)
(0x03cf, 0x0000, 116, 132), # 114: p=0.020618 ( 63, 1)
(0x01aa, 0x0000, 61, 55), # 115: p=0.009020 ( 2, 256)
(0x0285, 0x0000, 118, 130), # 116: p=0.013652 ( 96, 1)
(0x0286, 0x0000, 57, 51), # 117: p=0.013672 ( 2, 168)
(0x01ab, 0x0000, 120, 128), # 118: p=0.009029 ( 146, 1)
(0x03d3, 0x0000, 53, 47), # 119: p=0.020710 ( 2, 110)
(0x011a, 0x0000, 122, 126), # 120: p=0.005961 ( 222, 1)
(0x05c5, 0x0000, 49, 41), # 121: p=0.031250 ( 2, 72)
(0x00ba, 0x0000, 124, 62), # 122: p=0.003925 ( 338, 1)
(0x08ad, 0x0000, 43, 37), # 123: p=0.046979 ( 2, 47)
(0x007a, 0x0000, 72, 66), # 124: p=0.002586 ( 514, 1)
(0x0ccc, 0x0000, 39, 31), # 125: p=0.069306 ( 2, 31)
(0x01eb, 0x0000, 60, 54), # 126: p=0.010386 ( 222, 2)
(0x1302, 0x0000, 33, 25), # 127: p=0.102940 ( 2, 20)
(0x02e6, 0x0000, 56, 50), # 128: p=0.015695 ( 146, 2)
(0x1b81, 0x0000, 29, 131), # 129: p=0.148935 ( 2, 13)
(0x045e, 0x0000, 52, 46), # 130: p=0.023648 ( 96, 2)
(0x24ef, 0x0000, 23, 17), # 131: p=0.199999 ( 3, 13)
(0x0690, 0x0000, 48, 40), # 132: p=0.035533 ( 63, 2)
(0x2865, 0x0000, 23, 15), # 133: p=0.218748 ( 2, 8)
(0x09de, 0x0000, 42, 136), # 134: p=0.053434 ( 41, 2)
(0x3987, 0x0000, 137, 7), # 135: p=0.304346 ( 2, 5)
(0x0dc8, 0x0000, 38, 32), # 136: p=0.074626 ( 41, 3)
(0x2c99, 0x0000, 21, 139), # 137: p=0.241378 ( 2, 7)
(0x10ca, 0x0000, 140, 172), # 138: p=0.090907 ( 13, 1)
(0x3b5f, 0x0000, 15, 9), # 139: p=0.312499 ( 3, 7)
(0x0b5d, 0x0000, 142, 170), # 140: p=0.061537 ( 20, 1)
(0x5695, 0x0000, 9, 85), # 141: p=0.411764 ( 2, 3)
(0x078a, 0x0000, 144, 168), # 142: p=0.040815 ( 31, 1)
(0x8000, 0x0000, 141, 248), # 143: p=0.500000 ( 2, 2)
(0x050f, 0x0000, 146, 166), # 144: p=0.027397 ( 47, 1)
(0x24ee, 0x0000, 147, 247), # 145: p=0.199988 ( 0, 1)
(0x0358, 0x0000, 148, 164), # 146: p=0.018099 ( 72, 1)
(0x0d30, 0x0000, 149, 197), # 147: p=0.071422 ( 0, 4)
(0x0234, 0x0000, 150, 162), # 148: p=0.011940 ( 110, 1)
(0x0481, 0x0000, 151, 95), # 149: p=0.024388 ( 0, 13)
(0x0173, 0x0000, 152, 160), # 150: p=0.007858 ( 168, 1)
(0x017a, 0x0000, 153, 173), # 151: p=0.007999 ( 0, 41)
(0x00f5, 0x0000, 154, 158), # 152: p=0.005175 ( 256, 1)
(0x007b, 0x0000, 155, 165), # 153: p=0.002611 ( 0, 127)
(0x00a1, 0x0000, 70, 156), # 154: p=0.003413 ( 389, 1)
(0x0028, 0x0000, 157, 161), # 155: p=0.000849 ( 0, 392)
(0x011a, 0x0000, 66, 60), # 156: p=0.005957 ( 389, 2)
(0x000d, 0x0000, 81, 159), # 157: p=0.000276 ( 0, 1208)
(0x01aa, 0x0000, 62, 56), # 158: p=0.009020 ( 256, 2)
(0x0034, 0x0000, 75, 71), # 159: p=0.001102 ( 1, 1208)
(0x0286, 0x0000, 58, 52), # 160: p=0.013672 ( 168, 2)
(0x00a0, 0x0000, 69, 163), # 161: p=0.003387 ( 1, 392)
(0x03d3, 0x0000, 54, 48), # 162: p=0.020710 ( 110, 2)
(0x0117, 0x0000, 65, 59), # 163: p=0.005912 ( 2, 392)
(0x05c5, 0x0000, 50, 42), # 164: p=0.031250 ( 72, 2)
(0x01ea, 0x0000, 167, 171), # 165: p=0.010362 ( 1, 127)
(0x08ad, 0x0000, 44, 38), # 166: p=0.046979 ( 47, 2)
(0x0144, 0x0000, 65, 169), # 167: p=0.006849 ( 1, 193)
(0x0ccc, 0x0000, 40, 32), # 168: p=0.069306 ( 31, 2)
(0x0234, 0x0000, 59, 53), # 169: p=0.011925 ( 2, 193)
(0x1302, 0x0000, 34, 26), # 170: p=0.102940 ( 20, 2)
(0x0353, 0x0000, 55, 47), # 171: p=0.017995 ( 2, 127)
(0x1b81, 0x0000, 30, 174), # 172: p=0.148935 ( 13, 2)
(0x05c5, 0x0000, 175, 193), # 173: p=0.031249 ( 1, 41)
(0x24ef, 0x0000, 24, 18), # 174: p=0.199999 ( 13, 3)
(0x03cf, 0x0000, 177, 191), # 175: p=0.020618 ( 1, 63)
(0x2b74, 0x0000, 178, 222), # 176: p=0.235291 ( 4, 1)
(0x0285, 0x0000, 179, 189), # 177: p=0.013652 ( 1, 96)
(0x201d, 0x0000, 180, 218), # 178: p=0.173910 ( 6, 1)
(0x01ab, 0x0000, 181, 187), # 179: p=0.009029 ( 1, 146)
(0x1715, 0x0000, 182, 216), # 180: p=0.124998 ( 9, 1)
(0x011a, 0x0000, 183, 185), # 181: p=0.005961 ( 1, 222)
(0x0fb7, 0x0000, 184, 214), # 182: p=0.085105 ( 14, 1)
(0x00ba, 0x0000, 69, 61), # 183: p=0.003925 ( 1, 338)
(0x0a67, 0x0000, 186, 212), # 184: p=0.056337 ( 22, 1)
(0x01eb, 0x0000, 59, 53), # 185: p=0.010386 ( 2, 222)
(0x06e7, 0x0000, 188, 210), # 186: p=0.037382 ( 34, 1)
(0x02e6, 0x0000, 55, 49), # 187: p=0.015695 ( 2, 146)
(0x0496, 0x0000, 190, 208), # 188: p=0.024844 ( 52, 1)
(0x045e, 0x0000, 51, 45), # 189: p=0.023648 ( 2, 96)
(0x030d, 0x0000, 192, 206), # 190: p=0.016529 ( 79, 1)
(0x0690, 0x0000, 47, 39), # 191: p=0.035533 ( 2, 63)
(0x0206, 0x0000, 194, 204), # 192: p=0.010959 ( 120, 1)
(0x09de, 0x0000, 41, 195), # 193: p=0.053434 ( 2, 41)
(0x0155, 0x0000, 196, 202), # 194: p=0.007220 ( 183, 1)
(0x0dc8, 0x0000, 37, 31), # 195: p=0.074626 ( 3, 41)
(0x00e1, 0x0000, 198, 200), # 196: p=0.004750 ( 279, 1)
(0x2b74, 0x0000, 199, 243), # 197: p=0.235291 ( 1, 4)
(0x0094, 0x0000, 72, 64), # 198: p=0.003132 ( 424, 1)
(0x201d, 0x0000, 201, 239), # 199: p=0.173910 ( 1, 6)
(0x0188, 0x0000, 62, 56), # 200: p=0.008284 ( 279, 2)
(0x1715, 0x0000, 203, 237), # 201: p=0.124998 ( 1, 9)
(0x0252, 0x0000, 58, 52), # 202: p=0.012567 ( 183, 2)
(0x0fb7, 0x0000, 205, 235), # 203: p=0.085105 ( 1, 14)
(0x0383, 0x0000, 54, 48), # 204: p=0.019021 ( 120, 2)
(0x0a67, 0x0000, 207, 233), # 205: p=0.056337 ( 1, 22)
(0x0547, 0x0000, 50, 44), # 206: p=0.028571 ( 79, 2)
(0x06e7, 0x0000, 209, 231), # 207: p=0.037382 ( 1, 34)
(0x07e2, 0x0000, 46, 38), # 208: p=0.042682 ( 52, 2)
(0x0496, 0x0000, 211, 229), # 209: p=0.024844 ( 1, 52)
(0x0bc0, 0x0000, 40, 34), # 210: p=0.063636 ( 34, 2)
(0x030d, 0x0000, 213, 227), # 211: p=0.016529 ( 1, 79)
(0x1178, 0x0000, 36, 28), # 212: p=0.094593 ( 22, 2)
(0x0206, 0x0000, 215, 225), # 213: p=0.010959 ( 1, 120)
(0x19da, 0x0000, 30, 22), # 214: p=0.139999 ( 14, 2)
(0x0155, 0x0000, 217, 223), # 215: p=0.007220 ( 1, 183)
(0x24ef, 0x0000, 26, 16), # 216: p=0.199998 ( 9, 2)
(0x00e1, 0x0000, 219, 221), # 217: p=0.004750 ( 1, 279)
(0x320e, 0x0000, 20, 220), # 218: p=0.269229 ( 6, 2)
(0x0094, 0x0000, 71, 63), # 219: p=0.003132 ( 1, 424)
(0x432a, 0x0000, 14, 8), # 220: p=0.344827 ( 6, 3)
(0x0188, 0x0000, 61, 55), # 221: p=0.008284 ( 2, 279)
(0x447d, 0x0000, 14, 224), # 222: p=0.349998 ( 4, 2)
(0x0252, 0x0000, 57, 51), # 223: p=0.012567 ( 2, 183)
(0x5ece, 0x0000, 8, 2), # 224: p=0.434782 ( 4, 3)
(0x0383, 0x0000, 53, 47), # 225: p=0.019021 ( 2, 120)
(0x8000, 0x0000, 228, 87), # 226: p=0.500000 ( 1, 1)
(0x0547, 0x0000, 49, 43), # 227: p=0.028571 ( 2, 79)
(0x481a, 0x0000, 230, 246), # 228: p=0.363634 ( 2, 1)
(0x07e2, 0x0000, 45, 37), # 229: p=0.042682 ( 2, 52)
(0x3579, 0x0000, 232, 244), # 230: p=0.285711 ( 3, 1)
(0x0bc0, 0x0000, 39, 33), # 231: p=0.063636 ( 2, 34)
(0x24ef, 0x0000, 234, 238), # 232: p=0.199997 ( 5, 1)
(0x1178, 0x0000, 35, 27), # 233: p=0.094593 ( 2, 22)
(0x1978, 0x0000, 138, 236), # 234: p=0.137929 ( 8, 1)
(0x19da, 0x0000, 29, 21), # 235: p=0.139999 ( 2, 14)
(0x2865, 0x0000, 24, 16), # 236: p=0.218748 ( 8, 2)
(0x24ef, 0x0000, 25, 15), # 237: p=0.199998 ( 2, 9)
(0x3987, 0x0000, 240, 8), # 238: p=0.304346 ( 5, 2)
(0x320e, 0x0000, 19, 241), # 239: p=0.269229 ( 2, 6)
(0x2c99, 0x0000, 22, 242), # 240: p=0.241378 ( 7, 2)
(0x432a, 0x0000, 13, 7), # 241: p=0.344827 ( 3, 6)
(0x3b5f, 0x0000, 16, 10), # 242: p=0.312499 ( 7, 3)
(0x447d, 0x0000, 13, 245), # 243: p=0.349998 ( 2, 4)
(0x5695, 0x0000, 10, 2), # 244: p=0.411764 ( 3, 2)
(0x5ece, 0x0000, 7, 1), # 245: p=0.434782 ( 3, 4)
(0x8000, 0x0000, 244, 83), # 246: p=0.500000 ( 2, 2)
(0x8000, 0x0000, 249, 250), # 247: p=0.500000 ( 1, 1)
(0x5695, 0x0000, 10, 2), # 248: p=0.411764 ( 3, 2)
(0x481a, 0x0000, 89, 143), # 249: p=0.363634 ( 1, 2)
(0x481a, 0x0000, 230, 246), # 250: p=0.363634 ( 2, 1)
(0, 0, 0, 0),
(0, 0, 0, 0),
(0, 0, 0, 0),
(0, 0, 0, 0),
(0, 0, 0, 0),
]
xmtf = (
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F,
0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F,
0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F,
0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
0x58, 0x59, 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F,
0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F,
0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F,
0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9D, 0x9E, 0x9F,
0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7,
0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF,
0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7,
0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF,
0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7,
0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF
)
# }}}
def chr3(l):
return bytes(bytearray(l))
class BZZDecoder():
def __init__(self, infile, outfile):
self.instream = infile
self.outf = outfile
self.ieof = False
self.bptr = None
self.xsize = None
self.outbuf = [0] * (MAXBLOCK * 1024)
self.byte = None
self.scount = 0
self.delay = 25
self.a = 0
self.code = 0
self.bufint = 0
self.ctx = [0] * 300
# table
self.p = [0] * 256
self.m = [0] * 256
self.up = [0] * 256
self.dn = [0] * 256
# machine independent ffz
self.ffzt = [0] * 256
# Create machine independent ffz table
for i in range(256):
j = i
while(j & 0x80):
self.ffzt[i] += 1
j <<= 1
# Initialize table
self.newtable(default_ztable)
# Codebit counter
# Read first 16 bits of code
if not self.read_byte():
self.byte = 0xff
self.code = (self.byte << 8)
if not self.read_byte():
self.byte = 0xff
self.code = self.code | self.byte
# Preload buffer
self.preload()
# Compute initial fence
self.fence = self.code
if self.code >= 0x8000:
self.fence = 0x7fff
def convert(self, sz):
if self.ieof:
return 0
copied = 0
while sz > 0 and not (self.ieof):
# Decode if needed
if not self.xsize:
self.bptr = 0
if not self.decode(): # input block size set in decode
self.xsize = 1
self.ieof = True
self.xsize -= 1
# Compute remaining
bytes = self.xsize
if bytes > sz:
bytes = sz
# Transfer
if bytes:
for i in range(bytes):
self.outf.write(chr3(self.outbuf[self.bptr + i]))
self.xsize -= bytes
self.bptr += bytes
sz -= bytes
copied += bytes
# offset += bytes; // for tell()
return copied
def preload(self):
while self.scount <= 24:
if self.read_byte() < 1:
self.byte = 0xff
if --self.delay < 1:
raise BZZDecoderError("BiteStream EOF")
self.bufint = (self.bufint << 8) | self.byte
self.scount += 8
def newtable(self, table):
for i in range(256):
self.p[i] = table[i][0]
self.m[i] = table[i][1]
self.up[i] = table[i][2]
self.dn[i] = table[i][3]
def decode(self):
outbuf = self.outbuf
# Decode block size
self.xsize = self.decode_raw(24)
if not self.xsize:
return 0
if self.xsize > MAXBLOCK * 1024: # 4MB (4096 * 1024) is max block
raise BZZDecoderError("BiteStream.corrupt")
# Dec11ode Estimation Speed
fshift = 0
if self.zpcodec_decoder():
fshift += 1
if self.zpcodec_decoder():
fshift += 1
# Prepare Quasi MTF
mtf = list(xmtf) # unsigned chars
freq = [0] * FREQMAX
fadd = 4
# Decode
mtfno = 3
markerpos = -1
for i in range(self.xsize):
ctxid = CTXIDS - 1
if ctxid > mtfno:
ctxid = mtfno
cx = self.ctx
if self.zpcodec_decode(cx, ctxid):
mtfno = 0
outbuf[i] = mtf[mtfno]
elif self.zpcodec_decode(cx, ctxid + CTXIDS):
mtfno = 1
outbuf[i] = mtf[mtfno]
elif self.zpcodec_decode(cx, 2*CTXIDS):
mtfno = 2 + self.decode_binary(cx, 2*CTXIDS + 1, 1)
outbuf[i] = mtf[mtfno]
elif self.zpcodec_decode(cx, 2*CTXIDS+2):
mtfno = 4 + self.decode_binary(cx, 2*CTXIDS+2 + 1, 2)
outbuf[i] = mtf[mtfno]
elif self.zpcodec_decode(cx, 2*CTXIDS + 6):
mtfno = 8 + self.decode_binary(cx, 2*CTXIDS + 6 + 1, 3)
outbuf[i] = mtf[mtfno]
elif self.zpcodec_decode(cx, 2*CTXIDS + 14):
mtfno = 16 + self.decode_binary(cx, 2*CTXIDS + 14 + 1, 4)
outbuf[i] = mtf[mtfno]
elif self.zpcodec_decode(cx, 2*CTXIDS + 30 ):
mtfno = 32 + self.decode_binary(cx, 2*CTXIDS + 30 + 1, 5)
outbuf[i] = mtf[mtfno]
elif self.zpcodec_decode(cx, 2*CTXIDS + 62 ):
mtfno = 64 + self.decode_binary(cx, 2*CTXIDS + 62 + 1, 6)
outbuf[i] = mtf[mtfno]
elif self.zpcodec_decode(cx, 2*CTXIDS + 126):
mtfno = 128 + self.decode_binary(cx, 2*CTXIDS + 126 + 1, 7)
outbuf[i] = mtf[mtfno]
else:
mtfno = 256 # EOB
outbuf[i] = 0
markerpos = i
continue
# Rotate mtf according to empirical frequencies (new!)
# :rotate label
# Adjust frequencies for overflow
fadd = fadd + (fadd >> fshift)
if fadd > 0x10000000:
fadd >>= 24
freq[0] >>= 24
freq[1] >>= 24
freq[2] >>= 24
freq[3] >>= 24
for k in range(4, FREQMAX):
freq[k] = freq[k] >> 24
# Relocate new char according to new freq
fc = fadd
if mtfno < FREQMAX:
fc += freq[mtfno]
k = mtfno
while (k >= FREQMAX):
mtf[k] = mtf[k - 1]
k -= 1
while (k > 0 and fc >= freq[k - 1]):
mtf[k] = mtf[k - 1]
freq[k] = freq[k - 1]
k -= 1
mtf[k] = outbuf[i]
freq[k] = fc
#///////////////////////////////
#//////// Reconstruct the string
if markerpos < 1 or markerpos >= self.xsize:
raise BZZDecoderError("BiteStream.corrupt")
# Allocate pointers
posn = [0] * self.xsize
# Prepare count buffer
count = [0] * 256
# Fill count buffer
for i in range(markerpos):
c = outbuf[i]
posn[i] = (c << 24) | (count[c] & 0xffffff)
count[c] += 1
for i in range(markerpos + 1, self.xsize):
c = outbuf[i]
posn[i] = (c << 24) | (count[c] & 0xffffff)
count[c] += 1
# Compute sorted char positions
last = 1
for i in range(256):
tmp = count[i]
count[i] = last
last += tmp
# Undo the sort transform
i = 0
last = self.xsize - 1
while last > 0:
n = posn[i]
c = (posn[i] >> 24)
last -= 1
outbuf[last] = c
i = count[c] + (n & 0xffffff)
# Free and check
if i != markerpos:
raise BZZDecoderError("BiteStream.corrupt")
return self.xsize
def decode_raw(self, bits):
n = 1
m = (1 << bits)
while n < m:
b = self.zpcodec_decoder()
n = (n << 1) | b
return n - m
def decode_binary(self, ctx, index, bits):
n = 1
m = (1 << bits)
while n < m:
b = self.zpcodec_decode(ctx, index + n - 1)
n = (n << 1) | b
return n - m
def zpcodec_decoder(self):
return self.decode_sub_simple(0, 0x8000 + (self.a >> 1))
def decode_sub_simple(self, mps, z):
# Test MPS/LPS
if z > self.code:
# LPS branch
z = 0x10000 - z
self.a += +z
self.code = self.code + z
# LPS renormalization
shift = self.ffz()
self.scount -= shift
self.a = self.a << shift
self.a &= 0xffff
self.code = (self.code << shift) | ((self.bufint >> self.scount) & ((1 << shift) - 1))
self.code &= 0xffff
if self.scount < 16:
self.preload()
# Adjust fence
self.fence = self.code
if self.code >= 0x8000:
self.fence = 0x7fff
result = mps ^ 1
else:
# MPS renormalization
self.scount -= 1
self.a = (z << 1) & 0xffff
self.code = ((self.code << 1) | ((self.bufint >> self.scount) & 1))
self.code &= 0xffff
if self.scount < 16:
self.preload()
# Adjust fence
self.fence = self.code
if self.code >= 0x8000:
self.fence = 0x7fff
result = mps
return result
def decode_sub(self, ctx, index, z):
# Save bit
bit = (ctx[index] & 1)
# Avoid interval reversion
d = 0x6000 + ((z + self.a) >> 2)
if z > d:
z = d
# Test MPS/LPS
if z > self.code:
# LPS branch
z = 0x10000 - z
self.a += +z
self.code = self.code + z
# LPS adaptation
ctx[index] = self.dn[ctx[index]]
# LPS renormalization
shift = self.ffz()
self.scount -= shift
self.a = (self.a << shift) & 0xffff
self.code = ((self.code << shift) | ((self.bufint >> self.scount) & ((1 << shift) - 1))) & 0xffff
if self.scount < 16:
self.preload()
# Adjust fence
self.fence = self.code
if self.code >= 0x8000:
self.fence = 0x7fff
return bit ^ 1
else:
# MPS adaptation
if self.a >= self.m[ctx[index]]:
ctx[index] = self.up[ctx[index]]
# MPS renormalization
self.scount -= 1
self.a = z << 1 & 0xffff
self.code = ((self.code << 1) | ((self.bufint >> self.scount) & 1)) & 0xffff
if self.scount < 16:
self.preload()
# Adjust fence
self.fence = self.code
if self.code >= 0x8000:
self.fence = 0x7fff
return bit
def zpcodec_decode(self, ctx, index):
z = self.a + self.p[ctx[index]]
if z <= self.fence:
self.a = z
res = (ctx[index] & 1)
else:
res = self.decode_sub(ctx, index, z)
return res
def read_byte(self):
res = 0
if self.instream:
ires = self.instream.read(1)
res = len(ires)
if res:
self.byte = ord(ires[0])
else:
raise NotImplementedError
return res
def ffz(self):
x = self.a
if (x >= 0xff00):
return (self.ffzt[x & 0xff] + 8)
else:
return (self.ffzt[(x >> 8) & 0xff])
### for testing
def main():
import sys
infile = file(sys.argv[1], "rb")
outfile = file(sys.argv[2], "wb")
dec = BZZDecoder(infile, outfile)
while True:
res = dec.convert(1024 * 1024)
if not res:
break
if __name__ == "__main__":
main()

View File

@ -0,0 +1,87 @@
# -*- coding: utf-8 -*-
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL 3'
__copyright__ = '2011, Anthon van der Neut <anthon@mnt.org>'
__docformat__ = 'restructuredtext en'
import os
from subprocess import Popen, PIPE
from cStringIO import StringIO
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
from calibre.ebooks.txt.processor import convert_basic
class DJVUInput(InputFormatPlugin):
name = 'DJVU Input'
author = 'Anthon van der Neut'
description = 'Convert OCR-ed DJVU files (.djvu) to HTML'
file_types = set(['djvu', 'djv'])
options = set([
OptionRecommendation(name='use_djvutxt', recommended_value=True,
help=_('Try to use the djvutxt program and fall back to pure '
'python implementation if it fails or is not available')),
])
def convert(self, stream, options, file_ext, log, accelerators):
stdout = StringIO()
ppdjvu = True
# using djvutxt is MUCH faster, should make it an option
if options.use_djvutxt and os.path.exists('/usr/bin/djvutxt'):
from calibre.ptempfile import PersistentTemporaryFile
try:
fp = PersistentTemporaryFile(suffix='.djvu', prefix='djv_input')
filename = fp._name
fp.write(stream.read())
fp.close()
cmd = ['djvutxt', filename]
stdout.write(Popen(cmd, stdout=PIPE, close_fds=True).communicate()[0])
os.remove(filename)
ppdjvu = False
except:
stream.seek(0) # retry with the pure python converter
if ppdjvu:
from .djvu import DJVUFile
x = DJVUFile(stream)
x.get_text(stdout)
html = convert_basic(stdout.getvalue().replace(b"\n", b' ').replace(
b'\037', b'\n\n'))
# Run the HTMLized text through the html processing plugin.
from calibre.customize.ui import plugin_for_input_format
html_input = plugin_for_input_format('html')
for opt in html_input.options:
setattr(options, opt.option.name, opt.recommended_value)
options.input_encoding = 'utf-8'
base = os.getcwdu()
if file_ext != 'txtz' and hasattr(stream, 'name'):
base = os.path.dirname(stream.name)
fname = os.path.join(base, 'index.html')
c = 0
while os.path.exists(fname):
c += 1
fname = 'index%d.html'%c
htmlfile = open(fname, 'wb')
with htmlfile:
htmlfile.write(html.encode('utf-8'))
odi = options.debug_pipeline
options.debug_pipeline = None
# Generate oeb from html conversion.
with open(htmlfile.name, 'rb') as f:
oeb = html_input.convert(f, options, 'html', log,
{})
options.debug_pipeline = odi
os.remove(htmlfile.name)
# Set metadata from file.
from calibre.customize.ui import get_file_type_metadata
from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
mi = get_file_type_metadata(stream, file_ext)
meta_info_to_oeb_metadata(mi, oeb.metadata, log)
return oeb

View File

@ -0,0 +1,24 @@
# coding: utf-8
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2011, Anthon van der Neut <A.van.der.Neut@ruamel.eu>'
from calibre.gui2.convert.djvu_input_ui import Ui_Form
from calibre.gui2.convert import Widget
class PluginWidget(Widget, Ui_Form):
TITLE = _('DJVU Input')
HELP = _('Options specific to')+' DJVU '+_('input')
COMMIT_NAME = 'djvu_input'
ICON = I('mimetypes/djvu.png')
def __init__(self, parent, get_option, get_help, db=None, book_id=None):
Widget.__init__(self, parent,
['use_djvutxt', ])
self.db, self.book_id = db, book_id
self.initialize_options(get_option, get_help, db, book_id)

View File

@ -0,0 +1,28 @@
<?xml version="1.0" encoding="UTF-8"?>
<ui version="4.0">
<class>Form</class>
<widget class="QWidget" name="Form">
<property name="geometry">
<rect>
<x>0</x>
<y>0</y>
<width>400</width>
<height>300</height>
</rect>
</property>
<property name="windowTitle">
<string>Form</string>
</property>
<layout class="QVBoxLayout" name="verticalLayout">
<item>
<widget class="QCheckBox" name="opt_use_djvutxt">
<property name="text">
<string>Use &amp;djvutxt, if available, for faster processing</string>
</property>
</widget>
</item>
</layout>
</widget>
<resources/>
<connections/>
</ui>