From da29a58363f770f38f01e02e3cb4221331666c0a Mon Sep 17 00:00:00 2001 From: "Marshall T. Vandegrift" Date: Fri, 11 Jul 2008 14:37:27 -0400 Subject: [PATCH 01/19] Integrated own cleanup patch --- src/calibre/ebooks/mobi/reader.py | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/src/calibre/ebooks/mobi/reader.py b/src/calibre/ebooks/mobi/reader.py index dea87dbd8c..05093f3c1a 100644 --- a/src/calibre/ebooks/mobi/reader.py +++ b/src/calibre/ebooks/mobi/reader.py @@ -13,7 +13,7 @@ except ImportError: import Image as PILImage from calibre import __appname__ -from calibre.ebooks.BeautifulSoup import BeautifulSoup +from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag from calibre.ebooks.mobi import MobiError from calibre.ebooks.mobi.huffcdic import HuffReader from calibre.ebooks.mobi.palmdoc import decompress_doc @@ -165,13 +165,14 @@ class MobiReader(object): self.processed_html = self.processed_html.decode(self.book_header.codec, 'ignore') self.extract_images(processed_records, output_dir) self.replace_page_breaks() - self.cleanup() + self.cleanup_html() self.processed_html = re.compile('', re.IGNORECASE).sub( '\n\n', self.processed_html) soup = BeautifulSoup(self.processed_html.replace('> <', '>\n<')) + self.cleanup_soup(soup) guide = soup.find('guide') for elem in soup.findAll(['metadata', 'guide']): elem.extract() @@ -192,10 +193,29 @@ class MobiReader(object): if ncx: open(os.path.splitext(htmlfile)[0]+'.ncx', 'wb').write(ncx) - def cleanup(self): + def cleanup_html(self): self.processed_html = re.sub(r'
', '', self.processed_html) - self.processed_html = re.sub(r'<([^>]*) height="([^"]*)"', r'<\1 style="margin-top: \2"', self.processed_html) - self.processed_html = re.sub(r'<([^>]*) width="([^"]*)"', r'<\1 style="text-indent: \2"', self.processed_html) + + def cleanup_soup(self, soup): + for tag in soup.recursiveChildGenerator(): + if not isinstance(tag, Tag): continue + styles = [] + try: + styles.append(tag['style']) + except KeyError: + pass + try: + styles.append('margin-top: %s' % tag['height']) + del tag['height'] + except KeyError: + pass + try: + styles.append('text-indent: %s' % tag['width']) + del tag['width'] + except KeyError: + pass + if styles: + tag['style'] = '; '.join(styles) def create_opf(self, htmlfile, guide=None): mi = self.book_header.exth.mi From 615d5ea2795563f8af9dc34c2c2c03c84c9c9714 Mon Sep 17 00:00:00 2001 From: "Marshall T. Vandegrift" Date: Wed, 16 Jul 2008 10:00:49 -0400 Subject: [PATCH 02/19] Checkpoint state to move to office --- src/calibre/ebooks/lit/maps/__init__.py | 7 +- src/calibre/ebooks/lit/maps/html.py | 1568 +++++++++++------------ src/calibre/ebooks/lit/maps/opf.py | 54 +- src/calibre/ebooks/lit/mssha1.py | 343 +++++ src/calibre/ebooks/lit/reader.py | 418 +++--- 5 files changed, 1352 insertions(+), 1038 deletions(-) create mode 100644 src/calibre/ebooks/lit/mssha1.py diff --git a/src/calibre/ebooks/lit/maps/__init__.py b/src/calibre/ebooks/lit/maps/__init__.py index eb99464d9b..2abab3efe9 100644 --- a/src/calibre/ebooks/lit/maps/__init__.py +++ b/src/calibre/ebooks/lit/maps/__init__.py @@ -1,5 +1,2 @@ -import calibre.ebooks.maps.opf as opf -import calibre.ebooks.maps.html as html - -OPF_MAP = opf.MAP -HTML_MAP = html.MAP +from calibre.ebooks.lit.maps.opf import MAP as OPF_MAP +from calibre.ebooks.lit.maps.html import MAP as HTML_MAP diff --git a/src/calibre/ebooks/lit/maps/html.py b/src/calibre/ebooks/lit/maps/html.py index 095b0bcc3e..de0286c764 100644 --- a/src/calibre/ebooks/lit/maps/html.py +++ b/src/calibre/ebooks/lit/maps/html.py @@ -1,786 +1,3 @@ -ATTRS0 = { - 0x8010 => "tabindex", - 0x8046 => "title", - 0x804b => "style", - 0x804d => "disabled", - 0x83ea => "class", - 0x83eb => "id", - 0x83fe => "datafld", - 0x83ff => "datasrc", - 0x8400 => "dataformatas", - 0x87d6 => "accesskey", - 0x9392 => "lang", - 0x93ed => "language", - 0x93fe => "dir", - 0x9771 => "onmouseover", - 0x9772 => "onmouseout", - 0x9773 => "onmousedown", - 0x9774 => "onmouseup", - 0x9775 => "onmousemove", - 0x9776 => "onkeydown", - 0x9777 => "onkeyup", - 0x9778 => "onkeypress", - 0x9779 => "onclick", - 0x977a => "ondblclick", - 0x977e => "onhelp", - 0x977f => "onfocus", - 0x9780 => "onblur", - 0x9783 => "onrowexit", - 0x9784 => "onrowenter", - 0x9786 => "onbeforeupdate", - 0x9787 => "onafterupdate", - 0x978a => "onreadystatechange", - 0x9790 => "onscroll", - 0x9794 => "ondragstart", - 0x9795 => "onresize", - 0x9796 => "onselectstart", - 0x9797 => "onerrorupdate", - 0x9799 => "ondatasetchanged", - 0x979a => "ondataavailable", - 0x979b => "ondatasetcomplete", - 0x979c => "onfilterchange", - 0x979f => "onlosecapture", - 0x97a0 => "onpropertychange", - 0x97a2 => "ondrag", - 0x97a3 => "ondragend", - 0x97a4 => "ondragenter", - 0x97a5 => "ondragover", - 0x97a6 => "ondragleave", - 0x97a7 => "ondrop", - 0x97a8 => "oncut", - 0x97a9 => "oncopy", - 0x97aa => "onpaste", - 0x97ab => "onbeforecut", - 0x97ac => "onbeforecopy", - 0x97ad => "onbeforepaste", - 0x97af => "onrowsdelete", - 0x97b0 => "onrowsinserted", - 0x97b1 => "oncellchange", - 0x97b2 => "oncontextmenu", - 0x97b6 => "onbeforeeditfocus", - } -ATTRS3 = { - 0x0001 => "href", - 0x03ec => "target", - 0x03ee => "rel", - 0x03ef => "rev", - 0x03f0 => "urn", - 0x03f1 => "methods", - 0x8001 => "name", - 0x8046 => "title", - 0x804b => "style", - 0x83ea => "class", - 0x83eb => "id", - } -ATTRS5 = { - 0x9399 => "clear", - } -ATTRS6 = { - 0x8001 => "name", - 0x8006 => "width", - 0x8007 => "height", - 0x804a => "align", - 0x8bbb => "classid", - 0x8bbc => "data", - 0x8bbf => "codebase", - 0x8bc0 => "codetype", - 0x8bc1 => "code", - 0x8bc2 => "type", - 0x8bc5 => "vspace", - 0x8bc6 => "hspace", - 0x978e => "onerror", - } -ATTRS7 = { - 0x0001 => "href", - 0x03ea => "shape", - 0x03eb => "coords", - 0x03ed => "target", - 0x03ee => "alt", - 0x03ef => "nohref", - 0x8046 => "title", - 0x804b => "style", - 0x83ea => "class", - 0x83eb => "id", - } -ATTRS8 = { - 0x8046 => "title", - 0x804b => "style", - 0x83ea => "class", - 0x83eb => "id", - } -ATTRS9 = { - 0x03ec => "href", - 0x03ed => "target", - } -ATTRS10 = { - 0x938b => "color", - 0x939b => "face", - 0x93a3 => "size", - } -ATTRS12 = { - 0x03ea => "src", - 0x03eb => "loop", - 0x03ec => "volume", - 0x03ed => "balance", - } -ATTRS13 = { - 0x8046 => "title", - 0x804b => "style", - 0x83ea => "class", - 0x83eb => "id", - } -ATTRS15 = { - 0x8046 => "title", - 0x804b => "style", - 0x83ea => "class", - 0x83eb => "id", - 0x9399 => "clear", - } -ATTRS16 = { - 0x07db => "link", - 0x07dc => "alink", - 0x07dd => "vlink", - 0x8046 => "title", - 0x804b => "style", - 0x83ea => "class", - 0x83eb => "id", - 0x938a => "background", - 0x938b => "text", - 0x938e => "nowrap", - 0x93ae => "topmargin", - 0x93af => "rightmargin", - 0x93b0 => "bottommargin", - 0x93b1 => "leftmargin", - 0x93b6 => "bgproperties", - 0x93d8 => "scroll", - 0x977b => "onselect", - 0x9791 => "onload", - 0x9792 => "onunload", - 0x9798 => "onbeforeunload", - 0x97b3 => "onbeforeprint", - 0x97b4 => "onafterprint", - 0xfe0c => "bgcolor", - } -ATTRS17 = { - 0x8046 => "title", - 0x804b => "style", - 0x83ea => "class", - 0x83eb => "id", - 0x9399 => "clear", - } -ATTRS18 = { - 0x07d1 => "type", - 0x8001 => "name", - } -ATTRS19 = { - 0x8046 => "title", - 0x8049 => "align", - 0x804b => "style", - 0x83ea => "class", - 0x83eb => "id", - 0x93a8 => "valign", - } -ATTRS20 = { - 0x8046 => "title", - 0x804b => "style", - 0x83ea => "class", - 0x83eb => "id", - 0x9399 => "clear", - } -ATTRS21 = { - 0x8046 => "title", - 0x804b => "style", - 0x83ea => "class", - 0x83eb => "id", - } -ATTRS22 = { - 0x8046 => "title", - 0x804b => "style", - 0x83ea => "class", - 0x83eb => "id", - } -ATTRS23 = { - 0x03ea => "span", - 0x8006 => "width", - 0x8049 => "align", - 0x93a8 => "valign", - 0xfe0c => "bgcolor", - } -ATTRS24 = { - 0x03ea => "span", - 0x8006 => "width", - 0x8049 => "align", - 0x93a8 => "valign", - 0xfe0c => "bgcolor", - } -ATTRS27 = { - 0x8046 => "title", - 0x804b => "style", - 0x83ea => "class", - 0x83eb => "id", - 0x938e => "nowrap", - } -ATTRS29 = { - 0x8046 => "title", - 0x804b => "style", - 0x83ea => "class", - 0x83eb => "id", - } -ATTRS31 = { - 0x8046 => "title", - 0x8049 => "align", - 0x804b => "style", - 0x83ea => "class", - 0x83eb => "id", - 0x938e => "nowrap", - } -ATTRS32 = { - 0x03ea => "compact", - 0x8046 => "title", - 0x804b => "style", - 0x83ea => "class", - 0x83eb => "id", - } -ATTRS33 = { - 0x8046 => "title", - 0x804b => "style", - 0x83ea => "class", - 0x83eb => "id", - 0x938e => "nowrap", - } -ATTRS34 = { - 0x8046 => "title", - 0x804b => "style", - 0x83ea => "class", - 0x83eb => "id", - } -ATTRS35 = { - 0x8001 => "name", - 0x8006 => "width", - 0x8007 => "height", - 0x804a => "align", - 0x8bbd => "palette", - 0x8bbe => "pluginspage", - 0x8bbf => "codebase", - 0x8bbf => "src", - 0x8bc1 => "units", - 0x8bc2 => "type", - 0x8bc3 => "hidden", - } -ATTRS36 = { - 0x804a => "align", - } -ATTRS37 = { - 0x8046 => "title", - 0x804b => "style", - 0x83ea => "class", - 0x83eb => "id", - 0x938b => "color", - 0x939b => "face", - 0x939c => "size", - } -ATTRS38 = { - 0x03ea => "action", - 0x03ec => "enctype", - 0x03ed => "method", - 0x03ef => "target", - 0x03f4 => "accept-charset", - 0x8001 => "name", - 0x977c => "onsubmit", - 0x977d => "onreset", - } -ATTRS39 = { - 0x8000 => "align", - 0x8001 => "name", - 0x8bb9 => "src", - 0x8bbb => "border", - 0x8bbc => "frameborder", - 0x8bbd => "framespacing", - 0x8bbe => "marginwidth", - 0x8bbf => "marginheight", - 0x8bc0 => "noresize", - 0x8bc1 => "scrolling", - 0x8fa2 => "bordercolor", - } -ATTRS40 = { - 0x03e9 => "rows", - 0x03ea => "cols", - 0x03eb => "border", - 0x03ec => "bordercolor", - 0x03ed => "frameborder", - 0x03ee => "framespacing", - 0x8001 => "name", - 0x9791 => "onload", - 0x9792 => "onunload", - 0x9798 => "onbeforeunload", - 0x97b3 => "onbeforeprint", - 0x97b4 => "onafterprint", - } -ATTRS42 = { - 0x8046 => "title", - 0x8049 => "align", - 0x804b => "style", - 0x83ea => "class", - 0x83eb => "id", - 0x9399 => "clear", - } -ATTRS43 = { - 0x8046 => "title", - 0x8049 => "align", - 0x804b => "style", - 0x83ea => "class", - 0x83eb => "id", - 0x9399 => "clear", - } -ATTRS44 = { - 0x8046 => "title", - 0x8049 => "align", - 0x804b => "style", - 0x83ea => "class", - 0x83eb => "id", - 0x9399 => "clear", - } -ATTRS45 = { - 0x8046 => "title", - 0x8049 => "align", - 0x804b => "style", - 0x83ea => "class", - 0x83eb => "id", - 0x9399 => "clear", - } -ATTRS46 = { - 0x8046 => "title", - 0x8049 => "align", - 0x804b => "style", - 0x83ea => "class", - 0x83eb => "id", - 0x9399 => "clear", - } -ATTRS47 = { - 0x8046 => "title", - 0x8049 => "align", - 0x804b => "style", - 0x83ea => "class", - 0x83eb => "id", - 0x9399 => "clear", - } -ATTRS49 = { - 0x03ea => "noshade", - 0x8006 => "width", - 0x8007 => "size", - 0x8046 => "title", - 0x8049 => "align", - 0x804b => "style", - 0x83ea => "class", - 0x83eb => "id", - 0x938b => "color", - } -ATTRS51 = { - 0x8046 => "title", - 0x804b => "style", - 0x83ea => "class", - 0x83eb => "id", - } -ATTRS52 = { - 0x8001 => "name", - 0x8006 => "width", - 0x8007 => "height", - 0x804a => "align", - 0x8bb9 => "src", - 0x8bbb => "border", - 0x8bbc => "frameborder", - 0x8bbd => "framespacing", - 0x8bbe => "marginwidth", - 0x8bbf => "marginheight", - 0x8bc0 => "noresize", - 0x8bc1 => "scrolling", - 0x8fa2 => "vspace", - 0x8fa3 => "hspace", - } -ATTRS53 = { - 0x03eb => "alt", - 0x03ec => "src", - 0x03ed => "border", - 0x03ee => "vspace", - 0x03ef => "hspace", - 0x03f0 => "lowsrc", - 0x03f1 => "vrml", - 0x03f2 => "dynsrc", - 0x03f4 => "loop", - 0x03f6 => "start", - 0x07d3 => "ismap", - 0x07d9 => "usemap", - 0x8001 => "name", - 0x8006 => "width", - 0x8007 => "height", - 0x8046 => "title", - 0x804a => "align", - 0x804b => "style", - 0x83ea => "class", - 0x83eb => "id", - 0x978d => "onabort", - 0x978e => "onerror", - 0x9791 => "onload", - } -ATTRS54 = { - 0x07d1 => "type", - 0x07d3 => "size", - 0x07d4 => "maxlength", - 0x07d6 => "readonly", - 0x07d8 => "indeterminate", - 0x07da => "checked", - 0x07db => "alt", - 0x07dc => "src", - 0x07dd => "border", - 0x07de => "vspace", - 0x07df => "hspace", - 0x07e0 => "lowsrc", - 0x07e1 => "vrml", - 0x07e2 => "dynsrc", - 0x07e4 => "loop", - 0x07e5 => "start", - 0x8001 => "name", - 0x8006 => "width", - 0x8007 => "height", - 0x804a => "align", - 0x93ee => "value", - 0x977b => "onselect", - 0x978d => "onabort", - 0x978e => "onerror", - 0x978f => "onchange", - 0x9791 => "onload", - } -ATTRS56 = { - 0x8046 => "title", - 0x804b => "style", - 0x83ea => "class", - 0x83eb => "id", - } -ATTRS57 = { - 0x03e9 => "for", - } -ATTRS58 = { - 0x804a => "align", - } -ATTRS59 = { - 0x03ea => "value", - 0x8046 => "title", - 0x804b => "style", - 0x83ea => "class", - 0x83eb => "id", - 0x939a => "type", - } -ATTRS60 = { - 0x03ee => "href", - 0x03ef => "rel", - 0x03f0 => "rev", - 0x03f1 => "type", - 0x03f9 => "media", - 0x03fa => "target", - 0x8046 => "title", - 0x804b => "style", - 0x83ea => "class", - 0x83eb => "id", - 0x978e => "onerror", - 0x9791 => "onload", - } -ATTRS61 = { - 0x9399 => "clear", - } -ATTRS62 = { - 0x8001 => "name", - 0x8046 => "title", - 0x804b => "style", - 0x83ea => "class", - 0x83eb => "id", - } -ATTRS63 = { - 0x1771 => "scrolldelay", - 0x1772 => "direction", - 0x1773 => "behavior", - 0x1774 => "scrollamount", - 0x1775 => "loop", - 0x1776 => "vspace", - 0x1777 => "hspace", - 0x1778 => "truespeed", - 0x8006 => "width", - 0x8007 => "height", - 0x9785 => "onbounce", - 0x978b => "onfinish", - 0x978c => "onstart", - 0xfe0c => "bgcolor", - } -ATTRS65 = { - 0x03ea => "http-equiv", - 0x03eb => "content", - 0x03ec => "url", - 0x03f6 => "charset", - 0x8001 => "name", - } -ATTRS66 = { - 0x03f5 => "n", - } -ATTRS71 = { - 0x8000 => "border", - 0x8000 => "usemap", - 0x8001 => "name", - 0x8006 => "width", - 0x8007 => "height", - 0x8046 => "title", - 0x804a => "align", - 0x804b => "style", - 0x83ea => "class", - 0x83eb => "id", - 0x8bbb => "classid", - 0x8bbc => "data", - 0x8bbf => "codebase", - 0x8bc0 => "codetype", - 0x8bc1 => "code", - 0x8bc2 => "type", - 0x8bc5 => "vspace", - 0x8bc6 => "hspace", - 0x978e => "onerror", - } -ATTRS72 = { - 0x03eb => "compact", - 0x03ec => "start", - 0x8046 => "title", - 0x804b => "style", - 0x83ea => "class", - 0x83eb => "id", - 0x939a => "type", - } -ATTRS73 = { - 0x03ea => "selected", - 0x03eb => "value", - } -ATTRS74 = { - 0x8046 => "title", - 0x8049 => "align", - 0x804b => "style", - 0x83ea => "class", - 0x83eb => "id", - 0x9399 => "clear", - } -ATTRS75 = { - 0x8000 => "name", - 0x8000 => "value", - 0x8000 => "type", - } -ATTRS76 = { - 0x9399 => "clear", - } -ATTRS77 = { - 0x8046 => "title", - 0x804b => "style", - 0x83ea => "class", - 0x83eb => "id", - 0x9399 => "clear", - } -ATTRS78 = { - 0x8046 => "title", - 0x804b => "style", - 0x83ea => "class", - 0x83eb => "id", - } -ATTRS82 = { - 0x8046 => "title", - 0x804b => "style", - 0x83ea => "class", - 0x83eb => "id", - } -ATTRS83 = { - 0x8046 => "title", - 0x804b => "style", - 0x83ea => "class", - 0x83eb => "id", - } -ATTRS84 = { - 0x03ea => "src", - 0x03ed => "for", - 0x03ee => "event", - 0x03f0 => "defer", - 0x03f2 => "type", - 0x978e => "onerror", - } -ATTRS85 = { - 0x03eb => "size", - 0x03ec => "multiple", - 0x8000 => "align", - 0x8001 => "name", - 0x978f => "onchange", - } -ATTRS86 = { - 0x8046 => "title", - 0x804b => "style", - 0x83ea => "class", - 0x83eb => "id", - } -ATTRS87 = { - 0x8046 => "title", - 0x804b => "style", - 0x83ea => "class", - 0x83eb => "id", - } -ATTRS88 = { - 0x8046 => "title", - 0x804b => "style", - 0x83ea => "class", - 0x83eb => "id", - } -ATTRS89 = { - 0x8046 => "title", - 0x804b => "style", - 0x83ea => "class", - 0x83eb => "id", - } -ATTRS90 = { - 0x03eb => "type", - 0x03ef => "media", - 0x8046 => "title", - 0x978e => "onerror", - 0x9791 => "onload", - } -ATTRS91 = { - 0x8046 => "title", - 0x804b => "style", - 0x83ea => "class", - 0x83eb => "id", - } -ATTRS92 = { - 0x8046 => "title", - 0x804b => "style", - 0x83ea => "class", - 0x83eb => "id", - } -ATTRS93 = { - 0x03ea => "cols", - 0x03eb => "border", - 0x03ec => "rules", - 0x03ed => "frame", - 0x03ee => "cellspacing", - 0x03ef => "cellpadding", - 0x03fa => "datapagesize", - 0x8006 => "width", - 0x8007 => "height", - 0x8046 => "title", - 0x804a => "align", - 0x804b => "style", - 0x83ea => "class", - 0x83eb => "id", - 0x938a => "background", - 0x93a5 => "bordercolor", - 0x93a6 => "bordercolorlight", - 0x93a7 => "bordercolordark", - 0xfe0c => "bgcolor", - } -ATTRS94 = { - 0x8049 => "align", - 0x93a8 => "valign", - 0xfe0c => "bgcolor", - } -ATTRS95 = { - 0x8049 => "align", - 0x93a8 => "valign", - } -ATTRS96 = { - 0x07d2 => "rowspan", - 0x07d3 => "colspan", - 0x8006 => "width", - 0x8007 => "height", - 0x8046 => "title", - 0x8049 => "align", - 0x804b => "style", - 0x83ea => "class", - 0x83eb => "id", - 0x938a => "background", - 0x938e => "nowrap", - 0x93a5 => "bordercolor", - 0x93a6 => "bordercolorlight", - 0x93a7 => "bordercolordark", - 0x93a8 => "valign", - 0xfe0c => "bgcolor", - } -ATTRS97 = { - 0x1b5a => "rows", - 0x1b5b => "cols", - 0x1b5c => "wrap", - 0x1b5d => "readonly", - 0x8001 => "name", - 0x977b => "onselect", - 0x978f => "onchange", - } -ATTRS98 = { - 0x8049 => "align", - 0x93a8 => "valign", - 0xfe0c => "bgcolor", - } -ATTRS99 = { - 0x07d2 => "rowspan", - 0x07d3 => "colspan", - 0x8006 => "width", - 0x8007 => "height", - 0x8046 => "title", - 0x8049 => "align", - 0x804b => "style", - 0x83ea => "class", - 0x83eb => "id", - 0x938a => "background", - 0x938e => "nowrap", - 0x93a5 => "bordercolor", - 0x93a6 => "bordercolorlight", - 0x93a7 => "bordercolordark", - 0x93a8 => "valign", - 0xfe0c => "bgcolor", - } -ATTRS100 = { - 0x8049 => "align", - 0x93a8 => "valign", - 0xfe0c => "bgcolor", - } -ATTRS102 = { - 0x8007 => "height", - 0x8046 => "title", - 0x8049 => "align", - 0x804b => "style", - 0x83ea => "class", - 0x83eb => "id", - 0x93a5 => "bordercolor", - 0x93a6 => "bordercolorlight", - 0x93a7 => "bordercolordark", - 0x93a8 => "valign", - 0xfe0c => "bgcolor", - } -ATTRS103 = { - 0x8046 => "title", - 0x804b => "style", - 0x83ea => "class", - 0x83eb => "id", - } -ATTRS104 = { - 0x8046 => "title", - 0x804b => "style", - 0x83ea => "class", - 0x83eb => "id", - } -ATTRS105 = { - 0x03eb => "compact", - 0x8046 => "title", - 0x804b => "style", - 0x83ea => "class", - 0x83eb => "id", - 0x939a => "type", - } -ATTRS106 = { - 0x8046 => "title", - 0x804b => "style", - 0x83ea => "class", - 0x83eb => "id", - } -ATTRS108 = { - 0x9399 => "clear", - } - TAGS = [ None, None, @@ -893,6 +110,789 @@ TAGS = [ None, ] +ATTRS0 = { + 0x8010: "tabindex", + 0x8046: "title", + 0x804b: "style", + 0x804d: "disabled", + 0x83ea: "class", + 0x83eb: "id", + 0x83fe: "datafld", + 0x83ff: "datasrc", + 0x8400: "dataformatas", + 0x87d6: "accesskey", + 0x9392: "lang", + 0x93ed: "language", + 0x93fe: "dir", + 0x9771: "onmouseover", + 0x9772: "onmouseout", + 0x9773: "onmousedown", + 0x9774: "onmouseup", + 0x9775: "onmousemove", + 0x9776: "onkeydown", + 0x9777: "onkeyup", + 0x9778: "onkeypress", + 0x9779: "onclick", + 0x977a: "ondblclick", + 0x977e: "onhelp", + 0x977f: "onfocus", + 0x9780: "onblur", + 0x9783: "onrowexit", + 0x9784: "onrowenter", + 0x9786: "onbeforeupdate", + 0x9787: "onafterupdate", + 0x978a: "onreadystatechange", + 0x9790: "onscroll", + 0x9794: "ondragstart", + 0x9795: "onresize", + 0x9796: "onselectstart", + 0x9797: "onerrorupdate", + 0x9799: "ondatasetchanged", + 0x979a: "ondataavailable", + 0x979b: "ondatasetcomplete", + 0x979c: "onfilterchange", + 0x979f: "onlosecapture", + 0x97a0: "onpropertychange", + 0x97a2: "ondrag", + 0x97a3: "ondragend", + 0x97a4: "ondragenter", + 0x97a5: "ondragover", + 0x97a6: "ondragleave", + 0x97a7: "ondrop", + 0x97a8: "oncut", + 0x97a9: "oncopy", + 0x97aa: "onpaste", + 0x97ab: "onbeforecut", + 0x97ac: "onbeforecopy", + 0x97ad: "onbeforepaste", + 0x97af: "onrowsdelete", + 0x97b0: "onrowsinserted", + 0x97b1: "oncellchange", + 0x97b2: "oncontextmenu", + 0x97b6: "onbeforeeditfocus", + } +ATTRS3 = { + 0x0001: "href", + 0x03ec: "target", + 0x03ee: "rel", + 0x03ef: "rev", + 0x03f0: "urn", + 0x03f1: "methods", + 0x8001: "name", + 0x8046: "title", + 0x804b: "style", + 0x83ea: "class", + 0x83eb: "id", + } +ATTRS5 = { + 0x9399: "clear", + } +ATTRS6 = { + 0x8001: "name", + 0x8006: "width", + 0x8007: "height", + 0x804a: "align", + 0x8bbb: "classid", + 0x8bbc: "data", + 0x8bbf: "codebase", + 0x8bc0: "codetype", + 0x8bc1: "code", + 0x8bc2: "type", + 0x8bc5: "vspace", + 0x8bc6: "hspace", + 0x978e: "onerror", + } +ATTRS7 = { + 0x0001: "href", + 0x03ea: "shape", + 0x03eb: "coords", + 0x03ed: "target", + 0x03ee: "alt", + 0x03ef: "nohref", + 0x8046: "title", + 0x804b: "style", + 0x83ea: "class", + 0x83eb: "id", + } +ATTRS8 = { + 0x8046: "title", + 0x804b: "style", + 0x83ea: "class", + 0x83eb: "id", + } +ATTRS9 = { + 0x03ec: "href", + 0x03ed: "target", + } +ATTRS10 = { + 0x938b: "color", + 0x939b: "face", + 0x93a3: "size", + } +ATTRS12 = { + 0x03ea: "src", + 0x03eb: "loop", + 0x03ec: "volume", + 0x03ed: "balance", + } +ATTRS13 = { + 0x8046: "title", + 0x804b: "style", + 0x83ea: "class", + 0x83eb: "id", + } +ATTRS15 = { + 0x8046: "title", + 0x804b: "style", + 0x83ea: "class", + 0x83eb: "id", + 0x9399: "clear", + } +ATTRS16 = { + 0x07db: "link", + 0x07dc: "alink", + 0x07dd: "vlink", + 0x8046: "title", + 0x804b: "style", + 0x83ea: "class", + 0x83eb: "id", + 0x938a: "background", + 0x938b: "text", + 0x938e: "nowrap", + 0x93ae: "topmargin", + 0x93af: "rightmargin", + 0x93b0: "bottommargin", + 0x93b1: "leftmargin", + 0x93b6: "bgproperties", + 0x93d8: "scroll", + 0x977b: "onselect", + 0x9791: "onload", + 0x9792: "onunload", + 0x9798: "onbeforeunload", + 0x97b3: "onbeforeprint", + 0x97b4: "onafterprint", + 0xfe0c: "bgcolor", + } +ATTRS17 = { + 0x8046: "title", + 0x804b: "style", + 0x83ea: "class", + 0x83eb: "id", + 0x9399: "clear", + } +ATTRS18 = { + 0x07d1: "type", + 0x8001: "name", + } +ATTRS19 = { + 0x8046: "title", + 0x8049: "align", + 0x804b: "style", + 0x83ea: "class", + 0x83eb: "id", + 0x93a8: "valign", + } +ATTRS20 = { + 0x8046: "title", + 0x804b: "style", + 0x83ea: "class", + 0x83eb: "id", + 0x9399: "clear", + } +ATTRS21 = { + 0x8046: "title", + 0x804b: "style", + 0x83ea: "class", + 0x83eb: "id", + } +ATTRS22 = { + 0x8046: "title", + 0x804b: "style", + 0x83ea: "class", + 0x83eb: "id", + } +ATTRS23 = { + 0x03ea: "span", + 0x8006: "width", + 0x8049: "align", + 0x93a8: "valign", + 0xfe0c: "bgcolor", + } +ATTRS24 = { + 0x03ea: "span", + 0x8006: "width", + 0x8049: "align", + 0x93a8: "valign", + 0xfe0c: "bgcolor", + } +ATTRS27 = { + 0x8046: "title", + 0x804b: "style", + 0x83ea: "class", + 0x83eb: "id", + 0x938e: "nowrap", + } +ATTRS29 = { + 0x8046: "title", + 0x804b: "style", + 0x83ea: "class", + 0x83eb: "id", + } +ATTRS31 = { + 0x8046: "title", + 0x8049: "align", + 0x804b: "style", + 0x83ea: "class", + 0x83eb: "id", + 0x938e: "nowrap", + } +ATTRS32 = { + 0x03ea: "compact", + 0x8046: "title", + 0x804b: "style", + 0x83ea: "class", + 0x83eb: "id", + } +ATTRS33 = { + 0x8046: "title", + 0x804b: "style", + 0x83ea: "class", + 0x83eb: "id", + 0x938e: "nowrap", + } +ATTRS34 = { + 0x8046: "title", + 0x804b: "style", + 0x83ea: "class", + 0x83eb: "id", + } +ATTRS35 = { + 0x8001: "name", + 0x8006: "width", + 0x8007: "height", + 0x804a: "align", + 0x8bbd: "palette", + 0x8bbe: "pluginspage", + 0x8bbf: "codebase", + 0x8bbf: "src", + 0x8bc1: "units", + 0x8bc2: "type", + 0x8bc3: "hidden", + } +ATTRS36 = { + 0x804a: "align", + } +ATTRS37 = { + 0x8046: "title", + 0x804b: "style", + 0x83ea: "class", + 0x83eb: "id", + 0x938b: "color", + 0x939b: "face", + 0x939c: "size", + } +ATTRS38 = { + 0x03ea: "action", + 0x03ec: "enctype", + 0x03ed: "method", + 0x03ef: "target", + 0x03f4: "accept-charset", + 0x8001: "name", + 0x977c: "onsubmit", + 0x977d: "onreset", + } +ATTRS39 = { + 0x8000: "align", + 0x8001: "name", + 0x8bb9: "src", + 0x8bbb: "border", + 0x8bbc: "frameborder", + 0x8bbd: "framespacing", + 0x8bbe: "marginwidth", + 0x8bbf: "marginheight", + 0x8bc0: "noresize", + 0x8bc1: "scrolling", + 0x8fa2: "bordercolor", + } +ATTRS40 = { + 0x03e9: "rows", + 0x03ea: "cols", + 0x03eb: "border", + 0x03ec: "bordercolor", + 0x03ed: "frameborder", + 0x03ee: "framespacing", + 0x8001: "name", + 0x9791: "onload", + 0x9792: "onunload", + 0x9798: "onbeforeunload", + 0x97b3: "onbeforeprint", + 0x97b4: "onafterprint", + } +ATTRS42 = { + 0x8046: "title", + 0x8049: "align", + 0x804b: "style", + 0x83ea: "class", + 0x83eb: "id", + 0x9399: "clear", + } +ATTRS43 = { + 0x8046: "title", + 0x8049: "align", + 0x804b: "style", + 0x83ea: "class", + 0x83eb: "id", + 0x9399: "clear", + } +ATTRS44 = { + 0x8046: "title", + 0x8049: "align", + 0x804b: "style", + 0x83ea: "class", + 0x83eb: "id", + 0x9399: "clear", + } +ATTRS45 = { + 0x8046: "title", + 0x8049: "align", + 0x804b: "style", + 0x83ea: "class", + 0x83eb: "id", + 0x9399: "clear", + } +ATTRS46 = { + 0x8046: "title", + 0x8049: "align", + 0x804b: "style", + 0x83ea: "class", + 0x83eb: "id", + 0x9399: "clear", + } +ATTRS47 = { + 0x8046: "title", + 0x8049: "align", + 0x804b: "style", + 0x83ea: "class", + 0x83eb: "id", + 0x9399: "clear", + } +ATTRS49 = { + 0x03ea: "noshade", + 0x8006: "width", + 0x8007: "size", + 0x8046: "title", + 0x8049: "align", + 0x804b: "style", + 0x83ea: "class", + 0x83eb: "id", + 0x938b: "color", + } +ATTRS51 = { + 0x8046: "title", + 0x804b: "style", + 0x83ea: "class", + 0x83eb: "id", + } +ATTRS52 = { + 0x8001: "name", + 0x8006: "width", + 0x8007: "height", + 0x804a: "align", + 0x8bb9: "src", + 0x8bbb: "border", + 0x8bbc: "frameborder", + 0x8bbd: "framespacing", + 0x8bbe: "marginwidth", + 0x8bbf: "marginheight", + 0x8bc0: "noresize", + 0x8bc1: "scrolling", + 0x8fa2: "vspace", + 0x8fa3: "hspace", + } +ATTRS53 = { + 0x03eb: "alt", + 0x03ec: "src", + 0x03ed: "border", + 0x03ee: "vspace", + 0x03ef: "hspace", + 0x03f0: "lowsrc", + 0x03f1: "vrml", + 0x03f2: "dynsrc", + 0x03f4: "loop", + 0x03f6: "start", + 0x07d3: "ismap", + 0x07d9: "usemap", + 0x8001: "name", + 0x8006: "width", + 0x8007: "height", + 0x8046: "title", + 0x804a: "align", + 0x804b: "style", + 0x83ea: "class", + 0x83eb: "id", + 0x978d: "onabort", + 0x978e: "onerror", + 0x9791: "onload", + } +ATTRS54 = { + 0x07d1: "type", + 0x07d3: "size", + 0x07d4: "maxlength", + 0x07d6: "readonly", + 0x07d8: "indeterminate", + 0x07da: "checked", + 0x07db: "alt", + 0x07dc: "src", + 0x07dd: "border", + 0x07de: "vspace", + 0x07df: "hspace", + 0x07e0: "lowsrc", + 0x07e1: "vrml", + 0x07e2: "dynsrc", + 0x07e4: "loop", + 0x07e5: "start", + 0x8001: "name", + 0x8006: "width", + 0x8007: "height", + 0x804a: "align", + 0x93ee: "value", + 0x977b: "onselect", + 0x978d: "onabort", + 0x978e: "onerror", + 0x978f: "onchange", + 0x9791: "onload", + } +ATTRS56 = { + 0x8046: "title", + 0x804b: "style", + 0x83ea: "class", + 0x83eb: "id", + } +ATTRS57 = { + 0x03e9: "for", + } +ATTRS58 = { + 0x804a: "align", + } +ATTRS59 = { + 0x03ea: "value", + 0x8046: "title", + 0x804b: "style", + 0x83ea: "class", + 0x83eb: "id", + 0x939a: "type", + } +ATTRS60 = { + 0x03ee: "href", + 0x03ef: "rel", + 0x03f0: "rev", + 0x03f1: "type", + 0x03f9: "media", + 0x03fa: "target", + 0x8046: "title", + 0x804b: "style", + 0x83ea: "class", + 0x83eb: "id", + 0x978e: "onerror", + 0x9791: "onload", + } +ATTRS61 = { + 0x9399: "clear", + } +ATTRS62 = { + 0x8001: "name", + 0x8046: "title", + 0x804b: "style", + 0x83ea: "class", + 0x83eb: "id", + } +ATTRS63 = { + 0x1771: "scrolldelay", + 0x1772: "direction", + 0x1773: "behavior", + 0x1774: "scrollamount", + 0x1775: "loop", + 0x1776: "vspace", + 0x1777: "hspace", + 0x1778: "truespeed", + 0x8006: "width", + 0x8007: "height", + 0x9785: "onbounce", + 0x978b: "onfinish", + 0x978c: "onstart", + 0xfe0c: "bgcolor", + } +ATTRS65 = { + 0x03ea: "http-equiv", + 0x03eb: "content", + 0x03ec: "url", + 0x03f6: "charset", + 0x8001: "name", + } +ATTRS66 = { + 0x03f5: "n", + } +ATTRS71 = { + 0x8000: "border", + 0x8000: "usemap", + 0x8001: "name", + 0x8006: "width", + 0x8007: "height", + 0x8046: "title", + 0x804a: "align", + 0x804b: "style", + 0x83ea: "class", + 0x83eb: "id", + 0x8bbb: "classid", + 0x8bbc: "data", + 0x8bbf: "codebase", + 0x8bc0: "codetype", + 0x8bc1: "code", + 0x8bc2: "type", + 0x8bc5: "vspace", + 0x8bc6: "hspace", + 0x978e: "onerror", + } +ATTRS72 = { + 0x03eb: "compact", + 0x03ec: "start", + 0x8046: "title", + 0x804b: "style", + 0x83ea: "class", + 0x83eb: "id", + 0x939a: "type", + } +ATTRS73 = { + 0x03ea: "selected", + 0x03eb: "value", + } +ATTRS74 = { + 0x8046: "title", + 0x8049: "align", + 0x804b: "style", + 0x83ea: "class", + 0x83eb: "id", + 0x9399: "clear", + } +ATTRS75 = { + 0x8000: "name", + 0x8000: "value", + 0x8000: "type", + } +ATTRS76 = { + 0x9399: "clear", + } +ATTRS77 = { + 0x8046: "title", + 0x804b: "style", + 0x83ea: "class", + 0x83eb: "id", + 0x9399: "clear", + } +ATTRS78 = { + 0x8046: "title", + 0x804b: "style", + 0x83ea: "class", + 0x83eb: "id", + } +ATTRS82 = { + 0x8046: "title", + 0x804b: "style", + 0x83ea: "class", + 0x83eb: "id", + } +ATTRS83 = { + 0x8046: "title", + 0x804b: "style", + 0x83ea: "class", + 0x83eb: "id", + } +ATTRS84 = { + 0x03ea: "src", + 0x03ed: "for", + 0x03ee: "event", + 0x03f0: "defer", + 0x03f2: "type", + 0x978e: "onerror", + } +ATTRS85 = { + 0x03eb: "size", + 0x03ec: "multiple", + 0x8000: "align", + 0x8001: "name", + 0x978f: "onchange", + } +ATTRS86 = { + 0x8046: "title", + 0x804b: "style", + 0x83ea: "class", + 0x83eb: "id", + } +ATTRS87 = { + 0x8046: "title", + 0x804b: "style", + 0x83ea: "class", + 0x83eb: "id", + } +ATTRS88 = { + 0x8046: "title", + 0x804b: "style", + 0x83ea: "class", + 0x83eb: "id", + } +ATTRS89 = { + 0x8046: "title", + 0x804b: "style", + 0x83ea: "class", + 0x83eb: "id", + } +ATTRS90 = { + 0x03eb: "type", + 0x03ef: "media", + 0x8046: "title", + 0x978e: "onerror", + 0x9791: "onload", + } +ATTRS91 = { + 0x8046: "title", + 0x804b: "style", + 0x83ea: "class", + 0x83eb: "id", + } +ATTRS92 = { + 0x8046: "title", + 0x804b: "style", + 0x83ea: "class", + 0x83eb: "id", + } +ATTRS93 = { + 0x03ea: "cols", + 0x03eb: "border", + 0x03ec: "rules", + 0x03ed: "frame", + 0x03ee: "cellspacing", + 0x03ef: "cellpadding", + 0x03fa: "datapagesize", + 0x8006: "width", + 0x8007: "height", + 0x8046: "title", + 0x804a: "align", + 0x804b: "style", + 0x83ea: "class", + 0x83eb: "id", + 0x938a: "background", + 0x93a5: "bordercolor", + 0x93a6: "bordercolorlight", + 0x93a7: "bordercolordark", + 0xfe0c: "bgcolor", + } +ATTRS94 = { + 0x8049: "align", + 0x93a8: "valign", + 0xfe0c: "bgcolor", + } +ATTRS95 = { + 0x8049: "align", + 0x93a8: "valign", + } +ATTRS96 = { + 0x07d2: "rowspan", + 0x07d3: "colspan", + 0x8006: "width", + 0x8007: "height", + 0x8046: "title", + 0x8049: "align", + 0x804b: "style", + 0x83ea: "class", + 0x83eb: "id", + 0x938a: "background", + 0x938e: "nowrap", + 0x93a5: "bordercolor", + 0x93a6: "bordercolorlight", + 0x93a7: "bordercolordark", + 0x93a8: "valign", + 0xfe0c: "bgcolor", + } +ATTRS97 = { + 0x1b5a: "rows", + 0x1b5b: "cols", + 0x1b5c: "wrap", + 0x1b5d: "readonly", + 0x8001: "name", + 0x977b: "onselect", + 0x978f: "onchange", + } +ATTRS98 = { + 0x8049: "align", + 0x93a8: "valign", + 0xfe0c: "bgcolor", + } +ATTRS99 = { + 0x07d2: "rowspan", + 0x07d3: "colspan", + 0x8006: "width", + 0x8007: "height", + 0x8046: "title", + 0x8049: "align", + 0x804b: "style", + 0x83ea: "class", + 0x83eb: "id", + 0x938a: "background", + 0x938e: "nowrap", + 0x93a5: "bordercolor", + 0x93a6: "bordercolorlight", + 0x93a7: "bordercolordark", + 0x93a8: "valign", + 0xfe0c: "bgcolor", + } +ATTRS100 = { + 0x8049: "align", + 0x93a8: "valign", + 0xfe0c: "bgcolor", + } +ATTRS102 = { + 0x8007: "height", + 0x8046: "title", + 0x8049: "align", + 0x804b: "style", + 0x83ea: "class", + 0x83eb: "id", + 0x93a5: "bordercolor", + 0x93a6: "bordercolorlight", + 0x93a7: "bordercolordark", + 0x93a8: "valign", + 0xfe0c: "bgcolor", + } +ATTRS103 = { + 0x8046: "title", + 0x804b: "style", + 0x83ea: "class", + 0x83eb: "id", + } +ATTRS104 = { + 0x8046: "title", + 0x804b: "style", + 0x83ea: "class", + 0x83eb: "id", + } +ATTRS105 = { + 0x03eb: "compact", + 0x8046: "title", + 0x804b: "style", + 0x83ea: "class", + 0x83eb: "id", + 0x939a: "type", + } +ATTRS106 = { + 0x8046: "title", + 0x804b: "style", + 0x83ea: "class", + 0x83eb: "id", + } +ATTRS108 = { + 0x9399: "clear", + } + TAGS_ATTRS = [ None, None, @@ -1005,4 +1005,4 @@ TAGS_ATTRS = [ None, ] -MAP = (TAGS, TAGS_ATTRS, ATTRS0) +MAP = (TAGS, ATTRS0, TAGS_ATTRS) diff --git a/src/calibre/ebooks/lit/maps/opf.py b/src/calibre/ebooks/lit/maps/opf.py index a39e6bf8e8..cc1acc4dfa 100644 --- a/src/calibre/ebooks/lit/maps/opf.py +++ b/src/calibre/ebooks/lit/maps/opf.py @@ -1,28 +1,3 @@ -ATTRS = { - 0x0001 => "href", - 0x0002 => "%never-used", - 0x0003 => "%guid", - 0x0004 => "%minimum_level", - 0x0005 => "%attr5", - 0x0006 => "id", - 0x0007 => "href", - 0x0008 => "media-type", - 0x0009 => "fallback", - 0x000A => "idref", - 0x000B => "xmlns:dc", - 0x000C => "xmlns:oebpackage", - 0x000D => "role", - 0x000E => "file-as", - 0x000F => "event", - 0x0010 => "scheme", - 0x0011 => "title", - 0x0012 => "type", - 0x0013 => "unique-identifier", - 0x0014 => "name", - 0x0015 => "content", - 0x0016 => "xml:lang", - } - TAGS = [ None, "package", @@ -69,6 +44,31 @@ TAGS = [ None, ] -TAGS_ATTR = [{} for i in xrange(43)] +ATTRS = { + 0x0001: "href", + 0x0002: "%never-used", + 0x0003: "%guid", + 0x0004: "%minimum_level", + 0x0005: "%attr5", + 0x0006: "id", + 0x0007: "href", + 0x0008: "media-type", + 0x0009: "fallback", + 0x000A: "idref", + 0x000B: "xmlns:dc", + 0x000C: "xmlns:oebpackage", + 0x000D: "role", + 0x000E: "file-as", + 0x000F: "event", + 0x0010: "scheme", + 0x0011: "title", + 0x0012: "type", + 0x0013: "unique-identifier", + 0x0014: "name", + 0x0015: "content", + 0x0016: "xml:lang", + } -MAP = (TAGS, TAGS_ATTRS, ATTRS0) +TAGS_ATTRS = [{} for i in xrange(43)] + +MAP = (TAGS, ATTRS, TAGS_ATTRS) diff --git a/src/calibre/ebooks/lit/mssha1.py b/src/calibre/ebooks/lit/mssha1.py new file mode 100644 index 0000000000..f6f7c33444 --- /dev/null +++ b/src/calibre/ebooks/lit/mssha1.py @@ -0,0 +1,343 @@ +#!/usr/bin/env python +# -*- coding: iso-8859-1 + +"""A sample implementation of SHA-1 in pure Python. + + Framework adapted from Dinu Gherman's MD5 implementation by + J. Hallén and L. Creighton. SHA-1 implementation based directly on + the text of the NIST standard FIPS PUB 180-1. +""" + + +__date__ = '2004-11-17' +__version__ = 0.91 # Modernised by J. Hallén and L. Creighton for Pypy + + +import struct, copy + + +# ====================================================================== +# Bit-Manipulation helpers +# +# _long2bytes() was contributed by Barry Warsaw +# and is reused here with tiny modifications. +# ====================================================================== + +def _long2bytesBigEndian(n, blocksize=0): + """Convert a long integer to a byte string. + + If optional blocksize is given and greater than zero, pad the front + of the byte string with binary zeros so that the length is a multiple + of blocksize. + """ + + # After much testing, this algorithm was deemed to be the fastest. + s = '' + pack = struct.pack + while n > 0: + s = pack('>I', n & 0xffffffffL) + s + n = n >> 32 + + # Strip off leading zeros. + for i in range(len(s)): + if s[i] != '\000': + break + else: + # Only happens when n == 0. + s = '\000' + i = 0 + + s = s[i:] + + # Add back some pad bytes. This could be done more efficiently + # w.r.t. the de-padding being done above, but sigh... + if blocksize > 0 and len(s) % blocksize: + s = (blocksize - len(s) % blocksize) * '\000' + s + + return s + + +def _bytelist2longBigEndian(list): + "Transform a list of characters into a list of longs." + + imax = len(list)/4 + hl = [0L] * imax + + j = 0 + i = 0 + while i < imax: + b0 = long(ord(list[j])) << 24 + b1 = long(ord(list[j+1])) << 16 + b2 = long(ord(list[j+2])) << 8 + b3 = long(ord(list[j+3])) + hl[i] = b0 | b1 | b2 | b3 + i = i+1 + j = j+4 + + return hl + + +def _rotateLeft(x, n): + "Rotate x (32 bit) left n bits circularly." + + return (x << n) | (x >> (32-n)) + + +# ====================================================================== +# The SHA transformation functions +# +# ====================================================================== + +def f0_19(B, C, D): + return (B & (C ^ D)) ^ D + +def f20_39(B, C, D): + return B ^ C ^ D + +def f40_59(B, C, D): + return ((B | C) & D) | (B & C) + +def f60_79(B, C, D): + return B ^ C ^ D + +def f6_42(B, C, D): + return (B + C) ^ C + +f = [f0_19]*20 + [f20_39]*20 + [f40_59]*20 + [f60_79]*20 +f[3] = f20_39 +f[6] = f6_42 +f[10] = f20_39 +f[15] = f20_39 +f[26] = f0_19 +f[31] = f40_59 +f[42] = f6_42 +f[51] = f20_39 +f[68] = f0_19 + + +# Constants to be used +K = [ + 0x5A827999L, # ( 0 <= t <= 19) + 0x6ED9EBA1L, # (20 <= t <= 39) + 0x8F1BBCDCL, # (40 <= t <= 59) + 0xCA62C1D6L # (60 <= t <= 79) + ] + +class sha: + "An implementation of the MD5 hash function in pure Python." + + def __init__(self): + "Initialisation." + + # Initial message length in bits(!). + self.length = 0L + self.count = [0, 0] + + # Initial empty message as a sequence of bytes (8 bit characters). + self.input = [] + + # Call a separate init function, that can be used repeatedly + # to start from scratch on the same object. + self.init() + + + def init(self): + "Initialize the message-digest and set all fields to zero." + + self.length = 0L + self.input = [] + + # Initial 160 bit message digest (5 times 32 bit). + self.H0 = 0x32107654L + self.H1 = 0x23016745L + self.H2 = 0xC4E680A2L + self.H3 = 0xDC679823L + self.H4 = 0xD0857A34L + + def _transform(self, W): + for t in range(16, 80): + W.append(_rotateLeft( + W[t-3] ^ W[t-8] ^ W[t-14] ^ W[t-16], 1) & 0xffffffffL) + + A = self.H0 + B = self.H1 + C = self.H2 + D = self.H3 + E = self.H4 + + for t in xrange(0, 80): + TEMP = _rotateLeft(A, 5) + f[t](B, C, D) + E + W[t] + K[t/20] + E = D + D = C + C = _rotateLeft(B, 30) & 0xffffffffL + B = A + A = TEMP & 0xffffffffL + + self.H0 = (self.H0 + A) & 0xffffffffL + self.H1 = (self.H1 + B) & 0xffffffffL + self.H2 = (self.H2 + C) & 0xffffffffL + self.H3 = (self.H3 + D) & 0xffffffffL + self.H4 = (self.H4 + E) & 0xffffffffL + + + # Down from here all methods follow the Python Standard Library + # API of the sha module. + + def update(self, inBuf): + """Add to the current message. + + Update the sha object with the string arg. Repeated calls + are equivalent to a single call with the concatenation of all + the arguments, i.e. s.update(a); s.update(b) is equivalent + to s.update(a+b). + + The hash is immediately calculated for all full blocks. The final + calculation is made in digest(). It will calculate 1-2 blocks, + depending on how much padding we have to add. This allows us to + keep an intermediate value for the hash, so that we only need to + make minimal recalculation if we call update() to add more data + to the hashed string. + """ + + leninBuf = long(len(inBuf)) + + # Compute number of bytes mod 64. + index = (self.count[1] >> 3) & 0x3FL + + # Update number of bits. + self.count[1] = self.count[1] + (leninBuf << 3) + if self.count[1] < (leninBuf << 3): + self.count[0] = self.count[0] + 1 + self.count[0] = self.count[0] + (leninBuf >> 29) + + partLen = 64 - index + + if leninBuf >= partLen: + self.input[index:] = list(inBuf[:partLen]) + self._transform(_bytelist2longBigEndian(self.input)) + i = partLen + while i + 63 < leninBuf: + self._transform(_bytelist2longBigEndian(list(inBuf[i:i+64]))) + i = i + 64 + else: + self.input = list(inBuf[i:leninBuf]) + else: + i = 0 + self.input = self.input + list(inBuf) + + + def digest(self): + """Terminate the message-digest computation and return digest. + + Return the digest of the strings passed to the update() + method so far. This is a 16-byte string which may contain + non-ASCII characters, including null bytes. + """ + + H0 = self.H0 + H1 = self.H1 + H2 = self.H2 + H3 = self.H3 + H4 = self.H4 + input = [] + self.input + count = [] + self.count + + index = (self.count[1] >> 3) & 0x3fL + + if index < 56: + padLen = 56 - index + else: + padLen = 120 - index + + padding = ['\200'] + ['\000'] * 63 + self.update(padding[:padLen]) + + # Append length (before padding). + bits = _bytelist2longBigEndian(self.input[:56]) + count + + self._transform(bits) + + # Store state in digest. + digest = _long2bytesBigEndian(self.H0, 4) + \ + _long2bytesBigEndian(self.H1, 4) + \ + _long2bytesBigEndian(self.H2, 4) + \ + _long2bytesBigEndian(self.H3, 4) + \ + _long2bytesBigEndian(self.H4, 4) + + self.H0 = H0 + self.H1 = H1 + self.H2 = H2 + self.H3 = H3 + self.H4 = H4 + self.input = input + self.count = count + + return digest + + + def hexdigest(self): + """Terminate and return digest in HEX form. + + Like digest() except the digest is returned as a string of + length 32, containing only hexadecimal digits. This may be + used to exchange the value safely in email or other non- + binary environments. + """ + return ''.join(['%02x' % ord(c) for c in self.digest()]) + + def copy(self): + """Return a clone object. + + Return a copy ('clone') of the md5 object. This can be used + to efficiently compute the digests of strings that share + a common initial substring. + """ + + return copy.deepcopy(self) + + +# ====================================================================== +# Mimic Python top-level functions from standard library API +# for consistency with the md5 module of the standard library. +# ====================================================================== + +# These are mandatory variables in the module. They have constant values +# in the SHA standard. + +digest_size = digestsize = 20 +blocksize = 1 + +def new(arg=None): + """Return a new sha crypto object. + + If arg is present, the method call update(arg) is made. + """ + + crypto = sha() + if arg: + crypto.update(arg) + + return crypto + +if __name__ == '__main__': + def main(): + import sys + file = None + if len(sys.argv) > 2: + print "usage: %s [FILE]" % sys.argv[0] + return + elif len(sys.argv) < 2: + file = sys.stdin + else: + file = open(sys.argv[1], 'rb') + context = new() + data = file.read(16384) + while data: + context.update(data) + data = file.read(16384) + file.close() + digest = context.hexdigest().upper() + for i in xrange(0, 40, 8): + print digest[i:i+8], + print + main() diff --git a/src/calibre/ebooks/lit/reader.py b/src/calibre/ebooks/lit/reader.py index 1a0f42f8db..711aef6586 100644 --- a/src/calibre/ebooks/lit/reader.py +++ b/src/calibre/ebooks/lit/reader.py @@ -5,6 +5,7 @@ Support for reading the metadata from a lit file. ''' import sys, struct, cStringIO, os +import functools from itertools import repeat from calibre import relpath @@ -13,6 +14,31 @@ from calibre.ebooks.metadata.opf import OPFReader from calibre.ebooks.lit import LitError from calibre.ebooks.lit.maps import OPF_MAP, HTML_MAP +OPF_DECL = """" + +""" +XHTML_DECL = """ + +""" + +class DirectoryEntry(object): + def __init__(self, name, section, offset, size): + self.name = name + self.section = section + self.offset = offset + self.size = size + + def __repr__(self): + return "" \ + % (self.name, self.section, self.offset, self.size) + + def __str__(self): + return repr(self) + def u32(bytes): return struct.unpack('') index = self.binary_to_text(base=index, depth=depth+1) - is_goingdown = 0 + is_goingdown = False if not tag_name: raise LitError('Tag ends before it begins.') self.buf.write('') @@ -222,7 +248,7 @@ class UnBinary(object): if not in_censorship: self.buf.write(c) count -= 1 - elif count == 0: + if count == 0: if not in_censorship: self.buf.write('"') in_censorship = False @@ -268,7 +294,7 @@ class UnBinary(object): href += c count -= 1 if count == 0: - doc, m, frag = href.partition('#') + doc, m, frag = href[1:].partition('#') path = self.item_path(doc) if m and frag: path += m + frag @@ -297,100 +323,74 @@ class ManifestItem(object): def __repr__(self): return self.internal + u'->' + self.path +def preserve(function): + def wrapper(self, *args, **kwargs): + opos = self._stream.tell() + try: + return function(self, *args, **kwargs) + finally: + self._stream.seek(opos) + functools.update_wrapper(wrapper, function) + return wrapper + class LitFile(object): PIECE_SIZE = 16 def magic(): + @preserve def fget(self): - val = None - opos = self._stream.tell() - try: - self._stream.seek(0) - val = self._stream.read(8) - finally: - self._stream.seek(opos) - return val + self._stream.seek(0) + return self._stream.read(8) return property(fget=fget) magic = magic() def version(): def fget(self): - val = None - opos = self._stream.tell() - try: - self._stream.seek(8) - val = u32(self._stream.read(4)) - finally: - self._stream.seek(opos) - return val + self._stream.seek(8) + return u32(self._stream.read(4)) return property(fget=fget) version = version() def hdr_len(): + @preserve def fget(self): - val = None - opos = self._stream.tell() - try: - self._stream.seek(12) - val = int32(self._stream.read(4)) - finally: - self._stream.seek(opos) - return val + self._stream.seek(12) + return int32(self._stream.read(4)) return property(fget=fget) hdr_len = hdr_len() def num_pieces(): + @preserve def fget(self): - val = None - opos = self._stream.tell() - try: - self._stream.seek(16) - val = int32(self._stream.read(4)) - finally: - self._stream.seek(opos) - return val + self._stream.seek(16) + return int32(self._stream.read(4)) return property(fget=fget) num_pieces = num_pieces() def sec_hdr_len(): + @preserve def fget(self): - val = None - opos = self._stream.tell() - try: - self._stream.seek(20) - val = int32(self._stream.read(4)) - finally: - self._stream.seek(opos) - return val + self._stream.seek(20) + return int32(self._stream.read(4)) return property(fget=fget) sec_hdr_len = sec_hdr_len() def guid(): + @preserve def fget(self): - val = None - opos = self._stream.tell() - try: - self._stream.seek(24) - val = self._stream.read(16) - finally: - self._stream.seek(opos) - return val + self._stream.seek(24) + return self._stream.read(16) return property(fget=fget) guid = guid() def header(): + @preserve def fget(self): - val = None - opos = self._stream.tell() - try: - size = self.hdr_len \ - + (self.num_pieces * self.PIECE_SIZE) \ - + self.sec_hdr_len - self._stream.seek(0) - val = self._stream.read(size) - finally: - self._stream.seek(opos) - return val + size = self.hdr_len \ + + (self.num_pieces * self.PIECE_SIZE) \ + + self.sec_hdr_len + self._stream.seek(0) + return self._stream.read(size) return property(fget=fget) header = header() @@ -402,70 +402,64 @@ class LitFile(object): raise LitError('Unknown LIT version %d'%(self.version,)) self.read_secondary_header() self.read_header_pieces() - - def read_secondary_header(self): - opos = self._stream.tell() - try: - self._stream.seek(self.hdr_len + self.num_pieces*self.PIECE_SIZE) - bytes = self._stream.read(self.sec_hdr_len) - offset = int32(bytes[4:]) - while offset < len(bytes): - blocktype = bytes[offset:offset+4] - blockver = u32(bytes[offset+4:]) - if blocktype == 'CAOL': - if blockver != 2: - raise LitError( - 'Unknown CAOL block format %d' % blockver) - self.creator_id = u32(bytes[offset+12:]) - self.entry_chunklen = u32(bytes[offset+20:]) - self.count_chunklen = u32(bytes[offset+24:]) - self.entry_unknown = u32(bytes[offset+28:]) - self.count_unknown = u32(bytes[offset+32:]) - offset += 48 - elif blocktype == 'ITSF': - if blockver != 4: - raise LitError( - 'Unknown ITSF block format %d' % blockver) - if u32(bytes[offset+4+16:]): - raise LitError('This file has a 64bit content offset') - self.content_offset = u32(bytes[offset+16:]) - self.timestamp = u32(bytes[offset+24:]) - self.language_id = u32(bytes[offset+28:]) - offset += 48 - if not hasattr(self, 'content_offset'): - raise LitError('Could not figure out the content offset') - finally: - self._stream.seek(opos) + @preserve + def read_secondary_header(self): + self._stream.seek(self.hdr_len + self.num_pieces*self.PIECE_SIZE) + bytes = self._stream.read(self.sec_hdr_len) + offset = int32(bytes[4:]) + while offset < len(bytes): + blocktype = bytes[offset:offset+4] + blockver = u32(bytes[offset+4:]) + if blocktype == 'CAOL': + if blockver != 2: + raise LitError( + 'Unknown CAOL block format %d' % blockver) + self.creator_id = u32(bytes[offset+12:]) + self.entry_chunklen = u32(bytes[offset+20:]) + self.count_chunklen = u32(bytes[offset+24:]) + self.entry_unknown = u32(bytes[offset+28:]) + self.count_unknown = u32(bytes[offset+32:]) + offset += 48 + elif blocktype == 'ITSF': + if blockver != 4: + raise LitError( + 'Unknown ITSF block format %d' % blockver) + if u32(bytes[offset+4+16:]): + raise LitError('This file has a 64bit content offset') + self.content_offset = u32(bytes[offset+16:]) + self.timestamp = u32(bytes[offset+24:]) + self.language_id = u32(bytes[offset+28:]) + offset += 48 + if not hasattr(self, 'content_offset'): + raise LitError('Could not figure out the content offset') + + @preserve def read_header_pieces(self): - opos = self._stream.tell() - try: - src = self.header[self.hdr_len:] - for i in range(self.num_pieces): - piece = src[i*self.PIECE_SIZE:(i+1)*self.PIECE_SIZE] - if u32(piece[4:]) != 0 or u32(piece[12:]) != 0: - raise LitError('Piece %s has 64bit value' % repr(piece)) - offset, size = u32(piece), int32(piece[8:]) - self._stream.seek(offset) - piece = self._stream.read(size) - if i == 0: - continue # Dont need this piece - elif i == 1: - if u32(piece[8:]) != self.entry_chunklen or \ - u32(piece[12:]) != self.entry_unknown: - raise LitError('Secondary header does not match piece') - self.read_directory(piece) - elif i == 2: - if u32(piece[8:]) != self.count_chunklen or \ - u32(piece[12:]) != self.count_unknown: - raise LitError('Secondary header does not match piece') - continue # No data needed from this piece - elif i == 3: - self.piece3_guid = piece - elif i == 4: - self.piece4_guid = piece - finally: - self._stream.seek(opos) + src = self.header[self.hdr_len:] + for i in range(self.num_pieces): + piece = src[i*self.PIECE_SIZE:(i+1)*self.PIECE_SIZE] + if u32(piece[4:]) != 0 or u32(piece[12:]) != 0: + raise LitError('Piece %s has 64bit value' % repr(piece)) + offset, size = u32(piece), int32(piece[8:]) + self._stream.seek(offset) + piece = self._stream.read(size) + if i == 0: + continue # Dont need this piece + elif i == 1: + if u32(piece[8:]) != self.entry_chunklen or \ + u32(piece[12:]) != self.entry_unknown: + raise LitError('Secondary header does not match piece') + self.read_directory(piece) + elif i == 2: + if u32(piece[8:]) != self.count_chunklen or \ + u32(piece[12:]) != self.count_unknown: + raise LitError('Secondary header does not match piece') + continue # No data needed from this piece + elif i == 3: + self.piece3_guid = piece + elif i == 4: + self.piece4_guid = piece def read_directory(self, piece): self.entries = [] @@ -521,108 +515,88 @@ class LitFile(object): if not hasattr(self, 'manifest'): raise LitError('Lit file does not have a valid manifest') - - def read_section_names(self, entry): - opos = self._stream.tell() - try: - self._stream.seek(self.content_offset + entry.offset) - raw = self._stream.read(entry.size) - if len(raw) < 4: - raise LitError('Invalid Namelist section') - pos = 4 - self.num_sections = u16(raw[2:pos]) - - self.sections = {} - for section in range(self.num_sections): - size = u16(raw[pos:pos+2]) - pos += 2 - size = size*2 + 2 - if pos + size > len(raw): - raise LitError('Invalid Namelist section') - self.sections[section] = raw[pos:pos+size].decode('utf-16-le') - pos += size - finally: - self._stream.seek(opos) - - def read_manifest(self, entry): - opos = self._stream.tell() - try: - self.manifest = [] - self._stream.seek(self.content_offset + entry.offset) - raw = self._stream.read(entry.size) - pos = 0 - while pos < len(raw): - size = ord(raw[pos]) - if size == 0: break - pos += 1 - root = raw[pos:pos+size].decode('utf8') - pos += size - if pos >= len(raw): - raise LitError('Truncated manifest.') - for state in ['spine', 'not spine', 'css', 'images']: - num_files = int32(raw[pos:pos+4]) - pos += 4 - if num_files == 0: continue - - i = 0 - while i < num_files: - if pos+5 >= len(raw): - raise LitError('Truncated manifest.') - offset = u32(raw[pos:pos+4]) - pos += 4 - - slen = ord(raw[pos]) - pos += 1 - internal = raw[pos:pos+slen].decode('utf8') - pos += slen - - slen = ord(raw[pos]) - pos += 1 - original = raw[pos:pos+slen].decode('utf8') - pos += slen - - slen = ord(raw[pos]) - pos += 1 - mime_type = raw[pos:pos+slen].decode('utf8') - pos += slen + 1 - - self.manifest.append( - ManifestItem(original, internal, mime_type, - offset, root, state)) - i += 1 - finally: - self._stream.seek(opos) - - def read_meta(self, entry): - opos = self._stream.tell() - try: - self._stream.seek(self.content_offset + entry.offset) - raw = self._stream.read(entry.size) - xml = \ -'''\ - - -'''+\ - unicode(UnBinary(raw, self.manifest)) - self.meta = xml - finally: - self._stream.seek(opos) - + @preserve + def read_section_names(self, entry): + self._stream.seek(self.content_offset + entry.offset) + raw = self._stream.read(entry.size) + if len(raw) < 4: + raise LitError('Invalid Namelist section') + pos = 4 + self.num_sections = u16(raw[2:pos]) + + self.sections = {} + for section in range(self.num_sections): + size = u16(raw[pos:pos+2]) + pos += 2 + size = size*2 + 2 + if pos + size > len(raw): + raise LitError('Invalid Namelist section') + self.sections[section] = raw[pos:pos+size].decode('utf-16-le') + pos += size + + @preserve + def read_manifest(self, entry): + self.manifest = [] + self._stream.seek(self.content_offset + entry.offset) + raw = self._stream.read(entry.size) + pos = 0 + while pos < len(raw): + size = ord(raw[pos]) + if size == 0: break + pos += 1 + root = raw[pos:pos+size].decode('utf8') + pos += size + if pos >= len(raw): + raise LitError('Truncated manifest.') + for state in ['spine', 'not spine', 'css', 'images']: + num_files = int32(raw[pos:pos+4]) + pos += 4 + if num_files == 0: continue + + i = 0 + while i < num_files: + if pos+5 >= len(raw): + raise LitError('Truncated manifest.') + offset = u32(raw[pos:pos+4]) + pos += 4 + + slen = ord(raw[pos]) + pos += 1 + internal = raw[pos:pos+slen].decode('utf8') + pos += slen + + slen = ord(raw[pos]) + pos += 1 + original = raw[pos:pos+slen].decode('utf8') + pos += slen + + slen = ord(raw[pos]) + pos += 1 + mime_type = raw[pos:pos+slen].decode('utf8') + pos += slen + 1 + + self.manifest.append( + ManifestItem(original, internal, mime_type, + offset, root, state)) + i += 1 + + @preserve + def read_meta(self, entry): + self._stream.seek(self.content_offset + entry.offset) + raw = self._stream.read(entry.size) + xml = OPF_DECL + unicode(UnBinary(raw, self.manifest)) + self.meta = xml + + @preserve def read_image(self, internal_name): cover_entry = None for entry in self.entries: if internal_name in entry.name: cover_entry = entry break - opos = self._stream.tell() - try: - self._stream.seek(self.content_offset + cover_entry.offset) - return self._stream.read(cover_entry.size) - finally: - self._stream.seek(opos) + self._stream.seek(self.content_offset + cover_entry.offset) + return self._stream.read(cover_entry.size) def get_metadata(stream): try: From a48282500fb831dc8019a69068700a98c6d8a90d Mon Sep 17 00:00:00 2001 From: "Marshall T. Vandegrift" Date: Wed, 16 Jul 2008 15:00:47 -0400 Subject: [PATCH 03/19] Checkpoint for changing computers --- src/calibre/ebooks/lit/reader.py | 65 ++++++++++++++++++++------------ 1 file changed, 41 insertions(+), 24 deletions(-) diff --git a/src/calibre/ebooks/lit/reader.py b/src/calibre/ebooks/lit/reader.py index 711aef6586..4d149042cc 100644 --- a/src/calibre/ebooks/lit/reader.py +++ b/src/calibre/ebooks/lit/reader.py @@ -25,20 +25,6 @@ XHTML_DECL = """ "http://openebook.org/dtds/oeb-1.0.1/oebdoc101.dtd"> """ -class DirectoryEntry(object): - def __init__(self, name, section, offset, size): - self.name = name - self.section = section - self.offset = offset - self.size = size - - def __repr__(self): - return "" \ - % (self.name, self.section, self.offset, self.size) - - def __str__(self): - return repr(self) - def u32(bytes): return struct.unpack('' + self.path + return "ManifestItem(internal='%s', path='%s')" \ + % (repr(self.internal), repr(self.path)) def preserve(function): def wrapper(self, *args, **kwargs): @@ -382,6 +382,7 @@ class LitFile(object): return self._stream.read(16) return property(fget=fget) guid = guid() + def header(): @preserve @@ -403,6 +404,19 @@ class LitFile(object): self.read_secondary_header() self.read_header_pieces() + @preserve + def __len__(self): + self._stream.seek(0, 2) + return self._stream.tell() + + @preserve + def _read_raw(self, offset, size): + self._stream.seek(offset) + return self._stream.read(size) + + def _read_content(self, offset, size): + return self._read_raw(self.content_offset + offset, size) + @preserve def read_secondary_header(self): self._stream.seek(self.hdr_len + self.num_pieces*self.PIECE_SIZE) @@ -462,7 +476,7 @@ class LitFile(object): self.piece4_guid = piece def read_directory(self, piece): - self.entries = [] + self.entries = {} if not piece.startswith('IFCM'): raise LitError('Header piece #1 is not main directory.') chunk_size, num_chunks = int32(piece[8:12]), int32(piece[24:28]) @@ -507,7 +521,7 @@ class LitFile(object): self.read_manifest(entry) elif name == '/meta': self.read_meta(entry) - self.entries.append(entry) + self.entries[name] = entry i += 1 if not hasattr(self, 'sections'): @@ -590,14 +604,17 @@ class LitFile(object): @preserve def read_image(self, internal_name): - cover_entry = None - for entry in self.entries: - if internal_name in entry.name: - cover_entry = entry - break + cover_entry = self.entries[internal_name] self._stream.seek(self.content_offset + cover_entry.offset) return self._stream.read(cover_entry.size) + def get_file(self, name): + entry = self.entries[name] + if entry.section == 0: + return self._read_content(entry.offset, entry.size) + section = self.get_section(entry.section) + return section[entry.offset:entry.offset+entry.size] + def get_metadata(stream): try: litfile = LitFile(stream) From 9cf4508547a499d7174dfb90cabd5945ba3b356d Mon Sep 17 00:00:00 2001 From: "Marshall T. Vandegrift" Date: Thu, 17 Jul 2008 19:33:30 -0400 Subject: [PATCH 04/19] Checkpoint for switching computers --- src/calibre/ebooks/lit/msdes.py | 481 +++++++++++++++++++++++++++++++ src/calibre/ebooks/lit/mssha1.py | 8 +- src/calibre/ebooks/lit/reader.py | 118 ++++++-- 3 files changed, 579 insertions(+), 28 deletions(-) create mode 100644 src/calibre/ebooks/lit/msdes.py diff --git a/src/calibre/ebooks/lit/msdes.py b/src/calibre/ebooks/lit/msdes.py new file mode 100644 index 0000000000..5bc67b09bb --- /dev/null +++ b/src/calibre/ebooks/lit/msdes.py @@ -0,0 +1,481 @@ +# Re-modified for use in MS LIT decryption. Un-reversed the bytebit[] array. +# Substituted Microsoft's absurd modified S-boxes. Modified the encrypt/decrypt +# methods to handle more than one block at a time. +# +# And lo, all the previous notices follow: + +# Modified DES encryption for VNC password authentication. +# Ported from realvnc's java viewer by +# I chose this package name because it is not compatible with the +# original DES algorithm, e.g. found pycrypto. +# +# (C) 2003 chris +# Released as free software under the Python License. +# +# You're free to use it for commercial and noncommercial +# application, modify and redistribute it as long as the +# copyright notices are intact. There are no warranties, not +# even that it does what it says to do ;-) +# +# Original notice following: + +# This DES class has been extracted from package Acme.Crypto for use in VNC. +# The bytebit[] array has been reversed so that the most significant bit +# in each byte of the key is ignored, not the least significant. Also the +# unnecessary odd parity code has been removed. +# +# These changes are: +# Copyright (C) 1999 AT&T Laboratories Cambridge. All Rights Reserved. +# +# This software is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# + +# DesCipher - the DES encryption method +# +# The meat of this code is by Dave Zimmerman , and is: +# +# Copyright (c) 1996 Widget Workshop, Inc. All Rights Reserved. +# +# Permission to use, copy, modify, and distribute this software +# and its documentation for NON-COMMERCIAL or COMMERCIAL purposes and +# without fee is hereby granted, provided that this copyright notice is kept +# intact. +# +# WIDGET WORKSHOP MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE SUITABILITY +# OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +# TO THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +# PARTICULAR PURPOSE, OR NON-INFRINGEMENT. WIDGET WORKSHOP SHALL NOT BE LIABLE +# FOR ANY DAMAGES SUFFERED BY LICENSEE AS A RESULT OF USING, MODIFYING OR +# DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES. +# +# THIS SOFTWARE IS NOT DESIGNED OR INTENDED FOR USE OR RESALE AS ON-LINE +# CONTROL EQUIPMENT IN HAZARDOUS ENVIRONMENTS REQUIRING FAIL-SAFE +# PERFORMANCE, SUCH AS IN THE OPERATION OF NUCLEAR FACILITIES, AIRCRAFT +# NAVIGATION OR COMMUNICATION SYSTEMS, AIR TRAFFIC CONTROL, DIRECT LIFE +# SUPPORT MACHINES, OR WEAPONS SYSTEMS, IN WHICH THE FAILURE OF THE +# SOFTWARE COULD LEAD DIRECTLY TO DEATH, PERSONAL INJURY, OR SEVERE +# PHYSICAL OR ENVIRONMENTAL DAMAGE ("HIGH RISK ACTIVITIES"). WIDGET WORKSHOP +# SPECIFICALLY DISCLAIMS ANY EXPRESS OR IMPLIED WARRANTY OF FITNESS FOR +# HIGH RISK ACTIVITIES. +# +# +# The rest is: +# +# Copyright (C) 1996 by Jef Poskanzer . All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# +# Visit the ACME Labs Java page for up-to-date versions of this and other +# fine Java utilities: http://www.acme.com/java/ + + +#/ The DES encryption method. +#

+# This is surprisingly fast, for pure Java. On a SPARC 20, wrapped +# in Acme.Crypto.EncryptedOutputStream or Acme.Crypto.EncryptedInputStream, +# it does around 7000 bytes/second. +#

+# Most of this code is by Dave Zimmerman , and is +# Copyright (c) 1996 Widget Workshop, Inc. See the source file for details. +#

+# Fetch the software.
+# Fetch the entire Acme package. +#

+# @see Des3Cipher +# @see EncryptedOutputStream +# @see EncryptedInputStream + +import struct + +class DesCipher: + # Constructor, byte-array key. + def __init__(self, key): + self.setKey(key) + + #/ Set the key. + def setKey(self, key): + self.encryptKeys = self.deskey([ord(x) for x in key], 1) + self.decryptKeys = self.deskey([ord(x) for x in key], 0) + + # Turn an 8-byte key into internal keys. + def deskey(self, keyBlock, encrypting): + #~ int i, j, l, m, n; + pc1m = [0]*56 #new int[56]; + pcr = [0]*56 #new int[56]; + kn = [0]*32 #new int[32]; + + for j in range(56): + l = pc1[j] + m = l & 07 + pc1m[j] = ((keyBlock[l >> 3] & bytebit[m]) != 0) + for i in range(16): + if encrypting: + m = i << 1 + else: + m = (15-i) << 1 + n = m + 1 + kn[m] = kn[n] = 0 + for j in range(28): + l = j + totrot[i] + if l < 28: + pcr[j] = pc1m[l] + else: + pcr[j] = pc1m[l - 28] + for j in range(28, 56): + l = j + totrot[i] + if l < 56: + pcr[j] = pc1m[l] + else: + pcr[j] = pc1m[l - 28] + for j in range(24): + if pcr[pc2[j]] != 0: + kn[m] |= bigbyte[j] + if pcr[pc2[j+24]] != 0: + kn[n] |= bigbyte[j] + return self.cookey(kn) + + def cookey(self, raw): + #~ int raw0, raw1; + #~ int rawi, KnLi; + #~ int i; + KnL = [0]*32 + + rawi = 0 + KnLi = 0 + for i in range(16): + raw0 = raw[rawi] + rawi += 1 + raw1 = raw[rawi] + rawi += 1 + KnL[KnLi] = (raw0 & 0x00fc0000L) << 6 + KnL[KnLi] |= (raw0 & 0x00000fc0L) << 10 + KnL[KnLi] |= (raw1 & 0x00fc0000L) >> 10 + KnL[KnLi] |= (raw1 & 0x00000fc0L) >> 6 + KnLi += 1 + KnL[KnLi] = (raw0 & 0x0003f000L) << 12 + KnL[KnLi] |= (raw0 & 0x0000003fL) << 16 + KnL[KnLi] |= (raw1 & 0x0003f000L) >> 4 + KnL[KnLi] |= (raw1 & 0x0000003fL) + KnLi += 1 + return KnL + + # Block encryption routines. + + #/ Encrypt a block of eight bytes. + def encrypt(self, clearText): + if len(clearText) % 8 != 0: + raise TypeError, "length must be multiple of block size" + result = [] + while clearText: + result.append(struct.pack( + ">LL", *self.des(struct.unpack(">LL", clearText[:8]), + self.encryptKeys))) + clearText = clearText[8:] + return ''.join(result) + + #/ Decrypt a block of eight bytes. + def decrypt(self, cipherText): + if len(cipherText) % 8 != 0: + raise TypeError, "length must be multiple of block size" + result = [] + while cipherText: + result.append(struct.pack( + ">LL", *self.des(struct.unpack(">LL", cipherText[:8]), + self.decryptKeys))) + cipherText = cipherText[8:] + return ''.join(result) + + # The DES function. + def des(self, (leftt, right), keys): + #~ int fval, work, right, leftt; + #~ int round + keysi = 0 + + work = ((leftt >> 4) ^ right) & 0x0f0f0f0fL + right ^= work + leftt ^= (work << 4) & 0xffffffffL + + work = ((leftt >> 16) ^ right) & 0x0000ffffL + right ^= work + leftt ^= (work << 16) & 0xffffffffL + + work = ((right >> 2) ^ leftt) & 0x33333333L + leftt ^= work + right ^= (work << 2) & 0xffffffffL + + work = ((right >> 8) ^ leftt) & 0x00ff00ffL + leftt ^= work + right ^= (work << 8) & 0xffffffffL + right = ((right << 1) | ((right >> 31) & 1)) & 0xffffffffL + + work = (leftt ^ right) & 0xaaaaaaaaL + leftt ^= work + right ^= work + leftt = ((leftt << 1) | ((leftt >> 31) & 1)) & 0xffffffffL + + for round in range(8): + work = ((right << 28) | (right >> 4)) & 0xffffffffL + work ^= keys[keysi] + keysi += 1 + fval = SP7[ work & 0x0000003fL ] + fval |= SP5[(work >> 8) & 0x0000003fL ] + fval |= SP3[(work >> 16) & 0x0000003fL ] + fval |= SP1[(work >> 24) & 0x0000003fL ] + work = right ^ keys[keysi] + keysi += 1 + fval |= SP8[ work & 0x0000003fL ] + fval |= SP6[(work >> 8) & 0x0000003fL ] + fval |= SP4[(work >> 16) & 0x0000003fL ] + fval |= SP2[(work >> 24) & 0x0000003fL ] + leftt ^= fval + work = ((leftt << 28) | (leftt >> 4)) & 0xffffffffL + work ^= keys[keysi] + keysi += 1 + fval = SP7[ work & 0x0000003fL ] + fval |= SP5[(work >> 8) & 0x0000003fL ] + fval |= SP3[(work >> 16) & 0x0000003fL ] + fval |= SP1[(work >> 24) & 0x0000003fL ] + work = leftt ^ keys[keysi] + keysi += 1 + fval |= SP8[ work & 0x0000003fL ] + fval |= SP6[(work >> 8) & 0x0000003fL ] + fval |= SP4[(work >> 16) & 0x0000003fL ] + fval |= SP2[(work >> 24) & 0x0000003fL ] + right ^= fval + + right = ((right << 31) | (right >> 1)) & 0xffffffffL + work = (leftt ^ right) & 0xaaaaaaaaL + leftt ^= work + right ^= work + leftt = ((leftt << 31) | (leftt >> 1)) & 0xffffffffL + work = ((leftt >> 8) ^ right) & 0x00ff00ffL + right ^= work + leftt ^= (work << 8) & 0xffffffffL + work = ((leftt >> 2) ^ right) & 0x33333333L + right ^= work + leftt ^= (work << 2) & 0xffffffffL + work = ((right >> 16) ^ leftt) & 0x0000ffffL + leftt ^= work + right ^= (work << 16) & 0xffffffffL + work = ((right >> 4) ^ leftt) & 0x0f0f0f0fL + leftt ^= work + right ^= (work << 4) & 0xffffffffL + return right, leftt + +# Tables, permutations, S-boxes, etc. + +bytebit = [0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01] + +bigbyte = [ + 0x800000, 0x400000, 0x200000, 0x100000, + 0x080000, 0x040000, 0x020000, 0x010000, + 0x008000, 0x004000, 0x002000, 0x001000, + 0x000800, 0x000400, 0x000200, 0x000100, + 0x000080, 0x000040, 0x000020, 0x000010, + 0x000008, 0x000004, 0x000002, 0x000001 +] + +pc1 = [ + 56, 48, 40, 32, 24, 16, 8, + 0, 57, 49, 41, 33, 25, 17, + 9, 1, 58, 50, 42, 34, 26, + 18, 10, 2, 59, 51, 43, 35, + 62, 54, 46, 38, 30, 22, 14, + 6, 61, 53, 45, 37, 29, 21, + 13, 5, 60, 52, 44, 36, 28, + 20, 12, 4, 27, 19, 11, 3 +] + +totrot = [ + 1, 2, 4, 6, 8, 10, 12, 14, 15, 17, 19, 21, 23, 25, 27, 28 +] + +pc2 = [ + 13, 16, 10, 23, 0, 4, + 2, 27, 14, 5, 20, 9, + 22, 18, 11, 3 , 25, 7, + 15, 6, 26, 19, 12, 1, + 40, 51, 30, 36, 46, 54, + 29, 39, 50, 44, 32, 47, + 43, 48, 38, 55, 33, 52, + 45, 41, 49, 35, 28, 31, +] + +SP1 = [ +0x02080800L, 0x00080000L, 0x02000002L, 0x02080802L, +0x02000000L, 0x00080802L, 0x00080002L, 0x02000002L, +0x00080802L, 0x02080800L, 0x02080000L, 0x00000802L, +0x02000802L, 0x02000000L, 0x00000000L, 0x00080002L, +0x00080000L, 0x00000002L, 0x02000800L, 0x00080800L, +0x02080802L, 0x02080000L, 0x00000802L, 0x02000800L, +0x00000002L, 0x00000800L, 0x00080800L, 0x02080002L, +0x00000800L, 0x02000802L, 0x02080002L, 0x00000000L, +0x00000000L, 0x02080802L, 0x02000800L, 0x00080002L, +0x02080800L, 0x00080000L, 0x00000802L, 0x02000800L, +0x02080002L, 0x00000800L, 0x00080800L, 0x02000002L, +0x00080802L, 0x00000002L, 0x02000002L, 0x02080000L, +0x02080802L, 0x00080800L, 0x02080000L, 0x02000802L, +0x02000000L, 0x00000802L, 0x00080002L, 0x00000000L, +0x00080000L, 0x02000000L, 0x02000802L, 0x02080800L, +0x00000002L, 0x02080002L, 0x00000800L, 0x00080802L +] +SP2 = [ +0x40108010L, 0x00000000L, 0x00108000L, 0x40100000L, +0x40000010L, 0x00008010L, 0x40008000L, 0x00108000L, +0x00008000L, 0x40100010L, 0x00000010L, 0x40008000L, +0x00100010L, 0x40108000L, 0x40100000L, 0x00000010L, +0x00100000L, 0x40008010L, 0x40100010L, 0x00008000L, +0x00108010L, 0x40000000L, 0x00000000L, 0x00100010L, +0x40008010L, 0x00108010L, 0x40108000L, 0x40000010L, +0x40000000L, 0x00100000L, 0x00008010L, 0x40108010L, +0x00100010L, 0x40108000L, 0x40008000L, 0x00108010L, +0x40108010L, 0x00100010L, 0x40000010L, 0x00000000L, +0x40000000L, 0x00008010L, 0x00100000L, 0x40100010L, +0x00008000L, 0x40000000L, 0x00108010L, 0x40008010L, +0x40108000L, 0x00008000L, 0x00000000L, 0x40000010L, +0x00000010L, 0x40108010L, 0x00108000L, 0x40100000L, +0x40100010L, 0x00100000L, 0x00008010L, 0x40008000L, +0x40008010L, 0x00000010L, 0x40100000L, 0x00108000L +] +SP3 = [ +0x04000001L, 0x04040100L, 0x00000100L, 0x04000101L, +0x00040001L, 0x04000000L, 0x04000101L, 0x00040100L, +0x04000100L, 0x00040000L, 0x04040000L, 0x00000001L, +0x04040101L, 0x00000101L, 0x00000001L, 0x04040001L, +0x00000000L, 0x00040001L, 0x04040100L, 0x00000100L, +0x00000101L, 0x04040101L, 0x00040000L, 0x04000001L, +0x04040001L, 0x04000100L, 0x00040101L, 0x04040000L, +0x00040100L, 0x00000000L, 0x04000000L, 0x00040101L, +0x04040100L, 0x00000100L, 0x00000001L, 0x00040000L, +0x00000101L, 0x00040001L, 0x04040000L, 0x04000101L, +0x00000000L, 0x04040100L, 0x00040100L, 0x04040001L, +0x00040001L, 0x04000000L, 0x04040101L, 0x00000001L, +0x00040101L, 0x04000001L, 0x04000000L, 0x04040101L, +0x00040000L, 0x04000100L, 0x04000101L, 0x00040100L, +0x04000100L, 0x00000000L, 0x04040001L, 0x00000101L, +0x04000001L, 0x00040101L, 0x00000100L, 0x04040000L +] +SP4 = [ +0x00401008L, 0x10001000L, 0x00000008L, 0x10401008L, +0x00000000L, 0x10400000L, 0x10001008L, 0x00400008L, +0x10401000L, 0x10000008L, 0x10000000L, 0x00001008L, +0x10000008L, 0x00401008L, 0x00400000L, 0x10000000L, +0x10400008L, 0x00401000L, 0x00001000L, 0x00000008L, +0x00401000L, 0x10001008L, 0x10400000L, 0x00001000L, +0x00001008L, 0x00000000L, 0x00400008L, 0x10401000L, +0x10001000L, 0x10400008L, 0x10401008L, 0x00400000L, +0x10400008L, 0x00001008L, 0x00400000L, 0x10000008L, +0x00401000L, 0x10001000L, 0x00000008L, 0x10400000L, +0x10001008L, 0x00000000L, 0x00001000L, 0x00400008L, +0x00000000L, 0x10400008L, 0x10401000L, 0x00001000L, +0x10000000L, 0x10401008L, 0x00401008L, 0x00400000L, +0x10401008L, 0x00000008L, 0x10001000L, 0x00401008L, +0x00400008L, 0x00401000L, 0x10400000L, 0x10001008L, +0x00001008L, 0x10000000L, 0x10000008L, 0x10401000L +] +SP5 = [ +0x08000000L, 0x00010000L, 0x00000400L, 0x08010420L, +0x08010020L, 0x08000400L, 0x00010420L, 0x08010000L, +0x00010000L, 0x00000020L, 0x08000020L, 0x00010400L, +0x08000420L, 0x08010020L, 0x08010400L, 0x00000000L, +0x00010400L, 0x08000000L, 0x00010020L, 0x00000420L, +0x08000400L, 0x00010420L, 0x00000000L, 0x08000020L, +0x00000020L, 0x08000420L, 0x08010420L, 0x00010020L, +0x08010000L, 0x00000400L, 0x00000420L, 0x08010400L, +0x08010400L, 0x08000420L, 0x00010020L, 0x08010000L, +0x00010000L, 0x00000020L, 0x08000020L, 0x08000400L, +0x08000000L, 0x00010400L, 0x08010420L, 0x00000000L, +0x00010420L, 0x08000000L, 0x00000400L, 0x00010020L, +0x08000420L, 0x00000400L, 0x00000000L, 0x08010420L, +0x08010020L, 0x08010400L, 0x00000420L, 0x00010000L, +0x00010400L, 0x08010020L, 0x08000400L, 0x00000420L, +0x00000020L, 0x00010420L, 0x08010000L, 0x08000020L +] +SP6 = [ +0x80000040L, 0x00200040L, 0x00000000L, 0x80202000L, +0x00200040L, 0x00002000L, 0x80002040L, 0x00200000L, +0x00002040L, 0x80202040L, 0x00202000L, 0x80000000L, +0x80002000L, 0x80000040L, 0x80200000L, 0x00202040L, +0x00200000L, 0x80002040L, 0x80200040L, 0x00000000L, +0x00002000L, 0x00000040L, 0x80202000L, 0x80200040L, +0x80202040L, 0x80200000L, 0x80000000L, 0x00002040L, +0x00000040L, 0x00202000L, 0x00202040L, 0x80002000L, +0x00002040L, 0x80000000L, 0x80002000L, 0x00202040L, +0x80202000L, 0x00200040L, 0x00000000L, 0x80002000L, +0x80000000L, 0x00002000L, 0x80200040L, 0x00200000L, +0x00200040L, 0x80202040L, 0x00202000L, 0x00000040L, +0x80202040L, 0x00202000L, 0x00200000L, 0x80002040L, +0x80000040L, 0x80200000L, 0x00202040L, 0x00000000L, +0x00002000L, 0x80000040L, 0x80002040L, 0x80202000L, +0x80200000L, 0x00002040L, 0x00000040L, 0x80200040L, +] +SP7 = [ +0x00004000L, 0x00000200L, 0x01000200L, 0x01000004L, +0x01004204L, 0x00004004L, 0x00004200L, 0x00000000L, +0x01000000L, 0x01000204L, 0x00000204L, 0x01004000L, +0x00000004L, 0x01004200L, 0x01004000L, 0x00000204L, +0x01000204L, 0x00004000L, 0x00004004L, 0x01004204L, +0x00000000L, 0x01000200L, 0x01000004L, 0x00004200L, +0x01004004L, 0x00004204L, 0x01004200L, 0x00000004L, +0x00004204L, 0x01004004L, 0x00000200L, 0x01000000L, +0x00004204L, 0x01004000L, 0x01004004L, 0x00000204L, +0x00004000L, 0x00000200L, 0x01000000L, 0x01004004L, +0x01000204L, 0x00004204L, 0x00004200L, 0x00000000L, +0x00000200L, 0x01000004L, 0x00000004L, 0x01000200L, +0x00000000L, 0x01000204L, 0x01000200L, 0x00004200L, +0x00000204L, 0x00004000L, 0x01004204L, 0x01000000L, +0x01004200L, 0x00000004L, 0x00004004L, 0x01004204L, +0x01000004L, 0x01004200L, 0x01004000L, 0x00004004L, +] +SP8 = [ +0x20800080L, 0x20820000L, 0x00020080L, 0x00000000L, +0x20020000L, 0x00800080L, 0x20800000L, 0x20820080L, +0x00000080L, 0x20000000L, 0x00820000L, 0x00020080L, +0x00820080L, 0x20020080L, 0x20000080L, 0x20800000L, +0x00020000L, 0x00820080L, 0x00800080L, 0x20020000L, +0x20820080L, 0x20000080L, 0x00000000L, 0x00820000L, +0x20000000L, 0x00800000L, 0x20020080L, 0x20800080L, +0x00800000L, 0x00020000L, 0x20820000L, 0x00000080L, +0x00800000L, 0x00020000L, 0x20000080L, 0x20820080L, +0x00020080L, 0x20000000L, 0x00000000L, 0x00820000L, +0x20800080L, 0x20020080L, 0x20020000L, 0x00800080L, +0x20820000L, 0x00000080L, 0x00800080L, 0x20020000L, +0x20820080L, 0x00800000L, 0x20800000L, 0x20000080L, +0x00820000L, 0x00020080L, 0x20020080L, 0x20800000L, +0x00000080L, 0x20820000L, 0x00820080L, 0x00000000L, +0x20000000L, 0x20800080L, 0x00020000L, 0x00820080L, +] + +def new(key): + return DesCipher(key) + +block_size = 8 +key_size = 8 + +#test only: +if __name__ == '__main__': + des = DesCipher("\x01\x23\x45\x67\x89\xab\xcd\xef") + print ''.join( + "%02x" % ord(x) for x in des.encrypt("Now is t")) + diff --git a/src/calibre/ebooks/lit/mssha1.py b/src/calibre/ebooks/lit/mssha1.py index f6f7c33444..d61bd39094 100644 --- a/src/calibre/ebooks/lit/mssha1.py +++ b/src/calibre/ebooks/lit/mssha1.py @@ -123,7 +123,7 @@ K = [ 0xCA62C1D6L # (60 <= t <= 79) ] -class sha: +class mssha1(object): "An implementation of the MD5 hash function in pure Python." def __init__(self): @@ -186,7 +186,7 @@ class sha: def update(self, inBuf): """Add to the current message. - Update the sha object with the string arg. Repeated calls + Update the mssha1 object with the string arg. Repeated calls are equivalent to a single call with the concatenation of all the arguments, i.e. s.update(a); s.update(b) is equivalent to s.update(a+b). @@ -308,12 +308,12 @@ digest_size = digestsize = 20 blocksize = 1 def new(arg=None): - """Return a new sha crypto object. + """Return a new mssha1 crypto object. If arg is present, the method call update(arg) is made. """ - crypto = sha() + crypto = mssha1() if arg: crypto.update(arg) diff --git a/src/calibre/ebooks/lit/reader.py b/src/calibre/ebooks/lit/reader.py index 4d149042cc..2608d63399 100644 --- a/src/calibre/ebooks/lit/reader.py +++ b/src/calibre/ebooks/lit/reader.py @@ -13,6 +13,8 @@ from calibre.ebooks.metadata import MetaInformation from calibre.ebooks.metadata.opf import OPFReader from calibre.ebooks.lit import LitError from calibre.ebooks.lit.maps import OPF_MAP, HTML_MAP +import calibre.ebooks.lit.mssha1 as mssha1 +import calibre.ebooks.lit.msdes as msdes OPF_DECL = """" "http://openebook.org/dtds/oeb-1.0.1/oebdoc101.dtd"> """ +DESENCRYPT_GUID = "{67F6E4A2-60BF-11D3-8540-00C04F58C3CF}" +LZXCOMPRESS_GUID = "{0A9007C6-4076-11D3-8789-0000F8105754}" + def u32(bytes): return struct.unpack(' len(raw): raise LitError('Invalid Namelist section') - self.sections[section] = raw[pos:pos+size].decode('utf-16-le') - pos += size + self.section_names[section] = \ + raw[pos:pos+size].decode('utf-16-le').rstrip('\000') + pos += size - @preserve def read_manifest(self, entry): self.manifest = [] - self._stream.seek(self.content_offset + entry.offset) - raw = self._stream.read(entry.size) + raw = self._read_content(entry.offset, entry.size) pos = 0 while pos < len(raw): size = ord(raw[pos]) @@ -595,19 +598,52 @@ class LitFile(object): offset, root, state)) i += 1 - @preserve def read_meta(self, entry): - self._stream.seek(self.content_offset + entry.offset) - raw = self._stream.read(entry.size) + raw = self._read_content(entry.offset, entry.size) xml = OPF_DECL + unicode(UnBinary(raw, self.manifest)) self.meta = xml - @preserve - def read_image(self, internal_name): - cover_entry = self.entries[internal_name] - self._stream.seek(self.content_offset + cover_entry.offset) - return self._stream.read(cover_entry.size) + def read_drm(self): + def exists_file(name): + try: self.get_file(name) + except KeyError: return False + return True + self.drmlevel = 0 + if exists_file('/DRMStorage/Licenses/EUL'): + self.drmlevel = 5 + elif exists_file('/DRMStorage/DRMBookplate'): + self.drmlevel = 3 + elif exists_file('/DRMStorage/DRMSealed'): + self.drmlevel = 1 + else: + return + des = msdes.new(self.calculate_deskey()) + bookkey = des.decrypt(self.get_file('/DRMStorage/DRMSealed')) + if bookkey[0] != '\000': + raise LitError('Unable to decrypt title key!') + self.bookkey = bookkey[1:9] + def calculate_deskey(self): + hashfiles = ['/meta', '/DRMStorage/DRMSource'] + if self.drmlevel == 3: + hashfiles.append('/DRMStorage/DRMBookplate') + prepad = 2 + hash = mssha1.new() + for name in hashfiles: + data = self.get_file(name) + if prepad > 0: + data = ("\000" * prepad) + data + prepad = 0 + postpad = 64 - (len(data) % 64) + if postpad < 64: + data = data + ("\000" * postpad) + hash.update(data) + digest = hash.digest() + key = [0] * 8 + for i in xrange(0, len(digest)): + key[i % 8] ^= ord(digest[i]) + return ''.join(chr(x) for x in key) + def get_file(self, name): entry = self.entries[name] if entry.section == 0: @@ -615,6 +651,40 @@ class LitFile(object): section = self.get_section(entry.section) return section[entry.offset:entry.offset+entry.size] + def get_section(self, section): + data = self.section_data[section] + if not data: + data = self._get_section(section) + self.section_data[section] = data + return data + + def _get_section(self, section): + name = self.section_names[section] + path = '::DataSpace/Storage/' + name + transform = self.get_file(path + '/Transform/List') + content = self.get_file(path + '/Content') + control = self.get_file(path + '/ControlData') + idx_transform = idx_control = 0 + while (len(transform) - idx_transform) >= 16: + ndwords = int32(control[idx_control:]) + 1 + if (idx_control + (ndwords * 4)) > len(control) or ndwords <= 0: + raise LitError("ControlData is too short") + guid = msguid(transform[idx_transform:]) + if guid == DESENCRYPT_GUID: + content = self._decrypt(content) + idx_control += ndwords * 4 + elif guid == LZXCOMPRESS_GUID: + raise LitError("LZX decompression not implemented") + else: + raise LitError("Unrecognized transform: %s." % repr(guid)) + idx_transform += 16 + return content + + def _decrypt(self, content): + if self.drmlevel == 5: + raise LitError('Cannot extract content from a DRM protected ebook') + return msdes.new(self.bookkey).decrypt(content) + def get_metadata(stream): try: litfile = LitFile(stream) @@ -632,7 +702,7 @@ def get_metadata(stream): ext = 'jpg' else: ext = ext.lower() - cd = litfile.read_image(cover_item) + cd = litfile.get_file(cover_item) mi.cover_data = (ext, cd) if cd else (None, None) except: title = stream.name if hasattr(stream, 'name') and stream.name else 'Unknown' From 4eeae13b3508d743fcb2f007fe3b352b87c9acc5 Mon Sep 17 00:00:00 2001 From: "Marshall T. Vandegrift" Date: Thu, 17 Jul 2008 23:14:59 -0400 Subject: [PATCH 05/19] Checkpoint before sleep --- src/calibre/ebooks/lit/lzxd.py | 138 +++++++++++++++++++++++++++++++++ 1 file changed, 138 insertions(+) create mode 100644 src/calibre/ebooks/lit/lzxd.py diff --git a/src/calibre/ebooks/lit/lzxd.py b/src/calibre/ebooks/lit/lzxd.py new file mode 100644 index 0000000000..a09daf012b --- /dev/null +++ b/src/calibre/ebooks/lit/lzxd.py @@ -0,0 +1,138 @@ +import copy + +# some constants defined by the LZX specification +MIN_MATCH = 2 +MAX_MATCH = 257 +NUM_CHARS = 256 +BLOCKTYPE_INVALID = 0 # also blocktypes 4-7 invalid +BLOCKTYPE_VERBATIM = 1 +BLOCKTYPE_ALIGNED = 2 +BLOCKTYPE_UNCOMPRESSED = 3 +PRETREE_NUM_ELEMENTS = 20 +ALIGNED_NUM_ELEMENTS = 8 # aligned offset tree #elements +NUM_PRIMARY_LENGTHS = 7 # this one missing from spec! +NUM_SECONDARY_LENGTHS = 249 # length tree #elements + +# LZX huffman defines: tweak tablebits as desired +PRETREE_MAXSYMBOLS = LZX_PRETREE_NUM_ELEMENTS +PRETREE_TABLEBITS = 6 +MAINTREE_MAXSYMBOLS = LZX_NUM_CHARS + 50*8 +MAINTREE_TABLEBITS = 12 +LENGTH_MAXSYMBOLS = LZX_NUM_SECONDARY_LENGTHS+1 +LENGTH_TABLEBITS = 12 +ALIGNED_MAXSYMBOLS = LZX_ALIGNED_NUM_ELEMENTS +ALIGNED_TABLEBITS = 7 +LENTABLE_SAFETY = 64 # table decoding overruns are allowed + +FRAME_SIZE = 32768 # the size of a frame in LZX + + +class BitReader(object): + def __init__(self, data): + self.data, self.pos, self.nbits = \ + data + "\x00\x00\x00\x00", 0, len(data) * 8 + + def peek(self, n): + r, g = 0, 0 + while g < n: + r = (r << 8) | ord(self.data[(self.pos + g) >> 3]) + g = g + 8 - ((self.pos + g) & 7) + return (r >> (g - n)) & ((1 << n) - 1) + + def remove(self, n): + self.pos += n + return self.pos <= self.nbits + + def left(self): + return self.nbits - self.pos + + def read(self, n): + val = self.peek(n) + self.remove(n) + return val + +class LzxError(Exception): + pass + +POSITION_BASE = [0]*51 +EXTRA_BITS = [0]*51 + +def _static_init(): + j = 0 + for i in xrange(0, 51, 2): + EXTRA_BITS[i] = j + EXTRA_BITS[i + 1] = j + if i != 0 or j < 17): j += 1 + j = 0 + for i in xrange(0, 51, 1): + POSITION_BASE[i] = j + j += 1 << extra_bits[i] +_static_init() + +class LzxDecompressor(object): + def __init__(self, window_bits, reset_interval=0x7fff): + # LZX supports window sizes of 2^15 (32Kb) through 2^21 (2Mb) + if window_bits < 15 or window_bits > 21: + raise LzxError("Invalid window size") + + self.window_size = 1 << window_bits + self.window_posn = 0 + self.frame_posn = 0 + self.frame = 0 + self.reset_interval = reset_interval + self.intel_filesize = 0 + self.intel_curpos = 0 + + # window bits: 15 16 17 18 19 20 21 + # position slots: 30 32 34 36 38 42 50 + self.posn_solts = 50 if window_bits == 21 \ + else 42 if window_bits == 20 else window_bits << 1 + self.intel_started = 0 + self.input_end = 0 + + # huffman code lengths + self.PRETREE_len = [0] * (PRETREE_MAXSYMBOLS + LENTABLE_SAFETY) + self.MAINTREE_len = [0] * (MAINTREE_MAXSYMBOLS + LENTABLE_SAFETY) + self.LENGTH_len = [0] * (LENGTH_MAXSYMBOLS + LENTABLE_SAFETY) + self.ALIGNED_len = [0] * (ALIGNED_MAXSYMBOLS + LENTABLE_SAFETY) + + # huffman decoding tables + self.PRETREE_table = \ + [0] * ((1 << PRETREE_TABLEBITS) + (PRETREE_MAXSYMBOLS * 2)) + self.MAINTREE_table = \ + [0] * ((1 << MAINTREE_TABLEBITS) + (MAINTREE_MAXSYMBOLS * 2)) + self.LENGTH_table = \ + [0] * ((1 << LENGTH_TABLEBITS) + (LENGTH_MAXSYMBOLS * 2)) + self.ALIGNED_table = \ + [0] * ((1 << ALIGNED_TABLEBITS) + (ALIGNED_MAXSYMBOLS * 2)) + + self.o_buf = self.i_buf = '' + + self._reset_state() + + def _reset_state(self): + self.R0 = 1 + self.R1 = 1 + self.R2 = 1 + self.header_read = 0 + self.block_remaining = 0 + self.block_type = BLOCKTYPE_INVALID + + # initialise tables to 0 (because deltas will be applied to them) + for i in xrange(MAINTREE_MAXSYMBOLS): self.MAINTREE_len[i] = 0 + for i in xrange(LENGTH_MAXSYMBOLS): self.LENGTH_len[i] = 0 + + def decompress(self, data, out_bytes): + return ''.join(self._decompress(data, out_bytes)) + + def _decompress(self, data, out_bytes): + # easy answers + if out_bytes < 0: + raise LzxError('Negative desired output bytes') + + # Initialize input and output + input = BitReader(data) + output = [] + + + From 11c6b0a44d6c819634594eb538d3d4feff7632fe Mon Sep 17 00:00:00 2001 From: "Marshall T. Vandegrift" Date: Fri, 18 Jul 2008 00:15:13 -0400 Subject: [PATCH 06/19] Fixed trailing space issue --- src/calibre/ebooks/lrf/html/convert_from.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/calibre/ebooks/lrf/html/convert_from.py b/src/calibre/ebooks/lrf/html/convert_from.py index 15eede6d6c..17ffd05ee2 100644 --- a/src/calibre/ebooks/lrf/html/convert_from.py +++ b/src/calibre/ebooks/lrf/html/convert_from.py @@ -222,6 +222,7 @@ class HTMLConverter(object, LoggingInterface): self.memory = [] #: Used to ensure that duplicate CSS unhandled erros are not reported self.tops = {} #: element representing the top of each HTML file in the LRF file self.previous_text = '' #: Used to figure out when to lstrip + self.stripped_space = '' self.preserve_block_style = False #: Used so that

tags in

elements are handled properly self.avoid_page_break = False self.current_page = book.create_page() @@ -864,11 +865,15 @@ class HTMLConverter(object, LoggingInterface): if collapse_whitespace: src = re.sub(r'\s{1,}', ' ', src) + if self.stripped_space and len(src) == len(src.lstrip(u' \n\r\t')): + src = self.stripped_space + src + src, orig = src.rstrip(u' \n\r\t'), src + self.stripped_space = orig[len(src):] if len(self.previous_text) != len(self.previous_text.rstrip(u' \n\r\t')): src = src.lstrip(u' \n\r\t') if len(src): self.previous_text = src - append_text(src) + append_text(src) else: srcs = src.split('\n') for src in srcs[:-1]: From bc6f3ab5de22ca0fdb70369e54c081f01b78e2fa Mon Sep 17 00:00:00 2001 From: "Marshall T. Vandegrift" Date: Fri, 18 Jul 2008 00:20:01 -0400 Subject: [PATCH 07/19] Reverted incorrect branch change --- src/calibre/ebooks/lrf/html/convert_from.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/src/calibre/ebooks/lrf/html/convert_from.py b/src/calibre/ebooks/lrf/html/convert_from.py index 17ffd05ee2..15eede6d6c 100644 --- a/src/calibre/ebooks/lrf/html/convert_from.py +++ b/src/calibre/ebooks/lrf/html/convert_from.py @@ -222,7 +222,6 @@ class HTMLConverter(object, LoggingInterface): self.memory = [] #: Used to ensure that duplicate CSS unhandled erros are not reported self.tops = {} #: element representing the top of each HTML file in the LRF file self.previous_text = '' #: Used to figure out when to lstrip - self.stripped_space = '' self.preserve_block_style = False #: Used so that

tags in

elements are handled properly self.avoid_page_break = False self.current_page = book.create_page() @@ -865,15 +864,11 @@ class HTMLConverter(object, LoggingInterface): if collapse_whitespace: src = re.sub(r'\s{1,}', ' ', src) - if self.stripped_space and len(src) == len(src.lstrip(u' \n\r\t')): - src = self.stripped_space + src - src, orig = src.rstrip(u' \n\r\t'), src - self.stripped_space = orig[len(src):] if len(self.previous_text) != len(self.previous_text.rstrip(u' \n\r\t')): src = src.lstrip(u' \n\r\t') if len(src): self.previous_text = src - append_text(src) + append_text(src) else: srcs = src.split('\n') for src in srcs[:-1]: From 1e78860f4f3b414a70cfdc04b0dcb1435fea22f8 Mon Sep 17 00:00:00 2001 From: "Marshall T. Vandegrift" Date: Fri, 18 Jul 2008 16:34:41 -0400 Subject: [PATCH 08/19] Switched LZX to C extension --- src/calibre/ebooks/lit/lzxd.py | 138 --- src/calibre/utils/lzx-setup.py | 5 + src/calibre/utils/lzx/lzx.h | 169 ++++ src/calibre/utils/lzx/lzxd.c | 905 ++++++++++++++++++ src/calibre/utils/lzx/lzxglue.c | 172 ++++ src/calibre/utils/lzx/lzxmodule.c | 206 ++++ src/calibre/utils/lzx/mspack.h | 1482 +++++++++++++++++++++++++++++ src/calibre/utils/lzx/system.h | 66 ++ 8 files changed, 3005 insertions(+), 138 deletions(-) delete mode 100644 src/calibre/ebooks/lit/lzxd.py create mode 100644 src/calibre/utils/lzx-setup.py create mode 100644 src/calibre/utils/lzx/lzx.h create mode 100644 src/calibre/utils/lzx/lzxd.c create mode 100644 src/calibre/utils/lzx/lzxglue.c create mode 100644 src/calibre/utils/lzx/lzxmodule.c create mode 100644 src/calibre/utils/lzx/mspack.h create mode 100644 src/calibre/utils/lzx/system.h diff --git a/src/calibre/ebooks/lit/lzxd.py b/src/calibre/ebooks/lit/lzxd.py deleted file mode 100644 index a09daf012b..0000000000 --- a/src/calibre/ebooks/lit/lzxd.py +++ /dev/null @@ -1,138 +0,0 @@ -import copy - -# some constants defined by the LZX specification -MIN_MATCH = 2 -MAX_MATCH = 257 -NUM_CHARS = 256 -BLOCKTYPE_INVALID = 0 # also blocktypes 4-7 invalid -BLOCKTYPE_VERBATIM = 1 -BLOCKTYPE_ALIGNED = 2 -BLOCKTYPE_UNCOMPRESSED = 3 -PRETREE_NUM_ELEMENTS = 20 -ALIGNED_NUM_ELEMENTS = 8 # aligned offset tree #elements -NUM_PRIMARY_LENGTHS = 7 # this one missing from spec! -NUM_SECONDARY_LENGTHS = 249 # length tree #elements - -# LZX huffman defines: tweak tablebits as desired -PRETREE_MAXSYMBOLS = LZX_PRETREE_NUM_ELEMENTS -PRETREE_TABLEBITS = 6 -MAINTREE_MAXSYMBOLS = LZX_NUM_CHARS + 50*8 -MAINTREE_TABLEBITS = 12 -LENGTH_MAXSYMBOLS = LZX_NUM_SECONDARY_LENGTHS+1 -LENGTH_TABLEBITS = 12 -ALIGNED_MAXSYMBOLS = LZX_ALIGNED_NUM_ELEMENTS -ALIGNED_TABLEBITS = 7 -LENTABLE_SAFETY = 64 # table decoding overruns are allowed - -FRAME_SIZE = 32768 # the size of a frame in LZX - - -class BitReader(object): - def __init__(self, data): - self.data, self.pos, self.nbits = \ - data + "\x00\x00\x00\x00", 0, len(data) * 8 - - def peek(self, n): - r, g = 0, 0 - while g < n: - r = (r << 8) | ord(self.data[(self.pos + g) >> 3]) - g = g + 8 - ((self.pos + g) & 7) - return (r >> (g - n)) & ((1 << n) - 1) - - def remove(self, n): - self.pos += n - return self.pos <= self.nbits - - def left(self): - return self.nbits - self.pos - - def read(self, n): - val = self.peek(n) - self.remove(n) - return val - -class LzxError(Exception): - pass - -POSITION_BASE = [0]*51 -EXTRA_BITS = [0]*51 - -def _static_init(): - j = 0 - for i in xrange(0, 51, 2): - EXTRA_BITS[i] = j - EXTRA_BITS[i + 1] = j - if i != 0 or j < 17): j += 1 - j = 0 - for i in xrange(0, 51, 1): - POSITION_BASE[i] = j - j += 1 << extra_bits[i] -_static_init() - -class LzxDecompressor(object): - def __init__(self, window_bits, reset_interval=0x7fff): - # LZX supports window sizes of 2^15 (32Kb) through 2^21 (2Mb) - if window_bits < 15 or window_bits > 21: - raise LzxError("Invalid window size") - - self.window_size = 1 << window_bits - self.window_posn = 0 - self.frame_posn = 0 - self.frame = 0 - self.reset_interval = reset_interval - self.intel_filesize = 0 - self.intel_curpos = 0 - - # window bits: 15 16 17 18 19 20 21 - # position slots: 30 32 34 36 38 42 50 - self.posn_solts = 50 if window_bits == 21 \ - else 42 if window_bits == 20 else window_bits << 1 - self.intel_started = 0 - self.input_end = 0 - - # huffman code lengths - self.PRETREE_len = [0] * (PRETREE_MAXSYMBOLS + LENTABLE_SAFETY) - self.MAINTREE_len = [0] * (MAINTREE_MAXSYMBOLS + LENTABLE_SAFETY) - self.LENGTH_len = [0] * (LENGTH_MAXSYMBOLS + LENTABLE_SAFETY) - self.ALIGNED_len = [0] * (ALIGNED_MAXSYMBOLS + LENTABLE_SAFETY) - - # huffman decoding tables - self.PRETREE_table = \ - [0] * ((1 << PRETREE_TABLEBITS) + (PRETREE_MAXSYMBOLS * 2)) - self.MAINTREE_table = \ - [0] * ((1 << MAINTREE_TABLEBITS) + (MAINTREE_MAXSYMBOLS * 2)) - self.LENGTH_table = \ - [0] * ((1 << LENGTH_TABLEBITS) + (LENGTH_MAXSYMBOLS * 2)) - self.ALIGNED_table = \ - [0] * ((1 << ALIGNED_TABLEBITS) + (ALIGNED_MAXSYMBOLS * 2)) - - self.o_buf = self.i_buf = '' - - self._reset_state() - - def _reset_state(self): - self.R0 = 1 - self.R1 = 1 - self.R2 = 1 - self.header_read = 0 - self.block_remaining = 0 - self.block_type = BLOCKTYPE_INVALID - - # initialise tables to 0 (because deltas will be applied to them) - for i in xrange(MAINTREE_MAXSYMBOLS): self.MAINTREE_len[i] = 0 - for i in xrange(LENGTH_MAXSYMBOLS): self.LENGTH_len[i] = 0 - - def decompress(self, data, out_bytes): - return ''.join(self._decompress(data, out_bytes)) - - def _decompress(self, data, out_bytes): - # easy answers - if out_bytes < 0: - raise LzxError('Negative desired output bytes') - - # Initialize input and output - input = BitReader(data) - output = [] - - - diff --git a/src/calibre/utils/lzx-setup.py b/src/calibre/utils/lzx-setup.py new file mode 100644 index 0000000000..87e523b9c3 --- /dev/null +++ b/src/calibre/utils/lzx-setup.py @@ -0,0 +1,5 @@ +from distutils.core import setup, Extension + +setup(name="lzx", version="1.0", + ext_modules=[Extension('lzx', sources=['lzx/lzxmodule.c', 'lzx/lzxd.c'], + include_dirs=['lzx'])]) diff --git a/src/calibre/utils/lzx/lzx.h b/src/calibre/utils/lzx/lzx.h new file mode 100644 index 0000000000..15ae17c0aa --- /dev/null +++ b/src/calibre/utils/lzx/lzx.h @@ -0,0 +1,169 @@ +/* This file is part of libmspack. + * (C) 2003-2004 Stuart Caie. + * + * The LZX method was created by Jonathan Forbes and Tomi Poutanen, adapted + * by Microsoft Corporation. + * + * libmspack is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License (LGPL) version 2.1 + * + * For further details, see the file COPYING.LIB distributed with libmspack + */ + +#include + +#ifndef MSPACK_LZX_H +#define MSPACK_LZX_H 1 + +/* LZX compression / decompression definitions */ + +/* some constants defined by the LZX specification */ +#define LZX_MIN_MATCH (2) +#define LZX_MAX_MATCH (257) +#define LZX_NUM_CHARS (256) +#define LZX_BLOCKTYPE_INVALID (0) /* also blocktypes 4-7 invalid */ +#define LZX_BLOCKTYPE_VERBATIM (1) +#define LZX_BLOCKTYPE_ALIGNED (2) +#define LZX_BLOCKTYPE_UNCOMPRESSED (3) +#define LZX_PRETREE_NUM_ELEMENTS (20) +#define LZX_ALIGNED_NUM_ELEMENTS (8) /* aligned offset tree #elements */ +#define LZX_NUM_PRIMARY_LENGTHS (7) /* this one missing from spec! */ +#define LZX_NUM_SECONDARY_LENGTHS (249) /* length tree #elements */ + +/* LZX huffman defines: tweak tablebits as desired */ +#define LZX_PRETREE_MAXSYMBOLS (LZX_PRETREE_NUM_ELEMENTS) +#define LZX_PRETREE_TABLEBITS (6) +#define LZX_MAINTREE_MAXSYMBOLS (LZX_NUM_CHARS + 50*8) +#define LZX_MAINTREE_TABLEBITS (12) +#define LZX_LENGTH_MAXSYMBOLS (LZX_NUM_SECONDARY_LENGTHS+1) +#define LZX_LENGTH_TABLEBITS (12) +#define LZX_ALIGNED_MAXSYMBOLS (LZX_ALIGNED_NUM_ELEMENTS) +#define LZX_ALIGNED_TABLEBITS (7) +#define LZX_LENTABLE_SAFETY (64) /* table decoding overruns are allowed */ + +#define LZX_FRAME_SIZE (32768) /* the size of a frame in LZX */ + +struct lzxd_stream { + struct mspack_system *sys; /* I/O routines */ + struct mspack_file *input; /* input file handle */ + struct mspack_file *output; /* output file handle */ + + off_t offset; /* number of bytes actually output */ + off_t length; /* overall decompressed length of stream */ + + unsigned char *window; /* decoding window */ + unsigned int window_size; /* window size */ + unsigned int window_posn; /* decompression offset within window */ + unsigned int frame_posn; /* current frame offset within in window */ + unsigned int frame; /* the number of 32kb frames processed */ + unsigned int reset_interval; /* which frame do we reset the compressor? */ + + unsigned int R0, R1, R2; /* for the LRU offset system */ + unsigned int block_length; /* uncompressed length of this LZX block */ + unsigned int block_remaining; /* uncompressed bytes still left to decode */ + + signed int intel_filesize; /* magic header value used for transform */ + signed int intel_curpos; /* current offset in transform space */ + + unsigned char intel_started; /* has intel E8 decoding started? */ + unsigned char block_type; /* type of the current block */ + unsigned char header_read; /* have we started decoding at all yet? */ + unsigned char posn_slots; /* how many posn slots in stream? */ + unsigned char input_end; /* have we reached the end of input? */ + + int error; + + /* I/O buffering */ + unsigned char *inbuf, *i_ptr, *i_end, *o_ptr, *o_end; + unsigned int bit_buffer, bits_left, inbuf_size; + + /* huffman code lengths */ + unsigned char PRETREE_len [LZX_PRETREE_MAXSYMBOLS + LZX_LENTABLE_SAFETY]; + unsigned char MAINTREE_len [LZX_MAINTREE_MAXSYMBOLS + LZX_LENTABLE_SAFETY]; + unsigned char LENGTH_len [LZX_LENGTH_MAXSYMBOLS + LZX_LENTABLE_SAFETY]; + unsigned char ALIGNED_len [LZX_ALIGNED_MAXSYMBOLS + LZX_LENTABLE_SAFETY]; + + /* huffman decoding tables */ + unsigned short PRETREE_table [(1 << LZX_PRETREE_TABLEBITS) + + (LZX_PRETREE_MAXSYMBOLS * 2)]; + unsigned short MAINTREE_table[(1 << LZX_MAINTREE_TABLEBITS) + + (LZX_MAINTREE_MAXSYMBOLS * 2)]; + unsigned short LENGTH_table [(1 << LZX_LENGTH_TABLEBITS) + + (LZX_LENGTH_MAXSYMBOLS * 2)]; + unsigned short ALIGNED_table [(1 << LZX_ALIGNED_TABLEBITS) + + (LZX_ALIGNED_MAXSYMBOLS * 2)]; + + /* this is used purely for doing the intel E8 transform */ + unsigned char e8_buf[LZX_FRAME_SIZE]; +}; + +/* allocates LZX decompression state for decoding the given stream. + * + * - returns NULL if window_bits is outwith the range 15 to 21 (inclusive). + * + * - uses system->alloc() to allocate memory + * + * - returns NULL if not enough memory + * + * - window_bits is the size of the LZX window, from 32Kb (15) to 2Mb (21). + * + * - reset_interval is how often the bitstream is reset, measured in + * multiples of 32Kb bytes output. For CAB LZX streams, this is always 0 + * (does not occur). + * + * - input_buffer_size is how many bytes to use as an input bitstream buffer + * + * - output_length is the length in bytes of the entirely decompressed + * output stream, if known in advance. It is used to correctly perform + * the Intel E8 transformation, which must stop 6 bytes before the very + * end of the decompressed stream. It is not otherwise used or adhered + * to. If the full decompressed length is known in advance, set it here. + * If it is NOT known, use the value 0, and call lzxd_set_output_length() + * once it is known. If never set, 4 of the final 6 bytes of the output + * stream may be incorrect. + */ +extern struct lzxd_stream *lzxd_init(struct mspack_system *system, + struct mspack_file *input, + struct mspack_file *output, + int window_bits, + int reset_interval, + int input_buffer_size, + off_t output_length); + +/* see description of output_length in lzxd_init() */ +extern void lzxd_set_output_length(struct lzxd_stream *lzx, + off_t output_length); + +/* decompresses, or decompresses more of, an LZX stream. + * + * - out_bytes of data will be decompressed and the function will return + * with an MSPACK_ERR_OK return code. + * + * - decompressing will stop as soon as out_bytes is reached. if the true + * amount of bytes decoded spills over that amount, they will be kept for + * a later invocation of lzxd_decompress(). + * + * - the output bytes will be passed to the system->write() function given in + * lzxd_init(), using the output file handle given in lzxd_init(). More + * than one call may be made to system->write(). + * + * - LZX will read input bytes as necessary using the system->read() function + * given in lzxd_init(), using the input file handle given in lzxd_init(). + * This will continue until system->read() returns 0 bytes, or an error. + * input streams should convey an "end of input stream" by refusing to + * supply all the bytes that LZX asks for when they reach the end of the + * stream, rather than return an error code. + * + * - if an error code other than MSPACK_ERR_OK is returned, the stream should + * be considered unusable and lzxd_decompress() should not be called again + * on this stream. + */ +extern int lzxd_decompress(struct lzxd_stream *lzx, off_t out_bytes); + +/* frees all state associated with an LZX data stream + * + * - calls system->free() using the system pointer given in lzxd_init() + */ +void lzxd_free(struct lzxd_stream *lzx); + +#endif diff --git a/src/calibre/utils/lzx/lzxd.c b/src/calibre/utils/lzx/lzxd.c new file mode 100644 index 0000000000..337af441fd --- /dev/null +++ b/src/calibre/utils/lzx/lzxd.c @@ -0,0 +1,905 @@ +/* This file is part of libmspack. + * (C) 2003-2004 Stuart Caie. + * + * The LZX method was created by Jonathan Forbes and Tomi Poutanen, adapted + * by Microsoft Corporation. + * + * libmspack is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License (LGPL) version 2.1 + * + * For further details, see the file COPYING.LIB distributed with libmspack + */ + +/* LZX decompression implementation */ + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include +#include +#include + +/* Microsoft's LZX document and their implementation of the + * com.ms.util.cab Java package do not concur. + * + * In the LZX document, there is a table showing the correlation between + * window size and the number of position slots. It states that the 1MB + * window = 40 slots and the 2MB window = 42 slots. In the implementation, + * 1MB = 42 slots, 2MB = 50 slots. The actual calculation is 'find the + * first slot whose position base is equal to or more than the required + * window size'. This would explain why other tables in the document refer + * to 50 slots rather than 42. + * + * The constant NUM_PRIMARY_LENGTHS used in the decompression pseudocode + * is not defined in the specification. + * + * The LZX document does not state the uncompressed block has an + * uncompressed length field. Where does this length field come from, so + * we can know how large the block is? The implementation has it as the 24 + * bits following after the 3 blocktype bits, before the alignment + * padding. + * + * The LZX document states that aligned offset blocks have their aligned + * offset huffman tree AFTER the main and length trees. The implementation + * suggests that the aligned offset tree is BEFORE the main and length + * trees. + * + * The LZX document decoding algorithm states that, in an aligned offset + * block, if an extra_bits value is 1, 2 or 3, then that number of bits + * should be read and the result added to the match offset. This is + * correct for 1 and 2, but not 3, where just a huffman symbol (using the + * aligned tree) should be read. + * + * Regarding the E8 preprocessing, the LZX document states 'No translation + * may be performed on the last 6 bytes of the input block'. This is + * correct. However, the pseudocode provided checks for the *E8 leader* + * up to the last 6 bytes. If the leader appears between -10 and -7 bytes + * from the end, this would cause the next four bytes to be modified, at + * least one of which would be in the last 6 bytes, which is not allowed + * according to the spec. + * + * The specification states that the huffman trees must always contain at + * least one element. However, many CAB files contain blocks where the + * length tree is completely empty (because there are no matches), and + * this is expected to succeed. + */ + + +/* LZX decompressor input macros + * + * STORE_BITS stores bitstream state in lzxd_stream structure + * RESTORE_BITS restores bitstream state from lzxd_stream structure + * READ_BITS(var,n) takes N bits from the buffer and puts them in var + * ENSURE_BITS(n) ensures there are at least N bits in the bit buffer. + * PEEK_BITS(n) extracts without removing N bits from the bit buffer + * REMOVE_BITS(n) removes N bits from the bit buffer + * + * These bit access routines work by using the area beyond the MSB and the + * LSB as a free source of zeroes when shifting. This avoids having to + * mask any bits. So we have to know the bit width of the bit buffer + * variable. + * + * The bit buffer datatype should be at least 32 bits wide: it must be + * possible to ENSURE_BITS(16), so it must be possible to add 16 new bits + * to the bit buffer when the bit buffer already has 1 to 15 bits left. + */ + +#if HAVE_LIMITS_H +# include +#endif +#ifndef CHAR_BIT +# define CHAR_BIT (8) +#endif +#define BITBUF_WIDTH (sizeof(bit_buffer) * CHAR_BIT) + +#define STORE_BITS do { \ + lzx->i_ptr = i_ptr; \ + lzx->i_end = i_end; \ + lzx->bit_buffer = bit_buffer; \ + lzx->bits_left = bits_left; \ +} while (0) + +#define RESTORE_BITS do { \ + i_ptr = lzx->i_ptr; \ + i_end = lzx->i_end; \ + bit_buffer = lzx->bit_buffer; \ + bits_left = lzx->bits_left; \ +} while (0) + +#define ENSURE_BITS(nbits) \ + while (bits_left < (nbits)) { \ + if (i_ptr >= i_end) { \ + if (lzxd_read_input(lzx)) return lzx->error; \ + i_ptr = lzx->i_ptr; \ + i_end = lzx->i_end; \ + } \ + bit_buffer |= ((i_ptr[1] << 8) | i_ptr[0]) \ + << (BITBUF_WIDTH - 16 - bits_left); \ + bits_left += 16; \ + i_ptr += 2; \ + } + +#define PEEK_BITS(nbits) (bit_buffer >> (BITBUF_WIDTH - (nbits))) + +#define REMOVE_BITS(nbits) ((bit_buffer <<= (nbits)), (bits_left -= (nbits))) + +#define READ_BITS(val, nbits) do { \ + ENSURE_BITS(nbits); \ + (val) = PEEK_BITS(nbits); \ + REMOVE_BITS(nbits); \ +} while (0) + +static int lzxd_read_input(struct lzxd_stream *lzx) { + int read = lzx->sys->read(lzx->input, &lzx->inbuf[0], (int)lzx->inbuf_size); + if (read < 0) return lzx->error = MSPACK_ERR_READ; + + /* huff decode's ENSURE_BYTES(16) might overrun the input stream, even + * if those bits aren't used, so fake 2 more bytes */ + if (read == 0) { + if (lzx->input_end) { + D(("out of input bytes")) + return lzx->error = MSPACK_ERR_READ; + } + else { + read = 2; + lzx->inbuf[0] = lzx->inbuf[1] = 0; + lzx->input_end = 1; + } + } + + lzx->i_ptr = &lzx->inbuf[0]; + lzx->i_end = &lzx->inbuf[read]; + + return MSPACK_ERR_OK; +} + +/* Huffman decoding macros */ + +/* READ_HUFFSYM(tablename, var) decodes one huffman symbol from the + * bitstream using the stated table and puts it in var. + */ +#define READ_HUFFSYM(tbl, var) do { \ + /* huffman symbols can be up to 16 bits long */ \ + ENSURE_BITS(16); \ + /* immediate table lookup of [tablebits] bits of the code */ \ + sym = lzx->tbl##_table[PEEK_BITS(LZX_##tbl##_TABLEBITS)]; \ + /* is the symbol is longer than [tablebits] bits? (i=node index) */ \ + if (sym >= LZX_##tbl##_MAXSYMBOLS) { \ + /* decode remaining bits by tree traversal */ \ + i = 1 << (BITBUF_WIDTH - LZX_##tbl##_TABLEBITS); \ + do { \ + /* one less bit. error if we run out of bits before decode */ \ + i >>= 1; \ + if (i == 0) { \ + D(("out of bits in huffman decode")) \ + return lzx->error = MSPACK_ERR_DECRUNCH; \ + } \ + /* double node index and add 0 (left branch) or 1 (right) */ \ + sym <<= 1; sym |= (bit_buffer & i) ? 1 : 0; \ + /* hop to next node index / decoded symbol */ \ + sym = lzx->tbl##_table[sym]; \ + /* while we are still in node indicies, not decoded symbols */ \ + } while (sym >= LZX_##tbl##_MAXSYMBOLS); \ + } \ + /* result */ \ + (var) = sym; \ + /* look up the code length of that symbol and discard those bits */ \ + i = lzx->tbl##_len[sym]; \ + REMOVE_BITS(i); \ +} while (0) + +/* BUILD_TABLE(tbl) builds a huffman lookup table from code lengths */ +#define BUILD_TABLE(tbl) \ + if (make_decode_table(LZX_##tbl##_MAXSYMBOLS, LZX_##tbl##_TABLEBITS, \ + &lzx->tbl##_len[0], &lzx->tbl##_table[0])) \ + { \ + D(("failed to build %s table", #tbl)) \ + return lzx->error = MSPACK_ERR_DECRUNCH; \ + } + +/* make_decode_table(nsyms, nbits, length[], table[]) + * + * This function was coded by David Tritscher. It builds a fast huffman + * decoding table from a canonical huffman code lengths table. + * + * nsyms = total number of symbols in this huffman tree. + * nbits = any symbols with a code length of nbits or less can be decoded + * in one lookup of the table. + * length = A table to get code lengths from [0 to syms-1] + * table = The table to fill up with decoded symbols and pointers. + * + * Returns 0 for OK or 1 for error + */ + +static int make_decode_table(unsigned int nsyms, unsigned int nbits, + unsigned char *length, unsigned short *table) +{ + register unsigned short sym; + register unsigned int leaf, fill; + register unsigned char bit_num; + unsigned int pos = 0; /* the current position in the decode table */ + unsigned int table_mask = 1 << nbits; + unsigned int bit_mask = table_mask >> 1; /* don't do 0 length codes */ + unsigned int next_symbol = bit_mask; /* base of allocation for long codes */ + + /* fill entries for codes short enough for a direct mapping */ + for (bit_num = 1; bit_num <= nbits; bit_num++) { + for (sym = 0; sym < nsyms; sym++) { + if (length[sym] != bit_num) continue; + leaf = pos; + if((pos += bit_mask) > table_mask) return 1; /* table overrun */ + /* fill all possible lookups of this symbol with the symbol itself */ + for (fill = bit_mask; fill-- > 0;) table[leaf++] = sym; + } + bit_mask >>= 1; + } + + /* full table already? */ + if (pos == table_mask) return 0; + + /* clear the remainder of the table */ + for (sym = pos; sym < table_mask; sym++) table[sym] = 0xFFFF; + + /* allow codes to be up to nbits+16 long, instead of nbits */ + pos <<= 16; + table_mask <<= 16; + bit_mask = 1 << 15; + + for (bit_num = nbits+1; bit_num <= 16; bit_num++) { + for (sym = 0; sym < nsyms; sym++) { + if (length[sym] != bit_num) continue; + + leaf = pos >> 16; + for (fill = 0; fill < bit_num - nbits; fill++) { + /* if this path hasn't been taken yet, 'allocate' two entries */ + if (table[leaf] == 0xFFFF) { + table[(next_symbol << 1)] = 0xFFFF; + table[(next_symbol << 1) + 1] = 0xFFFF; + table[leaf] = next_symbol++; + } + /* follow the path and select either left or right for next bit */ + leaf = table[leaf] << 1; + if ((pos >> (15-fill)) & 1) leaf++; + } + table[leaf] = sym; + + if ((pos += bit_mask) > table_mask) return 1; /* table overflow */ + } + bit_mask >>= 1; + } + + /* full table? */ + if (pos == table_mask) return 0; + + /* either erroneous table, or all elements are 0 - let's find out. */ + for (sym = 0; sym < nsyms; sym++) if (length[sym]) return 1; + return 0; +} + + +/* READ_LENGTHS(tablename, first, last) reads in code lengths for symbols + * first to last in the given table. The code lengths are stored in their + * own special LZX way. + */ +#define READ_LENGTHS(tbl, first, last) do { \ + STORE_BITS; \ + if (lzxd_read_lens(lzx, &lzx->tbl##_len[0], (first), \ + (unsigned int)(last))) return lzx->error; \ + RESTORE_BITS; \ +} while (0) + +static int lzxd_read_lens(struct lzxd_stream *lzx, unsigned char *lens, + unsigned int first, unsigned int last) +{ + /* bit buffer and huffman symbol decode variables */ + register unsigned int bit_buffer; + register int bits_left, i; + register unsigned short sym; + unsigned char *i_ptr, *i_end; + + unsigned int x, y; + int z; + + RESTORE_BITS; + + /* read lengths for pretree (20 symbols, lengths stored in fixed 4 bits) */ + for (x = 0; x < 20; x++) { + READ_BITS(y, 4); + lzx->PRETREE_len[x] = y; + } + BUILD_TABLE(PRETREE); + + for (x = first; x < last; ) { + READ_HUFFSYM(PRETREE, z); + if (z == 17) { + /* code = 17, run of ([read 4 bits]+4) zeros */ + READ_BITS(y, 4); y += 4; + while (y--) lens[x++] = 0; + } + else if (z == 18) { + /* code = 18, run of ([read 5 bits]+20) zeros */ + READ_BITS(y, 5); y += 20; + while (y--) lens[x++] = 0; + } + else if (z == 19) { + /* code = 19, run of ([read 1 bit]+4) [read huffman symbol] */ + READ_BITS(y, 1); y += 4; + READ_HUFFSYM(PRETREE, z); + z = lens[x] - z; if (z < 0) z += 17; + while (y--) lens[x++] = z; + } + else { + /* code = 0 to 16, delta current length entry */ + z = lens[x] - z; if (z < 0) z += 17; + lens[x++] = z; + } + } + + STORE_BITS; + + return MSPACK_ERR_OK; +} + +/* LZX static data tables: + * + * LZX uses 'position slots' to represent match offsets. For every match, + * a small 'position slot' number and a small offset from that slot are + * encoded instead of one large offset. + * + * position_base[] is an index to the position slot bases + * + * extra_bits[] states how many bits of offset-from-base data is needed. + */ +static unsigned int position_base[51]; +static unsigned char extra_bits[51]; + +static void lzxd_static_init(void) { + int i, j; + + for (i = 0, j = 0; i < 51; i += 2) { + extra_bits[i] = j; /* 0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7... */ + extra_bits[i+1] = j; + if ((i != 0) && (j < 17)) j++; /* 0,0,1,2,3,4...15,16,17,17,17,17... */ + } + + for (i = 0, j = 0; i < 51; i++) { + position_base[i] = j; /* 0,1,2,3,4,6,8,12,16,24,32,... */ + j += 1 << extra_bits[i]; /* 1,1,1,1,2,2,4,4,8,8,16,16,32,32,... */ + } +} + +static void lzxd_reset_state(struct lzxd_stream *lzx) { + int i; + + lzx->R0 = 1; + lzx->R1 = 1; + lzx->R2 = 1; + lzx->header_read = 0; + lzx->block_remaining = 0; + lzx->block_type = LZX_BLOCKTYPE_INVALID; + + /* initialise tables to 0 (because deltas will be applied to them) */ + for (i = 0; i < LZX_MAINTREE_MAXSYMBOLS; i++) lzx->MAINTREE_len[i] = 0; + for (i = 0; i < LZX_LENGTH_MAXSYMBOLS; i++) lzx->LENGTH_len[i] = 0; +} + +/*-------- main LZX code --------*/ + +struct lzxd_stream *lzxd_init(struct mspack_system *system, + struct mspack_file *input, + struct mspack_file *output, + int window_bits, + int reset_interval, + int input_buffer_size, + off_t output_length) +{ + unsigned int window_size = 1 << window_bits; + struct lzxd_stream *lzx; + + if (!system) return NULL; + + /* LZX supports window sizes of 2^15 (32Kb) through 2^21 (2Mb) */ + if (window_bits < 15 || window_bits > 21) return NULL; + + input_buffer_size = (input_buffer_size + 1) & -2; + if (!input_buffer_size) return NULL; + + /* initialise static data */ + lzxd_static_init(); + + /* allocate decompression state */ + if (!(lzx = system->alloc(system, sizeof(struct lzxd_stream)))) { + return NULL; + } + + /* allocate decompression window and input buffer */ + lzx->window = system->alloc(system, (size_t) window_size); + lzx->inbuf = system->alloc(system, (size_t) input_buffer_size); + if (!lzx->window || !lzx->inbuf) { + system->free(lzx->window); + system->free(lzx->inbuf); + system->free(lzx); + return NULL; + } + + /* initialise decompression state */ + lzx->sys = system; + lzx->input = input; + lzx->output = output; + lzx->offset = 0; + lzx->length = output_length; + + lzx->inbuf_size = input_buffer_size; + lzx->window_size = 1 << window_bits; + lzx->window_posn = 0; + lzx->frame_posn = 0; + lzx->frame = 0; + lzx->reset_interval = reset_interval; + lzx->intel_filesize = 0; + lzx->intel_curpos = 0; + + /* window bits: 15 16 17 18 19 20 21 + * position slots: 30 32 34 36 38 42 50 */ + lzx->posn_slots = ((window_bits == 21) ? 50 : + ((window_bits == 20) ? 42 : (window_bits << 1))); + lzx->intel_started = 0; + lzx->input_end = 0; + + lzx->error = MSPACK_ERR_OK; + + lzx->i_ptr = lzx->i_end = &lzx->inbuf[0]; + lzx->o_ptr = lzx->o_end = &lzx->e8_buf[0]; + lzx->bit_buffer = lzx->bits_left = 0; + + lzxd_reset_state(lzx); + return lzx; +} + +void lzxd_set_output_length(struct lzxd_stream *lzx, off_t out_bytes) { + if (lzx) lzx->length = out_bytes; +} + +int lzxd_decompress(struct lzxd_stream *lzx, off_t out_bytes) { + /* bitstream reading and huffman variables */ + register unsigned int bit_buffer; + register int bits_left, i=0; + register unsigned short sym; + unsigned char *i_ptr, *i_end; + + int match_length, length_footer, extra, verbatim_bits, bytes_todo; + int this_run, main_element, aligned_bits, j; + unsigned char *window, *runsrc, *rundest, buf[12]; + unsigned int frame_size=0, end_frame, match_offset, window_posn; + unsigned int R0, R1, R2; + + /* easy answers */ + if (!lzx || (out_bytes < 0)) return MSPACK_ERR_ARGS; + if (lzx->error) return lzx->error; + + /* flush out any stored-up bytes before we begin */ + i = lzx->o_end - lzx->o_ptr; + if ((off_t) i > out_bytes) i = (int) out_bytes; + if (i) { + if (lzx->sys->write(lzx->output, lzx->o_ptr, i) != i) { + return lzx->error = MSPACK_ERR_WRITE; + } + lzx->o_ptr += i; + lzx->offset += i; + out_bytes -= i; + } + if (out_bytes == 0) return MSPACK_ERR_OK; + + /* restore local state */ + RESTORE_BITS; + window = lzx->window; + window_posn = lzx->window_posn; + R0 = lzx->R0; + R1 = lzx->R1; + R2 = lzx->R2; + + end_frame = (unsigned int)((lzx->offset + out_bytes) / LZX_FRAME_SIZE) + 1; + + while (lzx->frame < end_frame) { + /* have we reached the reset interval? (if there is one?) */ + if (lzx->reset_interval && ((lzx->frame % lzx->reset_interval) == 0)) { + if (lzx->block_remaining) { + D(("%d bytes remaining at reset interval", lzx->block_remaining)) + return lzx->error = MSPACK_ERR_DECRUNCH; + } + + /* re-read the intel header and reset the huffman lengths */ + lzxd_reset_state(lzx); + } + + /* read header if necessary */ + if (!lzx->header_read) { + /* read 1 bit. if bit=0, intel filesize = 0. + * if bit=1, read intel filesize (32 bits) */ + j = 0; READ_BITS(i, 1); if (i) { READ_BITS(i, 16); READ_BITS(j, 16); } + lzx->intel_filesize = (i << 16) | j; + lzx->header_read = 1; + } + + /* calculate size of frame: all frames are 32k except the final frame + * which is 32kb or less. this can only be calculated when lzx->length + * has been filled in. */ + frame_size = LZX_FRAME_SIZE; + if (lzx->length && (lzx->length - lzx->offset) < (off_t)frame_size) { + frame_size = lzx->length - lzx->offset; + } + + /* decode until one more frame is available */ + bytes_todo = lzx->frame_posn + frame_size - window_posn; + while (bytes_todo > 0) { + /* initialise new block, if one is needed */ + if (lzx->block_remaining == 0) { + /* realign if previous block was an odd-sized UNCOMPRESSED block */ + if ((lzx->block_type == LZX_BLOCKTYPE_UNCOMPRESSED) && + (lzx->block_length & 1)) + { + if (i_ptr == i_end) { + if (lzxd_read_input(lzx)) return lzx->error; + i_ptr = lzx->i_ptr; + i_end = lzx->i_end; + } + i_ptr++; + } + + /* read block type (3 bits) and block length (24 bits) */ + READ_BITS(lzx->block_type, 3); + READ_BITS(i, 16); READ_BITS(j, 8); + lzx->block_remaining = lzx->block_length = (i << 8) | j; + /*D(("new block t%d len %u", lzx->block_type, lzx->block_length))*/ + + /* read individual block headers */ + switch (lzx->block_type) { + case LZX_BLOCKTYPE_ALIGNED: + /* read lengths of and build aligned huffman decoding tree */ + for (i = 0; i < 8; i++) { READ_BITS(j, 3); lzx->ALIGNED_len[i] = j; } + BUILD_TABLE(ALIGNED); + /* no break -- rest of aligned header is same as verbatim */ + case LZX_BLOCKTYPE_VERBATIM: + /* read lengths of and build main huffman decoding tree */ + READ_LENGTHS(MAINTREE, 0, 256); + READ_LENGTHS(MAINTREE, 256, LZX_NUM_CHARS + (lzx->posn_slots << 3)); + BUILD_TABLE(MAINTREE); + /* if the literal 0xE8 is anywhere in the block... */ + if (lzx->MAINTREE_len[0xE8] != 0) lzx->intel_started = 1; + /* read lengths of and build lengths huffman decoding tree */ + READ_LENGTHS(LENGTH, 0, LZX_NUM_SECONDARY_LENGTHS); + BUILD_TABLE(LENGTH); + break; + + case LZX_BLOCKTYPE_UNCOMPRESSED: + /* because we can't assume otherwise */ + lzx->intel_started = 1; + + /* read 1-16 (not 0-15) bits to align to bytes */ + ENSURE_BITS(16); + if (bits_left > 16) i_ptr -= 2; + bits_left = 0; bit_buffer = 0; + + /* read 12 bytes of stored R0 / R1 / R2 values */ + for (rundest = &buf[0], i = 0; i < 12; i++) { + if (i_ptr == i_end) { + if (lzxd_read_input(lzx)) return lzx->error; + i_ptr = lzx->i_ptr; + i_end = lzx->i_end; + } + *rundest++ = *i_ptr++; + } + R0 = buf[0] | (buf[1] << 8) | (buf[2] << 16) | (buf[3] << 24); + R1 = buf[4] | (buf[5] << 8) | (buf[6] << 16) | (buf[7] << 24); + R2 = buf[8] | (buf[9] << 8) | (buf[10] << 16) | (buf[11] << 24); + break; + + default: + D(("bad block type")) + return lzx->error = MSPACK_ERR_DECRUNCH; + } + } + + /* decode more of the block: + * run = min(what's available, what's needed) */ + this_run = lzx->block_remaining; + if (this_run > bytes_todo) this_run = bytes_todo; + + /* assume we decode exactly this_run bytes, for now */ + bytes_todo -= this_run; + lzx->block_remaining -= this_run; + + /* decode at least this_run bytes */ + switch (lzx->block_type) { + case LZX_BLOCKTYPE_VERBATIM: + while (this_run > 0) { + READ_HUFFSYM(MAINTREE, main_element); + if (main_element < LZX_NUM_CHARS) { + /* literal: 0 to LZX_NUM_CHARS-1 */ + window[window_posn++] = main_element; + this_run--; + } + else { + /* match: LZX_NUM_CHARS + ((slot<<3) | length_header (3 bits)) */ + main_element -= LZX_NUM_CHARS; + + /* get match length */ + match_length = main_element & LZX_NUM_PRIMARY_LENGTHS; + if (match_length == LZX_NUM_PRIMARY_LENGTHS) { + READ_HUFFSYM(LENGTH, length_footer); + match_length += length_footer; + } + match_length += LZX_MIN_MATCH; + + /* get match offset */ + switch ((match_offset = (main_element >> 3))) { + case 0: match_offset = R0; break; + case 1: match_offset = R1; R1=R0; R0 = match_offset; break; + case 2: match_offset = R2; R2=R0; R0 = match_offset; break; + case 3: match_offset = 1; R2=R1; R1=R0; R0 = match_offset; break; + default: + extra = extra_bits[match_offset]; + READ_BITS(verbatim_bits, extra); + match_offset = position_base[match_offset] - 2 + verbatim_bits; + R2 = R1; R1 = R0; R0 = match_offset; + } + + if ((window_posn + match_length) > lzx->window_size) { + D(("match ran over window wrap")) + return lzx->error = MSPACK_ERR_DECRUNCH; + } + + /* copy match */ + rundest = &window[window_posn]; + i = match_length; + /* does match offset wrap the window? */ + if (match_offset > window_posn) { + /* j = length from match offset to end of window */ + j = match_offset - window_posn; + if (j > (int) lzx->window_size) { + D(("match offset beyond window boundaries")) + return lzx->error = MSPACK_ERR_DECRUNCH; + } + runsrc = &window[lzx->window_size - j]; + if (j < i) { + /* if match goes over the window edge, do two copy runs */ + i -= j; while (j-- > 0) *rundest++ = *runsrc++; + runsrc = window; + } + while (i-- > 0) *rundest++ = *runsrc++; + } + else { + runsrc = rundest - match_offset; + while (i-- > 0) *rundest++ = *runsrc++; + } + + this_run -= match_length; + window_posn += match_length; + } + } /* while (this_run > 0) */ + break; + + case LZX_BLOCKTYPE_ALIGNED: + while (this_run > 0) { + READ_HUFFSYM(MAINTREE, main_element); + if (main_element < LZX_NUM_CHARS) { + /* literal: 0 to LZX_NUM_CHARS-1 */ + window[window_posn++] = main_element; + this_run--; + } + else { + /* match: LZX_NUM_CHARS + ((slot<<3) | length_header (3 bits)) */ + main_element -= LZX_NUM_CHARS; + + /* get match length */ + match_length = main_element & LZX_NUM_PRIMARY_LENGTHS; + if (match_length == LZX_NUM_PRIMARY_LENGTHS) { + READ_HUFFSYM(LENGTH, length_footer); + match_length += length_footer; + } + match_length += LZX_MIN_MATCH; + + /* get match offset */ + switch ((match_offset = (main_element >> 3))) { + case 0: match_offset = R0; break; + case 1: match_offset = R1; R1 = R0; R0 = match_offset; break; + case 2: match_offset = R2; R2 = R0; R0 = match_offset; break; + default: + extra = extra_bits[match_offset]; + match_offset = position_base[match_offset] - 2; + if (extra > 3) { + /* verbatim and aligned bits */ + extra -= 3; + READ_BITS(verbatim_bits, extra); + match_offset += (verbatim_bits << 3); + READ_HUFFSYM(ALIGNED, aligned_bits); + match_offset += aligned_bits; + } + else if (extra == 3) { + /* aligned bits only */ + READ_HUFFSYM(ALIGNED, aligned_bits); + match_offset += aligned_bits; + } + else if (extra > 0) { /* extra==1, extra==2 */ + /* verbatim bits only */ + READ_BITS(verbatim_bits, extra); + match_offset += verbatim_bits; + } + else /* extra == 0 */ { + /* ??? not defined in LZX specification! */ + match_offset = 1; + } + /* update repeated offset LRU queue */ + R2 = R1; R1 = R0; R0 = match_offset; + } + + if ((window_posn + match_length) > lzx->window_size) { + D(("match ran over window wrap")) + return lzx->error = MSPACK_ERR_DECRUNCH; + } + + /* copy match */ + rundest = &window[window_posn]; + i = match_length; + /* does match offset wrap the window? */ + if (match_offset > window_posn) { + /* j = length from match offset to end of window */ + j = match_offset - window_posn; + if (j > (int) lzx->window_size) { + D(("match offset beyond window boundaries")) + return lzx->error = MSPACK_ERR_DECRUNCH; + } + runsrc = &window[lzx->window_size - j]; + if (j < i) { + /* if match goes over the window edge, do two copy runs */ + i -= j; while (j-- > 0) *rundest++ = *runsrc++; + runsrc = window; + } + while (i-- > 0) *rundest++ = *runsrc++; + } + else { + runsrc = rundest - match_offset; + while (i-- > 0) *rundest++ = *runsrc++; + } + + this_run -= match_length; + window_posn += match_length; + } + } /* while (this_run > 0) */ + break; + + case LZX_BLOCKTYPE_UNCOMPRESSED: + /* as this_run is limited not to wrap a frame, this also means it + * won't wrap the window (as the window is a multiple of 32k) */ + rundest = &window[window_posn]; + window_posn += this_run; + while (this_run > 0) { + if ((i = i_end - i_ptr)) { + if (i > this_run) i = this_run; + lzx->sys->copy(i_ptr, rundest, (size_t) i); + rundest += i; + i_ptr += i; + this_run -= i; + } + else { + if (lzxd_read_input(lzx)) return lzx->error; + i_ptr = lzx->i_ptr; + i_end = lzx->i_end; + } + } + break; + + default: + D(("Default Here.")); + return lzx->error = MSPACK_ERR_DECRUNCH; /* might as well */ + } + + /* did the final match overrun our desired this_run length? */ + if (this_run < 0) { + if ((unsigned int)(-this_run) > lzx->block_remaining) { + D(("overrun went past end of block by %d (%d remaining)", + -this_run, lzx->block_remaining )) + return lzx->error = MSPACK_ERR_DECRUNCH; + } + lzx->block_remaining -= -this_run; + } + } /* while (bytes_todo > 0) */ + + /* streams don't extend over frame boundaries */ + if ((window_posn - lzx->frame_posn) != frame_size) { + D(("decode beyond output frame limits! %d != %d", + window_posn - lzx->frame_posn, frame_size)) + /* Ignored */ +#if 0 + return lzx->error = MSPACK_ERR_DECRUNCH; +#endif + } + + /* re-align input bitstream */ + if (bits_left > 0) ENSURE_BITS(16); + if (bits_left & 15) REMOVE_BITS(bits_left & 15); + + /* check that we've used all of the previous frame first */ + if (lzx->o_ptr != lzx->o_end) { + D(("%d avail bytes, new %d frame", lzx->o_end-lzx->o_ptr, frame_size)) + return lzx->error = MSPACK_ERR_DECRUNCH; + } + + /* does this intel block _really_ need decoding? */ + if (lzx->intel_started && lzx->intel_filesize && + (lzx->frame <= 32768) && (frame_size > 10)) + { + unsigned char *data = &lzx->e8_buf[0]; + unsigned char *dataend = &lzx->e8_buf[frame_size - 10]; + signed int curpos = lzx->intel_curpos; + signed int filesize = lzx->intel_filesize; + signed int abs_off, rel_off; + + /* copy e8 block to the e8 buffer and tweak if needed */ + lzx->o_ptr = data; + lzx->sys->copy(&lzx->window[lzx->frame_posn], data, frame_size); + + while (data < dataend) { + if (*data++ != 0xE8) { curpos++; continue; } + abs_off = data[0] | (data[1]<<8) | (data[2]<<16) | (data[3]<<24); + if ((abs_off >= -curpos) && (abs_off < filesize)) { + rel_off = (abs_off >= 0) ? abs_off - curpos : abs_off + filesize; + data[0] = (unsigned char) rel_off; + data[1] = (unsigned char) (rel_off >> 8); + data[2] = (unsigned char) (rel_off >> 16); + data[3] = (unsigned char) (rel_off >> 24); + } + data += 4; + curpos += 5; + } + lzx->intel_curpos += frame_size; + } + else { + lzx->o_ptr = &lzx->window[lzx->frame_posn]; + if (lzx->intel_filesize) lzx->intel_curpos += frame_size; + } + lzx->o_end = &lzx->o_ptr[frame_size]; + + /* write a frame */ + i = (out_bytes < (off_t)frame_size) ? (unsigned int)out_bytes : frame_size; + if (lzx->sys->write(lzx->output, lzx->o_ptr, i) != i) { + return lzx->error = MSPACK_ERR_WRITE; + } + lzx->o_ptr += i; + lzx->offset += i; + out_bytes -= i; + + /* advance frame start position */ + lzx->frame_posn += frame_size; + lzx->frame++; + + /* wrap window / frame position pointers */ + if (window_posn == lzx->window_size) window_posn = 0; + if (lzx->frame_posn == lzx->window_size) lzx->frame_posn = 0; + + } /* while (lzx->frame < end_frame) */ + + if (out_bytes) { + D(("bytes left to output")) + return lzx->error = MSPACK_ERR_DECRUNCH; + } + + /* store local state */ + STORE_BITS; + lzx->window_posn = window_posn; + lzx->R0 = R0; + lzx->R1 = R1; + lzx->R2 = R2; + + return MSPACK_ERR_OK; +} + +void lzxd_free(struct lzxd_stream *lzx) { + struct mspack_system *sys; + if (lzx) { + sys = lzx->sys; + sys->free(lzx->inbuf); + sys->free(lzx->window); + sys->free(lzx); + } +} diff --git a/src/calibre/utils/lzx/lzxglue.c b/src/calibre/utils/lzx/lzxglue.c new file mode 100644 index 0000000000..7820c68cbf --- /dev/null +++ b/src/calibre/utils/lzx/lzxglue.c @@ -0,0 +1,172 @@ +/*--[lzxglue.c]---------------------------------------------------------------- + | Copyright (C) 2004 DRS + | + | This file is part of the "openclit" library for processing .LIT files. + | + | "Openclit" is free software; you can redistribute it and/or modify + | it under the terms of the GNU General Public License as published by + | the Free Software Foundation; either version 2 of the License, or + | (at your option) any later version. + | + | This program is distributed in the hope that it will be useful, + | but WITHOUT ANY WARRANTY; without even the implied warranty of + | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + | GNU General Public License for more details. + | + | You should have received a copy of the GNU General Public License + | along with this program; if not, write to the Free Software + | Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + | + | The GNU General Public License may also be available at the following + | URL: http://www.gnu.org/licenses/gpl.html +*/ + +/* This provides a "glue" between Stuart Caie's libmspack library and the + * Openclit calls to the earlier LZX library. + * + * This way, I should be able to use the files unmodified. + */ +#include +#include +#include "litlib.h" +#include "mspack.h" +#include "lzx.h" + +typedef struct memory_file +{ + unsigned int magic; /* 0xB5 */ + void * buffer; + int total_bytes; + int current_bytes; +} memory_file; + + +void * glue_alloc(struct mspack_system *this, size_t bytes) +{ + void * p; + p = (void *)malloc(bytes); + if (p == NULL) { + lit_error(ERR_R|ERR_LIBC,"Malloc(%d) failed!", bytes); + } + return p; +} + +void glue_free(void * p) +{ + free(p); +} + +void glue_copy(void *src, void *dest, size_t bytes) +{ + memcpy(dest, src, bytes); +} + +struct mspack_file * glue_open(struct mspack_system *this, char *filename, + int mode) +{ + lit_error(0,"MSPACK_OPEN unsupported!"); + return NULL; +} + +void glue_close(struct mspack_file * file) { + return; +} + + +int glue_read(struct mspack_file * file, void * buffer, int bytes) +{ + memory_file * mem; + int remaining; + + mem = (memory_file *)file; + if (mem->magic != 0xB5) return -1; + + remaining = mem->total_bytes - mem->current_bytes; + if (!remaining) return 0; + if (bytes > remaining) bytes = remaining; + memcpy(buffer, (unsigned char *)mem->buffer+mem->current_bytes, bytes); + mem->current_bytes += bytes; + return bytes; +} + +int glue_write(struct mspack_file * file, void * buffer, int bytes) +{ + memory_file * mem; + int remaining; + + mem = (memory_file *)file; + if (mem->magic != 0xB5) return -1; + + remaining = mem->total_bytes - mem->current_bytes; + if (!remaining) return 0; + if (bytes > remaining) { + lit_error(0,"MSPACK_READ tried to write %d bytes, only %d left.", + bytes, remaining); + bytes = remaining; + } + memcpy((unsigned char *)mem->buffer+mem->current_bytes, buffer, bytes); + mem->current_bytes += bytes; + return bytes; +} + +struct mspack_system lzxglue_system = +{ + glue_open, + glue_close, + glue_read, /* Read */ + glue_write, /* Write */ + NULL, /* Seek */ + NULL, /* Tell */ + NULL, /* Message */ + glue_alloc, + glue_free, + glue_copy, + NULL /* Termination */ +}; + +int LZXwindow; +struct lzxd_stream * lzx_stream = NULL; + + +/* Can't really init here,don't know enough */ +int LZXinit(int window) +{ + LZXwindow = window; + lzx_stream = NULL; + + return 0; +} + +/* Doesn't exist. Oh well, reinitialize state every time anyway */ +void LZXreset(void) +{ + return; +} + +int LZXdecompress(unsigned char *inbuf, unsigned char *outbuf, + unsigned int inlen, unsigned int outlen) +{ + int err; + memory_file source; + memory_file dest; + + source.magic = 0xB5; + source.buffer = inbuf; + source.current_bytes = 0; + source.total_bytes = inlen; + + dest.magic = 0xB5; + dest.buffer = outbuf; + dest.current_bytes = 0; + dest.total_bytes = outlen; + + lzx_stream = lzxd_init(&lzxglue_system, (struct mspack_file *)&source, + (struct mspack_file *)&dest, LZXwindow, + 0x7fff /* Never reset, I do it */, 4096, outlen); + err = -1; + if (lzx_stream) err = lzxd_decompress(lzx_stream, outlen); + + lzxd_free(lzx_stream); + lzx_stream = NULL; + return err; +} diff --git a/src/calibre/utils/lzx/lzxmodule.c b/src/calibre/utils/lzx/lzxmodule.c new file mode 100644 index 0000000000..44cc91c11d --- /dev/null +++ b/src/calibre/utils/lzx/lzxmodule.c @@ -0,0 +1,206 @@ +#include + +#include +#include + +static char lzx_doc[] = +"Provide basic LZX decompression using the code from libmspack."; + +static PyObject *LzxError = NULL; + +typedef struct memory_file { + unsigned int magic; /* 0xB5 */ + void * buffer; + int total_bytes; + int current_bytes; +} memory_file; + +void * +glue_alloc(struct mspack_system *this, size_t bytes) +{ + void *p = NULL; + p = (void *)malloc(bytes); + if (p == NULL) { + return (void *)PyErr_NoMemory(); + } + return p; +} + +void +glue_free(void *p) +{ + free(p); +} + +void +glue_copy(void *src, void *dest, size_t bytes) +{ + memcpy(dest, src, bytes); +} + +struct mspack_file * +glue_open(struct mspack_system *this, char *filename, int mode) +{ + PyErr_SetString(LzxError, "MSPACK_OPEN unsupported"); + return NULL; +} + +void +glue_close(struct mspack_file *file) +{ + return; +} + +int +glue_read(struct mspack_file *file, void * buffer, int bytes) +{ + memory_file *mem; + int remaining; + + mem = (memory_file *)file; + if (mem->magic != 0xB5) return -1; + + remaining = mem->total_bytes - mem->current_bytes; + if (!remaining) return 0; + if (bytes > remaining) bytes = remaining; + memcpy(buffer, (unsigned char *)mem->buffer + mem->current_bytes, bytes); + mem->current_bytes += bytes; + + return bytes; +} + +int +glue_write(struct mspack_file * file, void * buffer, int bytes) +{ + memory_file *mem; + int remaining; + + mem = (memory_file *)file; + if (mem->magic != 0xB5) return -1; + + remaining = mem->total_bytes - mem->current_bytes; + if (!remaining) return 0; + if (bytes > remaining) { + PyErr_SetString(LzxError, + "MSPACK_WRITE tried to write beyond end of buffer"); + bytes = remaining; + } + memcpy((unsigned char *)mem->buffer + mem->current_bytes, buffer, bytes); + mem->current_bytes += bytes; + return bytes; +} + +struct mspack_system lzxglue_system = { + glue_open, + glue_close, + glue_read, /* Read */ + glue_write, /* Write */ + NULL, /* Seek */ + NULL, /* Tell */ + NULL, /* Message */ + glue_alloc, + glue_free, + glue_copy, + NULL /* Termination */ +}; + + +int LZXwindow = 0; +struct lzxd_stream * lzx_stream = NULL; + +/* Can't really init here, don't know enough */ +static PyObject * +init(PyObject *self, PyObject *args) +{ + int window = 0; + + if (!PyArg_ParseTuple(args, "i", &window)) { + return NULL; + } + + LZXwindow = window; + lzx_stream = NULL; + + Py_RETURN_NONE; +} + +/* Doesn't exist. Oh well, reinitialize state every time anyway */ +static PyObject * +reset(PyObject *self, PyObject *args) +{ + if (!PyArg_ParseTuple(args, "")) { + return NULL; + } + + Py_RETURN_NONE; +} + +//int LZXdecompress(unsigned char *inbuf, unsigned char *outbuf, +// unsigned int inlen, unsigned int outlen) +static PyObject * +decompress(PyObject *self, PyObject *args) +{ + unsigned char *inbuf; + unsigned char *outbuf; + unsigned int inlen; + unsigned int outlen; + int err; + memory_file source; + memory_file dest; + PyObject *retval = NULL; + + if (!PyArg_ParseTuple(args, "s#I", &inbuf, &inlen, &outlen)) { + return NULL; + } + + retval = PyString_FromStringAndSize(NULL, outlen); + if (retval == NULL) { + return NULL; + } + outbuf = (unsigned char *)PyString_AS_STRING(retval); + + source.magic = 0xB5; + source.buffer = inbuf; + source.current_bytes = 0; + source.total_bytes = inlen; + + dest.magic = 0xB5; + dest.buffer = outbuf; + dest.current_bytes = 0; + dest.total_bytes = outlen; + + lzx_stream = lzxd_init(&lzxglue_system, (struct mspack_file *)&source, + (struct mspack_file *)&dest, LZXwindow, + 0x7fff /* Never reset, I do it */, 4096, outlen); + err = -1; + if (lzx_stream) err = lzxd_decompress(lzx_stream, outlen); + + lzxd_free(lzx_stream); + lzx_stream = NULL; + + if (err != MSPACK_ERR_OK) { + Py_DECREF(retval); + PyErr_SetString(LzxError, "LZX decompression failed"); + } + + return retval; +} + +static PyMethodDef lzx_methods[] = { + { "init", &init, METH_VARARGS, "Initialize the LZX decompressor" }, + { "reset", &reset, METH_VARARGS, "Reset the LZX decompressor" }, + { "decompress", &decompress, METH_VARARGS, "Run the LZX decompressor" }, + { NULL, NULL } +}; + +PyMODINIT_FUNC +initlzx(void) +{ + PyObject *m; + + m = Py_InitModule3("lzx", lzx_methods, lzx_doc); + if (m == NULL) return; + LzxError = PyErr_NewException("lzx.LzxError", NULL, NULL); + Py_INCREF(LzxError); + PyModule_AddObject(m, "LzxError", LzxError); +} diff --git a/src/calibre/utils/lzx/mspack.h b/src/calibre/utils/lzx/mspack.h new file mode 100644 index 0000000000..b48623fed0 --- /dev/null +++ b/src/calibre/utils/lzx/mspack.h @@ -0,0 +1,1482 @@ +/* libmspack -- a library for working with Microsoft compression formats. + * (C) 2003-2004 Stuart Caie + * + * libmspack is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License (LGPL) version 2.1 + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/** \mainpage + * + * \section intro Introduction + * + * libmspack is a library which provides compressors and decompressors, + * archivers and dearchivers for Microsoft compression formats. + * + * \section formats Formats supported + * + * The following file formats are supported: + * - SZDD files, which use LZSS compression + * - KWAJ files, which use LZSS, LZSS+Huffman or deflate compression + * - .HLP (MS Help) files, which use LZSS compression + * - .CAB (MS Cabinet) files, which use deflate, LZX or Quantum compression + * - .CHM (HTML Help) files, which use LZX compression + * - .LIT (MS EBook) files, which use LZX compression and DES encryption + * + * To determine the capabilities of the library, and the binary + * compatibility version of any particular compressor or decompressor, use + * the mspack_version() function. The UNIX library interface version is + * defined as the highest-versioned library component. + * + * \section starting Getting started + * + * The macro MSPACK_SYS_SELFTEST() should be used to ensure the library can + * be used. In particular, it checks if the caller is using 32-bit file I/O + * when the library is compiled for 64-bit file I/O and vice versa. + * + * If compiled normally, the library includes basic file I/O and memory + * management functionality using the standard C library. This can be + * customised and replaced entirely by creating a mspack_system structure. + * + * A compressor or decompressor for the required format must be + * instantiated before it can be used. Each construction function takes + * one parameter, which is either a pointer to a custom mspack_system + * structure, or NULL to use the default. The instantiation returned, if + * not NULL, contains function pointers (methods) to work with the given + * file format. + * + * For compression: + * - mspack_create_cab_compressor() creates a mscab_compressor + * - mspack_create_chm_compressor() creates a mschm_compressor + * - mspack_create_lit_compressor() creates a mslit_compressor + * - mspack_create_hlp_compressor() creates a mshlp_compressor + * - mspack_create_szdd_compressor() creates a msszdd_compressor + * - mspack_create_kwaj_compressor() creates a mskwaj_compressor + * + * For decompression: + * - mspack_create_cab_decompressor() creates a mscab_decompressor + * - mspack_create_chm_decompressor() creates a mschm_decompressor + * - mspack_create_lit_decompressor() creates a mslit_decompressor + * - mspack_create_hlp_decompressor() creates a mshlp_decompressor + * - mspack_create_szdd_decompressor() creates a msszdd_decompressor + * - mspack_create_kwaj_decompressor() creates a mskwaj_decompressor + * + * Once finished working with a format, each kind of + * compressor/decompressor has its own specific destructor: + * - mspack_destroy_cab_compressor() + * - mspack_destroy_cab_decompressor() + * - mspack_destroy_chm_compressor() + * - mspack_destroy_chm_decompressor() + * - mspack_destroy_lit_compressor() + * - mspack_destroy_lit_decompressor() + * - mspack_destroy_hlp_compressor() + * - mspack_destroy_hlp_decompressor() + * - mspack_destroy_szdd_compressor() + * - mspack_destroy_szdd_decompressor() + * - mspack_destroy_kwaj_compressor() + * - mspack_destroy_kwaj_decompressor() + * + * Destroying a compressor or decompressor does not destroy any objects, + * structures or handles that have been created using that compressor or + * decompressor. Ensure that everything created or opened is destroyed or + * closed before compressor/decompressor is itself destroyed. + * + * \section errors Error codes + * + * All compressors and decompressors use the same set of error codes. Most + * methods return an error code directly. For methods which do not + * return error codes directly, the error code can be obtained with the + * last_error() method. + * + * - #MSPACK_ERR_OK is used to indicate success. This error code is defined + * as zero, all other code are non-zero. + * - #MSPACK_ERR_ARGS indicates that a method was called with inappropriate + * arguments. + * - #MSPACK_ERR_OPEN indicates that mspack_system::open() failed. + * - #MSPACK_ERR_READ indicates that mspack_system::read() failed. + * - #MSPACK_ERR_WRITE indicates that mspack_system::write() failed. + * - #MSPACK_ERR_SEEK indicates that mspack_system::seek() failed. + * - #MSPACK_ERR_NOMEMORY indicates that mspack_system::alloc() failed. + * - #MSPACK_ERR_SIGNATURE indicates that the file being read does not + * have the correct "signature". It is probably not a valid file for + * whatever format is being read. + * - #MSPACK_ERR_DATAFORMAT indicates that the file being used or read + * is corrupt. + * - #MSPACK_ERR_CHECKSUM indicates that a data checksum has failed. + * - #MSPACK_ERR_CRUNCH indicates an error occured during compression. + * - #MSPACK_ERR_DECRUNCH indicates an error occured during decompression. + */ + +#ifndef LIB_MSPACK_H +#define LIB_MSPACK_H 1 + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#ifdef _MSC_VER +#include +#else +#include +#endif +/** + * System self-test function, to ensure both library and calling program + * can use one another. + * + * A result of MSPACK_ERR_OK means the library and caller are + * compatible. Any other result indicates that the library and caller are + * not compatible and should not be used. In particular, a value of + * MSPACK_ERR_SEEK means the library and caller use different off_t + * datatypes. + * + * It should be used like so: + * + * @code + * int selftest_result; + * MSPACK_SYS_SELFTEST(selftest_result); + * if (selftest_result != MSPACK_ERR_OK) { + * fprintf(stderr, "incompatible with this build of libmspack\n"); + * exit(0); + * } + * @endcode + * + * @param result an int variable to store the result of the self-test + */ +#define MSPACK_SYS_SELFTEST(result) do { \ + (result) = mspack_sys_selftest_internal(sizeof(off_t)); \ +} while (0) + +/** Part of the MSPACK_SYS_SELFTEST() macro, must not be used directly. */ +extern int mspack_sys_selftest_internal(int); + +/** + * Enquire about the binary compatibility version of a specific interface in + * the library. Currently, the following interfaces are defined: + * + * - #MSPACK_VER_LIBRARY: the overall library + * - #MSPACK_VER_SYSTEM: the mspack_system interface + * - #MSPACK_VER_MSCABD: the mscab_decompressor interface + * - #MSPACK_VER_MSCABC: the mscab_compressor interface + * - #MSPACK_VER_MSCHMD: the mschm_decompressor interface + * - #MSPACK_VER_MSCHMC: the mschm_compressor interface + * - #MSPACK_VER_MSLITD: the mslit_decompressor interface + * - #MSPACK_VER_MSLITC: the mslit_compressor interface + * - #MSPACK_VER_MSHLPD: the mshlp_decompressor interface + * - #MSPACK_VER_MSHLPC: the mshlp_compressor interface + * - #MSPACK_VER_MSSZDDD: the msszdd_decompressor interface + * - #MSPACK_VER_MSSZDDC: the msszdd_compressor interface + * - #MSPACK_VER_MSKWAJD: the mskwaj_decompressor interface + * - #MSPACK_VER_MSKWAJC: the mskwaj_compressor interface + * + * The result of the function should be interpreted as follows: + * - -1: this interface is completely unknown to the library + * - 0: this interface is known, but non-functioning + * - 1: this interface has all basic functionality + * - 2, 3, ...: this interface has additional functionality, clearly marked + * in the documentation as "version 2", "version 3" and so on. + * + * @param interface the interface to request current version of + * @return the version of the requested interface + */ +extern int mspack_version(int interface); + +/** Pass to mspack_version() to get the overall library version */ +#define MSPACK_VER_LIBRARY (0) +/** Pass to mspack_version() to get the mspack_system version */ +#define MSPACK_VER_SYSTEM (1) +/** Pass to mspack_version() to get the mscab_decompressor version */ +#define MSPACK_VER_MSCABD (2) +/** Pass to mspack_version() to get the mscab_compressor version */ +#define MSPACK_VER_MSCABC (3) +/** Pass to mspack_version() to get the mschm_decompressor version */ +#define MSPACK_VER_MSCHMD (4) +/** Pass to mspack_version() to get the mschm_compressor version */ +#define MSPACK_VER_MSCHMC (5) +/** Pass to mspack_version() to get the mslit_decompressor version */ +#define MSPACK_VER_MSLITD (6) +/** Pass to mspack_version() to get the mslit_compressor version */ +#define MSPACK_VER_MSLITC (7) +/** Pass to mspack_version() to get the mshlp_decompressor version */ +#define MSPACK_VER_MSHLPD (8) +/** Pass to mspack_version() to get the mshlp_compressor version */ +#define MSPACK_VER_MSHLPC (9) +/** Pass to mspack_version() to get the msszdd_decompressor version */ +#define MSPACK_VER_MSSZDDD (10) +/** Pass to mspack_version() to get the msszdd_compressor version */ +#define MSPACK_VER_MSSZDDC (11) +/** Pass to mspack_version() to get the mskwaj_decompressor version */ +#define MSPACK_VER_MSKWAJD (12) +/** Pass to mspack_version() to get the mskwaj_compressor version */ +#define MSPACK_VER_MSKWAJC (13) + +/* --- file I/O abstraction ------------------------------------------------ */ + +/** + * A structure which abstracts file I/O and memory management. + * + * The library always uses the mspack_system structure for interaction + * with the file system and to allocate, free and copy all memory. It also + * uses it to send literal messages to the library user. + * + * When the library is compiled normally, passing NULL to a compressor or + * decompressor constructor will result in a default mspack_system being + * used, where all methods are implemented with the standard C library. + * However, all constructors support being given a custom created + * mspack_system structure, with the library user's own methods. This + * allows for more abstract interaction, such as reading and writing files + * directly to memory, or from a network socket or pipe. + * + * Implementors of an mspack_system structure should read all + * documentation entries for every structure member, and write methods + * which conform to those standards. + */ +struct mspack_system { + /** + * Opens a file for reading, writing, appending or updating. + * + * @param this a self-referential pointer to the mspack_system + * structure whose open() method is being called. If + * this pointer is required by close(), read(), write(), + * seek() or tell(), it should be stored in the result + * structure at this time. + * @param filename the file to be opened. It is passed directly from the + * library caller without being modified, so it is up to + * the caller what this parameter actually represents. + * @param mode one of #MSPACK_SYS_OPEN_READ (open an existing file + * for reading), #MSPACK_SYS_OPEN_WRITE (open a new file + * for writing), #MSPACK_SYS_OPEN_UPDATE (open an existing + * file for reading/writing from the start of the file) or + * #MSPACK_SYS_OPEN_APPEND (open an existing file for + * reading/writing from the end of the file) + * @return a pointer to a mspack_file structure. This structure officially + * contains no members, its true contents are up to the + * mspack_system implementor. It should contain whatever is needed + * for other mspack_system methods to operate. + * @see close(), read(), write(), seek(), tell(), message() + */ + struct mspack_file * (*open)(struct mspack_system *this, + char *filename, + int mode); + + /** + * Closes a previously opened file. If any memory was allocated for this + * particular file handle, it should be freed at this time. + * + * @param file the file to close + * @see open() + */ + void (*close)(struct mspack_file *file); + + /** + * Reads a given number of bytes from an open file. + * + * @param file the file to read from + * @param buffer the location where the read bytes should be stored + * @param bytes the number of bytes to read from the file. + * @return the number of bytes successfully read (this can be less than + * the number requested), zero to mark the end of file, or less + * than zero to indicate an error. + * @see open(), write() + */ + int (*read)(struct mspack_file *file, + void *buffer, + int bytes); + + /** + * Writes a given number of bytes to an open file. + * + * @param file the file to write to + * @param buffer the location where the written bytes should be read from + * @param bytes the number of bytes to write to the file. + * @return the number of bytes successfully written, this can be less + * than the number requested. Zero or less can indicate an error + * where no bytes at all could be written. All cases where less + * bytes were written than requested are considered by the library + * to be an error. + * @see open(), read() + */ + int (*write)(struct mspack_file *file, + void *buffer, + int bytes); + + /** + * Seeks to a specific file offset within an open file. + * + * Sometimes the library needs to know the length of a file. It does + * this by seeking to the end of the file with seek(file, 0, + * MSPACK_SYS_SEEK_END), then calling tell(). Implementations may want + * to make a special case for this. + * + * Due to the potentially varying 32/64 bit datatype off_t on some + * architectures, the #MSPACK_SYS_SELFTEST macro MUST be used before + * using the library. If not, the error caused by the library passing an + * inappropriate stackframe to seek() is subtle and hard to trace. + * + * @param file the file to be seeked + * @param offset an offset to seek, measured in bytes + * @param mode one of #MSPACK_SYS_SEEK_START (the offset should be + * measured from the start of the file), #MSPACK_SYS_SEEK_CUR + * (the offset should be measured from the current file offset) + * or #MSPACK_SYS_SEEK_END (the offset should be measured from + * the end of the file) + * @return zero for success, non-zero for an error + * @see open(), tell() + */ + int (*seek)(struct mspack_file *file, + off_t offset, + int mode); + + /** + * Returns the current file position (in bytes) of the given file. + * + * @param file the file whose file position is wanted + * @return the current file position of the file + * @see open(), seek() + */ + off_t (*tell)(struct mspack_file *file); + + /** + * Used to send messages from the library to the user. + * + * Occasionally, the library generates warnings or other messages in + * plain english to inform the human user. These are informational only + * and can be ignored if not wanted. + * + * @param file may be a file handle returned from open() if this message + * pertains to a specific open file, or NULL if not related to + * a specific file. + * @param format a printf() style format string. It does NOT include a + * trailing newline. + * @see open() + */ + void (*message)(struct mspack_file *file, + char *format, + ...); + + /** + * Allocates memory. + * + * @param this a self-referential pointer to the mspack_system + * structure whose alloc() method is being called. + * @param bytes the number of bytes to allocate + * @result a pointer to the requested number of bytes, or NULL if + * not enough memory is available + * @see free() + */ + void * (*alloc)(struct mspack_system *this, + size_t bytes); + + /** + * Frees memory. + * + * @param ptr the memory to be freed. + * @see alloc() + */ + void (*free)(void *ptr); + + /** + * Copies from one region of memory to another. + * + * The regions of memory are guaranteed not to overlap, are usually less + * than 256 bytes, and may not be aligned. Please note that the source + * parameter comes before the destination parameter, unlike the standard + * C function memcpy(). + * + * @param src the region of memory to copy from + * @param dest the region of memory to copy to + * @param bytes the size of the memory region, in bytes + */ + void (*copy)(void *src, + void *dest, + size_t bytes); + + /** + * A null pointer to mark the end of mspack_system. It must equal NULL. + * + * Should the mspack_system structure extend in the future, this NULL + * will be seen, rather than have an invalid method pointer called. + */ + void *null_ptr; +}; + +/** mspack_system::open() mode: open existing file for reading. */ +#define MSPACK_SYS_OPEN_READ (0) +/** mspack_system::open() mode: open new file for writing */ +#define MSPACK_SYS_OPEN_WRITE (1) +/** mspack_system::open() mode: open existing file for writing */ +#define MSPACK_SYS_OPEN_UPDATE (2) +/** mspack_system::open() mode: open existing file for writing */ +#define MSPACK_SYS_OPEN_APPEND (3) + +/** mspack_system::seek() mode: seek relative to start of file */ +#define MSPACK_SYS_SEEK_START (0) +/** mspack_system::seek() mode: seek relative to current offset */ +#define MSPACK_SYS_SEEK_CUR (1) +/** mspack_system::seek() mode: seek relative to end of file */ +#define MSPACK_SYS_SEEK_END (2) + +/** + * A structure which represents an open file handle. The contents of this + * structure are determined by the implementation of the + * mspack_system::open() method. + */ +struct mspack_file { + int dummy; +}; + +/* --- error codes --------------------------------------------------------- */ + +/** Error code: no error */ +#define MSPACK_ERR_OK (0) +/** Error code: bad arguments to method */ +#define MSPACK_ERR_ARGS (1) +/** Error code: error opening file */ +#define MSPACK_ERR_OPEN (2) +/** Error code: error reading file */ +#define MSPACK_ERR_READ (3) +/** Error code: error writing file */ +#define MSPACK_ERR_WRITE (4) +/** Error code: seek error */ +#define MSPACK_ERR_SEEK (5) +/** Error code: out of memory */ +#define MSPACK_ERR_NOMEMORY (6) +/** Error code: bad "magic id" in file */ +#define MSPACK_ERR_SIGNATURE (7) +/** Error code: bad or corrupt file format */ +#define MSPACK_ERR_DATAFORMAT (8) +/** Error code: bad checksum or CRC */ +#define MSPACK_ERR_CHECKSUM (9) +/** Error code: error during compression */ +#define MSPACK_ERR_CRUNCH (10) +/** Error code: error during decompression */ +#define MSPACK_ERR_DECRUNCH (11) + +/* --- functions available in library -------------------------------------- */ + +/** Creates a new CAB compressor. + * @param sys a custom mspack_system structure, or NULL to use the default + * @return a #mscab_compressor or NULL + */ +extern struct mscab_compressor * + mspack_create_cab_compressor(struct mspack_system *sys); + +/** Creates a new CAB decompressor. + * @param sys a custom mspack_system structure, or NULL to use the default + * @return a #mscab_decompressor or NULL + */ +extern struct mscab_decompressor * + mspack_create_cab_decompressor(struct mspack_system *sys); + +/** Destroys an existing CAB compressor. + * @param this the #mscab_compressor to destroy + */ +extern void mspack_destroy_cab_compressor(struct mscab_compressor *this); + +/** Destroys an existing CAB decompressor. + * @param this the #mscab_decompressor to destroy + */ +extern void mspack_destroy_cab_decompressor(struct mscab_decompressor *this); + + +/** Creates a new CHM compressor. + * @param sys a custom mspack_system structure, or NULL to use the default + * @return a #mschm_compressor or NULL + */ +extern struct mschm_compressor * + mspack_create_chm_compressor(struct mspack_system *sys); + +/** Creates a new CHM decompressor. + * @param sys a custom mspack_system structure, or NULL to use the default + * @return a #mschm_decompressor or NULL + */ +extern struct mschm_decompressor * + mspack_create_chm_decompressor(struct mspack_system *sys); + +/** Destroys an existing CHM compressor. + * @param this the #mschm_compressor to destroy + */ +extern void mspack_destroy_chm_compressor(struct mschm_compressor *this); + +/** Destroys an existing CHM decompressor. + * @param this the #mschm_decompressor to destroy + */ +extern void mspack_destroy_chm_decompressor(struct mschm_decompressor *this); + + +/** Creates a new LIT compressor. + * @param sys a custom mspack_system structure, or NULL to use the default + * @return a #mslit_compressor or NULL + */ +extern struct mslit_compressor * + mspack_create_lit_compressor(struct mspack_system *sys); + +/** Creates a new LIT decompressor. + * @param sys a custom mspack_system structure, or NULL to use the default + * @return a #mslit_decompressor or NULL + */ +extern struct mslit_decompressor * + mspack_create_lit_decompressor(struct mspack_system *sys); + +/** Destroys an existing LIT compressor. + * @param this the #mslit_compressor to destroy + */ +extern void mspack_destroy_lit_compressor(struct mslit_compressor *this); + +/** Destroys an existing LIT decompressor. + * @param this the #mslit_decompressor to destroy + */ +extern void mspack_destroy_lit_decompressor(struct mslit_decompressor *this); + + +/** Creates a new HLP compressor. + * @param sys a custom mspack_system structure, or NULL to use the default + * @return a #mshlp_compressor or NULL + */ +extern struct mshlp_compressor * + mspack_create_hlp_compressor(struct mspack_system *sys); + +/** Creates a new HLP decompressor. + * @param sys a custom mspack_system structure, or NULL to use the default + * @return a #mshlp_decompressor or NULL + */ +extern struct mshlp_decompressor * + mspack_create_hlp_decompressor(struct mspack_system *sys); + +/** Destroys an existing hlp compressor. + * @param this the #mshlp_compressor to destroy + */ +extern void mspack_destroy_hlp_compressor(struct mshlp_compressor *this); + +/** Destroys an existing hlp decompressor. + * @param this the #mshlp_decompressor to destroy + */ +extern void mspack_destroy_hlp_decompressor(struct mshlp_decompressor *this); + + +/** Creates a new SZDD compressor. + * @param sys a custom mspack_system structure, or NULL to use the default + * @return a #msszdd_compressor or NULL + */ +extern struct msszdd_compressor * + mspack_create_szdd_compressor(struct mspack_system *sys); + +/** Creates a new SZDD decompressor. + * @param sys a custom mspack_system structure, or NULL to use the default + * @return a #msszdd_decompressor or NULL + */ +extern struct msszdd_decompressor * + mspack_create_szdd_decompressor(struct mspack_system *sys); + +/** Destroys an existing SZDD compressor. + * @param this the #msszdd_compressor to destroy + */ +extern void mspack_destroy_szdd_compressor(struct msszdd_compressor *this); + +/** Destroys an existing SZDD decompressor. + * @param this the #msszdd_decompressor to destroy + */ +extern void mspack_destroy_szdd_decompressor(struct msszdd_decompressor *this); + + +/** Creates a new KWAJ compressor. + * @param sys a custom mspack_system structure, or NULL to use the default + * @return a #mskwaj_compressor or NULL + */ +extern struct mskwaj_compressor * + mspack_create_kwaj_compressor(struct mspack_system *sys); + +/** Creates a new KWAJ decompressor. + * @param sys a custom mspack_system structure, or NULL to use the default + * @return a #mskwaj_decompressor or NULL + */ +extern struct mskwaj_decompressor * + mspack_create_kwaj_decompressor(struct mspack_system *sys); + +/** Destroys an existing KWAJ compressor. + * @param this the #mskwaj_compressor to destroy + */ +extern void mspack_destroy_kwaj_compressor(struct mskwaj_compressor *this); + +/** Destroys an existing KWAJ decompressor. + * @param this the #mskwaj_decompressor to destroy + */ +extern void mspack_destroy_kwaj_decompressor(struct mskwaj_decompressor *this); + + +/* --- support for .CAB (MS Cabinet) file format --------------------------- */ + +/** + * A structure which represents a single cabinet file. + * + * All fields are READ ONLY. + * + * If this cabinet is part of a merged cabinet set, the #files and #folders + * fields are common to all cabinets in the set, and will be identical. + * + * @see mscab_decompressor::open(), mscab_decompressor::close(), + * mscab_decompressor::search() + */ +struct mscabd_cabinet { + /** + * The next cabinet in a chained list, if this cabinet was opened with + * mscab_decompressor::search(). May be NULL to mark the end of the + * list. + */ + struct mscabd_cabinet *next; + + /** + * The filename of the cabinet. More correctly, the filename of the + * physical file that the cabinet resides in. This is given by the + * library user and may be in any format. + */ + char *filename; + + /** The file offset of cabinet within the physical file it resides in. */ + off_t base_offset; + + /** The length of the cabinet file in bytes. */ + unsigned int length; + + /** The previous cabinet in a cabinet set, or NULL. */ + struct mscabd_cabinet *prevcab; + + /** The next cabinet in a cabinet set, or NULL. */ + struct mscabd_cabinet *nextcab; + + /** The filename of the previous cabinet in a cabinet set, or NULL. */ + char *prevname; + + /** The filename of the next cabinet in a cabinet set, or NULL. */ + char *nextname; + + /** The name of the disk containing the previous cabinet in a cabinet + * set, or NULL. + */ + char *previnfo; + + /** The name of the disk containing the next cabinet in a cabinet set, + * or NULL. + */ + char *nextinfo; + + /** A list of all files in the cabinet or cabinet set. */ + struct mscabd_file *files; + + /** A list of all folders in the cabinet or cabinet set. */ + struct mscabd_folder *folders; + + /** + * The set ID of the cabinet. All cabinets in the same set should have + * the same set ID. + */ + unsigned short set_id; + + /** + * The index number of the cabinet within the set. Numbering should + * start from 0 for the first cabinet in the set, and increment by 1 for + * each following cabinet. + */ + unsigned short set_index; + + /** + * The number of bytes reserved in the header area of the cabinet. + * + * If this is non-zero and flags has MSCAB_HDR_RESV set, this data can + * be read by the calling application. It is of the given length, + * located at offset (base_offset + MSCAB_HDR_RESV_OFFSET) in the + * cabinet file. + * + * @see flags + */ + unsigned short header_resv; + + /** + * Header flags. + * + * - MSCAB_HDR_PREVCAB indicates the cabinet is part of a cabinet set, and + * has a predecessor cabinet. + * - MSCAB_HDR_NEXTCAB indicates the cabinet is part of a cabinet set, and + * has a successor cabinet. + * - MSCAB_HDR_RESV indicates the cabinet has reserved header space. + * + * @see prevname, previnfo, nextname, nextinfo, header_resv + */ + int flags; +}; + +/** Offset from start of cabinet to the reserved header data (if present). */ +#define MSCAB_HDR_RESV_OFFSET (0x28) + +/** Cabinet header flag: cabinet has a predecessor */ +#define MSCAB_HDR_PREVCAB (0x01) +/** Cabinet header flag: cabinet has a successor */ +#define MSCAB_HDR_NEXTCAB (0x02) +/** Cabinet header flag: cabinet has reserved header space */ +#define MSCAB_HDR_RESV (0x04) + +/** + * A structure which represents a single folder in a cabinet or cabinet set. + * + * All fields are READ ONLY. + * + * A folder is a single compressed stream of data. When uncompressed, it + * holds the data of one or more files. A folder may be split across more + * than one cabinet. + */ +struct mscabd_folder { + /** + * A pointer to the next folder in this cabinet or cabinet set, or NULL + * if this is the final folder. + */ + struct mscabd_folder *next; + + /** + * The compression format used by this folder. + * + * The macro MSCABD_COMP_METHOD() should be used on this field to get + * the algorithm used. The macro MSCABD_COMP_LEVEL() should be used to get + * the "compression level". + * + * @see MSCABD_COMP_METHOD(), MSCABD_COMP_LEVEL() + */ + int comp_type; + + /** + * The total number of data blocks used by this folder. This includes + * data blocks present in other files, if this folder spans more than + * one cabinet. + */ + unsigned int num_blocks; +}; + +/** + * Returns the compression method used by a folder. + * + * @param comp_type a mscabd_folder::comp_type value + * @return one of #MSCAB_COMP_NONE, #MSCAB_COMP_MSZIP, #MSCAB_COMP_QUANTUM + * or #MSCAB_COMP_LZX + */ +#define MSCABD_COMP_METHOD(comp_type) ((comp_type) & 0x0F) +/** + * Returns the compression level used by a folder. + * + * @param comp_type a mscabd_folder::comp_type value + * @return the compression level. This is only defined by LZX and Quantum + * compression + */ +#define MSCABD_COMP_LEVEL(comp_type) (((comp_type) >> 8) & 0x1F) + +/** Compression mode: no compression. */ +#define MSCAB_COMP_NONE (0) +/** Compression mode: MSZIP (deflate) compression. */ +#define MSCAB_COMP_MSZIP (1) +/** Compression mode: Quantum compression */ +#define MSCAB_COMP_QUANTUM (2) +/** Compression mode: LZX compression */ +#define MSCAB_COMP_LZX (3) + +/** + * A structure which represents a single file in a cabinet or cabinet set. + * + * All fields are READ ONLY. + */ +struct mscabd_file { + /** + * The next file in the cabinet or cabinet set, or NULL if this is the + * final file. + */ + struct mscabd_file *next; + + /** + * The filename of the file. + * + * A null terminated string of up to 255 bytes in length, it may be in + * either ISO-8859-1 or UTF8 format, depending on the file attributes. + * + * @see attribs + */ + char *filename; + + /** The uncompressed length of the file, in bytes. */ + unsigned int length; + + /** + * File attributes. + * + * The following attributes are defined: + * - #MSCAB_ATTRIB_RDONLY indicates the file is write protected. + * - #MSCAB_ATTRIB_HIDDEN indicates the file is hidden. + * - #MSCAB_ATTRIB_SYSTEM indicates the file is a operating system file. + * - #MSCAB_ATTRIB_ARCH indicates the file is "archived". + * - #MSCAB_ATTRIB_EXEC indicates the file is an executable program. + * - #MSCAB_ATTRIB_UTF_NAME indicates the filename is in UTF8 format rather + * than ISO-8859-1. + */ + int attribs; + + /** File's last modified time, hour field. */ + char time_h; + /** File's last modified time, minute field. */ + char time_m; + /** File's last modified time, second field. */ + char time_s; + + /** File's last modified date, day field. */ + char date_d; + /** File's last modified date, month field. */ + char date_m; + /** File's last modified date, year field. */ + int date_y; + + /** A pointer to the folder that contains this file. */ + struct mscabd_folder *folder; + + /** The uncompressed offset of this file in its folder. */ + unsigned int offset; +}; + +/** mscabd_file::attribs attribute: file is read-only. */ +#define MSCAB_ATTRIB_RDONLY (0x01) +/** mscabd_file::attribs attribute: file is hidden. */ +#define MSCAB_ATTRIB_HIDDEN (0x02) +/** mscabd_file::attribs attribute: file is an operating system file. */ +#define MSCAB_ATTRIB_SYSTEM (0x04) +/** mscabd_file::attribs attribute: file is "archived". */ +#define MSCAB_ATTRIB_ARCH (0x20) +/** mscabd_file::attribs attribute: file is an executable program. */ +#define MSCAB_ATTRIB_EXEC (0x40) +/** mscabd_file::attribs attribute: filename is UTF8, not ISO-8859-1. */ +#define MSCAB_ATTRIB_UTF_NAME (0x80) + +/** mscab_decompressor::set_param() parameter: search buffer size. */ +#define MSCABD_PARAM_SEARCHBUF (0) +/** mscab_decompressor::set_param() parameter: repair MS-ZIP streams? */ +#define MSCABD_PARAM_FIXMSZIP (1) +/** mscab_decompressor::set_param() parameter: size of decompression buffer */ +#define MSCABD_PARAM_DECOMPBUF (2) + +/** TODO */ +struct mscab_compressor { + int dummy; +}; + +/** + * A decompressor for .CAB (Microsoft Cabinet) files + * + * All fields are READ ONLY. + * + * @see mspack_create_cab_decompressor(), mspack_destroy_cab_decompressor() + */ +struct mscab_decompressor { + /** + * Opens a cabinet file and reads its contents. + * + * If the file opened is a valid cabinet file, all headers will be read + * and a mscabd_cabinet structure will be returned, with a full list of + * folders and files. + * + * In the case of an error occuring, NULL is returned and the error code + * is available from last_error(). + * + * The filename pointer should be considered "in use" until close() is + * called on the cabinet. + * + * @param this a self-referential pointer to the mscab_decompressor + * instance being called + * @param filename the filename of the cabinet file. This is passed + * directly to mspack_system::open(). + * @return a pointer to a mscabd_cabinet structure, or NULL on failure + * @see close(), search(), last_error() + */ + struct mscabd_cabinet * (*open) (struct mscab_decompressor *this, + char *filename); + + /** + * Closes a previously opened cabinet or cabinet set. + * + * This closes a cabinet, all cabinets associated with it via the + * mscabd_cabinet::next, mscabd_cabinet::prevcab and + * mscabd_cabinet::nextcab pointers, and all folders and files. All + * memory used by these entities is freed. + * + * The cabinet pointer is now invalid and cannot be used again. All + * mscabd_folder and mscabd_file pointers from that cabinet or cabinet + * set are also now invalid, and cannot be used again. + * + * If the cabinet pointer given was created using search(), it MUST be + * the cabinet pointer returned by search() and not one of the later + * cabinet pointers further along the mscabd_cabinet::next chain. + + * If extra cabinets have been added using append() or prepend(), these + * will all be freed, even if the cabinet pointer given is not the first + * cabinet in the set. Do NOT close() more than one cabinet in the set. + * + * The mscabd_cabinet::filename is not freed by the library, as it is + * not allocated by the library. The caller should free this itself if + * necessary, before it is lost forever. + * + * @param this a self-referential pointer to the mscab_decompressor + * instance being called + * @param cab the cabinet to close + * @see open(), search(), append(), prepend() + */ + void (*close)(struct mscab_decompressor *this, + struct mscabd_cabinet *cab); + + /** + * Searches a regular file for embedded cabinets. + * + * This opens a normal file with the given filename and will search the + * entire file for embedded cabinet files + * + * If any cabinets are found, the equivalent of open() is called on each + * potential cabinet file at the offset it was found. All successfully + * open()ed cabinets are kept in a list. + * + * The first cabinet found will be returned directly as the result of + * this method. Any further cabinets found will be chained in a list + * using the mscabd_cabinet::next field. + * + * In the case of an error occuring anywhere other than the simulated + * open(), NULL is returned and the error code is available from + * last_error(). + * + * If no error occurs, but no cabinets can be found in the file, NULL is + * returned and last_error() returns MSPACK_ERR_OK. + * + * The filename pointer should be considered in use until close() is + * called on the cabinet. + * + * close() should only be called on the result of search(), not on any + * subsequent cabinets in the mscabd_cabinet::next chain. + * + * @param this a self-referential pointer to the mscab_decompressor + * instance being called + * @param filename the filename of the file to search for cabinets. This + * is passed directly to mspack_system::open(). + * @return a pointer to a mscabd_cabinet structure, or NULL + * @see close(), open(), last_error() + */ + struct mscabd_cabinet * (*search) (struct mscab_decompressor *this, + char *filename); + + /** + * Appends one mscabd_cabinet to another, forming or extending a cabinet + * set. + * + * This will attempt to append one cabinet to another such that + * (cab->nextcab == nextcab) && (nextcab->prevcab == cab) and + * any folders split between the two cabinets are merged. + * + * The cabinets MUST be part of a cabinet set -- a cabinet set is a + * cabinet that spans more than one physical cabinet file on disk -- and + * must be appropriately matched. + * + * It can be determined if a cabinet has further parts to load by + * examining the mscabd_cabinet::flags field: + * + * - if (flags & MSCAB_HDR_PREVCAB) is non-zero, there is a + * predecessor cabinet to open() and prepend(). Its MS-DOS + * case-insensitive filename is mscabd_cabinet::prevname + * - if (flags & MSCAB_HDR_NEXTCAB) is non-zero, there is a + * successor cabinet to open() and append(). Its MS-DOS case-insensitive + * filename is mscabd_cabinet::nextname + * + * If the cabinets do not match, an error code will be returned. Neither + * cabinet has been altered, and both should be closed seperately. + * + * Files and folders in a cabinet set are a single entity. All cabinets + * in a set use the same file list, which is updated as cabinets in the + * set are added. All pointers to mscabd_folder and mscabd_file + * structures in either cabinet must be discarded and re-obtained after + * merging. + * + * @param this a self-referential pointer to the mscab_decompressor + * instance being called + * @param cab the cabinet which will be appended to, + * predecessor of nextcab + * @param nextcab the cabinet which will be appended, + * successor of cab + * @return an error code, or MSPACK_ERR_OK if successful + * @see prepend(), open(), close() + */ + int (*append) (struct mscab_decompressor *this, + struct mscabd_cabinet *cab, + struct mscabd_cabinet *nextcab); + + /** + * Prepends one mscabd_cabinet to another, forming or extending a + * cabinet set. + * + * This will attempt to prepend one cabinet to another, such that + * (cab->prevcab == prevcab) && (prevcab->nextcab == cab). In + * all other respects, it is identical to append(). See append() for the + * full documentation. + * + * @param this a self-referential pointer to the mscab_decompressor + * instance being called + * @param cab the cabinet which will be prepended to, + * successor of prevcab + * @param prevcab the cabinet which will be prepended, + * predecessor of cab + * @return an error code, or MSPACK_ERR_OK if successful + * @see append(), open(), close() + */ + int (*prepend) (struct mscab_decompressor *this, + struct mscabd_cabinet *cab, + struct mscabd_cabinet *prevcab); + + /** + * Extracts a file from a cabinet or cabinet set. + * + * This extracts a compressed file in a cabinet and writes it to the given + * filename. + * + * The MS-DOS filename of the file, mscabd_file::filename, is NOT USED + * by extract(). The caller must examine this MS-DOS filename, copy and + * change it as necessary, create directories as necessary, and provide + * the correct filename as a parameter, which will be passed unchanged + * to the decompressor's mspack_system::open() + * + * If the file belongs to a split folder in a multi-part cabinet set, + * and not enough parts of the cabinet set have been loaded and appended + * or prepended, an error will be returned immediately. + * + * @param this a self-referential pointer to the mscab_decompressor + * instance being called + * @param file the file to be decompressed + * @param filename the filename of the file being written to + * @return an error code, or MSPACK_ERR_OK if successful + */ + int (*extract)(struct mscab_decompressor *this, + struct mscabd_file *file, + char *filename); + + /** + * Sets a CAB decompression engine parameter. + * + * The following parameters are defined: + * - #MSCABD_PARAM_SEARCHBUF: How many bytes should be allocated as a + * buffer when using search()? The minimum value is 4. The default + * value is 32768. + * - #MSCABD_PARAM_FIXMSZIP: If non-zero, extract() will ignore bad + * checksums and recover from decompression errors in MS-ZIP + * compressed folders. The default value is 0 (don't recover). + * - #MSCABD_PARAM_DECOMPBUF: How many bytes should be used as an input + * bit buffer by decompressors? The minimum value is 4. The default + * value is 4096. + * + * @param this a self-referential pointer to the mscab_decompressor + * instance being called + * @param param the parameter to set + * @param value the value to set the parameter to + * @return MSPACK_ERR_OK if all is OK, or MSPACK_ERR_ARGS if there + * is a problem with either parameter or value. + * @see search(), extract() + */ + int (*set_param)(struct mscab_decompressor *this, + int param, + int value); + + /** + * Returns the error code set by the most recently called method. + * + * This is useful for open() and search(), which do not return an error + * code directly. + * + * @param this a self-referential pointer to the mscab_decompressor + * instance being called + * @return the most recent error code + * @see open(), search() + */ + int (*last_error)(struct mscab_decompressor *); +}; + +/* --- support for .CHM (HTMLHelp) file format ----------------------------- */ + +/** + * A structure which represents a section of a CHM helpfile. + * + * All fields are READ ONLY. + * + * Not used directly, but used as a generic base type for + * mschmd_sec_uncompressed and mschmd_sec_mscompressed. + */ +struct mschmd_section { + /** A pointer to the CHM helpfile that contains this section. */ + struct mschmd_header *chm; + + /** + * The section ID. Either 0 for the uncompressed section + * mschmd_sec_uncompressed, or 1 for the LZX compressed section + * mschmd_sec_mscompressed. No other section IDs are known. + */ + unsigned int id; +}; + +/** + * A structure which represents the uncompressed section of a CHM helpfile. + * + * All fields are READ ONLY. + */ +struct mschmd_sec_uncompressed { + /** Generic section data. */ + struct mschmd_section base; + + /** The file offset of where this section begins in the CHM helpfile. */ + off_t offset; +}; + +/** + * A structure which represents the compressed section of a CHM helpfile. + * + * All fields are READ ONLY. + */ +struct mschmd_sec_mscompressed { + /** Generic section data. */ + struct mschmd_section base; + + /** A pointer to the meta-file which represents all LZX compressed data. */ + struct mschmd_file *content; + + /** A pointer to the file which contains the LZX control data. */ + struct mschmd_file *control; + + /** A pointer to the file which contains the LZX reset table. */ + struct mschmd_file *rtable; +}; + +/** + * A structure which represents a CHM helpfile. + * + * All fields are READ ONLY. + */ +struct mschmd_header { + /** The version of the CHM file format used in this file. */ + unsigned int version; + + /** + * The "timestamp" of the CHM helpfile. + * + * It is the lower 32 bits of a 64-bit value representing the number of + * centiseconds since 1601-01-01 00:00:00 UTC, plus 42. It is not useful + * as a timestamp, but it is useful as a semi-unique ID. + */ + unsigned int timestamp; + + + /** + * The default Language and Country ID (LCID) of the user who ran the + * HTMLHelp Compiler. This is not the language of the CHM file itself. + */ + unsigned int language; + + /** + * The filename of the CHM helpfile. This is given by the library user + * and may be in any format. + */ + char *filename; + + /** The length of the CHM helpfile, in bytes. */ + off_t length; + + /** A list of all non-system files in the CHM helpfile. */ + struct mschmd_file *files; + + /** + * A list of all system files in the CHM helpfile. + * + * System files are files which begin with "::". They are meta-files + * generated by the CHM creation process. + */ + struct mschmd_file *sysfiles; + + /** The section 0 (uncompressed) data in this CHM helpfile. */ + struct mschmd_sec_uncompressed sec0; + + /** The section 1 (MSCompressed) data in this CHM helpfile. */ + struct mschmd_sec_mscompressed sec1; + + /** The file offset of the first PMGL/PMGI directory chunk. */ + off_t dir_offset; + + /** The number of PMGL/PMGI directory chunks in this CHM helpfile. */ + unsigned int num_chunks; + + /** The size of each PMGL/PMGI chunk, in bytes. */ + unsigned int chunk_size; + + /** The "density" of the quick-reference section in PMGL/PMGI chunks. */ + unsigned int density; + + /** The depth of the index tree. + * + * - if 1, there are no PMGI chunks, only PMGL chunks. + * - if 2, there is 1 PMGI chunk. All chunk indices point to PMGL chunks. + * - if 3, the root PMGI chunk points to secondary PMGI chunks, which in + * turn point to PMGL chunks. + * - and so on... + */ + unsigned int depth; + + /** + * The number of the root PGMI chunk. + * + * If there is no index in the CHM helpfile, this will be 0xFFFFFFFF. + */ + unsigned int index_root; +}; + +/** + * A structure which represents a file stored in a CHM helpfile. + * + * All fields are READ ONLY. + */ +struct mschmd_file { + /** + * A pointer to the next file in the list, or NULL if this is the final + * file. + */ + struct mschmd_file *next; + + /** + * A pointer to the section that this file is located in. Indirectly, + * it also points to the CHM helpfile the file is located in. + */ + struct mschmd_section *section; + + /** The offset within the section data that this file is located at. */ + off_t offset; + + /** The length of this file, in bytes */ + off_t length; + + /** The filename of this file -- a null terminated string in UTF8. */ + char *filename; +}; + +/** TODO */ +struct mschm_compressor { + int dummy; +}; + +/** + * A decompressor for .CHM (Microsoft HTMLHelp) files + * + * All fields are READ ONLY. + * + * @see mspack_create_chm_decompressor(), mspack_destroy_chm_decompressor() + */ +struct mschm_decompressor { + /** + * Opens a CHM helpfile and reads its contents. + * + * If the file opened is a valid CHM helpfile, all headers will be read + * and a mschmd_header structure will be returned, with a full list of + * files. + * + * In the case of an error occuring, NULL is returned and the error code + * is available from last_error(). + * + * The filename pointer should be considered "in use" until close() is + * called on the CHM helpfile. + * + * @param this a self-referential pointer to the mschm_decompressor + * instance being called + * @param filename the filename of the CHM helpfile. This is passed + * directly to mspack_system::open(). + * @return a pointer to a mschmd_header structure, or NULL on failure + * @see close() + */ + struct mschmd_header *(*open)(struct mschm_decompressor *this, + char *filename); + + /** + * Closes a previously opened CHM helpfile. + * + * This closes a CHM helpfile, frees the mschmd_header and all + * mschmd_file structures associated with it (if any). This works on + * both helpfiles opened with open() and helpfiles opened with + * fast_open(). + * + * The CHM header pointer is now invalid and cannot be used again. All + * mschmd_file pointers referencing that CHM are also now invalid, and + * cannot be used again. + * + * @param this a self-referential pointer to the mschm_decompressor + * instance being called + * @param chm the CHM helpfile to close + * @see open(), fast_open() + */ + void (*close)(struct mschm_decompressor *this, + struct mschmd_header *chm); + + /** + * Extracts a file from a CHM helpfile. + * + * This extracts a file from a CHM helpfile and writes it to the given + * filename. The filename of the file, mscabd_file::filename, is not + * used by extract(), but can be used by the caller as a guide for + * constructing an appropriate filename. + * + * This method works both with files found in the mschmd_header::files + * and mschmd_header::sysfiles list and mschmd_file structures generated + * on the fly by fast_find(). + * + * @param this a self-referential pointer to the mscab_decompressor + * instance being called + * @param file the file to be decompressed + * @param filename the filename of the file being written to + * @return an error code, or MSPACK_ERR_OK if successful + */ + int (*extract)(struct mschm_decompressor *this, + struct mschmd_file *file, + char *filename); + + /** + * Returns the error code set by the most recently called method. + * + * This is useful for open() and fast_open(), which do not return an + * error code directly. + * + * @param this a self-referential pointer to the mschm_decompressor + * instance being called + * @return the most recent error code + * @see open(), search() + */ + int (*last_error)(struct mschm_decompressor *this); + + /** + * Opens a CHM helpfile quickly. + * + * If the file opened is a valid CHM helpfile, only essential headers + * will be read. A mschmd_header structure will be still be returned, as + * with open(), but the mschmd_header::files field will be NULL. No + * files details will be automatically read. The fast_find() method + * must be used to obtain file details. + * + * In the case of an error occuring, NULL is returned and the error code + * is available from last_error(). + * + * The filename pointer should be considered "in use" until close() is + * called on the CHM helpfile. + * + * @param this a self-referential pointer to the mschm_decompressor + * instance being called + * @param filename the filename of the CHM helpfile. This is passed + * directly to mspack_system::open(). + * @return a pointer to a mschmd_header structure, or NULL on failure + * @see open(), close(), fast_find(), extract() + */ + struct mschmd_header *(*fast_open)(struct mschm_decompressor *this, + char *filename); + + /** + * Finds file details quickly. + * + * Instead of reading all CHM helpfile headers and building a list of + * files, fast_open() and fast_find() are intended for finding file + * details only when they are needed. The CHM file format includes an + * on-disk file index to allow this. + * + * Given a case-sensitive filename, fast_find() will search the on-disk + * index for that file. + * + * If the file was found, the caller-provided mschmd_file structure will + * be filled out like so: + * - section: the correct value for the found file + * - offset: the correct value for the found file + * - length: the correct value for the found file + * - all other structure elements: NULL or 0 + * + * If the file was not found, MSPACK_ERR_OK will still be returned as the + * result, but the caller-provided structure will be filled out like so: + * - section: NULL + * - offset: 0 + * - length: 0 + * - all other structure elements: NULL or 0 + * + * This method is intended to be used in conjunction with CHM helpfiles + * opened with fast_open(), but it also works with helpfiles opened + * using the regular open(). + * + * @param this a self-referential pointer to the mschm_decompressor + * instance being called + * @param chm the CHM helpfile to search for the file + * @param filename the filename of the file to search for + * @param f_ptr a pointer to a caller-provded mschmd_file structure + * @param f_size sizeof(struct mschmd_file) + * @return MSPACK_ERR_OK, or an error code + * @see open(), close(), fast_find(), extract() + */ + int (*fast_find)(struct mschm_decompressor *this, + struct mschmd_header *chm, + char *filename, + struct mschmd_file *f_ptr, + int f_size); +}; + +/* --- support for .LIT (EBook) file format -------------------------------- */ + +/** TODO */ +struct mslit_compressor { + int dummy; +}; + +/** TODO */ +struct mslit_decompressor { + int dummy; +}; + + +/* --- support for .HLP (MS Help) file format ------------------------------ */ + +/** TODO */ +struct mshlp_compressor { + int dummy; +}; + +/** TODO */ +struct mshlp_decompressor { + int dummy; +}; + + +/* --- support for SZDD file format ---------------------------------------- */ + +/** TODO */ +struct msszdd_compressor { + int dummy; +}; + +/** TODO */ +struct msszdd_decompressor { + int dummy; +}; + +/* --- support for KWAJ file format ---------------------------------------- */ + +/** TODO */ +struct mskwaj_compressor { + int dummy; +}; + +/** TODO */ +struct mskwaj_decompressor { + int dummy; +}; + +#ifdef __cplusplus +}; +#endif + +#endif diff --git a/src/calibre/utils/lzx/system.h b/src/calibre/utils/lzx/system.h new file mode 100644 index 0000000000..acc7d23f56 --- /dev/null +++ b/src/calibre/utils/lzx/system.h @@ -0,0 +1,66 @@ +/* This file is part of libmspack. + * (C) 2003-2004 Stuart Caie. + * + * libmspack is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License (LGPL) version 2.1 + * + * For further details, see the file COPYING.LIB distributed with libmspack + */ + +#ifndef MSPACK_SYSTEM_H +#define MSPACK_SYSTEM_H 1 + +#ifdef _MSC_VER +#define inline +#endif + +#ifdef DEBUG +# include +# define D(x) do { printf("%s:%d (%s) ",__FILE__, __LINE__, __FUNCTION__); \ + printf x ; fputc('\n', stdout); fflush(stdout);} while (0); +#else +# define D(x) +#endif + +/* endian-neutral reading of little-endian data */ +#define __egi32(a,n) ( (((a)[n+3]) << 24) | (((a)[n+2]) << 16) | \ + (((a)[n+1]) << 8) | ((a)[n+0]) ) +#define EndGetI64(a) ((((unsigned long long int) __egi32(a,4)) << 32) | \ + ((unsigned int) __egi32(a,0))) +#define EndGetI32(a) __egi32(a,0) +#define EndGetI16(a) ((((a)[1])<<8)|((a)[0])) + +/* endian-neutral reading of big-endian data */ +#define EndGetM32(a) ((((a)[0])<<24)|(((a)[1])<<16)|(((a)[2])<<8)|((a)[3])) +#define EndGetM16(a) ((((a)[0])<<8)|((a)[1])) + +extern struct mspack_system *mspack_default_system; + +/* returns the length of a file opened for reading */ +extern int mspack_sys_filelen(struct mspack_system *system, + struct mspack_file *file, off_t *length); + +/* validates a system structure */ +extern int mspack_valid_system(struct mspack_system *sys); + +/* Can't redfine intrinsics in Microsoft Visual C */ +#ifndef _MSC_VER + +/* inline memcmp() */ +static inline int memcmp(const void *s1, const void *s2, size_t n) { + unsigned char *c1 = (unsigned char *) s1; + unsigned char *c2 = (unsigned char *) s2; + if (n == 0) return 0; + while (--n && (*c1 == *c2)) c1++, c2++; + return *c1 - *c2; +} + +/* inline strlen() */ +static inline size_t strlen(const char *s) { + const char *e = s; + while (*e) e++; + return e - s; +} +#endif + +#endif From 1367ba58f3dba20a1221888af2e3912320db6a0f Mon Sep 17 00:00:00 2001 From: "Marshall T. Vandegrift" Date: Fri, 18 Jul 2008 18:03:28 -0400 Subject: [PATCH 09/19] Section decompression working --- src/calibre/ebooks/lit/reader.py | 102 ++++++++++++++---- src/calibre/utils/lzx/lzxglue.c | 172 ------------------------------ src/calibre/utils/lzx/lzxmodule.c | 7 +- 3 files changed, 90 insertions(+), 191 deletions(-) delete mode 100644 src/calibre/utils/lzx/lzxglue.c diff --git a/src/calibre/ebooks/lit/reader.py b/src/calibre/ebooks/lit/reader.py index 2608d63399..9963e14bf2 100644 --- a/src/calibre/ebooks/lit/reader.py +++ b/src/calibre/ebooks/lit/reader.py @@ -15,13 +15,14 @@ from calibre.ebooks.lit import LitError from calibre.ebooks.lit.maps import OPF_MAP, HTML_MAP import calibre.ebooks.lit.mssha1 as mssha1 import calibre.ebooks.lit.msdes as msdes +import calibre.utils.lzx as lzx OPF_DECL = """" """ -XHTML_DECL = """ +HTML_DECL = """ @@ -30,6 +31,14 @@ XHTML_DECL = """ DESENCRYPT_GUID = "{67F6E4A2-60BF-11D3-8540-00C04F58C3CF}" LZXCOMPRESS_GUID = "{0A9007C6-4076-11D3-8789-0000F8105754}" +LZXC_TAG = 0x43585a4c +CONTROL_TAG = 4 +CONTROL_WINDOW_SIZE = 12 +RESET_NENTRIES = 4 +RESET_HDRLEN = 12 +RESET_UCLENGTH = 16 +RESET_INTERVAL = 32 + def u32(bytes): return struct.unpack('= 16: - ndwords = int32(control[idx_control:]) + 1 - if (idx_control + (ndwords * 4)) > len(control) or ndwords <= 0: + while len(transform) >= 16: + csize = (int32(control) + 1) * 4 + if csize > len(control) or csize <= 0: raise LitError("ControlData is too short") - guid = msguid(transform[idx_transform:]) + guid = msguid(transform) if guid == DESENCRYPT_GUID: content = self._decrypt(content) - idx_control += ndwords * 4 + control = control[csize:] elif guid == LZXCOMPRESS_GUID: - raise LitError("LZX decompression not implemented") + content = self._decompress_section(name, control, content) + control = control[csize:] else: raise LitError("Unrecognized transform: %s." % repr(guid)) - idx_transform += 16 + transform = transform[16:] return content def _decrypt(self, content): @@ -685,6 +698,59 @@ class LitFile(object): raise LitError('Cannot extract content from a DRM protected ebook') return msdes.new(self.bookkey).decrypt(content) + def _decompress_section(self, name, control, content): + if len(control) < 32 or u32(control[CONTROL_TAG:]) != LZXC_TAG: + raise LitError("Invalid ControlData tag value") + result = [] + + window_size = 14 + u = u32(control[CONTROL_WINDOW_SIZE:]) + while u > 0: + u >>= 1 + window_size += 1 + if window_size < 15 or window_size > 21: + raise LitError("Invalid window in ControlData") + lzx.init(window_size) + + reset_table = self.get_file('/'.join( + ['::DataSpace/Storage', name, 'Transform', + LZXCOMPRESS_GUID, 'InstanceData/ResetTable'])) + if len(reset_table) < (RESET_INTERVAL + 8): + raise LitError("Reset table is too short") + if u32(reset_table[RESET_UCLENGTH + 4:]) != 0: + raise LitError("Reset table has 64bit value for UCLENGTH") + ofs_entry = int32(reset_table[RESET_HDRLEN:]) + 8 + uclength = int32(reset_table[RESET_UCLENGTH:]) + accum = int32(reset_table[RESET_INTERVAL:]) + bytes_remaining = uclength + window_bytes = (1 << window_size) + base = 0 + + while ofs_entry < len(reset_table): + if accum >= window_bytes: + accum = 0 + size = int32(reset_table[ofs_entry:]) + u = int32(reset_table[ofs_entry + 4:]) + if u != 0: + raise LitError("Reset table entry greater than 32 bits") + if size >= (len(content) + base): + raise("Reset table entry out of bounds") + if bytes_remaining >= window_bytes: + lzx.reset() + result.append(lzx.decompress(content, window_bytes)) + bytes_remaining -= window_bytes + content = content[size - base:] + base = size + accum += int32(reset_table[RESET_INTERVAL:]) + ofs_entry += 8 + if bytes_remaining < window_bytes and bytes_remaining > 0: + lzx.reset() + result.append(lzx.decompress(content, bytes_remaining)) + bytes_remaining = 0 + if bytes_remaining > 0: + raise LitError("Failed to completely decompress section") + return ''.join(result) + def get_metadata(stream): try: litfile = LitFile(stream) @@ -693,7 +759,7 @@ def get_metadata(stream): cover_url, cover_item = mi.cover, None if cover_url: cover_url = relpath(cover_url, os.getcwd()) - for item in litfile.manifest: + for item in litfile.manifest.values(): if item.path == cover_url: cover_item = item.internal if cover_item is not None: diff --git a/src/calibre/utils/lzx/lzxglue.c b/src/calibre/utils/lzx/lzxglue.c deleted file mode 100644 index 7820c68cbf..0000000000 --- a/src/calibre/utils/lzx/lzxglue.c +++ /dev/null @@ -1,172 +0,0 @@ -/*--[lzxglue.c]---------------------------------------------------------------- - | Copyright (C) 2004 DRS - | - | This file is part of the "openclit" library for processing .LIT files. - | - | "Openclit" is free software; you can redistribute it and/or modify - | it under the terms of the GNU General Public License as published by - | the Free Software Foundation; either version 2 of the License, or - | (at your option) any later version. - | - | This program is distributed in the hope that it will be useful, - | but WITHOUT ANY WARRANTY; without even the implied warranty of - | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - | GNU General Public License for more details. - | - | You should have received a copy of the GNU General Public License - | along with this program; if not, write to the Free Software - | Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - | - | The GNU General Public License may also be available at the following - | URL: http://www.gnu.org/licenses/gpl.html -*/ - -/* This provides a "glue" between Stuart Caie's libmspack library and the - * Openclit calls to the earlier LZX library. - * - * This way, I should be able to use the files unmodified. - */ -#include -#include -#include "litlib.h" -#include "mspack.h" -#include "lzx.h" - -typedef struct memory_file -{ - unsigned int magic; /* 0xB5 */ - void * buffer; - int total_bytes; - int current_bytes; -} memory_file; - - -void * glue_alloc(struct mspack_system *this, size_t bytes) -{ - void * p; - p = (void *)malloc(bytes); - if (p == NULL) { - lit_error(ERR_R|ERR_LIBC,"Malloc(%d) failed!", bytes); - } - return p; -} - -void glue_free(void * p) -{ - free(p); -} - -void glue_copy(void *src, void *dest, size_t bytes) -{ - memcpy(dest, src, bytes); -} - -struct mspack_file * glue_open(struct mspack_system *this, char *filename, - int mode) -{ - lit_error(0,"MSPACK_OPEN unsupported!"); - return NULL; -} - -void glue_close(struct mspack_file * file) { - return; -} - - -int glue_read(struct mspack_file * file, void * buffer, int bytes) -{ - memory_file * mem; - int remaining; - - mem = (memory_file *)file; - if (mem->magic != 0xB5) return -1; - - remaining = mem->total_bytes - mem->current_bytes; - if (!remaining) return 0; - if (bytes > remaining) bytes = remaining; - memcpy(buffer, (unsigned char *)mem->buffer+mem->current_bytes, bytes); - mem->current_bytes += bytes; - return bytes; -} - -int glue_write(struct mspack_file * file, void * buffer, int bytes) -{ - memory_file * mem; - int remaining; - - mem = (memory_file *)file; - if (mem->magic != 0xB5) return -1; - - remaining = mem->total_bytes - mem->current_bytes; - if (!remaining) return 0; - if (bytes > remaining) { - lit_error(0,"MSPACK_READ tried to write %d bytes, only %d left.", - bytes, remaining); - bytes = remaining; - } - memcpy((unsigned char *)mem->buffer+mem->current_bytes, buffer, bytes); - mem->current_bytes += bytes; - return bytes; -} - -struct mspack_system lzxglue_system = -{ - glue_open, - glue_close, - glue_read, /* Read */ - glue_write, /* Write */ - NULL, /* Seek */ - NULL, /* Tell */ - NULL, /* Message */ - glue_alloc, - glue_free, - glue_copy, - NULL /* Termination */ -}; - -int LZXwindow; -struct lzxd_stream * lzx_stream = NULL; - - -/* Can't really init here,don't know enough */ -int LZXinit(int window) -{ - LZXwindow = window; - lzx_stream = NULL; - - return 0; -} - -/* Doesn't exist. Oh well, reinitialize state every time anyway */ -void LZXreset(void) -{ - return; -} - -int LZXdecompress(unsigned char *inbuf, unsigned char *outbuf, - unsigned int inlen, unsigned int outlen) -{ - int err; - memory_file source; - memory_file dest; - - source.magic = 0xB5; - source.buffer = inbuf; - source.current_bytes = 0; - source.total_bytes = inlen; - - dest.magic = 0xB5; - dest.buffer = outbuf; - dest.current_bytes = 0; - dest.total_bytes = outlen; - - lzx_stream = lzxd_init(&lzxglue_system, (struct mspack_file *)&source, - (struct mspack_file *)&dest, LZXwindow, - 0x7fff /* Never reset, I do it */, 4096, outlen); - err = -1; - if (lzx_stream) err = lzxd_decompress(lzx_stream, outlen); - - lzxd_free(lzx_stream); - lzx_stream = NULL; - return err; -} diff --git a/src/calibre/utils/lzx/lzxmodule.c b/src/calibre/utils/lzx/lzxmodule.c index 44cc91c11d..bf8a48a056 100644 --- a/src/calibre/utils/lzx/lzxmodule.c +++ b/src/calibre/utils/lzx/lzxmodule.c @@ -199,8 +199,13 @@ initlzx(void) PyObject *m; m = Py_InitModule3("lzx", lzx_methods, lzx_doc); - if (m == NULL) return; + if (m == NULL) { + return; + } + LzxError = PyErr_NewException("lzx.LzxError", NULL, NULL); Py_INCREF(LzxError); PyModule_AddObject(m, "LzxError", LzxError); + + return; } From 3737fd3e13c380bcfda7b9d54d7ee012547d401e Mon Sep 17 00:00:00 2001 From: "Marshall T. Vandegrift" Date: Sat, 19 Jul 2008 14:51:31 -0400 Subject: [PATCH 10/19] Added path clean-up and basic extraction method. --- src/calibre/ebooks/lit/reader.py | 200 ++++++++++++++++++------------- 1 file changed, 114 insertions(+), 86 deletions(-) diff --git a/src/calibre/ebooks/lit/reader.py b/src/calibre/ebooks/lit/reader.py index 9963e14bf2..afe5d96297 100644 --- a/src/calibre/ebooks/lit/reader.py +++ b/src/calibre/ebooks/lit/reader.py @@ -1,8 +1,10 @@ +''' +Support for reading LIT files. +''' +from __future__ import with_statement + __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' -''' -Support for reading the metadata from a lit file. -''' import sys, struct, cStringIO, os import functools @@ -39,6 +41,13 @@ RESET_HDRLEN = 12 RESET_UCLENGTH = 16 RESET_INTERVAL = 32 +FLAG_OPENING = 1 +FLAG_CLOSING = 2 +FLAG_BLOCK = 4 +FLAG_HEAD = 8 +FLAG_ATOM = 16 +XML_ENTITIES = ['&', ''', '<', '>', '"'] + def u32(bytes): return struct.unpack('= len(raw): - raise LitError('Truncated manifest.') + while raw: + slen, raw = ord(raw[0]), raw[1:] + if slen == 0: break + root, raw = raw[:slen].decode('utf8'), raw[slen:] + if not raw: + raise LitError('Truncated manifest') for state in ['spine', 'not spine', 'css', 'images']: - num_files = int32(raw[pos:pos+4]) - pos += 4 + num_files, raw = int32(raw), raw[4:] if num_files == 0: continue - - i = 0 - while i < num_files: - if pos+5 >= len(raw): - raise LitError('Truncated manifest.') - offset = u32(raw[pos:pos+4]) - pos += 4 - - slen = ord(raw[pos]) - pos += 1 - internal = raw[pos:pos+slen].decode('utf8') - pos += slen - - slen = ord(raw[pos]) - pos += 1 - original = raw[pos:pos+slen].decode('utf8') - pos += slen - - slen = ord(raw[pos]) - pos += 1 - mime_type = raw[pos:pos+slen].decode('utf8') - pos += slen + 1 - - self.manifest[internal] = \ - ManifestItem(original, internal, mime_type, - offset, root, state) - i += 1 + for i in xrange(num_files): + if len(raw) < 5: + raise LitError('Truncated manifest') + offset, raw = u32(raw), raw[4:] + slen, raw = ord(raw[0]), raw[1:] + internal, raw = raw[:slen].decode('utf8'), raw[slen:] + slen, raw = ord(raw[0]), raw[1:] + original, raw = raw[:slen].decode('utf8'), raw[slen:] + slen, raw = ord(raw[0]), raw[1:] + mime_type, raw = raw[:slen].decode('utf8'), raw[slen+1:] + self.manifest[internal] = ManifestItem( + original, internal, mime_type, offset, root, state) + mlist = self.manifest.values() + shared = mlist[0].path + for item in mlist[1:]: + path = item.path + while not path.startswith(shared): + shared = shared[:-1] + if shared == '': + break + else: + slen = len(shared) + for item in mlist: + item.path = item.path[slen:] def read_meta(self, entry): raw = self._read_content(entry.offset, entry.size) @@ -610,16 +606,12 @@ class LitFile(object): self.meta = xml def read_drm(self): - def exists_file(name): - try: self.get_file(name) - except KeyError: return False - return True self.drmlevel = 0 - if exists_file('/DRMStorage/Licenses/EUL'): + if '/DRMStorage/Licenses/EUL' in self.entries: self.drmlevel = 5 - elif exists_file('/DRMStorage/DRMBookplate'): + elif '/DRMStorage/DRMBookplate' in self.entries: self.drmlevel = 3 - elif exists_file('/DRMStorage/DRMSealed'): + elif '/DRMStorage/DRMSealed' in self.entries: self.drmlevel = 1 else: return @@ -686,7 +678,10 @@ class LitFile(object): content = self._decrypt(content) control = control[csize:] elif guid == LZXCOMPRESS_GUID: - content = self._decompress_section(name, control, content) + reset_table = self.get_file( + '/'.join(['::DataSpace/Storage', name, 'Transform', + LZXCOMPRESS_GUID, 'InstanceData/ResetTable'])) + content = self._decompress(content, control, reset_table) control = control[csize:] else: raise LitError("Unrecognized transform: %s." % repr(guid)) @@ -698,9 +693,14 @@ class LitFile(object): raise LitError('Cannot extract content from a DRM protected ebook') return msdes.new(self.bookkey).decrypt(content) - def _decompress_section(self, name, control, content): + def _decompress(self, content, control, reset_table): if len(control) < 32 or u32(control[CONTROL_TAG:]) != LZXC_TAG: raise LitError("Invalid ControlData tag value") + if len(reset_table) < (RESET_INTERVAL + 8): + raise LitError("Reset table is too short") + if u32(reset_table[RESET_UCLENGTH + 4:]) != 0: + raise LitError("Reset table has 64bit value for UCLENGTH") + result = [] window_size = 14 @@ -712,13 +712,6 @@ class LitFile(object): raise LitError("Invalid window in ControlData") lzx.init(window_size) - reset_table = self.get_file('/'.join( - ['::DataSpace/Storage', name, 'Transform', - LZXCOMPRESS_GUID, 'InstanceData/ResetTable'])) - if len(reset_table) < (RESET_INTERVAL + 8): - raise LitError("Reset table is too short") - if u32(reset_table[RESET_UCLENGTH + 4:]) != 0: - raise LitError("Reset table has 64bit value for UCLENGTH") ofs_entry = int32(reset_table[RESET_HDRLEN:]) + 8 uclength = int32(reset_table[RESET_UCLENGTH:]) accum = int32(reset_table[RESET_INTERVAL:]) @@ -749,11 +742,38 @@ class LitFile(object): bytes_remaining = 0 if bytes_remaining > 0: raise LitError("Failed to completely decompress section") - return ''.join(result) - + return ''.join(result) + + def extract_content(self, output_dir=os.getcwdu()): + output_dir = os.path.abspath(output_dir) + try: + opf_path = os.path.splitext( + os.path.basename(self._stream.name))[0] + '.opf' + except AttributeError: + opf_path = 'content.opf' + opf_path = os.path.join(output_dir, opf_path) + self._ensure_dir(opf_path) + with open(opf_path, 'w') as f: + f.write(self.get_markup_file('/meta').encode('utf-8')) + for entry in self.manifest.values(): + path = os.path.join(output_dir, entry.path) + self._ensure_dir(path) + with open(path, 'w') as f: + if 'spine' in entry.state: + name = '/'.join(['/data', entry.internal, 'content']) + f.write(self.get_markup_file(name).encode('utf-8')) + else: + name = '/'.join(['/data', entry.internal]) + f.write(self.get_file(name)) + + def _ensure_dir(self, path): + dir = os.path.dirname(path) + if not os.path.isdir(dir): + os.makedirs(dir) + def get_metadata(stream): try: - litfile = LitFile(stream) + litfile = LitReader(stream) src = litfile.meta.encode('utf-8') mi = OPFReader(cStringIO.StringIO(src), dir=os.getcwd()) cover_url, cover_item = mi.cover, None @@ -775,16 +795,24 @@ def get_metadata(stream): mi = MetaInformation(title, ['Unknown']) return mi +def option_parser(): + from calibre import OptionParser + parser = OptionParser(usage=_('%prog [options] EBOOK')) + parser.add_option('-o', '--output-dir', default='.', + help=_('Output directory. Defaults to current directory.')) + parser.add_option('--verbose', default=False, action='store_true', + help='Useful for debugging.') + return parser + def main(args=sys.argv): + parser = option_parser() + opts, args = parser.parse_args(args) if len(args) != 2: - print >>sys.stderr, _('Usage: %s file.lit')%(args[0],) + parser.print_help() return 1 - mi = get_metadata(open(args[1], 'rb')) - print unicode(mi) - if mi.cover_data[1]: - cover = os.path.abspath(os.path.splitext(os.path.basename(args[1]))[0] + '.' + mi.cover_data[0]) - open(cover, 'wb').write(mi.cover_data[1]) - print _('Cover saved to'), cover + lr = LitReader(args[1]) + lr.extract_content(opts.output_dir) + print _('OEB ebook created in'), opts.output_dir return 0 if __name__ == '__main__': From 731631a7d7dcc4662100caea66f841b32fae6fa6 Mon Sep 17 00:00:00 2001 From: "Marshall T. Vandegrift" Date: Sat, 19 Jul 2008 16:45:41 -0400 Subject: [PATCH 11/19] Added a few speed-ups to the DES code --- src/calibre/ebooks/lit/msdes.py | 38 ++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/src/calibre/ebooks/lit/msdes.py b/src/calibre/ebooks/lit/msdes.py index 5bc67b09bb..de980f8c3d 100644 --- a/src/calibre/ebooks/lit/msdes.py +++ b/src/calibre/ebooks/lit/msdes.py @@ -1,6 +1,7 @@ # Re-modified for use in MS LIT decryption. Un-reversed the bytebit[] array. -# Substituted Microsoft's absurd modified S-boxes. Modified the encrypt/decrypt -# methods to handle more than one block at a time. +# Substituted Microsoft's absurd modified S-boxes. Modified the +# encrypt/decrypt methods to handle more than one block at a time. Added a few +# speed-ups supported by modern versions of Python. Added option 'psyco' use. # # And lo, all the previous notices follow: @@ -125,30 +126,30 @@ class DesCipher: pcr = [0]*56 #new int[56]; kn = [0]*32 #new int[32]; - for j in range(56): + for j in xrange(56): l = pc1[j] m = l & 07 pc1m[j] = ((keyBlock[l >> 3] & bytebit[m]) != 0) - for i in range(16): + for i in xrange(16): if encrypting: m = i << 1 else: m = (15-i) << 1 n = m + 1 kn[m] = kn[n] = 0 - for j in range(28): + for j in xrange(28): l = j + totrot[i] if l < 28: pcr[j] = pc1m[l] else: pcr[j] = pc1m[l - 28] - for j in range(28, 56): + for j in xrange(28, 56): l = j + totrot[i] if l < 56: pcr[j] = pc1m[l] else: pcr[j] = pc1m[l - 28] - for j in range(24): + for j in xrange(24): if pcr[pc2[j]] != 0: kn[m] |= bigbyte[j] if pcr[pc2[j+24]] != 0: @@ -163,7 +164,7 @@ class DesCipher: rawi = 0 KnLi = 0 - for i in range(16): + for i in xrange(16): raw0 = raw[rawi] rawi += 1 raw1 = raw[rawi] @@ -187,11 +188,10 @@ class DesCipher: if len(clearText) % 8 != 0: raise TypeError, "length must be multiple of block size" result = [] - while clearText: + for base in xrange(0, len(clearText), 8): result.append(struct.pack( - ">LL", *self.des(struct.unpack(">LL", clearText[:8]), + ">LL", *self.des(struct.unpack(">LL", clearText[base:base+8]), self.encryptKeys))) - clearText = clearText[8:] return ''.join(result) #/ Decrypt a block of eight bytes. @@ -199,11 +199,10 @@ class DesCipher: if len(cipherText) % 8 != 0: raise TypeError, "length must be multiple of block size" result = [] - while cipherText: + for base in xrange(0, len(cipherText), 8): result.append(struct.pack( - ">LL", *self.des(struct.unpack(">LL", cipherText[:8]), + ">LL", *self.des(struct.unpack(">LL", cipherText[base:base+8]), self.decryptKeys))) - cipherText = cipherText[8:] return ''.join(result) # The DES function. @@ -234,7 +233,7 @@ class DesCipher: right ^= work leftt = ((leftt << 1) | ((leftt >> 31) & 1)) & 0xffffffffL - for round in range(8): + for round in xrange(8): work = ((right << 28) | (right >> 4)) & 0xffffffffL work ^= keys[keysi] keysi += 1 @@ -322,6 +321,7 @@ pc2 = [ 45, 41, 49, 35, 28, 31, ] +# Microsoft's modified S-boxes for LIT file encryption SP1 = [ 0x02080800L, 0x00080000L, 0x02000002L, 0x02080802L, 0x02000000L, 0x00080802L, 0x00080002L, 0x02000002L, @@ -473,6 +473,14 @@ def new(key): block_size = 8 key_size = 8 +try: + import psyco + psyco.bind(DesCipher.deskey) + psyco.bind(DesCipher.cookey) + psyco.bind(DesCipher.des) +except ImportError: + pass + #test only: if __name__ == '__main__': des = DesCipher("\x01\x23\x45\x67\x89\xab\xcd\xef") From fb4f2f3a81c3e27837cfb47697f00cbafeac07ee Mon Sep 17 00:00:00 2001 From: "Marshall T. Vandegrift" Date: Sat, 19 Jul 2008 16:47:14 -0400 Subject: [PATCH 12/19] Added comments for LIT-specific SHA-1 changes. --- src/calibre/ebooks/lit/mssha1.py | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/src/calibre/ebooks/lit/mssha1.py b/src/calibre/ebooks/lit/mssha1.py index d61bd39094..1708c8dd8b 100644 --- a/src/calibre/ebooks/lit/mssha1.py +++ b/src/calibre/ebooks/lit/mssha1.py @@ -1,21 +1,11 @@ -#!/usr/bin/env python -# -*- coding: iso-8859-1 +""" +Modified version of SHA-1 used in Microsoft LIT files. -"""A sample implementation of SHA-1 in pure Python. - - Framework adapted from Dinu Gherman's MD5 implementation by - J. Hallén and L. Creighton. SHA-1 implementation based directly on - the text of the NIST standard FIPS PUB 180-1. +Adapted from the PyPy pure-Python SHA-1 implementation. """ - -__date__ = '2004-11-17' -__version__ = 0.91 # Modernised by J. Hallén and L. Creighton for Pypy - - import struct, copy - # ====================================================================== # Bit-Manipulation helpers # @@ -100,10 +90,13 @@ def f40_59(B, C, D): def f60_79(B, C, D): return B ^ C ^ D +# Microsoft's lovely addition... def f6_42(B, C, D): return (B + C) ^ C f = [f0_19]*20 + [f20_39]*20 + [f40_59]*20 + [f60_79]*20 + +# ...and delightful changes f[3] = f20_39 f[6] = f6_42 f[10] = f20_39 @@ -148,6 +141,7 @@ class mssha1(object): self.input = [] # Initial 160 bit message digest (5 times 32 bit). + # Also changed by Microsoft from standard. self.H0 = 0x32107654L self.H1 = 0x23016745L self.H2 = 0xC4E680A2L From 6b18c8b745cf6be6dbc463d7032942a375a2d61a Mon Sep 17 00:00:00 2001 From: "Marshall T. Vandegrift" Date: Sat, 19 Jul 2008 16:47:46 -0400 Subject: [PATCH 13/19] Added "lit2oeb" to set of command-line tools --- src/calibre/linux.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/calibre/linux.py b/src/calibre/linux.py index 5d3ead778e..4d7ff9c8aa 100644 --- a/src/calibre/linux.py +++ b/src/calibre/linux.py @@ -46,6 +46,7 @@ entry_points = { 'librarything = calibre.ebooks.metadata.library_thing:main', 'mobi2oeb = calibre.ebooks.mobi.reader:main', 'lrf2html = calibre.ebooks.lrf.html.convert_to:main', + 'lit2oeb = calibre.ebooks.lit.reader:main', 'calibre-debug = calibre.debug:main', 'calibredb = calibre.library.cli:main', 'calibre-fontconfig = calibre.utils.fontconfig:main', From 006182e5f46ea8f5da43607ab530dff752a12d94 Mon Sep 17 00:00:00 2001 From: "Marshall T. Vandegrift" Date: Sat, 19 Jul 2008 16:50:14 -0400 Subject: [PATCH 14/19] Fixed bug in directory processing and re-named methods to reflect public/private status. --- src/calibre/ebooks/lit/reader.py | 113 ++++++++++++++----------------- 1 file changed, 51 insertions(+), 62 deletions(-) diff --git a/src/calibre/ebooks/lit/reader.py b/src/calibre/ebooks/lit/reader.py index afe5d96297..0fed4aacbc 100644 --- a/src/calibre/ebooks/lit/reader.py +++ b/src/calibre/ebooks/lit/reader.py @@ -19,13 +19,13 @@ import calibre.ebooks.lit.mssha1 as mssha1 import calibre.ebooks.lit.msdes as msdes import calibre.utils.lzx as lzx -OPF_DECL = """" +OPF_DECL = """ """ HTML_DECL = """ - """ @@ -421,8 +421,13 @@ class LitReader(object): raise LitError('Not a valid LIT file') if self.version != 1: raise LitError('Unknown LIT version %d'%(self.version,)) - self.read_secondary_header() - self.read_header_pieces() + self.entries = {} + self._read_secondary_header() + self._read_header_pieces() + self._read_section_names() + self._read_manifest() + self._read_meta() + self._read_drm() @preserve def __len__(self): @@ -437,10 +442,9 @@ class LitReader(object): def _read_content(self, offset, size): return self._read_raw(self.content_offset + offset, size) - @preserve - def read_secondary_header(self): - self._stream.seek(self.hdr_len + self.num_pieces*self.PIECE_SIZE) - bytes = self._stream.read(self.sec_hdr_len) + def _read_secondary_header(self): + offset = self.hdr_len + (self.num_pieces * self.PIECE_SIZE) + bytes = self._read_raw(offset, self.sec_hdr_len) offset = int32(bytes[4:]) while offset < len(bytes): blocktype = bytes[offset:offset+4] @@ -468,23 +472,21 @@ class LitReader(object): if not hasattr(self, 'content_offset'): raise LitError('Could not figure out the content offset') - @preserve - def read_header_pieces(self): + def _read_header_pieces(self): src = self.header[self.hdr_len:] for i in range(self.num_pieces): piece = src[i * self.PIECE_SIZE:(i + 1) * self.PIECE_SIZE] if u32(piece[4:]) != 0 or u32(piece[12:]) != 0: raise LitError('Piece %s has 64bit value' % repr(piece)) offset, size = u32(piece), int32(piece[8:]) - self._stream.seek(offset) - piece = self._stream.read(size) + piece = self._read_raw(offset, size) if i == 0: continue # Dont need this piece elif i == 1: if u32(piece[8:]) != self.entry_chunklen or \ u32(piece[12:]) != self.entry_unknown: raise LitError('Secondary header does not match piece') - self.read_directory(piece) + self._read_directory(piece) elif i == 2: if u32(piece[8:]) != self.count_chunklen or \ u32(piece[12:]) != self.count_unknown: @@ -495,58 +497,44 @@ class LitReader(object): elif i == 4: self.piece4_guid = piece - def read_directory(self, piece): - self.entries = {} + def _read_directory(self, piece): if not piece.startswith('IFCM'): raise LitError('Header piece #1 is not main directory.') chunk_size, num_chunks = int32(piece[8:12]), int32(piece[24:28]) - if (32 + chunk_size * num_chunks) != len(piece): + if (32 + (num_chunks * chunk_size)) != len(piece): raise LitError('IFCM HEADER has incorrect length') - for chunk in range(num_chunks): - p = 32 + chunk * chunk_size - if piece[p:p+4] != 'AOLL': - continue - remaining = chunk_size - int32(piece[p+4:p+8]) - 48 - if remaining < 0: + for i in xrange(num_chunks): + offset = 32 + (i * chunk_size) + chunk = piece[offset:offset + chunk_size] + tag, chunk = chunk[:4], chunk[4:] + if tag != 'AOLL': continue + remaining, chunk = int32(chunk[:4]), chunk[4:] + if remaining >= chunk_size: raise LitError('AOLL remaining count is negative') - entries = u16(piece[p+chunk_size-2:]) - if entries <= 0: - # Hopefully everything will work even without a correct entries - # count + remaining = chunk_size - (remaining + 48) + entries = u16(chunk[-2:]) + if entries == 0: + # Hopefully will work even without a correct entries count entries = (2 ** 16) - 1 - piece = piece[p+48:] - i = 0 - while i < entries: + chunk = chunk[40:] + for j in xrange(entries): if remaining <= 0: break - namelen, piece, remaining = encint(piece, remaining) + namelen, chunk, remaining = encint(chunk, remaining) if namelen != (namelen & 0x7fffffff): raise LitError('Directory entry had 64bit name length.') if namelen > remaining - 3: raise LitError('Read past end of directory chunk') - name = piece[:namelen] - piece = piece[namelen:] - section, piece, remaining = encint(piece, remaining) - offset, piece, remaining = encint(piece, remaining) - size, piece, remaining = encint(piece, remaining) - + name, chunk = chunk[:namelen], chunk[namelen:] + section, chunk, remaining = encint(chunk, remaining) + offset, chunk, remaining = encint(chunk, remaining) + size, chunk, remaining = encint(chunk, remaining) entry = DirectoryEntry(name, section, offset, size) - - if name == '::DataSpace/NameList': - self.read_section_names(entry) - elif name == '/manifest': - self.read_manifest(entry) - elif name == '/meta': - self.read_meta(entry) self.entries[name] = entry - i += 1 - if not hasattr(self, 'section_names'): - raise LitError('Lit file does not have a valid NameList') - if not hasattr(self, 'manifest'): - raise LitError('Lit file does not have a valid manifest') - self.read_drm() - def read_section_names(self, entry): - raw = self._read_content(entry.offset, entry.size) + def _read_section_names(self): + if '::DataSpace/NameList' not in self.entries: + raise LitError('Lit file does not have a valid NameList') + raw = self.get_file('::DataSpace/NameList') if len(raw) < 4: raise LitError('Invalid Namelist section') pos = 4 @@ -563,9 +551,11 @@ class LitReader(object): raw[pos:pos+size].decode('utf-16-le').rstrip('\000') pos += size - def read_manifest(self, entry): + def _read_manifest(self): + if '/manifest' not in self.entries: + raise LitError('Lit file does not have a valid manifest') + raw = self.get_file('/manifest') self.manifest = {} - raw = self._read_content(entry.offset, entry.size) while raw: slen, raw = ord(raw[0]), raw[1:] if slen == 0: break @@ -600,12 +590,12 @@ class LitReader(object): for item in mlist: item.path = item.path[slen:] - def read_meta(self, entry): - raw = self._read_content(entry.offset, entry.size) + def _read_meta(self): + raw = self.get_file('/meta') xml = OPF_DECL + unicode(UnBinary(raw, self.manifest, OPF_MAP)) self.meta = xml - def read_drm(self): + def _read_drm(self): self.drmlevel = 0 if '/DRMStorage/Licenses/EUL' in self.entries: self.drmlevel = 5 @@ -615,13 +605,13 @@ class LitReader(object): self.drmlevel = 1 else: return - des = msdes.new(self.calculate_deskey()) + des = msdes.new(self._calculate_deskey()) bookkey = des.decrypt(self.get_file('/DRMStorage/DRMSealed')) if bookkey[0] != '\000': raise LitError('Unable to decrypt title key!') self.bookkey = bookkey[1:9] - def calculate_deskey(self): + def _calculate_deskey(self): hashfiles = ['/meta', '/DRMStorage/DRMSource'] if self.drmlevel == 3: hashfiles.append('/DRMStorage/DRMBookplate') @@ -726,19 +716,18 @@ class LitReader(object): u = int32(reset_table[ofs_entry + 4:]) if u != 0: raise LitError("Reset table entry greater than 32 bits") - if size >= (len(content) + base): + if size >= len(content): raise("Reset table entry out of bounds") if bytes_remaining >= window_bytes: lzx.reset() - result.append(lzx.decompress(content, window_bytes)) + result.append(lzx.decompress(content[base:size], window_bytes)) bytes_remaining -= window_bytes - content = content[size - base:] base = size accum += int32(reset_table[RESET_INTERVAL:]) ofs_entry += 8 if bytes_remaining < window_bytes and bytes_remaining > 0: lzx.reset() - result.append(lzx.decompress(content, bytes_remaining)) + result.append(lzx.decompress(content[base:], bytes_remaining)) bytes_remaining = 0 if bytes_remaining > 0: raise LitError("Failed to completely decompress section") From a349d763791c48d47cea1f32778b244aef794b5c Mon Sep 17 00:00:00 2001 From: "Marshall T. Vandegrift" Date: Sat, 19 Jul 2008 18:24:59 -0400 Subject: [PATCH 15/19] Various encoding fix-ups. Fix for broken file(s?) from Penguin. --- src/calibre/ebooks/lit/reader.py | 40 ++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/src/calibre/ebooks/lit/reader.py b/src/calibre/ebooks/lit/reader.py index 0fed4aacbc..66d6fe9385 100644 --- a/src/calibre/ebooks/lit/reader.py +++ b/src/calibre/ebooks/lit/reader.py @@ -8,6 +8,7 @@ __copyright__ = '2008, Kovid Goyal ' import sys, struct, cStringIO, os import functools +import codecs from itertools import repeat from calibre import relpath @@ -33,7 +34,6 @@ HTML_DECL = """ DESENCRYPT_GUID = "{67F6E4A2-60BF-11D3-8540-00C04F58C3CF}" LZXCOMPRESS_GUID = "{0A9007C6-4076-11D3-8789-0000F8105754}" -LZXC_TAG = 0x43585a4c CONTROL_TAG = 4 CONTROL_WINDOW_SIZE = 12 RESET_NENTRIES = 4 @@ -41,11 +41,11 @@ RESET_HDRLEN = 12 RESET_UCLENGTH = 16 RESET_INTERVAL = 32 -FLAG_OPENING = 1 -FLAG_CLOSING = 2 -FLAG_BLOCK = 4 -FLAG_HEAD = 8 -FLAG_ATOM = 16 +FLAG_OPENING = (1 << 0) +FLAG_CLOSING = (1 << 1) +FLAG_BLOCK = (1 << 2) +FLAG_HEAD = (1 << 3) +FLAG_ATOM = (1 << 4) XML_ENTITIES = ['&', ''', '<', '>', '"'] def u32(bytes): @@ -202,7 +202,7 @@ class UnBinary(object): is_goingdown = False if not tag_name: raise LitError('Tag ends before it begins.') - self.buf.write('') + self.buf.write(u''.join(('')).encode('utf-8')) dynamic_tag = 0 tag_name = None state = 'text' @@ -252,7 +252,7 @@ class UnBinary(object): state = 'get attr' elif count > 0: if not in_censorship: - self.buf.write(c) + self.buf.write(unicode(c).encode('utf-8')) count -= 1 if count == 0: if not in_censorship: @@ -272,7 +272,7 @@ class UnBinary(object): tag_name += c count -= 1 if count == 0: - self.buf.write(tag_name) + self.buf.write(unicode(tag_name).encode('utf-8')) state = 'get attr' elif state == 'get attr length': @@ -283,7 +283,7 @@ class UnBinary(object): state = 'get custom attr' elif state == 'get custom attr': - self.buf.write(c) + self.buf.write(unicode(c).encode('utf-8')) count -= 1 if count == 0: self.buf.write('=') @@ -592,7 +592,13 @@ class LitReader(object): def _read_meta(self): raw = self.get_file('/meta') - xml = OPF_DECL + unicode(UnBinary(raw, self.manifest, OPF_MAP)) + try: + xml = OPF_DECL + unicode(UnBinary(raw, self.manifest, OPF_MAP)) + except LitError: + if 'PENGUIN group' not in raw: raise + print "WARNING: attempting PENGUIN malformed OPF fix" + raw = raw.replace('PENGUIN group', '\x00\x01\x18\x00PENGUIN group', 1) + xml = OPF_DECL + unicode(UnBinary(raw, self.manifest, OPF_MAP)) self.meta = xml def _read_drm(self): @@ -669,8 +675,8 @@ class LitReader(object): control = control[csize:] elif guid == LZXCOMPRESS_GUID: reset_table = self.get_file( - '/'.join(['::DataSpace/Storage', name, 'Transform', - LZXCOMPRESS_GUID, 'InstanceData/ResetTable'])) + '/'.join(('::DataSpace/Storage', name, 'Transform', + LZXCOMPRESS_GUID, 'InstanceData/ResetTable'))) content = self._decompress(content, control, reset_table) control = control[csize:] else: @@ -684,7 +690,7 @@ class LitReader(object): return msdes.new(self.bookkey).decrypt(content) def _decompress(self, content, control, reset_table): - if len(control) < 32 or u32(control[CONTROL_TAG:]) != LZXC_TAG: + if len(control) < 32 or control[CONTROL_TAG:CONTROL_TAG+4] != "LZXC": raise LitError("Invalid ControlData tag value") if len(reset_table) < (RESET_INTERVAL + 8): raise LitError("Reset table is too short") @@ -743,16 +749,16 @@ class LitReader(object): opf_path = os.path.join(output_dir, opf_path) self._ensure_dir(opf_path) with open(opf_path, 'w') as f: - f.write(self.get_markup_file('/meta').encode('utf-8')) + f.write(self.meta.encode('utf-8')) for entry in self.manifest.values(): path = os.path.join(output_dir, entry.path) self._ensure_dir(path) with open(path, 'w') as f: if 'spine' in entry.state: - name = '/'.join(['/data', entry.internal, 'content']) + name = '/'.join(('/data', entry.internal, 'content')) f.write(self.get_markup_file(name).encode('utf-8')) else: - name = '/'.join(['/data', entry.internal]) + name = '/'.join(('/data', entry.internal)) f.write(self.get_file(name)) def _ensure_dir(self, path): From 015ca663506ee5fa930adbbcaa294e847a4ae2d8 Mon Sep 17 00:00:00 2001 From: "Marshall T. Vandegrift" Date: Sun, 20 Jul 2008 00:20:18 -0400 Subject: [PATCH 16/19] Added various copyright headers and doc strings --- src/calibre/ebooks/lit/maps/__init__.py | 7 +++++++ src/calibre/ebooks/lit/maps/html.py | 7 +++++++ src/calibre/ebooks/lit/maps/opf.py | 7 +++++++ src/calibre/ebooks/lit/reader.py | 3 ++- src/calibre/utils/lzx/lzxmodule.c | 7 +++++++ 5 files changed, 30 insertions(+), 1 deletion(-) diff --git a/src/calibre/ebooks/lit/maps/__init__.py b/src/calibre/ebooks/lit/maps/__init__.py index 2abab3efe9..2235c384ff 100644 --- a/src/calibre/ebooks/lit/maps/__init__.py +++ b/src/calibre/ebooks/lit/maps/__init__.py @@ -1,2 +1,9 @@ +__license__ = 'GPL v3' +__copyright__ = '2008, Marshall T. Vandegrift ' + +""" +Microsoft LIT tag and attribute tables. +""" + from calibre.ebooks.lit.maps.opf import MAP as OPF_MAP from calibre.ebooks.lit.maps.html import MAP as HTML_MAP diff --git a/src/calibre/ebooks/lit/maps/html.py b/src/calibre/ebooks/lit/maps/html.py index de0286c764..c0b9987f32 100644 --- a/src/calibre/ebooks/lit/maps/html.py +++ b/src/calibre/ebooks/lit/maps/html.py @@ -1,3 +1,10 @@ +__license__ = 'GPL v3' +__copyright__ = '2008, Marshall T. Vandegrift ' + +""" +Microsoft LIT HTML tag and attribute tables, copied from ConvertLIT. +""" + TAGS = [ None, None, diff --git a/src/calibre/ebooks/lit/maps/opf.py b/src/calibre/ebooks/lit/maps/opf.py index cc1acc4dfa..f3bb7dcb89 100644 --- a/src/calibre/ebooks/lit/maps/opf.py +++ b/src/calibre/ebooks/lit/maps/opf.py @@ -1,3 +1,10 @@ +__license__ = 'GPL v3' +__copyright__ = '2008, Marshall T. Vandegrift ' + +""" +Microsoft LIT OPF tag and attribute tables, copied from ConvertLIT. +""" + TAGS = [ None, "package", diff --git a/src/calibre/ebooks/lit/reader.py b/src/calibre/ebooks/lit/reader.py index 66d6fe9385..c53f266e79 100644 --- a/src/calibre/ebooks/lit/reader.py +++ b/src/calibre/ebooks/lit/reader.py @@ -4,7 +4,8 @@ Support for reading LIT files. from __future__ import with_statement __license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal ' +__copyright__ = '2008, Kovid Goyal ' \ + 'and Marshall T. Vandegrift ' import sys, struct, cStringIO, os import functools diff --git a/src/calibre/utils/lzx/lzxmodule.c b/src/calibre/utils/lzx/lzxmodule.c index bf8a48a056..a1917b5749 100644 --- a/src/calibre/utils/lzx/lzxmodule.c +++ b/src/calibre/utils/lzx/lzxmodule.c @@ -1,3 +1,10 @@ +/* __license__ = 'GPL v3' + * __copyright__ = '2008, Marshall T. Vandegrift ' + * + * Python module C glue code. + */ + + #include #include From 87ae95cc7a1caeb2f20236db2df4b124fb99cc18 Mon Sep 17 00:00:00 2001 From: "Marshall T. Vandegrift" Date: Sun, 20 Jul 2008 00:40:41 -0400 Subject: [PATCH 17/19] Removed duplicate LIT-parsing code. --- src/calibre/ebooks/lit/reader.py | 26 +- src/calibre/ebooks/metadata/lit.py | 737 +---------------------------- 2 files changed, 16 insertions(+), 747 deletions(-) diff --git a/src/calibre/ebooks/lit/reader.py b/src/calibre/ebooks/lit/reader.py index c53f266e79..65fce4f3e9 100644 --- a/src/calibre/ebooks/lit/reader.py +++ b/src/calibre/ebooks/lit/reader.py @@ -767,33 +767,9 @@ class LitReader(object): if not os.path.isdir(dir): os.makedirs(dir) -def get_metadata(stream): - try: - litfile = LitReader(stream) - src = litfile.meta.encode('utf-8') - mi = OPFReader(cStringIO.StringIO(src), dir=os.getcwd()) - cover_url, cover_item = mi.cover, None - if cover_url: - cover_url = relpath(cover_url, os.getcwd()) - for item in litfile.manifest.values(): - if item.path == cover_url: - cover_item = item.internal - if cover_item is not None: - ext = cover_url.rpartition('.')[-1] - if not ext: - ext = 'jpg' - else: - ext = ext.lower() - cd = litfile.get_file(cover_item) - mi.cover_data = (ext, cd) if cd else (None, None) - except: - title = stream.name if hasattr(stream, 'name') and stream.name else 'Unknown' - mi = MetaInformation(title, ['Unknown']) - return mi - def option_parser(): from calibre import OptionParser - parser = OptionParser(usage=_('%prog [options] EBOOK')) + parser = OptionParser(usage=_('%prog [options] LITFILE')) parser.add_option('-o', '--output-dir', default='.', help=_('Output directory. Defaults to current directory.')) parser.add_option('--verbose', default=False, action='store_true', diff --git a/src/calibre/ebooks/metadata/lit.py b/src/calibre/ebooks/metadata/lit.py index 2b8c3a4b9f..825fe45cf4 100644 --- a/src/calibre/ebooks/metadata/lit.py +++ b/src/calibre/ebooks/metadata/lit.py @@ -1,734 +1,25 @@ __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' ''' -Support for reading the metadata from a lit file. +Support for reading the metadata from a LIT file. ''' -import sys, struct, cStringIO, os -from itertools import repeat +import sys, cStringIO, os from calibre import relpath from calibre.ebooks.metadata import MetaInformation from calibre.ebooks.metadata.opf import OPFReader - -OPF_ATTR_MAP = [ - None, - "href", - "%never-used", - "%guid", - "%minimum_level", - "%attr5", - "id", - "href", - "media-type", - "fallback", - "idref", - "xmlns:dc", - "xmlns:oebpackage", - "role", - "file-as", - "event", - "scheme", - "title", - "type", - "unique-identifier", - "name", - "content", - "xml:lang", - ] - -OPF_TAG_MAP = [ - None, - "package", - "dc:Title", - "dc:Creator", - None, - None, - None, - None, - None, - None, - None, - None, - None, - None, - None, - None, - "manifest", - "item", - "spine", - "itemref", - "metadata", - "dc-metadata", - "dc:Subject", - "dc:Description", - "dc:Publisher", - "dc:Contributor", - "dc:Date", - "dc:Type", - "dc:Format", - "dc:Identifier", - "dc:Source", - "dc:Language", - "dc:Relation", - "dc:Coverage", - "dc:Rights", - "x-metadata", - "meta", - "tours", - "tour", - "site", - "guide", - "reference", - None, - ] - -class DirectoryEntry(object): - def __init__(self, name, section, offset, size): - self.name = name - self.section = section - self.offset = offset - self.size = size - - def __repr__(self): - return '%s\n\tSection: %d\n\tOffset: %d\n\tSize: %d\n'%(self.name, - self.section, self.offset, self.size) - - def __str__(self): - return repr(self) - -class LitReadError(Exception): - pass - -def u32(bytes): - b = struct.unpack('BBBB', bytes[:4]) - return b[0] + (b[1] << 8) + (b[2] << 16) + (b[3] << 32) - -def u16(bytes): - b = struct.unpack('BB', bytes[:2]) - return b[0] + (b[1] << 8) - -def int32(bytes): - return u32(bytes)&0x7FFFFFFF - -def encint(bytes, remaining): - pos, val = 0, 0 - while remaining > 0: - b = ord(bytes[pos]) - pos += 1 - remaining -= 1 - val <<= 7 - val |= (b & 0x7f) - if b & 0x80 == 0: break - return val, bytes[pos:], remaining - -def read_utf8_char(bytes, pos): - c = ord(bytes[pos]) - mask = 0x80 - if (c & mask): - elsize = 0 - while c & mask: - mask >>= 1 - elsize += 1 - if (mask <= 1) or (mask == 0x40): - raise LitReadError('Invalid UTF8 character: %s'%(repr(bytes[pos]))) - else: - elsize = 1 - - - if elsize > 1: - if elsize + pos > len(bytes): - raise LitReadError('Invalid UTF8 character: %s'%(repr(bytes[pos]))) - c &= (mask - 1) - for i in range(1, elsize): - b = ord(bytes[pos+i]) - if (b & 0xC0) != 0x80: - raise LitReadError('Invalid UTF8 character: %s'%(repr(bytes[pos:pos+i]))) - c = (c << 6) | (b & 0x3F) - return unichr(c), pos+elsize - -FLAG_OPENING = 1 -FLAG_CLOSING = 2 -FLAG_BLOCK = 4 -FLAG_HEAD = 8 -FLAG_ATOM = 16 -XML_ENTITIES = ['&', ''', '<', '>', '"'] - -class UnBinary(object): - def __init__(self, bin, manifest, attr_map=OPF_ATTR_MAP, tag_map=OPF_TAG_MAP, - tag_to_attr_map=[[] for i in range(43)]): - self.manifest = manifest - self.pending_indent = 0 - self.lingering_space = 0 - self.was_in_text = 0 - self.attr_map = attr_map - self.tag_map = tag_map - self.tag_to_attr_map = tag_to_attr_map - self.opf = self.attr_map is OPF_ATTR_MAP - self.bin = bin - self.buf = cStringIO.StringIO() - self.ampersands = [] - self.binary_to_text() - self.raw = self.buf.getvalue().lstrip().decode('utf-8') - self.escape_ampersands() - - def escape_ampersands(self): - offset = 0 - for pos in self.ampersands: - test = self.raw[pos+offset:pos+offset+6] - if test.startswith('&#') and ';' in test: - continue - escape = True - for ent in XML_ENTITIES: - if test.startswith(ent): - escape = False - break - if not escape: - continue - self.raw = self.raw[:pos+offset] + '&' + self.raw[pos+offset+1:] - offset += 4 - - - def write_spaces(self, depth): - self.buf.write(u''.join(repeat(' ', depth))) - - def item_path(self, internal_id): - for i in self.manifest: - if i == internal_id: - return i.path - raise LitReadError('Could not find item %s'%(internal_id,)) - - def __unicode__(self): - return self.raw - - def binary_to_text(self, base=0, depth=0): - space_enabled, saved_space_enabled = 1, 0 - was_indented, is_goingdown = 0, 0 - tag_name = current_map = None - dynamic_tag = errors = in_censorship = 0 - - state = 'text' - index = base - flags = 0 - - while index < len(self.bin): - c, index = read_utf8_char(self.bin, index) - if state == 'text': - if ord(c) == 0: - state = 'get flags' - continue - if (not self.was_in_text) or space_enabled: - space_enabled = 0; - if c in (' ', '\t', '\n', '\r'): - space_enabled += 1 - else: - self.was_in_text = 1 - if c == '\v': - c = '\n' - pending_indent = 0 - if c == '&': - self.ampersands.append(self.buf.tell()-1) - self.buf.write(c.encode('utf-8') if isinstance(c, unicode) else c) - elif state == 'get flags': - if ord(c) == 0: - state = 'text' - continue - flags = ord(c) - state = 'get tag' - elif state == 'get tag': - state = 'text' if ord(c) == 0 else 'get attr' - if flags & FLAG_OPENING: - if space_enabled and ((not self.was_in_text) or (flags &(FLAG_BLOCK|FLAG_HEAD))): - self.pending_indent += 1 - if self.pending_indent or self.opf: - was_indented += 1 - self.buf.write(u'\n') - self.write_spaces(depth) - pending_indent = 0 - if (flags & FLAG_HEAD) or (flags & FLAG_BLOCK) or \ - self.opf or depth == 0: - pending_indent = 1 - tag = ord(c) - self.buf.write('<') - if not (flags & FLAG_CLOSING): - is_goingdown = 1 - if tag == 0x8000: - state = 'get custom length' - continue - if flags & FLAG_ATOM: - raise LitReadError('TODO: Atoms not yet implemented') - elif tag < len(self.tag_map): - tag_name = self.tag_map[tag] - current_map = self.tag_to_attr_map[tag] - else: - dynamic_tag += 1 - errors += 1 - tag_name = '?'+unichr(tag)+'?' - current_map = self.tag_to_attr_map[tag] - print 'WARNING: tag %s unknown'%(unichr(tag),) - - self.buf.write(unicode(tag_name).encode('utf-8')) - elif flags & FLAG_CLOSING: - #if depth == 0: - # raise LitReadError('Extra closing tag') - self.lingering_space = space_enabled - return index - elif state == 'get attr': - in_censorship = 0 - if ord(c) == 0: - if not is_goingdown: - tag_name = None - dynamic_tag = 0 - self.buf.write(' />') - else: - self.buf.write('>') - if not self.opf and (flags & (FLAG_BLOCK|FLAG_HEAD)): - pending_indent += 1 - index = self.binary_to_text(base=index, depth=depth+1) - is_goingdown = 0 - if not tag_name: - raise LitReadError('Tag ends before it begins.') - saved_space_enabled = space_enabled - space_enabled = self.lingering_space - if space_enabled and was_indented and not self.was_in_text: - self.buf.write('\n') - self.write_spaces(depth) - self.buf.write('') - if (space_enabled and self.opf) or (flags & (FLAG_BLOCK|FLAG_HEAD)): - self.pending_indent += 1 - dynamic_tag = 0 - tag_name = None - space_enabled = saved_space_enabled - - self.was_in_text = 0 - state = 'text' - else: - if ord(c) == 0x8000: - state = 'get attr length' - continue - attr = None - if ord(c) < len(current_map) and current_map[ord(c)]: - attr = current_map[ord(c)] - elif ord(c) < len(self.attr_map): - attr = self.attr_map[ord(c)] - - if not attr or not isinstance(attr, basestring): - raise LitReadError('Unknown attribute %d in tag %s'%(ord(c), tag_name)) - - if attr.startswith('%'): - in_censorship = 1 - state = 'get value length' - continue - - self.buf.write(' ' + unicode(attr).encode('utf-8') + '=') - if attr in ['href', 'src']: - state = 'get href' - else: - state = 'get value length' - elif state == 'get value length': - if not in_censorship: - self.buf.write('"') - char_count = ord(c) - 1 - if not char_count: - if not in_censorship: - self.buf.write('"') - in_censorship = 0 - state = 'get attr' - state = 'get value' - if ord(c) == 0xffff: - continue - if char_count < 0 or char_count > len(self.bin)-index: - raise LitReadError('Invalid character count %d'%(char_count,)) - elif state == 'get value': - if char_count == 0xfffe: - if not in_censorship: - self.buf.write(str(ord(c)-1)) - in_censorship = 0 - state = 'get attr' - elif char_count: - if not in_censorship: - self.buf.write(c) - char_count -= 1 - if not char_count: - if not in_censorship: - self.buf.write('"') - in_censorship = 0 - state = 'get attr' - elif state == 'get custom length': - char_count = ord(c) - 1 - if char_count <= 0 or char_count > len(self.bin)-index: - raise LitReadError('Invalid character count %d'%(char_count,)) - dynamic_tag += 1 - state = 'get custom' - tag_name = '' - elif state == 'get custom': - tag += c - char_count -= 1 - if not char_count: - self.buf.write(tag_name) - state = 'get attr' - elif state == 'get attr length': - char_count = ord(c) - 1 - if char_count <= 0 or char_count > len(self.bin)-index: - raise LitReadError('Invalid character count %d'%(char_count,)) - self.buf.write(' ') - state = 'get custom attr' - elif state == 'get custom attr': - self.buf.write(c) - char_count -= 1 - if not char_count: - self.buf.write('=') - state = 'get value length' - elif state == 'get href': - char_count = ord(c) - 1 - if char_count <= 0: - raise LitReadError('Invalid character count %d'%(char_count,)) - href = self.bin[index+1:index+char_count].decode('ascii') - index += char_count - doc, m, frag = href.partition('#') - path = self.item_path(doc) - if m and frag: - path += m+frag - self.buf.write((u'"%s"'%(path,)).encode('utf-8')) - state = 'get attr' - - self.lingering_space = space_enabled - return index - -class ManifestItem(object): - - def __init__(self, original, internal, mime_type, offset, root, state): - self.original = original - self.internal = internal - self.mime_type = mime_type - self.offset = offset - self.root = root - self.state = state - self.prefix = 'images' if state == 'images' else 'css' if state == 'css' else '' - self.prefix = self.prefix + os.sep if self.prefix else '' - self.path = self.prefix + self.original - - def __eq__(self, other): - if hasattr(other, 'internal'): - return self.internal == other.internal - return self.internal == other - - def __repr__(self): - return self.internal + u'->' + self.path - -class LitFile(object): - - PIECE_SIZE = 16 - - @apply - def magic(): - def fget(self): - opos = self._stream.tell() - self._stream.seek(0) - val = self._stream.read(8) - self._stream.seek(opos) - return val - return property(fget=fget) - - @apply - def version(): - def fget(self): - opos = self._stream.tell() - self._stream.seek(8) - val = u32(self._stream.read(4)) - self._stream.seek(opos) - return val - return property(fget=fget) - - @apply - def hdr_len(): - def fget(self): - opos = self._stream.tell() - self._stream.seek(12) - val = int32(self._stream.read(4)) - self._stream.seek(opos) - return val - return property(fget=fget) - - @apply - def num_pieces(): - def fget(self): - opos = self._stream.tell() - self._stream.seek(16) - val = int32(self._stream.read(4)) - self._stream.seek(opos) - return val - return property(fget=fget) - - @apply - def sec_hdr_len(): - def fget(self): - opos = self._stream.tell() - self._stream.seek(20) - val = int32(self._stream.read(4)) - self._stream.seek(opos) - return val - return property(fget=fget) - - @apply - def guid(): - def fget(self): - opos = self._stream.tell() - self._stream.seek(24) - val = self._stream.read(16) - self._stream.seek(opos) - return val - return property(fget=fget) - - @apply - def header(): - def fget(self): - opos = self._stream.tell() - size = self.hdr_len + self.num_pieces*self.PIECE_SIZE + self.sec_hdr_len - self._stream.seek(0) - val = self._stream.read(size) - self._stream.seek(opos) - return val - return property(fget=fget) - - def __init__(self, stream): - self._stream = stream - if self.magic != 'ITOLITLS': - raise LitReadError('Not a valid LIT file') - if self.version != 1: - raise LitReadError('Unknown LIT version %d'%(self.version,)) - self.read_secondary_header() - self.read_header_pieces() - - - def read_secondary_header(self): - opos = self._stream.tell() - try: - self._stream.seek(self.hdr_len + self.num_pieces*self.PIECE_SIZE) - bytes = self._stream.read(self.sec_hdr_len) - offset = int32(bytes[4:]) - - while offset < len(bytes): - blocktype = bytes[offset:offset+4] - blockver = u32(bytes[offset+4:]) - - if blocktype == 'CAOL': - if blockver != 2: - raise LitReadError('Unknown CAOL block format %d'%(blockver,)) - self.creator_id = u32(bytes[offset+12:]) - self.entry_chunklen = u32(bytes[offset+20:]) - self.count_chunklen = u32(bytes[offset+24:]) - self.entry_unknown = u32(bytes[offset+28:]) - self.count_unknown = u32(bytes[offset+32:]) - offset += 48 - elif blocktype == 'ITSF': - if blockver != 4: - raise LitReadError('Unknown ITSF block format %d'%(blockver,)) - if u32(bytes[offset+4+16:]): - raise LitReadError('This file has a 64bit content offset') - self.content_offset = u32(bytes[offset+16:]) - self.timestamp = u32(bytes[offset+24:]) - self.language_id = u32(bytes[offset+28:]) - offset += 48 - - if not hasattr(self, 'content_offset'): - raise LitReadError('Could not figure out the content offset') - finally: - self._stream.seek(opos) - - def read_header_pieces(self): - opos = self._stream.tell() - try: - src = self.header[self.hdr_len:] - for i in range(self.num_pieces): - piece = src[i*self.PIECE_SIZE:(i+1)*self.PIECE_SIZE] - if u32(piece[4:]) != 0 or u32(piece[12:]) != 0: - raise LitReadError('Piece %s has 64bit value'%(repr(piece),)) - offset, size = u32(piece), int32(piece[8:]) - self._stream.seek(offset) - piece = self._stream.read(size) - if i == 0: - continue # Dont need this piece - elif i == 1: - if u32(piece[8:]) != self.entry_chunklen or \ - u32(piece[12:]) != self.entry_unknown: - raise LitReadError('Secondary header does not match piece') - self.read_directory(piece) - elif i == 2: - if u32(piece[8:]) != self.count_chunklen or \ - u32(piece[12:]) != self.count_unknown: - raise LitReadError('Secondary header does not match piece') - continue # No data needed from this piece - elif i == 3: - self.piece3_guid = piece - elif i == 4: - self.piece4_guid = piece - finally: - self._stream.seek(opos) - - def read_directory(self, piece): - self.entries = [] - if not piece.startswith('IFCM'): - raise LitReadError('Header piece #1 is not main directory.') - chunk_size, num_chunks = int32(piece[8:12]), int32(piece[24:28]) - - if 32 + chunk_size*num_chunks != len(piece): - raise LitReadError('IFCM HEADER has incorrect length') - - for chunk in range(num_chunks): - p = 32 + chunk*chunk_size - if piece[p:p+4] != 'AOLL': - continue - remaining = chunk_size - int32(piece[p+4:p+8]) - 48 - if remaining < 0: - raise LitReadError('AOLL remaining count is negative') - - entries = u16(piece[p+chunk_size-2:]) - - if entries <= 0: # Hopefully everything will work even without a correct entries count - entries = (2**16)-1 - - piece = piece[p+48:] - i = 0 - while i < entries: - if remaining <= 0: break - namelen, piece, remaining = encint(piece, remaining) - if namelen != (namelen & 0x7fffffff): - raise LitReadError('Directory entry had 64bit name length.') - if namelen > remaining - 3: - raise LitReadError('Read past end of directory chunk') - name = piece[:namelen] - piece = piece[namelen:] - section, piece, remaining = encint(piece, remaining) - offset, piece, remaining = encint(piece, remaining) - size, piece, remaining = encint(piece, remaining) - - entry = DirectoryEntry(name, section, offset, size) - - if name == '::DataSpace/NameList': - self.read_section_names(entry) - elif name == '/manifest': - self.read_manifest(entry) - elif name == '/meta': - self.read_meta(entry) - self.entries.append(entry) - i += 1 - - if not hasattr(self, 'sections'): - raise LitReadError('Lit file does not have a valid NameList') - - if not hasattr(self, 'manifest'): - raise LitReadError('Lit file does not have a valid manifest') - - def read_section_names(self, entry): - opos = self._stream.tell() - try: - self._stream.seek(self.content_offset + entry.offset) - raw = self._stream.read(entry.size) - if len(raw) < 4: - raise LitReadError('Invalid Namelist section') - pos = 4 - self.num_sections = u16(raw[2:pos]) - - self.sections = {} - for section in range(self.num_sections): - size = u16(raw[pos:pos+2]) - pos += 2 - size = size*2 + 2 - if pos + size > len(raw): - raise LitReadError('Invalid Namelist section') - self.sections[section] = raw[pos:pos+size].decode('utf-16-le') - pos += size - finally: - self._stream.seek(opos) - - def read_manifest(self, entry): - opos = self._stream.tell() - try: - self.manifest = [] - self._stream.seek(self.content_offset + entry.offset) - raw = self._stream.read(entry.size) - pos = 0 - while pos < len(raw): - size = ord(raw[pos]) - if size == 0: break - pos += 1 - root = raw[pos:pos+size].decode('utf8') - pos += size - if pos >= len(raw): - raise LitReadError('Truncated manifest.') - for state in ['spine', 'not spine', 'css', 'images']: - num_files = int32(raw[pos:pos+4]) - pos += 4 - if num_files == 0: continue - - i = 0 - while i < num_files: - if pos+5 >= len(raw): - raise LitReadError('Truncated manifest.') - offset = u32(raw[pos:pos+4]) - pos += 4 - - slen = ord(raw[pos]) - pos += 1 - internal = raw[pos:pos+slen].decode('utf8') - pos += slen - - slen = ord(raw[pos]) - pos += 1 - original = raw[pos:pos+slen].decode('utf8') - pos += slen - - slen = ord(raw[pos]) - pos += 1 - mime_type = raw[pos:pos+slen].decode('utf8') - pos += slen +1 - - self.manifest.append(ManifestItem(original, internal, mime_type, offset, root, state)) - i += 1 - finally: - self._stream.seek(opos) - - def read_meta(self, entry): - opos = self._stream.tell() - try: - self._stream.seek(self.content_offset + entry.offset) - raw = self._stream.read(entry.size) - - xml = \ -'''\ - - -'''+\ - unicode(UnBinary(raw, self.manifest)) - self.meta = xml - finally: - self._stream.seek(opos) - - def read_image(self, internal_name): - cover_entry = None - for entry in self.entries: - if internal_name in entry.name: - cover_entry = entry - break - opos = self._stream.tell() - try: - self._stream.seek(self.content_offset + cover_entry.offset) - return self._stream.read(cover_entry.size) - finally: - self._stream.seek(opos) +from calibre.ebooks.lit.reader import LitReader def get_metadata(stream): try: - litfile = LitFile(stream) + litfile = LitReader(stream) src = litfile.meta.encode('utf-8') mi = OPFReader(cStringIO.StringIO(src), dir=os.getcwd()) cover_url, cover_item = mi.cover, None if cover_url: cover_url = relpath(cover_url, os.getcwd()) - for item in litfile.manifest: + for item in litfile.manifest.values(): if item.path == cover_url: cover_item = item.internal if cover_item is not None: @@ -737,26 +28,28 @@ def get_metadata(stream): ext = 'jpg' else: ext = ext.lower() - cd = litfile.read_image(cover_item) - mi.cover_data = (ext, cd) if cd else (None, None) + cd = litfile.get_file('/data/' + cover_item) + mi.cover_data = (ext, cd) if cd else (None, None) except: title = stream.name if hasattr(stream, 'name') and stream.name else 'Unknown' mi = MetaInformation(title, ['Unknown']) return mi - - def main(args=sys.argv): if len(args) != 2: - print >>sys.stderr, _('Usage: %s file.lit')%(args[0],) + print >>sys.stderr, _('Usage: %s file.lit') % args[0] return 1 - mi = get_metadata(open(args[1], 'rb')) + fname = args[1] + mi = get_metadata(open(fname, 'rb')) print unicode(mi) if mi.cover_data[1]: - cover = os.path.abspath(os.path.splitext(os.path.basename(args[1]))[0] + '.' + mi.cover_data[0]) + cover = os.path.abspath( + '.'.join((os.path.splitext(os.path.basename(fname))[0], + mi.cover_data[0]))) open(cover, 'wb').write(mi.cover_data[1]) print _('Cover saved to'), cover return 0 if __name__ == '__main__': - sys.exit(main()) \ No newline at end of file + sys.exit(main()) + From 63b6550e21ca0e3b2d3ff5afeba479c0c3bec147 Mon Sep 17 00:00:00 2001 From: "Marshall T. Vandegrift" Date: Sun, 20 Jul 2008 01:02:14 -0400 Subject: [PATCH 18/19] Fix for LIT files with '..' in filename paths --- src/calibre/ebooks/lit/reader.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/calibre/ebooks/lit/reader.py b/src/calibre/ebooks/lit/reader.py index 65fce4f3e9..2a862141d1 100644 --- a/src/calibre/ebooks/lit/reader.py +++ b/src/calibre/ebooks/lit/reader.py @@ -331,7 +331,15 @@ class ManifestItem(object): self.offset = offset self.root = root self.state = state - self.path = self.original + # Some paths in Fictionwise "multiformat" LIT files contain '..' (!?) + nodes = original.split('/') + path = [] + for node in nodes: + if node == '..': + if path: path.pop() + continue + path.append(node) + self.path = os.path.join(*path) def __eq__(self, other): if hasattr(other, 'internal'): From 56b5b0e26c1505e16bccabeb513dc3d7f9c69241 Mon Sep 17 00:00:00 2001 From: "Marshall T. Vandegrift" Date: Sun, 20 Jul 2008 01:08:36 -0400 Subject: [PATCH 19/19] Fix a few lines which flow beyond 80 columns --- src/calibre/ebooks/lit/reader.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/src/calibre/ebooks/lit/reader.py b/src/calibre/ebooks/lit/reader.py index 2a862141d1..8cef0fdd18 100644 --- a/src/calibre/ebooks/lit/reader.py +++ b/src/calibre/ebooks/lit/reader.py @@ -122,7 +122,8 @@ class UnBinary(object): break if not escape: continue - self.raw = self.raw[:pos+offset] + '&' + self.raw[pos+offset+1:] + self.raw = '&'.join( + (self.raw[:pos+offset], self.raw[pos+offset+1:])) offset += 4 def item_path(self, internal_id): @@ -203,7 +204,8 @@ class UnBinary(object): is_goingdown = False if not tag_name: raise LitError('Tag ends before it begins.') - self.buf.write(u''.join(('')).encode('utf-8')) + self.buf.write(u''.join( + ('')).encode('utf-8')) dynamic_tag = 0 tag_name = None state = 'text' @@ -606,7 +608,8 @@ class LitReader(object): except LitError: if 'PENGUIN group' not in raw: raise print "WARNING: attempting PENGUIN malformed OPF fix" - raw = raw.replace('PENGUIN group', '\x00\x01\x18\x00PENGUIN group', 1) + raw = raw.replace( + 'PENGUIN group', '\x00\x01\x18\x00PENGUIN group', 1) xml = OPF_DECL + unicode(UnBinary(raw, self.manifest, OPF_MAP)) self.meta = xml @@ -735,7 +738,8 @@ class LitReader(object): raise("Reset table entry out of bounds") if bytes_remaining >= window_bytes: lzx.reset() - result.append(lzx.decompress(content[base:size], window_bytes)) + result.append( + lzx.decompress(content[base:size], window_bytes)) bytes_remaining -= window_bytes base = size accum += int32(reset_table[RESET_INTERVAL:]) @@ -778,10 +782,12 @@ class LitReader(object): def option_parser(): from calibre import OptionParser parser = OptionParser(usage=_('%prog [options] LITFILE')) - parser.add_option('-o', '--output-dir', default='.', - help=_('Output directory. Defaults to current directory.')) - parser.add_option('--verbose', default=False, action='store_true', - help='Useful for debugging.') + parser.add_option( + '-o', '--output-dir', default='.', + help=_('Output directory. Defaults to current directory.')) + parser.add_option( + '--verbose', default=False, action='store_true', + help='Useful for debugging.') return parser def main(args=sys.argv):