diff --git a/src/calibre/ebooks/lit/__init__.py b/src/calibre/ebooks/lit/__init__.py new file mode 100644 index 0000000000..412a52ab05 --- /dev/null +++ b/src/calibre/ebooks/lit/__init__.py @@ -0,0 +1,5 @@ +__license__ = 'GPL v3' +__copyright__ = '2008, Marshall T. Vandegrift ' + +class LitError(Exception): + pass diff --git a/src/calibre/ebooks/lit/maps/__init__.py b/src/calibre/ebooks/lit/maps/__init__.py new file mode 100644 index 0000000000..eb99464d9b --- /dev/null +++ b/src/calibre/ebooks/lit/maps/__init__.py @@ -0,0 +1,5 @@ +import calibre.ebooks.maps.opf as opf +import calibre.ebooks.maps.html as html + +OPF_MAP = opf.MAP +HTML_MAP = html.MAP diff --git a/src/calibre/ebooks/lit/maps/html.py b/src/calibre/ebooks/lit/maps/html.py new file mode 100644 index 0000000000..095b0bcc3e --- /dev/null +++ b/src/calibre/ebooks/lit/maps/html.py @@ -0,0 +1,1008 @@ +ATTRS0 = { + 0x8010 => "tabindex", + 0x8046 => "title", + 0x804b => "style", + 0x804d => "disabled", + 0x83ea => "class", + 0x83eb => "id", + 0x83fe => "datafld", + 0x83ff => "datasrc", + 0x8400 => "dataformatas", + 0x87d6 => "accesskey", + 0x9392 => "lang", + 0x93ed => "language", + 0x93fe => "dir", + 0x9771 => "onmouseover", + 0x9772 => "onmouseout", + 0x9773 => "onmousedown", + 0x9774 => "onmouseup", + 0x9775 => "onmousemove", + 0x9776 => "onkeydown", + 0x9777 => "onkeyup", + 0x9778 => "onkeypress", + 0x9779 => "onclick", + 0x977a => "ondblclick", + 0x977e => "onhelp", + 0x977f => "onfocus", + 0x9780 => "onblur", + 0x9783 => "onrowexit", + 0x9784 => "onrowenter", + 0x9786 => "onbeforeupdate", + 0x9787 => "onafterupdate", + 0x978a => "onreadystatechange", + 0x9790 => "onscroll", + 0x9794 => "ondragstart", + 0x9795 => "onresize", + 0x9796 => "onselectstart", + 0x9797 => "onerrorupdate", + 0x9799 => "ondatasetchanged", + 0x979a => "ondataavailable", + 0x979b => "ondatasetcomplete", + 0x979c => "onfilterchange", + 0x979f => "onlosecapture", + 0x97a0 => "onpropertychange", + 0x97a2 => "ondrag", + 0x97a3 => "ondragend", + 0x97a4 => "ondragenter", + 0x97a5 => "ondragover", + 0x97a6 => "ondragleave", + 0x97a7 => "ondrop", + 0x97a8 => "oncut", + 0x97a9 => "oncopy", + 0x97aa => "onpaste", + 0x97ab => "onbeforecut", + 0x97ac => "onbeforecopy", + 0x97ad => "onbeforepaste", + 0x97af => "onrowsdelete", + 0x97b0 => "onrowsinserted", + 0x97b1 => "oncellchange", + 0x97b2 => "oncontextmenu", + 0x97b6 => "onbeforeeditfocus", + } +ATTRS3 = { + 0x0001 => "href", + 0x03ec => "target", + 0x03ee => "rel", + 0x03ef => "rev", + 0x03f0 => "urn", + 0x03f1 => "methods", + 0x8001 => "name", + 0x8046 => "title", + 0x804b => "style", + 0x83ea => "class", + 0x83eb => "id", + } +ATTRS5 = { + 0x9399 => "clear", + } +ATTRS6 = { + 0x8001 => "name", + 0x8006 => "width", + 0x8007 => "height", + 0x804a => "align", + 0x8bbb => "classid", + 0x8bbc => "data", + 0x8bbf => "codebase", + 0x8bc0 => "codetype", + 0x8bc1 => "code", + 0x8bc2 => "type", + 0x8bc5 => "vspace", + 0x8bc6 => "hspace", + 0x978e => "onerror", + } +ATTRS7 = { + 0x0001 => "href", + 0x03ea => "shape", + 0x03eb => "coords", + 0x03ed => "target", + 0x03ee => "alt", + 0x03ef => "nohref", + 0x8046 => "title", + 0x804b => "style", + 0x83ea => "class", + 0x83eb => "id", + } +ATTRS8 = { + 0x8046 => "title", + 0x804b => "style", + 0x83ea => "class", + 0x83eb => "id", + } +ATTRS9 = { + 0x03ec => "href", + 0x03ed => "target", + } +ATTRS10 = { + 0x938b => "color", + 0x939b => "face", + 0x93a3 => "size", + } +ATTRS12 = { + 0x03ea => "src", + 0x03eb => "loop", + 0x03ec => "volume", + 0x03ed => "balance", + } +ATTRS13 = { + 0x8046 => "title", + 0x804b => "style", + 0x83ea => "class", + 0x83eb => "id", + } +ATTRS15 = { + 0x8046 => "title", + 0x804b => "style", + 0x83ea => "class", + 0x83eb => "id", + 0x9399 => "clear", + } +ATTRS16 = { + 0x07db => "link", + 0x07dc => "alink", + 0x07dd => "vlink", + 0x8046 => "title", + 0x804b => "style", + 0x83ea => "class", + 0x83eb => "id", + 0x938a => "background", + 0x938b => "text", + 0x938e => "nowrap", + 0x93ae => "topmargin", + 0x93af => "rightmargin", + 0x93b0 => "bottommargin", + 0x93b1 => "leftmargin", + 0x93b6 => "bgproperties", + 0x93d8 => "scroll", + 0x977b => "onselect", + 0x9791 => "onload", + 0x9792 => "onunload", + 0x9798 => "onbeforeunload", + 0x97b3 => "onbeforeprint", + 0x97b4 => "onafterprint", + 0xfe0c => "bgcolor", + } +ATTRS17 = { + 0x8046 => "title", + 0x804b => "style", + 0x83ea => "class", + 0x83eb => "id", + 0x9399 => "clear", + } +ATTRS18 = { + 0x07d1 => "type", + 0x8001 => "name", + } +ATTRS19 = { + 0x8046 => "title", + 0x8049 => "align", + 0x804b => "style", + 0x83ea => "class", + 0x83eb => "id", + 0x93a8 => "valign", + } +ATTRS20 = { + 0x8046 => "title", + 0x804b => "style", + 0x83ea => "class", + 0x83eb => "id", + 0x9399 => "clear", + } +ATTRS21 = { + 0x8046 => "title", + 0x804b => "style", + 0x83ea => "class", + 0x83eb => "id", + } +ATTRS22 = { + 0x8046 => "title", + 0x804b => "style", + 0x83ea => "class", + 0x83eb => "id", + } +ATTRS23 = { + 0x03ea => "span", + 0x8006 => "width", + 0x8049 => "align", + 0x93a8 => "valign", + 0xfe0c => "bgcolor", + } +ATTRS24 = { + 0x03ea => "span", + 0x8006 => "width", + 0x8049 => "align", + 0x93a8 => "valign", + 0xfe0c => "bgcolor", + } +ATTRS27 = { + 0x8046 => "title", + 0x804b => "style", + 0x83ea => "class", + 0x83eb => "id", + 0x938e => "nowrap", + } +ATTRS29 = { + 0x8046 => "title", + 0x804b => "style", + 0x83ea => "class", + 0x83eb => "id", + } +ATTRS31 = { + 0x8046 => "title", + 0x8049 => "align", + 0x804b => "style", + 0x83ea => "class", + 0x83eb => "id", + 0x938e => "nowrap", + } +ATTRS32 = { + 0x03ea => "compact", + 0x8046 => "title", + 0x804b => "style", + 0x83ea => "class", + 0x83eb => "id", + } +ATTRS33 = { + 0x8046 => "title", + 0x804b => "style", + 0x83ea => "class", + 0x83eb => "id", + 0x938e => "nowrap", + } +ATTRS34 = { + 0x8046 => "title", + 0x804b => "style", + 0x83ea => "class", + 0x83eb => "id", + } +ATTRS35 = { + 0x8001 => "name", + 0x8006 => "width", + 0x8007 => "height", + 0x804a => "align", + 0x8bbd => "palette", + 0x8bbe => "pluginspage", + 0x8bbf => "codebase", + 0x8bbf => "src", + 0x8bc1 => "units", + 0x8bc2 => "type", + 0x8bc3 => "hidden", + } +ATTRS36 = { + 0x804a => "align", + } +ATTRS37 = { + 0x8046 => "title", + 0x804b => "style", + 0x83ea => "class", + 0x83eb => "id", + 0x938b => "color", + 0x939b => "face", + 0x939c => "size", + } +ATTRS38 = { + 0x03ea => "action", + 0x03ec => "enctype", + 0x03ed => "method", + 0x03ef => "target", + 0x03f4 => "accept-charset", + 0x8001 => "name", + 0x977c => "onsubmit", + 0x977d => "onreset", + } +ATTRS39 = { + 0x8000 => "align", + 0x8001 => "name", + 0x8bb9 => "src", + 0x8bbb => "border", + 0x8bbc => "frameborder", + 0x8bbd => "framespacing", + 0x8bbe => "marginwidth", + 0x8bbf => "marginheight", + 0x8bc0 => "noresize", + 0x8bc1 => "scrolling", + 0x8fa2 => "bordercolor", + } +ATTRS40 = { + 0x03e9 => "rows", + 0x03ea => "cols", + 0x03eb => "border", + 0x03ec => "bordercolor", + 0x03ed => "frameborder", + 0x03ee => "framespacing", + 0x8001 => "name", + 0x9791 => "onload", + 0x9792 => "onunload", + 0x9798 => "onbeforeunload", + 0x97b3 => "onbeforeprint", + 0x97b4 => "onafterprint", + } +ATTRS42 = { + 0x8046 => "title", + 0x8049 => "align", + 0x804b => "style", + 0x83ea => "class", + 0x83eb => "id", + 0x9399 => "clear", + } +ATTRS43 = { + 0x8046 => "title", + 0x8049 => "align", + 0x804b => "style", + 0x83ea => "class", + 0x83eb => "id", + 0x9399 => "clear", + } +ATTRS44 = { + 0x8046 => "title", + 0x8049 => "align", + 0x804b => "style", + 0x83ea => "class", + 0x83eb => "id", + 0x9399 => "clear", + } +ATTRS45 = { + 0x8046 => "title", + 0x8049 => "align", + 0x804b => "style", + 0x83ea => "class", + 0x83eb => "id", + 0x9399 => "clear", + } +ATTRS46 = { + 0x8046 => "title", + 0x8049 => "align", + 0x804b => "style", + 0x83ea => "class", + 0x83eb => "id", + 0x9399 => "clear", + } +ATTRS47 = { + 0x8046 => "title", + 0x8049 => "align", + 0x804b => "style", + 0x83ea => "class", + 0x83eb => "id", + 0x9399 => "clear", + } +ATTRS49 = { + 0x03ea => "noshade", + 0x8006 => "width", + 0x8007 => "size", + 0x8046 => "title", + 0x8049 => "align", + 0x804b => "style", + 0x83ea => "class", + 0x83eb => "id", + 0x938b => "color", + } +ATTRS51 = { + 0x8046 => "title", + 0x804b => "style", + 0x83ea => "class", + 0x83eb => "id", + } +ATTRS52 = { + 0x8001 => "name", + 0x8006 => "width", + 0x8007 => "height", + 0x804a => "align", + 0x8bb9 => "src", + 0x8bbb => "border", + 0x8bbc => "frameborder", + 0x8bbd => "framespacing", + 0x8bbe => "marginwidth", + 0x8bbf => "marginheight", + 0x8bc0 => "noresize", + 0x8bc1 => "scrolling", + 0x8fa2 => "vspace", + 0x8fa3 => "hspace", + } +ATTRS53 = { + 0x03eb => "alt", + 0x03ec => "src", + 0x03ed => "border", + 0x03ee => "vspace", + 0x03ef => "hspace", + 0x03f0 => "lowsrc", + 0x03f1 => "vrml", + 0x03f2 => "dynsrc", + 0x03f4 => "loop", + 0x03f6 => "start", + 0x07d3 => "ismap", + 0x07d9 => "usemap", + 0x8001 => "name", + 0x8006 => "width", + 0x8007 => "height", + 0x8046 => "title", + 0x804a => "align", + 0x804b => "style", + 0x83ea => "class", + 0x83eb => "id", + 0x978d => "onabort", + 0x978e => "onerror", + 0x9791 => "onload", + } +ATTRS54 = { + 0x07d1 => "type", + 0x07d3 => "size", + 0x07d4 => "maxlength", + 0x07d6 => "readonly", + 0x07d8 => "indeterminate", + 0x07da => "checked", + 0x07db => "alt", + 0x07dc => "src", + 0x07dd => "border", + 0x07de => "vspace", + 0x07df => "hspace", + 0x07e0 => "lowsrc", + 0x07e1 => "vrml", + 0x07e2 => "dynsrc", + 0x07e4 => "loop", + 0x07e5 => "start", + 0x8001 => "name", + 0x8006 => "width", + 0x8007 => "height", + 0x804a => "align", + 0x93ee => "value", + 0x977b => "onselect", + 0x978d => "onabort", + 0x978e => "onerror", + 0x978f => "onchange", + 0x9791 => "onload", + } +ATTRS56 = { + 0x8046 => "title", + 0x804b => "style", + 0x83ea => "class", + 0x83eb => "id", + } +ATTRS57 = { + 0x03e9 => "for", + } +ATTRS58 = { + 0x804a => "align", + } +ATTRS59 = { + 0x03ea => "value", + 0x8046 => "title", + 0x804b => "style", + 0x83ea => "class", + 0x83eb => "id", + 0x939a => "type", + } +ATTRS60 = { + 0x03ee => "href", + 0x03ef => "rel", + 0x03f0 => "rev", + 0x03f1 => "type", + 0x03f9 => "media", + 0x03fa => "target", + 0x8046 => "title", + 0x804b => "style", + 0x83ea => "class", + 0x83eb => "id", + 0x978e => "onerror", + 0x9791 => "onload", + } +ATTRS61 = { + 0x9399 => "clear", + } +ATTRS62 = { + 0x8001 => "name", + 0x8046 => "title", + 0x804b => "style", + 0x83ea => "class", + 0x83eb => "id", + } +ATTRS63 = { + 0x1771 => "scrolldelay", + 0x1772 => "direction", + 0x1773 => "behavior", + 0x1774 => "scrollamount", + 0x1775 => "loop", + 0x1776 => "vspace", + 0x1777 => "hspace", + 0x1778 => "truespeed", + 0x8006 => "width", + 0x8007 => "height", + 0x9785 => "onbounce", + 0x978b => "onfinish", + 0x978c => "onstart", + 0xfe0c => "bgcolor", + } +ATTRS65 = { + 0x03ea => "http-equiv", + 0x03eb => "content", + 0x03ec => "url", + 0x03f6 => "charset", + 0x8001 => "name", + } +ATTRS66 = { + 0x03f5 => "n", + } +ATTRS71 = { + 0x8000 => "border", + 0x8000 => "usemap", + 0x8001 => "name", + 0x8006 => "width", + 0x8007 => "height", + 0x8046 => "title", + 0x804a => "align", + 0x804b => "style", + 0x83ea => "class", + 0x83eb => "id", + 0x8bbb => "classid", + 0x8bbc => "data", + 0x8bbf => "codebase", + 0x8bc0 => "codetype", + 0x8bc1 => "code", + 0x8bc2 => "type", + 0x8bc5 => "vspace", + 0x8bc6 => "hspace", + 0x978e => "onerror", + } +ATTRS72 = { + 0x03eb => "compact", + 0x03ec => "start", + 0x8046 => "title", + 0x804b => "style", + 0x83ea => "class", + 0x83eb => "id", + 0x939a => "type", + } +ATTRS73 = { + 0x03ea => "selected", + 0x03eb => "value", + } +ATTRS74 = { + 0x8046 => "title", + 0x8049 => "align", + 0x804b => "style", + 0x83ea => "class", + 0x83eb => "id", + 0x9399 => "clear", + } +ATTRS75 = { + 0x8000 => "name", + 0x8000 => "value", + 0x8000 => "type", + } +ATTRS76 = { + 0x9399 => "clear", + } +ATTRS77 = { + 0x8046 => "title", + 0x804b => "style", + 0x83ea => "class", + 0x83eb => "id", + 0x9399 => "clear", + } +ATTRS78 = { + 0x8046 => "title", + 0x804b => "style", + 0x83ea => "class", + 0x83eb => "id", + } +ATTRS82 = { + 0x8046 => "title", + 0x804b => "style", + 0x83ea => "class", + 0x83eb => "id", + } +ATTRS83 = { + 0x8046 => "title", + 0x804b => "style", + 0x83ea => "class", + 0x83eb => "id", + } +ATTRS84 = { + 0x03ea => "src", + 0x03ed => "for", + 0x03ee => "event", + 0x03f0 => "defer", + 0x03f2 => "type", + 0x978e => "onerror", + } +ATTRS85 = { + 0x03eb => "size", + 0x03ec => "multiple", + 0x8000 => "align", + 0x8001 => "name", + 0x978f => "onchange", + } +ATTRS86 = { + 0x8046 => "title", + 0x804b => "style", + 0x83ea => "class", + 0x83eb => "id", + } +ATTRS87 = { + 0x8046 => "title", + 0x804b => "style", + 0x83ea => "class", + 0x83eb => "id", + } +ATTRS88 = { + 0x8046 => "title", + 0x804b => "style", + 0x83ea => "class", + 0x83eb => "id", + } +ATTRS89 = { + 0x8046 => "title", + 0x804b => "style", + 0x83ea => "class", + 0x83eb => "id", + } +ATTRS90 = { + 0x03eb => "type", + 0x03ef => "media", + 0x8046 => "title", + 0x978e => "onerror", + 0x9791 => "onload", + } +ATTRS91 = { + 0x8046 => "title", + 0x804b => "style", + 0x83ea => "class", + 0x83eb => "id", + } +ATTRS92 = { + 0x8046 => "title", + 0x804b => "style", + 0x83ea => "class", + 0x83eb => "id", + } +ATTRS93 = { + 0x03ea => "cols", + 0x03eb => "border", + 0x03ec => "rules", + 0x03ed => "frame", + 0x03ee => "cellspacing", + 0x03ef => "cellpadding", + 0x03fa => "datapagesize", + 0x8006 => "width", + 0x8007 => "height", + 0x8046 => "title", + 0x804a => "align", + 0x804b => "style", + 0x83ea => "class", + 0x83eb => "id", + 0x938a => "background", + 0x93a5 => "bordercolor", + 0x93a6 => "bordercolorlight", + 0x93a7 => "bordercolordark", + 0xfe0c => "bgcolor", + } +ATTRS94 = { + 0x8049 => "align", + 0x93a8 => "valign", + 0xfe0c => "bgcolor", + } +ATTRS95 = { + 0x8049 => "align", + 0x93a8 => "valign", + } +ATTRS96 = { + 0x07d2 => "rowspan", + 0x07d3 => "colspan", + 0x8006 => "width", + 0x8007 => "height", + 0x8046 => "title", + 0x8049 => "align", + 0x804b => "style", + 0x83ea => "class", + 0x83eb => "id", + 0x938a => "background", + 0x938e => "nowrap", + 0x93a5 => "bordercolor", + 0x93a6 => "bordercolorlight", + 0x93a7 => "bordercolordark", + 0x93a8 => "valign", + 0xfe0c => "bgcolor", + } +ATTRS97 = { + 0x1b5a => "rows", + 0x1b5b => "cols", + 0x1b5c => "wrap", + 0x1b5d => "readonly", + 0x8001 => "name", + 0x977b => "onselect", + 0x978f => "onchange", + } +ATTRS98 = { + 0x8049 => "align", + 0x93a8 => "valign", + 0xfe0c => "bgcolor", + } +ATTRS99 = { + 0x07d2 => "rowspan", + 0x07d3 => "colspan", + 0x8006 => "width", + 0x8007 => "height", + 0x8046 => "title", + 0x8049 => "align", + 0x804b => "style", + 0x83ea => "class", + 0x83eb => "id", + 0x938a => "background", + 0x938e => "nowrap", + 0x93a5 => "bordercolor", + 0x93a6 => "bordercolorlight", + 0x93a7 => "bordercolordark", + 0x93a8 => "valign", + 0xfe0c => "bgcolor", + } +ATTRS100 = { + 0x8049 => "align", + 0x93a8 => "valign", + 0xfe0c => "bgcolor", + } +ATTRS102 = { + 0x8007 => "height", + 0x8046 => "title", + 0x8049 => "align", + 0x804b => "style", + 0x83ea => "class", + 0x83eb => "id", + 0x93a5 => "bordercolor", + 0x93a6 => "bordercolorlight", + 0x93a7 => "bordercolordark", + 0x93a8 => "valign", + 0xfe0c => "bgcolor", + } +ATTRS103 = { + 0x8046 => "title", + 0x804b => "style", + 0x83ea => "class", + 0x83eb => "id", + } +ATTRS104 = { + 0x8046 => "title", + 0x804b => "style", + 0x83ea => "class", + 0x83eb => "id", + } +ATTRS105 = { + 0x03eb => "compact", + 0x8046 => "title", + 0x804b => "style", + 0x83ea => "class", + 0x83eb => "id", + 0x939a => "type", + } +ATTRS106 = { + 0x8046 => "title", + 0x804b => "style", + 0x83ea => "class", + 0x83eb => "id", + } +ATTRS108 = { + 0x9399 => "clear", + } + +TAGS = [ + None, + None, + None, + "a", + "acronym", + "address", + "applet", + "area", + "b", + "base", + "basefont", + "bdo", + "bgsound", + "big", + "blink", + "blockquote", + "body", + "br", + "button", + "caption", + "center", + "cite", + "code", + "col", + "colgroup", + None, + None, + "dd", + "del", + "dfn", + "dir", + "div", + "dl", + "dt", + "em", + "embed", + "fieldset", + "font", + "form", + "frame", + "frameset", + None, + "h1", + "h2", + "h3", + "h4", + "h5", + "h6", + "head", + "hr", + "html", + "i", + "iframe", + "img", + "input", + "ins", + "kbd", + "label", + "legend", + "li", + "link", + "tag61", + "map", + "tag63", + "tag64", + "meta", + "nextid", + "nobr", + "noembed", + "noframes", + "noscript", + "object", + "ol", + "option", + "p", + "param", + "plaintext", + "pre", + "q", + "rp", + "rt", + "ruby", + "s", + "samp", + "script", + "select", + "small", + "span", + "strike", + "strong", + "style", + "sub", + "sup", + "table", + "tbody", + "tc", + "td", + "textarea", + "tfoot", + "th", + "thead", + "title", + "tr", + "tt", + "u", + "ul", + "var", + "wbr", + None, + ] + +TAGS_ATTRS = [ + None, + None, + None, + ATTRS3, # a + None, # acronym + ATTRS5, # address + ATTRS6, # applet + ATTRS7, # area + ATTRS8, # b + ATTRS9, # base + ATTRS10, # basefont + None, # bdo + ATTRS12, # bgsound + ATTRS13, # big + None, # blink + ATTRS15, # blockquote + ATTRS16, # body + ATTRS17, # br + ATTRS18, # button + ATTRS19, # caption + ATTRS20, # center + ATTRS21, # cite + ATTRS22, # code + ATTRS23, # col + ATTRS24, # colgroup + None, + None, + ATTRS27, # dd + None, # del + ATTRS29, # dfn + None, # dir + ATTRS31, # div + ATTRS32, # dl + ATTRS33, # dt + ATTRS34, # em + ATTRS35, # embed + ATTRS36, # fieldset + ATTRS37, # font + ATTRS38, # form + ATTRS39, # frame + ATTRS40, # frameset + None, + ATTRS42, # h1 + ATTRS43, # h2 + ATTRS44, # h3 + ATTRS45, # h4 + ATTRS46, # h5 + ATTRS47, # h6 + None, # head + ATTRS49, # hr + None, # html + ATTRS51, # i + ATTRS52, # iframe + ATTRS53, # img + ATTRS54, # input + None, # ins + ATTRS56, # kbd + ATTRS57, # label + ATTRS58, # legend + ATTRS59, # li + ATTRS60, # link + ATTRS61, # tag61 + ATTRS62, # map + ATTRS63, # tag63 + None, # tag64 + ATTRS65, # meta + ATTRS66, # nextid + None, # nobr + None, # noembed + None, # noframes + None, # noscript + ATTRS71, # object + ATTRS72, # ol + ATTRS73, # option + ATTRS74, # p + ATTRS75, # param + ATTRS76, # plaintext + ATTRS77, # pre + ATTRS78, # q + None, # rp + None, # rt + None, # ruby + ATTRS82, # s + ATTRS83, # samp + ATTRS84, # script + ATTRS85, # select + ATTRS86, # small + ATTRS87, # span + ATTRS88, # strike + ATTRS89, # strong + ATTRS90, # style + ATTRS91, # sub + ATTRS92, # sup + ATTRS93, # table + ATTRS94, # tbody + ATTRS95, # tc + ATTRS96, # td + ATTRS97, # textarea + ATTRS98, # tfoot + ATTRS99, # th + ATTRS100, # thead + None, # title + ATTRS102, # tr + ATTRS103, # tt + ATTRS104, # u + ATTRS105, # ul + ATTRS106, # var + None, # wbr + None, + ] + +MAP = (TAGS, TAGS_ATTRS, ATTRS0) diff --git a/src/calibre/ebooks/lit/maps/opf.py b/src/calibre/ebooks/lit/maps/opf.py new file mode 100644 index 0000000000..a39e6bf8e8 --- /dev/null +++ b/src/calibre/ebooks/lit/maps/opf.py @@ -0,0 +1,74 @@ +ATTRS = { + 0x0001 => "href", + 0x0002 => "%never-used", + 0x0003 => "%guid", + 0x0004 => "%minimum_level", + 0x0005 => "%attr5", + 0x0006 => "id", + 0x0007 => "href", + 0x0008 => "media-type", + 0x0009 => "fallback", + 0x000A => "idref", + 0x000B => "xmlns:dc", + 0x000C => "xmlns:oebpackage", + 0x000D => "role", + 0x000E => "file-as", + 0x000F => "event", + 0x0010 => "scheme", + 0x0011 => "title", + 0x0012 => "type", + 0x0013 => "unique-identifier", + 0x0014 => "name", + 0x0015 => "content", + 0x0016 => "xml:lang", + } + +TAGS = [ + None, + "package", + "dc:Title", + "dc:Creator", + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + "manifest", + "item", + "spine", + "itemref", + "metadata", + "dc-metadata", + "dc:Subject", + "dc:Description", + "dc:Publisher", + "dc:Contributor", + "dc:Date", + "dc:Type", + "dc:Format", + "dc:Identifier", + "dc:Source", + "dc:Language", + "dc:Relation", + "dc:Coverage", + "dc:Rights", + "x-metadata", + "meta", + "tours", + "tour", + "site", + "guide", + "reference", + None, + ] + +TAGS_ATTR = [{} for i in xrange(43)] + +MAP = (TAGS, TAGS_ATTRS, ATTRS0) diff --git a/src/calibre/ebooks/lit/reader.py b/src/calibre/ebooks/lit/reader.py new file mode 100644 index 0000000000..1a0f42f8db --- /dev/null +++ b/src/calibre/ebooks/lit/reader.py @@ -0,0 +1,664 @@ +__license__ = 'GPL v3' +__copyright__ = '2008, Kovid Goyal ' +''' +Support for reading the metadata from a lit file. +''' + +import sys, struct, cStringIO, os +from itertools import repeat + +from calibre import relpath +from calibre.ebooks.metadata import MetaInformation +from calibre.ebooks.metadata.opf import OPFReader +from calibre.ebooks.lit import LitError +from calibre.ebooks.lit.maps import OPF_MAP, HTML_MAP + +def u32(bytes): + return struct.unpack(' 0: + b = ord(bytes[pos]) + pos += 1 + remaining -= 1 + val <<= 7 + val |= (b & 0x7f) + if b & 0x80 == 0: break + return val, bytes[pos:], remaining + +def read_utf8_char(bytes, pos): + c = ord(bytes[pos]) + mask = 0x80 + if (c & mask): + elsize = 0 + while c & mask: + mask >>= 1 + elsize += 1 + if (mask <= 1) or (mask == 0x40): + raise LitError('Invalid UTF8 character: %s' % repr(bytes[pos])) + else: + elsize = 1 + if elsize > 1: + if elsize + pos > len(bytes): + raise LitError('Invalid UTF8 character: %s' % repr(bytes[pos])) + c &= (mask - 1) + for i in range(1, elsize): + b = ord(bytes[pos+i]) + if (b & 0xC0) != 0x80: + raise LitError( + 'Invalid UTF8 character: %s' % repr(bytes[pos:pos+i])) + c = (c << 6) | (b & 0x3F) + return unichr(c), pos+elsize + +FLAG_OPENING = 1 +FLAG_CLOSING = 2 +FLAG_BLOCK = 4 +FLAG_HEAD = 8 +FLAG_ATOM = 16 +XML_ENTITIES = ['&', ''', '<', '>', '"'] + +class UnBinary(object): + def __init__(self, bin, manifest, map=OPF_MAP): + self.manifest = manifest + self.attr_map, self.tag_map, self.tag_to_attr_map = map + self.opf = map is OPF_MAP + self.bin = bin + self.buf = cStringIO.StringIO() + self.ampersands = [] + self.binary_to_text() + self.raw = self.buf.getvalue().lstrip().decode('utf-8') + self.escape_ampersands() + + def escape_ampersands(self): + offset = 0 + for pos in self.ampersands: + test = self.raw[pos+offset:pos+offset+6] + if test.startswith('&#') and ';' in test: + continue + escape = True + for ent in XML_ENTITIES: + if test.startswith(ent): + escape = False + break + if not escape: + continue + self.raw = self.raw[:pos+offset] + '&' + self.raw[pos+offset+1:] + offset += 4 + + def item_path(self, internal_id): + for i in self.manifest: + if i == internal_id: + return i.path + raise LitError('Could not find item %s'%(internal_id,)) + + def __unicode__(self): + return self.raw + + def binary_to_text(self, base=0, depth=0): + tag_name = current_map = None + dynamic_tag = errors = 0 + in_censorship = False + state = 'text' + index = base + flags = 0 + + while index < len(self.bin): + c, index = read_utf8_char(self.bin, index) + oc = ord(c) + + if state == 'text': + if oc == 0: + state = 'get flags' + continue + elif c == '\v': + c = '\n' + elif c == '&': + self.ampersands.append(self.buf.tell()-1) + self.buf.write(c.encode('utf-8')) + + elif state == 'get flags': + if oc == 0: + state = 'text' + continue + flags = oc + state = 'get tag' + + elif state == 'get tag': + state = 'text' if oc == 0 else 'get attr' + if flags & FLAG_OPENING: + tag = oc + self.buf.write('<') + if not (flags & FLAG_CLOSING): + is_goingdown = 1 + if tag == 0x8000: + state = 'get custom length' + continue + if flags & FLAG_ATOM: + raise LitError('TODO: Atoms not yet implemented') + elif tag < len(self.tag_map): + tag_name = self.tag_map[tag] + current_map = self.tag_to_attr_map[tag] + else: + dynamic_tag += 1 + errors += 1 + tag_name = '?'+unichr(tag)+'?' + current_map = self.tag_to_attr_map[tag] + print 'WARNING: tag %s unknown' % unichr(tag) + self.buf.write(unicode(tag_name).encode('utf-8')) + elif flags & FLAG_CLOSING: + if depth == 0: + raise LitError('Extra closing tag') + return index + + elif state == 'get attr': + in_censorship = False + if oc == 0: + if not is_goingdown: + tag_name = None + dynamic_tag = 0 + self.buf.write(' />') + else: + self.buf.write('>') + index = self.binary_to_text(base=index, depth=depth+1) + is_goingdown = 0 + if not tag_name: + raise LitError('Tag ends before it begins.') + self.buf.write('') + dynamic_tag = 0 + tag_name = None + state = 'text' + else: + if oc == 0x8000: + state = 'get attr length' + continue + attr = None + if oc in current_map and current_map[oc]: + attr = current_map[oc] + elif oc in self.attr_map: + attr = self.attr_map[oc] + if not attr or not isinstance(attr, basestring): + raise LitError( + 'Unknown attribute %d in tag %s' % (oc, tag_name)) + if attr.startswith('%'): + in_censorship = True + state = 'get value length' + continue + self.buf.write(' ' + unicode(attr).encode('utf-8') + '=') + if attr in ['href', 'src']: + state = 'get href length' + else: + state = 'get value length' + + elif state == 'get value length': + if not in_censorship: + self.buf.write('"') + count = oc - 1 + if count == 0: + if not in_censorship: + self.buf.write('"') + in_censorship = False + state = 'get attr' + continue + state = 'get value' + if oc == 0xffff: + continue + if count < 0 or count > (len(self.bin) - index): + raise LitError('Invalid character count %d' % count) + + elif state == 'get value': + if count == 0xfffe: + if not in_censorship: + self.buf.write('%s"' % (oc - 1)) + in_censorship = False + state = 'get attr' + elif count > 0: + if not in_censorship: + self.buf.write(c) + count -= 1 + elif count == 0: + if not in_censorship: + self.buf.write('"') + in_censorship = False + state = 'get attr' + + elif state == 'get custom length': + count = oc - 1 + if count <= 0 or count > len(self.bin)-index: + raise LitError('Invalid character count %d' % count) + dynamic_tag += 1 + state = 'get custom' + tag_name = '' + + elif state == 'get custom': + tag_name += c + count -= 1 + if count == 0: + self.buf.write(tag_name) + state = 'get attr' + + elif state == 'get attr length': + count = oc - 1 + if count <= 0 or count > (len(self.bin) - index): + raise LitError('Invalid character count %d' % count) + self.buf.write(' ') + state = 'get custom attr' + + elif state == 'get custom attr': + self.buf.write(c) + count -= 1 + if count == 0: + self.buf.write('=') + state = 'get value length' + + elif state == 'get href length': + count = oc - 1 + if count <= 0 or count > (len(self.bin) - index): + raise LitError('Invalid character count %d' % count) + href = '' + state = 'get href' + + elif state == 'get href': + href += c + count -= 1 + if count == 0: + doc, m, frag = href.partition('#') + path = self.item_path(doc) + if m and frag: + path += m + frag + self.buf.write((u'"%s"' % path).encode('utf-8')) + state = 'get attr' + return index + +class ManifestItem(object): + def __init__(self, original, internal, mime_type, offset, root, state): + self.original = original + self.internal = internal + self.mime_type = mime_type + self.offset = offset + self.root = root + self.state = state + self.prefix = 'images' \ + if state == 'images' else 'css' if state == 'css' else '' + self.prefix = self.prefix + os.sep if self.prefix else '' + self.path = self.prefix + self.original + + def __eq__(self, other): + if hasattr(other, 'internal'): + return self.internal == other.internal + return self.internal == other + + def __repr__(self): + return self.internal + u'->' + self.path + +class LitFile(object): + PIECE_SIZE = 16 + + def magic(): + def fget(self): + val = None + opos = self._stream.tell() + try: + self._stream.seek(0) + val = self._stream.read(8) + finally: + self._stream.seek(opos) + return val + return property(fget=fget) + magic = magic() + + def version(): + def fget(self): + val = None + opos = self._stream.tell() + try: + self._stream.seek(8) + val = u32(self._stream.read(4)) + finally: + self._stream.seek(opos) + return val + return property(fget=fget) + version = version() + + def hdr_len(): + def fget(self): + val = None + opos = self._stream.tell() + try: + self._stream.seek(12) + val = int32(self._stream.read(4)) + finally: + self._stream.seek(opos) + return val + return property(fget=fget) + hdr_len = hdr_len() + + def num_pieces(): + def fget(self): + val = None + opos = self._stream.tell() + try: + self._stream.seek(16) + val = int32(self._stream.read(4)) + finally: + self._stream.seek(opos) + return val + return property(fget=fget) + num_pieces = num_pieces() + + def sec_hdr_len(): + def fget(self): + val = None + opos = self._stream.tell() + try: + self._stream.seek(20) + val = int32(self._stream.read(4)) + finally: + self._stream.seek(opos) + return val + return property(fget=fget) + sec_hdr_len = sec_hdr_len() + + def guid(): + def fget(self): + val = None + opos = self._stream.tell() + try: + self._stream.seek(24) + val = self._stream.read(16) + finally: + self._stream.seek(opos) + return val + return property(fget=fget) + guid = guid() + + def header(): + def fget(self): + val = None + opos = self._stream.tell() + try: + size = self.hdr_len \ + + (self.num_pieces * self.PIECE_SIZE) \ + + self.sec_hdr_len + self._stream.seek(0) + val = self._stream.read(size) + finally: + self._stream.seek(opos) + return val + return property(fget=fget) + header = header() + + def __init__(self, stream): + self._stream = stream + if self.magic != 'ITOLITLS': + raise LitError('Not a valid LIT file') + if self.version != 1: + raise LitError('Unknown LIT version %d'%(self.version,)) + self.read_secondary_header() + self.read_header_pieces() + + def read_secondary_header(self): + opos = self._stream.tell() + try: + self._stream.seek(self.hdr_len + self.num_pieces*self.PIECE_SIZE) + bytes = self._stream.read(self.sec_hdr_len) + offset = int32(bytes[4:]) + while offset < len(bytes): + blocktype = bytes[offset:offset+4] + blockver = u32(bytes[offset+4:]) + if blocktype == 'CAOL': + if blockver != 2: + raise LitError( + 'Unknown CAOL block format %d' % blockver) + self.creator_id = u32(bytes[offset+12:]) + self.entry_chunklen = u32(bytes[offset+20:]) + self.count_chunklen = u32(bytes[offset+24:]) + self.entry_unknown = u32(bytes[offset+28:]) + self.count_unknown = u32(bytes[offset+32:]) + offset += 48 + elif blocktype == 'ITSF': + if blockver != 4: + raise LitError( + 'Unknown ITSF block format %d' % blockver) + if u32(bytes[offset+4+16:]): + raise LitError('This file has a 64bit content offset') + self.content_offset = u32(bytes[offset+16:]) + self.timestamp = u32(bytes[offset+24:]) + self.language_id = u32(bytes[offset+28:]) + offset += 48 + if not hasattr(self, 'content_offset'): + raise LitError('Could not figure out the content offset') + finally: + self._stream.seek(opos) + + def read_header_pieces(self): + opos = self._stream.tell() + try: + src = self.header[self.hdr_len:] + for i in range(self.num_pieces): + piece = src[i*self.PIECE_SIZE:(i+1)*self.PIECE_SIZE] + if u32(piece[4:]) != 0 or u32(piece[12:]) != 0: + raise LitError('Piece %s has 64bit value' % repr(piece)) + offset, size = u32(piece), int32(piece[8:]) + self._stream.seek(offset) + piece = self._stream.read(size) + if i == 0: + continue # Dont need this piece + elif i == 1: + if u32(piece[8:]) != self.entry_chunklen or \ + u32(piece[12:]) != self.entry_unknown: + raise LitError('Secondary header does not match piece') + self.read_directory(piece) + elif i == 2: + if u32(piece[8:]) != self.count_chunklen or \ + u32(piece[12:]) != self.count_unknown: + raise LitError('Secondary header does not match piece') + continue # No data needed from this piece + elif i == 3: + self.piece3_guid = piece + elif i == 4: + self.piece4_guid = piece + finally: + self._stream.seek(opos) + + def read_directory(self, piece): + self.entries = [] + if not piece.startswith('IFCM'): + raise LitError('Header piece #1 is not main directory.') + chunk_size, num_chunks = int32(piece[8:12]), int32(piece[24:28]) + + if (32 + chunk_size * num_chunks) != len(piece): + raise LitError('IFCM HEADER has incorrect length') + + for chunk in range(num_chunks): + p = 32 + chunk * chunk_size + if piece[p:p+4] != 'AOLL': + continue + remaining = chunk_size - int32(piece[p+4:p+8]) - 48 + if remaining < 0: + raise LitError('AOLL remaining count is negative') + + entries = u16(piece[p+chunk_size-2:]) + if entries <= 0: + # Hopefully everything will work even without a correct entries + # count + entries = (2 ** 16) - 1 + + piece = piece[p+48:] + i = 0 + while i < entries: + if remaining <= 0: break + namelen, piece, remaining = encint(piece, remaining) + if namelen != (namelen & 0x7fffffff): + raise LitError('Directory entry had 64bit name length.') + if namelen > remaining - 3: + raise LitError('Read past end of directory chunk') + name = piece[:namelen] + piece = piece[namelen:] + section, piece, remaining = encint(piece, remaining) + offset, piece, remaining = encint(piece, remaining) + size, piece, remaining = encint(piece, remaining) + + entry = DirectoryEntry(name, section, offset, size) + + if name == '::DataSpace/NameList': + self.read_section_names(entry) + elif name == '/manifest': + self.read_manifest(entry) + elif name == '/meta': + self.read_meta(entry) + self.entries.append(entry) + i += 1 + + if not hasattr(self, 'sections'): + raise LitError('Lit file does not have a valid NameList') + + if not hasattr(self, 'manifest'): + raise LitError('Lit file does not have a valid manifest') + + def read_section_names(self, entry): + opos = self._stream.tell() + try: + self._stream.seek(self.content_offset + entry.offset) + raw = self._stream.read(entry.size) + if len(raw) < 4: + raise LitError('Invalid Namelist section') + pos = 4 + self.num_sections = u16(raw[2:pos]) + + self.sections = {} + for section in range(self.num_sections): + size = u16(raw[pos:pos+2]) + pos += 2 + size = size*2 + 2 + if pos + size > len(raw): + raise LitError('Invalid Namelist section') + self.sections[section] = raw[pos:pos+size].decode('utf-16-le') + pos += size + finally: + self._stream.seek(opos) + + def read_manifest(self, entry): + opos = self._stream.tell() + try: + self.manifest = [] + self._stream.seek(self.content_offset + entry.offset) + raw = self._stream.read(entry.size) + pos = 0 + while pos < len(raw): + size = ord(raw[pos]) + if size == 0: break + pos += 1 + root = raw[pos:pos+size].decode('utf8') + pos += size + if pos >= len(raw): + raise LitError('Truncated manifest.') + for state in ['spine', 'not spine', 'css', 'images']: + num_files = int32(raw[pos:pos+4]) + pos += 4 + if num_files == 0: continue + + i = 0 + while i < num_files: + if pos+5 >= len(raw): + raise LitError('Truncated manifest.') + offset = u32(raw[pos:pos+4]) + pos += 4 + + slen = ord(raw[pos]) + pos += 1 + internal = raw[pos:pos+slen].decode('utf8') + pos += slen + + slen = ord(raw[pos]) + pos += 1 + original = raw[pos:pos+slen].decode('utf8') + pos += slen + + slen = ord(raw[pos]) + pos += 1 + mime_type = raw[pos:pos+slen].decode('utf8') + pos += slen + 1 + + self.manifest.append( + ManifestItem(original, internal, mime_type, + offset, root, state)) + i += 1 + finally: + self._stream.seek(opos) + + def read_meta(self, entry): + opos = self._stream.tell() + try: + self._stream.seek(self.content_offset + entry.offset) + raw = self._stream.read(entry.size) + + xml = \ +'''\ + + +'''+\ + unicode(UnBinary(raw, self.manifest)) + self.meta = xml + finally: + self._stream.seek(opos) + + def read_image(self, internal_name): + cover_entry = None + for entry in self.entries: + if internal_name in entry.name: + cover_entry = entry + break + opos = self._stream.tell() + try: + self._stream.seek(self.content_offset + cover_entry.offset) + return self._stream.read(cover_entry.size) + finally: + self._stream.seek(opos) + +def get_metadata(stream): + try: + litfile = LitFile(stream) + src = litfile.meta.encode('utf-8') + mi = OPFReader(cStringIO.StringIO(src), dir=os.getcwd()) + cover_url, cover_item = mi.cover, None + if cover_url: + cover_url = relpath(cover_url, os.getcwd()) + for item in litfile.manifest: + if item.path == cover_url: + cover_item = item.internal + if cover_item is not None: + ext = cover_url.rpartition('.')[-1] + if not ext: + ext = 'jpg' + else: + ext = ext.lower() + cd = litfile.read_image(cover_item) + mi.cover_data = (ext, cd) if cd else (None, None) + except: + title = stream.name if hasattr(stream, 'name') and stream.name else 'Unknown' + mi = MetaInformation(title, ['Unknown']) + return mi + +def main(args=sys.argv): + if len(args) != 2: + print >>sys.stderr, _('Usage: %s file.lit')%(args[0],) + return 1 + mi = get_metadata(open(args[1], 'rb')) + print unicode(mi) + if mi.cover_data[1]: + cover = os.path.abspath(os.path.splitext(os.path.basename(args[1]))[0] + '.' + mi.cover_data[0]) + open(cover, 'wb').write(mi.cover_data[1]) + print _('Cover saved to'), cover + return 0 + +if __name__ == '__main__': + sys.exit(main())