Add option to pretty print HTML in lit2oeb

This commit is contained in:
Kovid Goyal 2008-08-13 19:13:09 -07:00
commit fab60df71d

View File

@ -10,6 +10,7 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net> ' \
import sys, struct, cStringIO, os import sys, struct, cStringIO, os
import functools import functools
import re import re
from lxml import etree
from calibre.ebooks.lit import LitError from calibre.ebooks.lit import LitError
from calibre.ebooks.lit.maps import OPF_MAP, HTML_MAP from calibre.ebooks.lit.maps import OPF_MAP, HTML_MAP
import calibre.ebooks.lit.mssha1 as mssha1 import calibre.ebooks.lit.mssha1 as mssha1
@ -17,6 +18,8 @@ from calibre import plugins
lzx, lxzerror = plugins['lzx'] lzx, lxzerror = plugins['lzx']
msdes, msdeserror = plugins['msdes'] msdes, msdeserror = plugins['msdes']
XML_DECL = """<?xml version="1.0" encoding="UTF-8" ?>
"""
OPF_DECL = """<?xml version="1.0" encoding="UTF-8" ?> OPF_DECL = """<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE package <!DOCTYPE package
PUBLIC "+//ISBN 0-9673008-1-9//DTD OEB 1.0.1 Package//EN" PUBLIC "+//ISBN 0-9673008-1-9//DTD OEB 1.0.1 Package//EN"
@ -367,6 +370,8 @@ def preserve(function):
class LitReader(object): class LitReader(object):
PIECE_SIZE = 16 PIECE_SIZE = 16
XML_PARSER = etree.XMLParser(
remove_blank_text=True, resolve_entities=False)
def magic(): def magic():
@preserve @preserve
@ -609,6 +614,12 @@ class LitReader(object):
if item.path[0] == '/': if item.path[0] == '/':
item.path = os.path.basename(item.path) item.path = os.path.basename(item.path)
def _pretty_print(self, xml):
f = cStringIO.StringIO(xml.encode('utf-8'))
doc = etree.parse(f, parser=self.XML_PARSER)
pretty = etree.tostring(doc, encoding='ascii', pretty_print=True)
return XML_DECL + unicode(pretty)
def _read_meta(self): def _read_meta(self):
path = 'content.opf' path = 'content.opf'
raw = self.get_file('/meta') raw = self.get_file('/meta')
@ -755,7 +766,7 @@ class LitReader(object):
raise LitError("Failed to completely decompress section") raise LitError("Failed to completely decompress section")
return ''.join(result) return ''.join(result)
def get_entry_content(self, entry): def get_entry_content(self, entry, pretty_print=False):
if 'spine' in entry.state: if 'spine' in entry.state:
name = '/'.join(('/data', entry.internal, 'content')) name = '/'.join(('/data', entry.internal, 'content'))
path = entry.path path = entry.path
@ -763,13 +774,15 @@ class LitReader(object):
decl, map = (OPF_DECL, OPF_MAP) \ decl, map = (OPF_DECL, OPF_MAP) \
if name == '/meta' else (HTML_DECL, HTML_MAP) if name == '/meta' else (HTML_DECL, HTML_MAP)
content = decl + unicode(UnBinary(raw, path, self.manifest, map)) content = decl + unicode(UnBinary(raw, path, self.manifest, map))
if pretty_print:
content = self._pretty_print(content)
content = content.encode('utf-8') content = content.encode('utf-8')
else: else:
name = '/'.join(('/data', entry.internal)) name = '/'.join(('/data', entry.internal))
content = self.get_file(name) content = self.get_file(name)
return content return content
def extract_content(self, output_dir=os.getcwdu()): def extract_content(self, output_dir=os.getcwdu(), pretty_print=False):
output_dir = os.path.abspath(output_dir) output_dir = os.path.abspath(output_dir)
try: try:
opf_path = os.path.splitext( opf_path = os.path.splitext(
@ -779,12 +792,15 @@ class LitReader(object):
opf_path = os.path.join(output_dir, opf_path) opf_path = os.path.join(output_dir, opf_path)
self._ensure_dir(opf_path) self._ensure_dir(opf_path)
with open(opf_path, 'wb') as f: with open(opf_path, 'wb') as f:
f.write(self.meta.encode('utf-8')) xml = self.meta
if pretty_print:
xml = self._pretty_print(xml)
f.write(xml.encode('utf-8'))
for entry in self.manifest.values(): for entry in self.manifest.values():
path = os.path.join(output_dir, entry.path) path = os.path.join(output_dir, entry.path)
self._ensure_dir(path) self._ensure_dir(path)
with open(path, 'wb') as f: with open(path, 'wb') as f:
f.write(self.get_entry_content(entry)) f.write(self.get_entry_content(entry, pretty_print))
def _ensure_dir(self, path): def _ensure_dir(self, path):
dir = os.path.dirname(path) dir = os.path.dirname(path)
@ -797,6 +813,9 @@ def option_parser():
parser.add_option( parser.add_option(
'-o', '--output-dir', default='.', '-o', '--output-dir', default='.',
help=_('Output directory. Defaults to current directory.')) help=_('Output directory. Defaults to current directory.'))
parser.add_option(
'-p', '--pretty-print', default=False, action='store_true',
help=_('Legibly format extracted markup. May modify meaningful whitespace.'))
parser.add_option( parser.add_option(
'--verbose', default=False, action='store_true', '--verbose', default=False, action='store_true',
help=_('Useful for debugging.')) help=_('Useful for debugging.'))
@ -809,7 +828,7 @@ def main(args=sys.argv):
parser.print_help() parser.print_help()
return 1 return 1
lr = LitReader(args[1]) lr = LitReader(args[1])
lr.extract_content(opts.output_dir) lr.extract_content(opts.output_dir, opts.pretty_print)
print _('OEB ebook created in'), opts.output_dir print _('OEB ebook created in'), opts.output_dir
return 0 return 0