PML Output: Add .png to image names. Fix removing excessive newlines from PML output. PMLZ Output: Name images correctly.

This commit is contained in:
Kovid Goyal 2009-10-23 19:29:13 -06:00
parent 4b822e5700
commit 5770808fcf
3 changed files with 32 additions and 40 deletions

View File

@ -16,11 +16,11 @@ def image_name(name, taken_names=[]):
cut = len(name) - 32 cut = len(name) - 32
names = name[:10] names = name[:10]
namee = name[10+cut:] namee = name[10+cut:]
name = names + namee name = '%s%s.png' % (names, namee)
while name in taken_names: while name in taken_names:
for i in xrange(9999999999999999999999999999999): for i in xrange(999999999999999999999999999):
name = '%s%s' % (name[:-len('%s' % i)], i) name = '%s%s.png' % (name[:-len('%s' % i)], i)
name = name.ljust(32, '\x00')[:32] name = name.ljust(32, '\x00')[:32]

View File

@ -18,7 +18,7 @@ from calibre.customize.conversion import OutputFormatPlugin
from calibre.customize.conversion import OptionRecommendation from calibre.customize.conversion import OptionRecommendation
from calibre.ptempfile import TemporaryDirectory from calibre.ptempfile import TemporaryDirectory
from calibre.utils.zipfile import ZipFile from calibre.utils.zipfile import ZipFile
from calibre.ebooks.oeb.base import OEB_IMAGES from calibre.ebooks.oeb.base import OEB_RASTER_IMAGES
from calibre.ebooks.pml.pmlml import PMLMLizer from calibre.ebooks.pml.pmlml import PMLMLizer
class PMLOutput(OutputFormatPlugin): class PMLOutput(OutputFormatPlugin):
@ -40,28 +40,26 @@ class PMLOutput(OutputFormatPlugin):
def convert(self, oeb_book, output_path, input_plugin, opts, log): def convert(self, oeb_book, output_path, input_plugin, opts, log):
with TemporaryDirectory('_pmlz_output') as tdir: with TemporaryDirectory('_pmlz_output') as tdir:
pmlmlizer = PMLMLizer(log) pmlmlizer = PMLMLizer(log)
content = pmlmlizer.extract_content(oeb_book, opts) pml = unicode(pmlmlizer.extract_content(oeb_book, opts))
with open(os.path.join(tdir, 'index.pml'), 'wb') as out: with open(os.path.join(tdir, 'index.pml'), 'wb') as out:
out.write(content.encode(opts.output_encoding, 'replace')) out.write(pml.encode(opts.output_encoding, 'replace'))
self.write_images(oeb_book.manifest, tdir) self.write_images(oeb_book.manifest, pmlmlizer.image_hrefs, tdir)
log.debug('Compressing output...') log.debug('Compressing output...')
pmlz = ZipFile(output_path, 'w') pmlz = ZipFile(output_path, 'w')
pmlz.add_dir(tdir) pmlz.add_dir(tdir)
def write_images(self, manifest, out_dir): def write_images(self, manifest, image_hrefs, out_dir):
for item in manifest: for item in manifest:
if item.media_type in OEB_IMAGES: if item.media_type in OEB_RASTER_IMAGES and item.href in image_hrefs.keys():
im = Image.open(cStringIO.StringIO(item.data)) im = Image.open(cStringIO.StringIO(item.data))
data = cStringIO.StringIO() data = cStringIO.StringIO()
im.save(data, 'PNG') im.save(data, 'PNG')
data = data.getvalue() data = data.getvalue()
name = os.path.splitext(os.path.basename(item.href))[0] + '.png' path = os.path.join(out_dir, image_hrefs[item.href])
path = os.path.join(out_dir, name)
with open(path, 'wb') as out: with open(path, 'wb') as out:
out.write(data) out.write(data)

View File

@ -8,7 +8,6 @@ __docformat__ = 'restructuredtext en'
Transform OEB content into PML markup Transform OEB content into PML markup
''' '''
import os
import re import re
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace
@ -138,16 +137,13 @@ class PMLMLizer(object):
aid = self.link_hrefs[aid] aid = self.link_hrefs[aid]
return u'\\Q="%s"' % aid return u'\\Q="%s"' % aid
def remove_newlines(self, text):
text = text.replace('\r\n', ' ')
text = text.replace('\n', ' ')
text = text.replace('\r', ' ')
return text
def clean_text(self, text): def clean_text(self, text):
# Remove excess spaces at beginning and end of lines
text = re.sub('(?m)^[ ]+', '', text)
text = re.sub('(?m)[ ]+$', '', text)
# Remove excessive newlines
text = re.sub('%s{1,1}' % os.linesep, '%s%s' % (os.linesep, os.linesep), text)
text = re.sub('%s{3,}' % os.linesep, '%s%s' % (os.linesep, os.linesep), text)
text = re.sub('[ ]{2,}', ' ', text)
# Remove excessive \p tags # Remove excessive \p tags
text = re.sub(r'\\p\s*\\p', '', text) text = re.sub(r'\\p\s*\\p', '', text)
@ -166,6 +162,17 @@ class PMLMLizer(object):
# Turn all unicode characters into their PML hex equivelent # Turn all unicode characters into their PML hex equivelent
text = re.sub('[^\x00-\x7f]', lambda x: '\\U%04x' % ord(x.group()), text) text = re.sub('[^\x00-\x7f]', lambda x: '\\U%04x' % ord(x.group()), text)
# Remove excess spaces at beginning and end of lines
text = re.sub('(?m)^[ ]+', '', text)
text = re.sub('(?m)[ ]+$', '', text)
# Remove excessive spaces
text = re.sub('[ ]{2,}', ' ', text)
# Remove excessive newlines
text = re.sub('\n[ ]+\n', '\n\n', text)
text = re.sub('\n\n\n+', '\n\n', text)
return text return text
def dump_text(self, elem, stylizer, page, tag_stack=[]): def dump_text(self, elem, stylizer, page, tag_stack=[]):
@ -197,7 +204,7 @@ class PMLMLizer(object):
if len(self.image_hrefs.keys()) == 0: if len(self.image_hrefs.keys()) == 0:
self.image_hrefs[page.abshref(elem.attrib['src'])] = 'cover.png' self.image_hrefs[page.abshref(elem.attrib['src'])] = 'cover.png'
else: else:
self.image_hrefs[page.abshref(elem.attrib['src'])] = image_name('%s' % len(self.image_hrefs.keys()), self.image_hrefs.keys()).strip('\x00') self.image_hrefs[page.abshref(elem.attrib['src'])] = image_name('%s.png' % len(self.image_hrefs.keys()), self.image_hrefs.keys()).strip('\x00')
text.append('\\m="%s"' % self.image_hrefs[page.abshref(elem.attrib['src'])]) text.append('\\m="%s"' % self.image_hrefs[page.abshref(elem.attrib['src'])])
if tag == 'hr': if tag == 'hr':
w = '\\w' w = '\\w'
@ -251,7 +258,7 @@ class PMLMLizer(object):
# Proccess tags that contain text. # Proccess tags that contain text.
if hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '': if hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '':
text.append(self.elem_text(elem, tag_stack)) text.append(self.remove_newlines(elem.text))
for item in elem: for item in elem:
text += self.dump_text(item, stylizer, page, tag_stack) text += self.dump_text(item, stylizer, page, tag_stack)
@ -261,32 +268,19 @@ class PMLMLizer(object):
close_tag_list.insert(0, tag_stack.pop()) close_tag_list.insert(0, tag_stack.pop())
text += self.close_tags(close_tag_list) text += self.close_tags(close_tag_list)
if tag in SEPARATE_TAGS: if tag in SEPARATE_TAGS:
text.append(os.linesep + os.linesep) text.append('\n\n')
if 'block' not in tag_stack: if 'block' not in tag_stack:
text.append(os.linesep + os.linesep) text.append('\n\n')
#if style['page-break-after'] == 'always': #if style['page-break-after'] == 'always':
# text.append('\\p') # text.append('\\p')
if hasattr(elem, 'tail') and elem.tail != None and elem.tail.strip() != '': if hasattr(elem, 'tail') and elem.tail != None and elem.tail.strip() != '':
text.append(self.elem_tail(elem, tag_stack)) text.append(self.remove_newlines(elem.tail))
return text return text
def elem_text(self, elem, tag_stack):
return self.block_text(elem.text, 'block' in tag_stack)
def elem_tail(self, elem, tag_stack):
return self.block_text(elem.tail, 'block' in tag_stack)
def block_text(self, text, in_block):
if in_block:
text = text.replace('\n\r', ' ')
text = text.replace('\n', ' ')
text = text.replace('\r', ' ')
return text
def close_tags(self, tags): def close_tags(self, tags):
text = [u''] text = [u'']
for i in range(0, len(tags)): for i in range(0, len(tags)):