From 5770808fcf903d1caaa30fdf64873a02f83e194c Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 23 Oct 2009 19:29:13 -0600 Subject: [PATCH] PML Output: Add .png to image names. Fix removing excessive newlines from PML output. PMLZ Output: Name images correctly. --- src/calibre/ebooks/pdb/ereader/__init__.py | 6 +-- src/calibre/ebooks/pml/output.py | 16 +++---- src/calibre/ebooks/pml/pmlml.py | 50 ++++++++++------------ 3 files changed, 32 insertions(+), 40 deletions(-) diff --git a/src/calibre/ebooks/pdb/ereader/__init__.py b/src/calibre/ebooks/pdb/ereader/__init__.py index 3f08b068cb..89560c9448 100644 --- a/src/calibre/ebooks/pdb/ereader/__init__.py +++ b/src/calibre/ebooks/pdb/ereader/__init__.py @@ -16,11 +16,11 @@ def image_name(name, taken_names=[]): cut = len(name) - 32 names = name[:10] namee = name[10+cut:] - name = names + namee + name = '%s%s.png' % (names, namee) while name in taken_names: - for i in xrange(9999999999999999999999999999999): - name = '%s%s' % (name[:-len('%s' % i)], i) + for i in xrange(999999999999999999999999999): + name = '%s%s.png' % (name[:-len('%s' % i)], i) name = name.ljust(32, '\x00')[:32] diff --git a/src/calibre/ebooks/pml/output.py b/src/calibre/ebooks/pml/output.py index 360e63c98e..774fc4c8d1 100644 --- a/src/calibre/ebooks/pml/output.py +++ b/src/calibre/ebooks/pml/output.py @@ -18,7 +18,7 @@ from calibre.customize.conversion import OutputFormatPlugin from calibre.customize.conversion import OptionRecommendation from calibre.ptempfile import TemporaryDirectory from calibre.utils.zipfile import ZipFile -from calibre.ebooks.oeb.base import OEB_IMAGES +from calibre.ebooks.oeb.base import OEB_RASTER_IMAGES from calibre.ebooks.pml.pmlml import PMLMLizer class PMLOutput(OutputFormatPlugin): @@ -40,28 +40,26 @@ class PMLOutput(OutputFormatPlugin): def convert(self, oeb_book, output_path, input_plugin, opts, log): with TemporaryDirectory('_pmlz_output') as tdir: pmlmlizer = PMLMLizer(log) - content = pmlmlizer.extract_content(oeb_book, opts) + pml = unicode(pmlmlizer.extract_content(oeb_book, opts)) with open(os.path.join(tdir, 'index.pml'), 'wb') as out: - out.write(content.encode(opts.output_encoding, 'replace')) + out.write(pml.encode(opts.output_encoding, 'replace')) - self.write_images(oeb_book.manifest, tdir) + self.write_images(oeb_book.manifest, pmlmlizer.image_hrefs, tdir) log.debug('Compressing output...') pmlz = ZipFile(output_path, 'w') pmlz.add_dir(tdir) - def write_images(self, manifest, out_dir): + def write_images(self, manifest, image_hrefs, out_dir): for item in manifest: - if item.media_type in OEB_IMAGES: + if item.media_type in OEB_RASTER_IMAGES and item.href in image_hrefs.keys(): im = Image.open(cStringIO.StringIO(item.data)) data = cStringIO.StringIO() im.save(data, 'PNG') data = data.getvalue() - name = os.path.splitext(os.path.basename(item.href))[0] + '.png' - path = os.path.join(out_dir, name) + path = os.path.join(out_dir, image_hrefs[item.href]) with open(path, 'wb') as out: out.write(data) - diff --git a/src/calibre/ebooks/pml/pmlml.py b/src/calibre/ebooks/pml/pmlml.py index 7b1813256e..862f0ea0ae 100644 --- a/src/calibre/ebooks/pml/pmlml.py +++ b/src/calibre/ebooks/pml/pmlml.py @@ -8,7 +8,6 @@ __docformat__ = 'restructuredtext en' Transform OEB content into PML markup ''' -import os import re from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace @@ -138,16 +137,13 @@ class PMLMLizer(object): aid = self.link_hrefs[aid] return u'\\Q="%s"' % aid + def remove_newlines(self, text): + text = text.replace('\r\n', ' ') + text = text.replace('\n', ' ') + text = text.replace('\r', ' ') + return text + def clean_text(self, text): - # Remove excess spaces at beginning and end of lines - text = re.sub('(?m)^[ ]+', '', text) - text = re.sub('(?m)[ ]+$', '', text) - - # Remove excessive newlines - text = re.sub('%s{1,1}' % os.linesep, '%s%s' % (os.linesep, os.linesep), text) - text = re.sub('%s{3,}' % os.linesep, '%s%s' % (os.linesep, os.linesep), text) - text = re.sub('[ ]{2,}', ' ', text) - # Remove excessive \p tags text = re.sub(r'\\p\s*\\p', '', text) @@ -166,6 +162,17 @@ class PMLMLizer(object): # Turn all unicode characters into their PML hex equivelent text = re.sub('[^\x00-\x7f]', lambda x: '\\U%04x' % ord(x.group()), text) + # Remove excess spaces at beginning and end of lines + text = re.sub('(?m)^[ ]+', '', text) + text = re.sub('(?m)[ ]+$', '', text) + + # Remove excessive spaces + text = re.sub('[ ]{2,}', ' ', text) + + # Remove excessive newlines + text = re.sub('\n[ ]+\n', '\n\n', text) + text = re.sub('\n\n\n+', '\n\n', text) + return text def dump_text(self, elem, stylizer, page, tag_stack=[]): @@ -197,7 +204,7 @@ class PMLMLizer(object): if len(self.image_hrefs.keys()) == 0: self.image_hrefs[page.abshref(elem.attrib['src'])] = 'cover.png' else: - self.image_hrefs[page.abshref(elem.attrib['src'])] = image_name('%s' % len(self.image_hrefs.keys()), self.image_hrefs.keys()).strip('\x00') + self.image_hrefs[page.abshref(elem.attrib['src'])] = image_name('%s.png' % len(self.image_hrefs.keys()), self.image_hrefs.keys()).strip('\x00') text.append('\\m="%s"' % self.image_hrefs[page.abshref(elem.attrib['src'])]) if tag == 'hr': w = '\\w' @@ -251,7 +258,7 @@ class PMLMLizer(object): # Proccess tags that contain text. if hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '': - text.append(self.elem_text(elem, tag_stack)) + text.append(self.remove_newlines(elem.text)) for item in elem: text += self.dump_text(item, stylizer, page, tag_stack) @@ -261,32 +268,19 @@ class PMLMLizer(object): close_tag_list.insert(0, tag_stack.pop()) text += self.close_tags(close_tag_list) if tag in SEPARATE_TAGS: - text.append(os.linesep + os.linesep) + text.append('\n\n') if 'block' not in tag_stack: - text.append(os.linesep + os.linesep) + text.append('\n\n') #if style['page-break-after'] == 'always': # text.append('\\p') if hasattr(elem, 'tail') and elem.tail != None and elem.tail.strip() != '': - text.append(self.elem_tail(elem, tag_stack)) + text.append(self.remove_newlines(elem.tail)) return text - def elem_text(self, elem, tag_stack): - return self.block_text(elem.text, 'block' in tag_stack) - - def elem_tail(self, elem, tag_stack): - return self.block_text(elem.tail, 'block' in tag_stack) - - def block_text(self, text, in_block): - if in_block: - text = text.replace('\n\r', ' ') - text = text.replace('\n', ' ') - text = text.replace('\r', ' ') - return text - def close_tags(self, tags): text = [u''] for i in range(0, len(tags)):