py3: More fixes to RTF input

Embedded WMF image processing now works
This commit is contained in:
Kovid Goyal 2019-05-20 15:50:36 +05:30
parent 037e28a442
commit cb5ac309fa
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 15 additions and 11 deletions

View File

@ -1,4 +1,4 @@
from __future__ import with_statement from __future__ import with_statement, unicode_literals
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>' __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
@ -118,20 +118,21 @@ class RTFInput(InputFormatPlugin):
def extract_images(self, picts): def extract_images(self, picts):
from calibre.utils.imghdr import what from calibre.utils.imghdr import what
from binascii import unhexlify
self.log('Extracting images...') self.log('Extracting images...')
with open(picts, 'rb') as f: with open(picts, 'rb') as f:
raw = f.read() raw = f.read()
picts = filter(len, re.findall(r'\{\\pict([^}]+)\}', raw)) picts = filter(len, re.findall(br'\{\\pict([^}]+)\}', raw))
hex = re.compile(r'[^a-fA-F0-9]') hex_pat = re.compile(br'[^a-fA-F0-9]')
encs = [hex.sub('', pict) for pict in picts] encs = [hex_pat.sub(b'', pict) for pict in picts]
count = 0 count = 0
imap = {} imap = {}
for enc in encs: for enc in encs:
if len(enc) % 2 == 1: if len(enc) % 2 == 1:
enc = enc[:-1] enc = enc[:-1]
data = enc.decode('hex') data = unhexlify(enc)
fmt = what(None, data) fmt = what(None, data)
if fmt is None: if fmt is None:
fmt = 'wmf' fmt = 'wmf'
@ -158,7 +159,7 @@ class RTFInput(InputFormatPlugin):
return name return name
try: try:
return self.rasterize_wmf(name) return self.rasterize_wmf(name)
except: except Exception:
self.log.exception('Failed to convert WMF image %r'%name) self.log.exception('Failed to convert WMF image %r'%name)
return self.replace_wmf(name) return self.replace_wmf(name)
@ -168,7 +169,7 @@ class RTFInput(InputFormatPlugin):
return '__REMOVE_ME__' return '__REMOVE_ME__'
from calibre.ebooks.covers import message_image from calibre.ebooks.covers import message_image
if self.default_img is None: if self.default_img is None:
self.default_img = message_image('Conversion of WMF images is not supported.', self.default_img = message_image('Conversion of WMF images is not supported.'
' Use Microsoft Word or OpenOffice to save this RTF file' ' Use Microsoft Word or OpenOffice to save this RTF file'
' as HTML and convert that in calibre.') ' as HTML and convert that in calibre.')
name = name.replace('.wmf', '.jpg') name = name.replace('.wmf', '.jpg')
@ -287,15 +288,15 @@ class RTFInput(InputFormatPlugin):
result = transform(doc) result = transform(doc)
html = u'index.xhtml' html = u'index.xhtml'
with open(html, 'wb') as f: with open(html, 'wb') as f:
res = transform.tostring(result) res = as_bytes(transform.tostring(result))
# res = res[:100].replace('xmlns:html', 'xmlns') + res[100:] # res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
# clean multiple \n # clean multiple \n
res = re.sub('\n+', '\n', res) res = re.sub(b'\n+', b'\n', res)
# Replace newlines inserted by the 'empty_paragraphs' option in rtf2xml with html blank lines # Replace newlines inserted by the 'empty_paragraphs' option in rtf2xml with html blank lines
# res = re.sub('\s*<body>', '<body>', res) # res = re.sub('\s*<body>', '<body>', res)
# res = re.sub('(?<=\n)\n{2}', # res = re.sub('(?<=\n)\n{2}',
# u'<p>\u00a0</p>\n'.encode('utf-8'), res) # u'<p>\u00a0</p>\n'.encode('utf-8'), res)
f.write(as_bytes(res)) f.write(res)
self.write_inline_css(inline_class, border_styles) self.write_inline_css(inline_class, border_styles)
stream.seek(0) stream.seek(0)
mi = get_metadata(stream, 'rtf') mi = get_metadata(stream, 'rtf')

View File

@ -1,5 +1,6 @@
#!/usr/bin/env python2 #!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>' __copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
@ -145,7 +146,7 @@ class WMF(object):
size, func = struct.unpack_from('<IH', data, offset) size, func = struct.unpack_from('<IH', data, offset)
size *= 2 # Convert to bytes size *= 2 # Convert to bytes
offset += hsize offset += hsize
params = '' params = b''
delta = size - hsize delta = size - hsize
if delta > 0: if delta > 0:
params = data[offset:offset+delta] params = data[offset:offset+delta]
@ -158,6 +159,8 @@ class WMF(object):
self.records.append((func, params)) self.records.append((func, params))
for rec in self.records: for rec in self.records:
if not hasattr(rec[0], 'split'):
continue
f = getattr(self, rec[0], None) f = getattr(self, rec[0], None)
if callable(f): if callable(f):
f(rec[1]) f(rec[1])