RTF Input: Support extraction of images (JPEG/PNG only)

This commit is contained in:
Kovid Goyal 2009-07-30 15:04:02 -06:00
parent d0a1ce4825
commit ecfa8d8385
3 changed files with 61 additions and 27 deletions

View File

@ -2,7 +2,7 @@ from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
import os
import os, glob, re
from lxml import etree
@ -61,6 +61,30 @@ class RTFInput(InputFormatPlugin):
os.remove('out.xml')
return ans
def extract_images(self, picts):
self.log('Extracting images...')
count = 0
raw = open(picts, 'rb').read()
starts = []
for match in re.finditer(r'\{\\pict([^}]+)\}', raw):
starts.append(match.start(1))
for start in starts:
pos, bc = start, 1
while bc > 0:
if raw[pos] == '}': bc -= 1
elif raw[pos] == '{': bc += 1
pos += 1
pict = raw[start:pos+1]
enc = re.sub(r'[^a-zA-Z0-9]', '', pict)
if len(enc) % 2 == 1:
enc = enc[:-1]
data = enc.decode('hex')
count += 1
name = (('%4d'%count).replace(' ', '0'))+'.jpg'
open(name, 'wb').write(data)
#open(name+'.hex', 'wb').write(enc)
def convert(self, stream, options, file_ext, log,
accelerators):
from calibre.ebooks.rtf.xsl import xhtml
@ -74,6 +98,12 @@ class RTFInput(InputFormatPlugin):
except RtfInvalidCodeException:
raise ValueError(_('This RTF file has a feature calibre does not '
'support. Convert it to HTML first and then try it.'))
d = glob.glob(os.path.join('*_rtf_pict_dir', 'picts.rtf'))
if d:
try:
self.extract_images(d[0])
except:
self.log.exception('Failed to extract images...')
self.log('Parsing XML...')
parser = etree.XMLParser(recover=True, no_network=True)
doc = etree.fromstring(xml, parser=parser)

View File

@ -18,11 +18,11 @@
xhtml = '''\
<?xml version="1.0"?>
<xsl:stylesheet version="1.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
<xsl:stylesheet version="1.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:html="http://www.w3.org/1999/xhtml"
xmlns:rtf="http://rtf2xml.sourceforge.net/"
exclude-result-prefixes="rtf"
exclude-result-prefixes="rtf"
>
<xsl:template match = "rtf:para">
@ -36,7 +36,7 @@ xhtml = '''\
parent::rtf:paragraph-definition[@name='heading 7']|
parent::rtf:paragraph-definition[@name='heading 8']|
parent::rtf:paragraph-definition[@name='heading 9']
">
<xsl:variable name="head-number" select="substring(parent::rtf:paragraph-definition/@name, 9)"/>
<xsl:element name="h{$head-number}">
@ -64,7 +64,7 @@ xhtml = '''\
parent::rtf:paragraph-definition[@name='heading 7']|
parent::rtf:paragraph-definition[@name='heading 8']|
parent::rtf:paragraph-definition[@name='heading 9']
">
<xsl:apply-templates/>
</xsl:when>
@ -108,17 +108,17 @@ xhtml = '''\
<xsl:when test = "@italics = 'true' ">
<emph rend = "paragraph-emph-italics">
<xsl:apply-templates/>
</emph>
</emph>
</xsl:when>
<xsl:when test = "@bold = 'true' ">
<emph rend = "paragraph-emph-bold">
<xsl:apply-templates/>
</emph>
</emph>
</xsl:when>
<xsl:when test = "@underlined">
<emph rend = "paragraph-emph-underlined">
<xsl:apply-templates/>
</emph>
</emph>
</xsl:when>
<xsl:when test = "(@strike-through = 'true')
or (@double-strike-through = 'true')
@ -128,18 +128,18 @@ xhtml = '''\
or (@shadow = 'true')
or (@hidden = 'true')
or (@outline = 'true')
">
<emph rend = "paragraph-emph">
<xsl:apply-templates/>
</emph>
</emph>
</xsl:when>
<xsl:otherwise>
<xsl:apply-templates/>
</xsl:otherwise>
</xsl:choose>
</xsl:template>
<xsl:template name="make-header">
<head>
<xsl:element name="meta">
@ -150,7 +150,7 @@ xhtml = '''\
<xsl:text>http://rtf2xml.sourceforge.net/</xsl:text>
</xsl:attribute>
</xsl:element>
<xsl:choose>
<xsl:when test="/rtf:doc/rtf:preamble/rtf:doc-information">
<xsl:apply-templates select="/rtf:doc/rtf:preamble/rtf:doc-information" mode="header"/>
@ -333,7 +333,7 @@ xhtml = '''\
</xsl:otherwise>
</xsl:choose>
</xsl:template>
<xsl:template match="rtf:inline">
<xsl:variable name="num-attrs" select="count(@*)"/>
<xsl:choose>
@ -387,7 +387,7 @@ xhtml = '''\
</xsl:attribute>
<xsl:apply-templates/>
</xsl:element>
</xsl:otherwise>
</xsl:choose>
</xsl:template>
@ -401,9 +401,9 @@ xhtml = '''\
</xsl:attribute>
<xsl:apply-templates/>
</xsl:element>
</xsl:template>
<xsl:template match="rtf:list[@list-type='unordered']">
<xsl:element name="ul">
<xsl:apply-templates/>
@ -479,13 +479,13 @@ xhtml = '''\
<xsl:template match="rtf:preamble">
<xsl:apply-templates/>
</xsl:template>
<xsl:template match="rtf:page-break">
<xsl:element name="br">
<xsl:attribute name="style">page-break-after:always</xsl:attribute>
</xsl:element>
</xsl:template>
<xsl:template match="rtf:rtf-definition|rtf:font-table|rtf:color-table|rtf:style-table|rtf:page-definition|rtf:list-table|rtf:override-table|rtf:override-list|rtf:list-text"/>
<xsl:template match="rtf:body">
@ -505,11 +505,11 @@ xhtml = '''\
<xsl:apply-templates/>
</xsl:element>
</xsl:template>
<xsl:template match = "rtf:field-block">
<xsl:apply-templates/>
</xsl:template>
<xsl:template match = "rtf:field[@type='hyperlink']">
<xsl:element name ="a">
<xsl:attribute name = "href">
@ -522,9 +522,13 @@ xhtml = '''\
<xsl:template match = "rtf:field">
<xsl:apply-templates/>
</xsl:template>
<xsl:template match="rtf:pict" />
<xsl:template match="rtf:pict">
<xsl:element name="img">
<xsl:attribute name="src"><xsl:value-of select="@num" />.jpg</xsl:attribute>
</xsl:element>
</xsl:template>
<xsl:template match="*">
<xsl:message>
<xsl:text>no match for element: "</xsl:text>
@ -533,6 +537,6 @@ xhtml = '''\
</xsl:message>
<xsl:apply-templates/>
</xsl:template>
</xsl:stylesheet>
'''
'''

View File

@ -55,7 +55,7 @@ class Pict:
return "}\n"
def __text_func(self, line):
#tx<nu<__________<true text
return line[18:]
return line[17:]
def __make_dir(self):
""" Make a dirctory to put the image data in"""
base_name = os.path.basename(getattr(self.__orig_file, 'name',