Plugin for RTF input

This commit is contained in:
Kovid Goyal 2009-04-19 18:20:26 -07:00
parent c7498b0d50
commit 5c5a4d8676
8 changed files with 110 additions and 206 deletions

View File

@ -283,6 +283,7 @@ from calibre.ebooks.txt.input import TXTInput
from calibre.ebooks.lit.input import LITInput
from calibre.ebooks.fb2.input import FB2Input
from calibre.ebooks.odt.input import ODTInput
from calibre.ebooks.rtf.input import RTFInput
from calibre.ebooks.html.input import HTMLInput
from calibre.ebooks.oeb.output import OEBOutput
from calibre.ebooks.txt.output import TXTOutput
@ -291,7 +292,7 @@ from calibre.customize.profiles import input_profiles, output_profiles
plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDFInput, HTMLInput,
TXTInput, OEBOutput, TXTOutput, PDFOutput, LITInput,
FB2Input, ODTInput]
FB2Input, ODTInput, RTFInput]
plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
x.__name__.endswith('MetadataReader')]
plugins += [x for x in list(locals().values()) if isinstance(x, type) and \

View File

@ -19,11 +19,6 @@ from calibre.utils.zipfile import ZipFile
from calibre.customize.ui import run_plugins_on_preprocess
def rtf2opf(path, tdir, opts):
from calibre.ebooks.lrf.rtf.convert_from import generate_html
generate_html(path, tdir)
return os.path.join(tdir, 'metadata.opf')
def epub2opf(path, tdir, opts):
zf = ZipFile(path)
zf.extractall(tdir)
@ -42,11 +37,6 @@ def epub2opf(path, tdir, opts):
raise ValueError('%s is not a valid EPUB file'%path)
return opf
def odt2epub(path, tdir, opts):
from calibre.ebooks.odt.to_oeb import Extract
opts.encoding = 'utf-8'
return Extract()(path, tdir)
SOURCE_FORMATS = ['lit', 'mobi', 'prc', 'azw', 'fb2', 'odt', 'rtf',
'txt', 'pdf', 'rar', 'zip', 'oebzip', 'htm', 'html', 'epub']

View File

@ -1,190 +0,0 @@
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
import os, sys, shutil, logging, glob
from lxml import etree
from calibre.ebooks.lrf import option_parser as lrf_option_parser
from calibre.ebooks.metadata.meta import get_metadata
from calibre.ebooks.lrf.html.convert_from import process_file as html_process_file
from calibre import setup_cli_handlers
from calibre.libwand import convert, WandException
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
from calibre.ebooks.lrf.rtf.xsl import xhtml
from calibre.ebooks.rtf2xml.ParseRtf import RtfInvalidCodeException
from calibre.ptempfile import PersistentTemporaryDirectory
from calibre.ebooks.metadata import MetaInformation
from calibre.ebooks.metadata.opf import OPFCreator
def option_parser():
parser = lrf_option_parser(
_('''%prog [options] mybook.rtf
%prog converts mybook.rtf to mybook.lrf''')
)
parser.add_option('--keep-intermediate-files', action='store_true', default=False)
return parser
def convert_images(html, logger):
wmfs = glob.glob('*.wmf') + glob.glob('*.WMF')
for wmf in wmfs:
target = os.path.join(os.path.dirname(wmf), os.path.splitext(os.path.basename(wmf))[0]+'.jpg')
try:
convert(wmf, target)
html = html.replace(os.path.basename(wmf), os.path.basename(target))
except WandException, err:
logger.warning(u'Unable to convert image %s with error: %s'%(wmf, unicode(err)))
continue
return html
def process_file(path, options, logger=None):
if logger is None:
level = logging.DEBUG if options.verbose else logging.INFO
logger = logging.getLogger('rtf2lrf')
setup_cli_handlers(logger, level)
rtf = os.path.abspath(os.path.expanduser(path))
f = open(rtf, 'rb')
mi = get_metadata(f, 'rtf')
f.close()
tdir = PersistentTemporaryDirectory('_rtf2lrf')
html = generate_html(rtf, tdir)
cwd = os.getcwdu()
try:
if not options.output:
ext = '.lrs' if options.lrs else '.lrf'
options.output = os.path.abspath(os.path.basename(os.path.splitext(path)[0]) + ext)
options.output = os.path.abspath(os.path.expanduser(options.output))
if not mi.title:
mi.title = os.path.splitext(os.path.basename(rtf))[0]
if (not options.title or options.title == 'Unknown'):
options.title = mi.title
if (not options.author or options.author == 'Unknown') and mi.author:
options.author = mi.author
if (not options.category or options.category == 'Unknown') and mi.category:
options.category = mi.category
if (not options.freetext or options.freetext == 'Unknown') and mi.comments:
options.freetext = mi.comments
os.chdir(tdir)
html_process_file(html, options, logger)
finally:
os.chdir(cwd)
if hasattr(options, 'keep_intermediate_files') and options.keep_intermediate_files:
logger.debug('Intermediate files in '+ tdir)
else:
shutil.rmtree(tdir)
def main(args=sys.argv, logger=None):
parser = option_parser()
options, args = parser.parse_args(args)
if len(args) != 2:
parser.print_help()
print
print 'No rtf file specified'
return 1
process_file(args[1], options, logger)
return 0
def generate_xml(rtfpath, tdir):
from calibre.ebooks.rtf2xml.ParseRtf import ParseRtf
ofile = os.path.join(tdir, 'index.xml')
cwd = os.getcwdu()
os.chdir(tdir)
rtfpath = os.path.abspath(rtfpath)
try:
parser = ParseRtf(
in_file = rtfpath,
out_file = ofile,
# Convert symbol fonts to unicode equivelents. Default
# is 1
convert_symbol = 1,
# Convert Zapf fonts to unicode equivelents. Default
# is 1.
convert_zapf = 1,
# Convert Wingding fonts to unicode equivelents.
# Default is 1.
convert_wingdings = 1,
# Convert RTF caps to real caps.
# Default is 1.
convert_caps = 1,
# Indent resulting XML.
# Default is 0 (no indent).
indent = 1,
# Form lists from RTF. Default is 1.
form_lists = 1,
# Convert headings to sections. Default is 0.
headings_to_sections = 1,
# Group paragraphs with the same style name. Default is 1.
group_styles = 1,
# Group borders. Default is 1.
group_borders = 1,
# Write or do not write paragraphs. Default is 0.
empty_paragraphs = 0,
)
parser.parse_rtf()
finally:
os.chdir(cwd)
return ofile
def generate_html(rtfpath, tdir):
print 'Converting RTF to XML...'
rtfpath = os.path.abspath(rtfpath)
try:
xml = generate_xml(rtfpath, tdir)
except RtfInvalidCodeException:
raise Exception(_('This RTF file has a feature calibre does not support. Convert it to HTML and then convert it.'))
tdir = os.path.dirname(xml)
cwd = os.getcwdu()
os.chdir(tdir)
try:
print 'Parsing XML...'
parser = etree.XMLParser(recover=True, no_network=True)
try:
doc = etree.parse(xml, parser)
except:
raise
print 'Parsing failed. Trying to clean up XML...'
soup = BeautifulStoneSoup(open(xml, 'rb').read())
doc = etree.fromstring(str(soup))
print 'Converting XML to HTML...'
styledoc = etree.fromstring(xhtml)
transform = etree.XSLT(styledoc)
result = transform(doc)
tdir = os.path.dirname(xml)
html = os.path.join(tdir, 'index.html')
f = open(html, 'wb')
res = transform.tostring(result)
res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
f.write(res)
f.close()
try:
mi = get_metadata(open(rtfpath, 'rb'), 'rtf')
except:
mi = MetaInformation(None, None)
if not mi.title:
mi.title = os.path.splitext(os.path.basename(rtfpath))[0]
if not mi.authors:
mi.authors = [_('Unknown')]
opf = OPFCreator(tdir, mi)
opf.create_manifest([('index.html', None)])
opf.create_spine(['index.html'])
opf.render(open('metadata.opf', 'wb'))
finally:
os.chdir(cwd)
return html
if __name__ == '__main__':
sys.exit(main())

View File

@ -0,0 +1,101 @@
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
import os
from lxml import etree
from calibre.customize.conversion import InputFormatPlugin
class RTFInput(InputFormatPlugin):
name = 'RTF Input'
author = 'Kovid Goyal'
description = 'Convert RTF files to HTML'
file_types = set(['rtf'])
def generate_xml(self, stream):
from calibre.ebooks.rtf2xml.ParseRtf import ParseRtf
ofile = 'out.xml'
parser = ParseRtf(
in_file = stream,
out_file = ofile,
# Convert symbol fonts to unicode equivelents. Default
# is 1
convert_symbol = 1,
# Convert Zapf fonts to unicode equivelents. Default
# is 1.
convert_zapf = 1,
# Convert Wingding fonts to unicode equivelents.
# Default is 1.
convert_wingdings = 1,
# Convert RTF caps to real caps.
# Default is 1.
convert_caps = 1,
# Indent resulting XML.
# Default is 0 (no indent).
indent = 1,
# Form lists from RTF. Default is 1.
form_lists = 1,
# Convert headings to sections. Default is 0.
headings_to_sections = 1,
# Group paragraphs with the same style name. Default is 1.
group_styles = 1,
# Group borders. Default is 1.
group_borders = 1,
# Write or do not write paragraphs. Default is 0.
empty_paragraphs = 0,
)
parser.parse_rtf()
ans = open('out.xml').read()
os.remove('out.xml')
return ans
def convert(self, stream, options, file_ext, log,
accelerators):
from calibre.ebooks.rtf.xsl import xhtml
from calibre.ebooks.metadata.meta import get_metadata
from calibre.ebooks.metadata.opf import OPFCreator
from calibre.ebooks.rtf2xml.ParseRtf import RtfInvalidCodeException
self.log = log
self.log('Converting RTF to XML...')
try:
xml = self.generate_xml(stream)
except RtfInvalidCodeException:
raise ValueError(_('This RTF file has a feature calibre does not '
'support. Convert it to HTML first and then try it.'))
self.log('Parsing XML...')
parser = etree.XMLParser(recover=True, no_network=True)
doc = etree.fromstring(xml, parser=parser)
self.log('Converting XML to HTML...')
styledoc = etree.fromstring(xhtml)
transform = etree.XSLT(styledoc)
result = transform(doc)
html = 'index.xhtml'
with open(html, 'wb') as f:
res = transform.tostring(result)
res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
f.write(res)
stream.seek(0)
mi = get_metadata(stream, 'rtf')
if not mi.title:
mi.title = _('Unknown')
if not mi.authors:
mi.authors = [_('Unknown')]
opf = OPFCreator(os.getcwd(), mi)
opf.create_manifest([('index.xhtml', None)])
opf.create_spine(['index.xhtml'])
opf.render(open('metadata.opf', 'wb'))
return os.path.abspath('metadata.opf')

View File

@ -149,9 +149,10 @@ class ParseRtf:
self.__group_borders = group_borders
self.__empty_paragraphs = empty_paragraphs
self.__no_dtd = no_dtd
def __check_file(self, the_file, type):
"""Check to see if files exist"""
if hasattr(the_file, 'read'): return
if the_file == None:
if type == "file_to_parse":
message = "You must provide a file for the script to work"
@ -545,13 +546,12 @@ class ParseRtf:
def __make_temp_file(self,file):
"""Make a temporary file to parse"""
write_file="rtf_write_file"
read_obj = open(file,'r')
read_obj = file if hasattr(file, 'read') else open(file,'r')
write_obj = open(write_file, 'w')
line = "dummy"
while line:
line = read_obj.read(1000)
write_obj.write(line )
read_obj.close()
write_obj.close()
return write_file
"""

View File

@ -58,10 +58,12 @@ class Pict:
return line[18:]
def __make_dir(self):
""" Make a dirctory to put the image data in"""
base_name = os.path.basename(self.__orig_file)
base_name = os.path.basename(getattr(self.__orig_file, 'name',
self.__orig_file))
base_name = os.path.splitext(base_name)[0]
if self.__out_file:
dir_name = os.path.dirname(self.__out_file)
dir_name = os.path.dirname(getattr(self.__out_file, 'name',
self.__out_file))
else:
dir_name = os.path.dirname(self.__orig_file)
# self.__output_to_file_func()