From 5c5a4d867662e088c42fc75a8e54b397479215f7 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 19 Apr 2009 18:20:26 -0700 Subject: [PATCH] Plugin for RTF input --- src/calibre/customize/builtins.py | 3 +- src/calibre/ebooks/epub/from_any.py | 10 - src/calibre/ebooks/lrf/rtf/convert_from.py | 190 ------------------- src/calibre/ebooks/{lrf => }/rtf/__init__.py | 0 src/calibre/ebooks/rtf/input.py | 101 ++++++++++ src/calibre/ebooks/{lrf => }/rtf/xsl.py | 0 src/calibre/ebooks/rtf2xml/ParseRtf.py | 6 +- src/calibre/ebooks/rtf2xml/pict.py | 6 +- 8 files changed, 110 insertions(+), 206 deletions(-) delete mode 100644 src/calibre/ebooks/lrf/rtf/convert_from.py rename src/calibre/ebooks/{lrf => }/rtf/__init__.py (100%) create mode 100644 src/calibre/ebooks/rtf/input.py rename src/calibre/ebooks/{lrf => }/rtf/xsl.py (100%) diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index a67224872b..51a0e4c75f 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -283,6 +283,7 @@ from calibre.ebooks.txt.input import TXTInput from calibre.ebooks.lit.input import LITInput from calibre.ebooks.fb2.input import FB2Input from calibre.ebooks.odt.input import ODTInput +from calibre.ebooks.rtf.input import RTFInput from calibre.ebooks.html.input import HTMLInput from calibre.ebooks.oeb.output import OEBOutput from calibre.ebooks.txt.output import TXTOutput @@ -291,7 +292,7 @@ from calibre.customize.profiles import input_profiles, output_profiles plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDFInput, HTMLInput, TXTInput, OEBOutput, TXTOutput, PDFOutput, LITInput, - FB2Input, ODTInput] + FB2Input, ODTInput, RTFInput] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ x.__name__.endswith('MetadataReader')] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ diff --git a/src/calibre/ebooks/epub/from_any.py b/src/calibre/ebooks/epub/from_any.py index 196ed59646..68112592d2 100644 --- a/src/calibre/ebooks/epub/from_any.py +++ b/src/calibre/ebooks/epub/from_any.py @@ -19,11 +19,6 @@ from calibre.utils.zipfile import ZipFile from calibre.customize.ui import run_plugins_on_preprocess -def rtf2opf(path, tdir, opts): - from calibre.ebooks.lrf.rtf.convert_from import generate_html - generate_html(path, tdir) - return os.path.join(tdir, 'metadata.opf') - def epub2opf(path, tdir, opts): zf = ZipFile(path) zf.extractall(tdir) @@ -42,11 +37,6 @@ def epub2opf(path, tdir, opts): raise ValueError('%s is not a valid EPUB file'%path) return opf -def odt2epub(path, tdir, opts): - from calibre.ebooks.odt.to_oeb import Extract - opts.encoding = 'utf-8' - return Extract()(path, tdir) - SOURCE_FORMATS = ['lit', 'mobi', 'prc', 'azw', 'fb2', 'odt', 'rtf', 'txt', 'pdf', 'rar', 'zip', 'oebzip', 'htm', 'html', 'epub'] diff --git a/src/calibre/ebooks/lrf/rtf/convert_from.py b/src/calibre/ebooks/lrf/rtf/convert_from.py deleted file mode 100644 index e4dd153d2a..0000000000 --- a/src/calibre/ebooks/lrf/rtf/convert_from.py +++ /dev/null @@ -1,190 +0,0 @@ -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal ' -import os, sys, shutil, logging, glob - -from lxml import etree - -from calibre.ebooks.lrf import option_parser as lrf_option_parser -from calibre.ebooks.metadata.meta import get_metadata -from calibre.ebooks.lrf.html.convert_from import process_file as html_process_file -from calibre import setup_cli_handlers -from calibre.libwand import convert, WandException -from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup -from calibre.ebooks.lrf.rtf.xsl import xhtml -from calibre.ebooks.rtf2xml.ParseRtf import RtfInvalidCodeException -from calibre.ptempfile import PersistentTemporaryDirectory -from calibre.ebooks.metadata import MetaInformation -from calibre.ebooks.metadata.opf import OPFCreator - -def option_parser(): - parser = lrf_option_parser( -_('''%prog [options] mybook.rtf - - -%prog converts mybook.rtf to mybook.lrf''') - ) - parser.add_option('--keep-intermediate-files', action='store_true', default=False) - return parser - -def convert_images(html, logger): - wmfs = glob.glob('*.wmf') + glob.glob('*.WMF') - for wmf in wmfs: - target = os.path.join(os.path.dirname(wmf), os.path.splitext(os.path.basename(wmf))[0]+'.jpg') - try: - convert(wmf, target) - html = html.replace(os.path.basename(wmf), os.path.basename(target)) - except WandException, err: - logger.warning(u'Unable to convert image %s with error: %s'%(wmf, unicode(err))) - continue - return html - -def process_file(path, options, logger=None): - if logger is None: - level = logging.DEBUG if options.verbose else logging.INFO - logger = logging.getLogger('rtf2lrf') - setup_cli_handlers(logger, level) - rtf = os.path.abspath(os.path.expanduser(path)) - f = open(rtf, 'rb') - mi = get_metadata(f, 'rtf') - f.close() - tdir = PersistentTemporaryDirectory('_rtf2lrf') - html = generate_html(rtf, tdir) - cwd = os.getcwdu() - try: - if not options.output: - ext = '.lrs' if options.lrs else '.lrf' - options.output = os.path.abspath(os.path.basename(os.path.splitext(path)[0]) + ext) - options.output = os.path.abspath(os.path.expanduser(options.output)) - if not mi.title: - mi.title = os.path.splitext(os.path.basename(rtf))[0] - if (not options.title or options.title == 'Unknown'): - options.title = mi.title - if (not options.author or options.author == 'Unknown') and mi.author: - options.author = mi.author - if (not options.category or options.category == 'Unknown') and mi.category: - options.category = mi.category - if (not options.freetext or options.freetext == 'Unknown') and mi.comments: - options.freetext = mi.comments - os.chdir(tdir) - html_process_file(html, options, logger) - finally: - os.chdir(cwd) - if hasattr(options, 'keep_intermediate_files') and options.keep_intermediate_files: - logger.debug('Intermediate files in '+ tdir) - else: - shutil.rmtree(tdir) - -def main(args=sys.argv, logger=None): - parser = option_parser() - options, args = parser.parse_args(args) - if len(args) != 2: - parser.print_help() - print - print 'No rtf file specified' - return 1 - process_file(args[1], options, logger) - return 0 - - -def generate_xml(rtfpath, tdir): - from calibre.ebooks.rtf2xml.ParseRtf import ParseRtf - ofile = os.path.join(tdir, 'index.xml') - cwd = os.getcwdu() - os.chdir(tdir) - rtfpath = os.path.abspath(rtfpath) - try: - parser = ParseRtf( - in_file = rtfpath, - out_file = ofile, - # Convert symbol fonts to unicode equivelents. Default - # is 1 - convert_symbol = 1, - - # Convert Zapf fonts to unicode equivelents. Default - # is 1. - convert_zapf = 1, - - # Convert Wingding fonts to unicode equivelents. - # Default is 1. - convert_wingdings = 1, - - # Convert RTF caps to real caps. - # Default is 1. - convert_caps = 1, - - # Indent resulting XML. - # Default is 0 (no indent). - indent = 1, - - # Form lists from RTF. Default is 1. - form_lists = 1, - - # Convert headings to sections. Default is 0. - headings_to_sections = 1, - - # Group paragraphs with the same style name. Default is 1. - group_styles = 1, - - # Group borders. Default is 1. - group_borders = 1, - - # Write or do not write paragraphs. Default is 0. - empty_paragraphs = 0, - ) - parser.parse_rtf() - finally: - os.chdir(cwd) - return ofile - - -def generate_html(rtfpath, tdir): - print 'Converting RTF to XML...' - rtfpath = os.path.abspath(rtfpath) - try: - xml = generate_xml(rtfpath, tdir) - except RtfInvalidCodeException: - raise Exception(_('This RTF file has a feature calibre does not support. Convert it to HTML and then convert it.')) - tdir = os.path.dirname(xml) - cwd = os.getcwdu() - os.chdir(tdir) - try: - print 'Parsing XML...' - parser = etree.XMLParser(recover=True, no_network=True) - try: - doc = etree.parse(xml, parser) - except: - raise - print 'Parsing failed. Trying to clean up XML...' - soup = BeautifulStoneSoup(open(xml, 'rb').read()) - doc = etree.fromstring(str(soup)) - print 'Converting XML to HTML...' - styledoc = etree.fromstring(xhtml) - - transform = etree.XSLT(styledoc) - result = transform(doc) - tdir = os.path.dirname(xml) - html = os.path.join(tdir, 'index.html') - f = open(html, 'wb') - res = transform.tostring(result) - res = res[:100].replace('xmlns:html', 'xmlns') + res[100:] - f.write(res) - f.close() - try: - mi = get_metadata(open(rtfpath, 'rb'), 'rtf') - except: - mi = MetaInformation(None, None) - if not mi.title: - mi.title = os.path.splitext(os.path.basename(rtfpath))[0] - if not mi.authors: - mi.authors = [_('Unknown')] - opf = OPFCreator(tdir, mi) - opf.create_manifest([('index.html', None)]) - opf.create_spine(['index.html']) - opf.render(open('metadata.opf', 'wb')) - finally: - os.chdir(cwd) - return html - -if __name__ == '__main__': - sys.exit(main()) - \ No newline at end of file diff --git a/src/calibre/ebooks/lrf/rtf/__init__.py b/src/calibre/ebooks/rtf/__init__.py similarity index 100% rename from src/calibre/ebooks/lrf/rtf/__init__.py rename to src/calibre/ebooks/rtf/__init__.py diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py new file mode 100644 index 0000000000..764d47ff41 --- /dev/null +++ b/src/calibre/ebooks/rtf/input.py @@ -0,0 +1,101 @@ +from __future__ import with_statement +__license__ = 'GPL v3' +__copyright__ = '2008, Kovid Goyal ' + +import os + +from lxml import etree + +from calibre.customize.conversion import InputFormatPlugin + +class RTFInput(InputFormatPlugin): + + name = 'RTF Input' + author = 'Kovid Goyal' + description = 'Convert RTF files to HTML' + file_types = set(['rtf']) + + def generate_xml(self, stream): + from calibre.ebooks.rtf2xml.ParseRtf import ParseRtf + ofile = 'out.xml' + parser = ParseRtf( + in_file = stream, + out_file = ofile, + # Convert symbol fonts to unicode equivelents. Default + # is 1 + convert_symbol = 1, + + # Convert Zapf fonts to unicode equivelents. Default + # is 1. + convert_zapf = 1, + + # Convert Wingding fonts to unicode equivelents. + # Default is 1. + convert_wingdings = 1, + + # Convert RTF caps to real caps. + # Default is 1. + convert_caps = 1, + + # Indent resulting XML. + # Default is 0 (no indent). + indent = 1, + + # Form lists from RTF. Default is 1. + form_lists = 1, + + # Convert headings to sections. Default is 0. + headings_to_sections = 1, + + # Group paragraphs with the same style name. Default is 1. + group_styles = 1, + + # Group borders. Default is 1. + group_borders = 1, + + # Write or do not write paragraphs. Default is 0. + empty_paragraphs = 0, + ) + parser.parse_rtf() + ans = open('out.xml').read() + os.remove('out.xml') + return ans + + def convert(self, stream, options, file_ext, log, + accelerators): + from calibre.ebooks.rtf.xsl import xhtml + from calibre.ebooks.metadata.meta import get_metadata + from calibre.ebooks.metadata.opf import OPFCreator + from calibre.ebooks.rtf2xml.ParseRtf import RtfInvalidCodeException + self.log = log + self.log('Converting RTF to XML...') + try: + xml = self.generate_xml(stream) + except RtfInvalidCodeException: + raise ValueError(_('This RTF file has a feature calibre does not ' + 'support. Convert it to HTML first and then try it.')) + self.log('Parsing XML...') + parser = etree.XMLParser(recover=True, no_network=True) + doc = etree.fromstring(xml, parser=parser) + self.log('Converting XML to HTML...') + styledoc = etree.fromstring(xhtml) + + transform = etree.XSLT(styledoc) + result = transform(doc) + html = 'index.xhtml' + with open(html, 'wb') as f: + res = transform.tostring(result) + res = res[:100].replace('xmlns:html', 'xmlns') + res[100:] + f.write(res) + stream.seek(0) + mi = get_metadata(stream, 'rtf') + if not mi.title: + mi.title = _('Unknown') + if not mi.authors: + mi.authors = [_('Unknown')] + opf = OPFCreator(os.getcwd(), mi) + opf.create_manifest([('index.xhtml', None)]) + opf.create_spine(['index.xhtml']) + opf.render(open('metadata.opf', 'wb')) + return os.path.abspath('metadata.opf') + diff --git a/src/calibre/ebooks/lrf/rtf/xsl.py b/src/calibre/ebooks/rtf/xsl.py similarity index 100% rename from src/calibre/ebooks/lrf/rtf/xsl.py rename to src/calibre/ebooks/rtf/xsl.py diff --git a/src/calibre/ebooks/rtf2xml/ParseRtf.py b/src/calibre/ebooks/rtf2xml/ParseRtf.py index 5b008df615..cba0f900db 100755 --- a/src/calibre/ebooks/rtf2xml/ParseRtf.py +++ b/src/calibre/ebooks/rtf2xml/ParseRtf.py @@ -149,9 +149,10 @@ class ParseRtf: self.__group_borders = group_borders self.__empty_paragraphs = empty_paragraphs self.__no_dtd = no_dtd - + def __check_file(self, the_file, type): """Check to see if files exist""" + if hasattr(the_file, 'read'): return if the_file == None: if type == "file_to_parse": message = "You must provide a file for the script to work" @@ -545,13 +546,12 @@ class ParseRtf: def __make_temp_file(self,file): """Make a temporary file to parse""" write_file="rtf_write_file" - read_obj = open(file,'r') + read_obj = file if hasattr(file, 'read') else open(file,'r') write_obj = open(write_file, 'w') line = "dummy" while line: line = read_obj.read(1000) write_obj.write(line ) - read_obj.close() write_obj.close() return write_file """ diff --git a/src/calibre/ebooks/rtf2xml/pict.py b/src/calibre/ebooks/rtf2xml/pict.py index b1931b8c2e..6c88dd54e4 100755 --- a/src/calibre/ebooks/rtf2xml/pict.py +++ b/src/calibre/ebooks/rtf2xml/pict.py @@ -58,10 +58,12 @@ class Pict: return line[18:] def __make_dir(self): """ Make a dirctory to put the image data in""" - base_name = os.path.basename(self.__orig_file) + base_name = os.path.basename(getattr(self.__orig_file, 'name', + self.__orig_file)) base_name = os.path.splitext(base_name)[0] if self.__out_file: - dir_name = os.path.dirname(self.__out_file) + dir_name = os.path.dirname(getattr(self.__out_file, 'name', + self.__out_file)) else: dir_name = os.path.dirname(self.__orig_file) # self.__output_to_file_func()