Plugin for RTF input

2025-07-09 03:04:10 -04:00 · 2009-04-19 18:20:26 -07:00 · 2009-04-19 18:20:26 -07:00 · 5c5a4d8676
commit 5c5a4d8676
parent c7498b0d50
8 changed files with 110 additions and 206 deletions
--- a/src/calibre/customize/builtins.py
+++ b/src/calibre/customize/builtins.py
@ -283,6 +283,7 @@ from calibre.ebooks.txt.input import TXTInput
 from calibre.ebooks.lit.input import LITInput
 from calibre.ebooks.fb2.input import FB2Input
 from calibre.ebooks.odt.input import ODTInput
+from calibre.ebooks.rtf.input import RTFInput
 from calibre.ebooks.html.input import HTMLInput
 from calibre.ebooks.oeb.output import OEBOutput
 from calibre.ebooks.txt.output import TXTOutput
@ -291,7 +292,7 @@ from calibre.customize.profiles import input_profiles, output_profiles

 plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDFInput, HTMLInput,
        TXTInput, OEBOutput, TXTOutput, PDFOutput, LITInput,
-        FB2Input, ODTInput]
+        FB2Input, ODTInput, RTFInput]
 plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
                                        x.__name__.endswith('MetadataReader')]
 plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
--- a/src/calibre/ebooks/epub/from_any.py
+++ b/src/calibre/ebooks/epub/from_any.py
@ -19,11 +19,6 @@ from calibre.utils.zipfile import ZipFile
 from calibre.customize.ui import run_plugins_on_preprocess


-def rtf2opf(path, tdir, opts):
-    from calibre.ebooks.lrf.rtf.convert_from import generate_html
-    generate_html(path, tdir)
-    return os.path.join(tdir, 'metadata.opf')
-
 def epub2opf(path, tdir, opts):
    zf = ZipFile(path)
    zf.extractall(tdir)
@ -42,11 +37,6 @@ def epub2opf(path, tdir, opts):
        raise ValueError('%s is not a valid EPUB file'%path)
    return opf

-def odt2epub(path, tdir, opts):
-    from calibre.ebooks.odt.to_oeb import Extract
-    opts.encoding = 'utf-8'
-    return Extract()(path, tdir)
-
 SOURCE_FORMATS = ['lit', 'mobi', 'prc', 'azw', 'fb2', 'odt', 'rtf',
                  'txt', 'pdf', 'rar', 'zip', 'oebzip', 'htm', 'html', 'epub']

--- a/src/calibre/ebooks/lrf/rtf/convert_from.py
+++ b/src/calibre/ebooks/lrf/rtf/convert_from.py
@ -1,190 +0,0 @@
-__license__   = 'GPL v3'
-__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
-import os, sys, shutil, logging, glob
-
-from lxml import etree
-
-from calibre.ebooks.lrf import option_parser as lrf_option_parser
-from calibre.ebooks.metadata.meta import get_metadata
-from calibre.ebooks.lrf.html.convert_from import process_file as html_process_file
-from calibre import setup_cli_handlers
-from calibre.libwand import convert, WandException
-from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
-from calibre.ebooks.lrf.rtf.xsl import xhtml
-from calibre.ebooks.rtf2xml.ParseRtf import RtfInvalidCodeException
-from calibre.ptempfile import PersistentTemporaryDirectory
-from calibre.ebooks.metadata import MetaInformation
-from calibre.ebooks.metadata.opf import OPFCreator 
-
-def option_parser():
-    parser = lrf_option_parser(
-_('''%prog [options] mybook.rtf
-
-
-%prog converts mybook.rtf to mybook.lrf''')
-        )
-    parser.add_option('--keep-intermediate-files', action='store_true', default=False)
-    return parser
-
-def convert_images(html, logger):
-    wmfs = glob.glob('*.wmf') + glob.glob('*.WMF')
-    for wmf in wmfs:
-        target = os.path.join(os.path.dirname(wmf), os.path.splitext(os.path.basename(wmf))[0]+'.jpg')
-        try:
-            convert(wmf, target)
-            html = html.replace(os.path.basename(wmf), os.path.basename(target))
-        except WandException, err:
-            logger.warning(u'Unable to convert image %s with error: %s'%(wmf, unicode(err)))
-            continue
-    return html
-
-def process_file(path, options, logger=None):
-    if logger is None:
-        level = logging.DEBUG if options.verbose else logging.INFO
-        logger = logging.getLogger('rtf2lrf')
-        setup_cli_handlers(logger, level)
-    rtf = os.path.abspath(os.path.expanduser(path))
-    f = open(rtf, 'rb')
-    mi = get_metadata(f, 'rtf')
-    f.close()
-    tdir = PersistentTemporaryDirectory('_rtf2lrf')
-    html = generate_html(rtf, tdir)
-    cwd = os.getcwdu()
-    try:
-        if not options.output:
-            ext = '.lrs' if options.lrs else '.lrf'
-            options.output = os.path.abspath(os.path.basename(os.path.splitext(path)[0]) + ext)
-        options.output = os.path.abspath(os.path.expanduser(options.output))
-        if not mi.title:
-            mi.title = os.path.splitext(os.path.basename(rtf))[0]
-        if (not options.title or options.title == 'Unknown'):
-            options.title = mi.title
-        if (not options.author or options.author == 'Unknown') and mi.author:
-            options.author = mi.author
-        if (not options.category or options.category == 'Unknown') and mi.category:
-            options.category = mi.category
-        if (not options.freetext or options.freetext == 'Unknown') and mi.comments:
-            options.freetext = mi.comments
-        os.chdir(tdir)
-        html_process_file(html, options, logger)
-    finally:
-        os.chdir(cwd)
-        if hasattr(options, 'keep_intermediate_files') and options.keep_intermediate_files:
-            logger.debug('Intermediate files in '+ tdir)
-        else:
-            shutil.rmtree(tdir)
-
-def main(args=sys.argv, logger=None):
-    parser = option_parser()
-    options, args = parser.parse_args(args)
-    if len(args) != 2:
-        parser.print_help()
-        print
-        print 'No rtf file specified'
-        return 1
-    process_file(args[1], options, logger)
-    return 0
-    
-
-def generate_xml(rtfpath, tdir):
-    from calibre.ebooks.rtf2xml.ParseRtf import ParseRtf
-    ofile = os.path.join(tdir, 'index.xml')
-    cwd = os.getcwdu()
-    os.chdir(tdir)
-    rtfpath = os.path.abspath(rtfpath)
-    try:
-        parser = ParseRtf(
-            in_file    = rtfpath,
-            out_file   = ofile,
-            # Convert symbol fonts to unicode equivelents. Default
-            # is 1
-            convert_symbol = 1,
-    
-            # Convert Zapf fonts to unicode equivelents. Default
-            # is 1.
-            convert_zapf = 1,
-    
-            # Convert Wingding fonts to unicode equivelents.
-            # Default is 1.
-            convert_wingdings = 1,
-    
-            # Convert RTF caps to real caps.
-            # Default is 1.
-            convert_caps = 1,
-    
-            # Indent resulting XML.
-            # Default is 0 (no indent).
-            indent = 1,
-    
-            # Form lists from RTF. Default is 1.
-            form_lists = 1,
-    
-            # Convert headings to sections. Default is 0.
-            headings_to_sections = 1,
-    
-            # Group paragraphs with the same style name. Default is 1.
-            group_styles = 1,
-    
-            # Group borders. Default is 1.
-            group_borders = 1,
-    
-            # Write or do not write paragraphs. Default is 0.
-            empty_paragraphs = 0,
-        )
-        parser.parse_rtf()
-    finally:
-        os.chdir(cwd)
-    return ofile
-
-
-def generate_html(rtfpath, tdir):
-    print 'Converting RTF to XML...'
-    rtfpath = os.path.abspath(rtfpath)
-    try:
-        xml = generate_xml(rtfpath, tdir)
-    except RtfInvalidCodeException:
-        raise Exception(_('This RTF file has a feature calibre does not support. Convert it to HTML and then convert it.'))
-    tdir = os.path.dirname(xml)
-    cwd = os.getcwdu()
-    os.chdir(tdir)
-    try:
-        print 'Parsing XML...'
-        parser = etree.XMLParser(recover=True, no_network=True)
-        try:
-            doc = etree.parse(xml, parser)
-        except:
-            raise
-            print 'Parsing failed. Trying to clean up XML...'
-            soup = BeautifulStoneSoup(open(xml, 'rb').read())
-            doc = etree.fromstring(str(soup))
-        print 'Converting XML to HTML...'
-        styledoc = etree.fromstring(xhtml)
-        
-        transform = etree.XSLT(styledoc)
-        result = transform(doc)
-        tdir = os.path.dirname(xml)
-        html = os.path.join(tdir, 'index.html')
-        f = open(html, 'wb')
-        res = transform.tostring(result)
-        res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
-        f.write(res)
-        f.close()
-        try:
-            mi = get_metadata(open(rtfpath, 'rb'), 'rtf')
-        except:
-            mi = MetaInformation(None, None)
-        if not mi.title:
-            mi.title = os.path.splitext(os.path.basename(rtfpath))[0]
-        if not mi.authors:
-            mi.authors = [_('Unknown')]
-        opf = OPFCreator(tdir, mi)
-        opf.create_manifest([('index.html', None)])
-        opf.create_spine(['index.html'])
-        opf.render(open('metadata.opf', 'wb'))
-    finally:
-        os.chdir(cwd)
-    return html
-            
-if __name__ == '__main__':
-    sys.exit(main())    
-        
--- a/src/calibre/ebooks/lrf/rtf/init.py
+++ b/src/calibre/ebooks/lrf/rtf/init.py
--- a/src/calibre/ebooks/rtf/input.py
+++ b/src/calibre/ebooks/rtf/input.py
@ -0,0 +1,101 @@
+from __future__ import with_statement
+__license__   = 'GPL v3'
+__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
+
+import os
+
+from lxml import etree
+
+from calibre.customize.conversion import InputFormatPlugin
+
+class RTFInput(InputFormatPlugin):
+
+    name        = 'RTF Input'
+    author      = 'Kovid Goyal'
+    description = 'Convert RTF files to HTML'
+    file_types  = set(['rtf'])
+
+    def generate_xml(self, stream):
+        from calibre.ebooks.rtf2xml.ParseRtf import ParseRtf
+        ofile = 'out.xml'
+        parser = ParseRtf(
+            in_file    = stream,
+            out_file   = ofile,
+            # Convert symbol fonts to unicode equivelents. Default
+            # is 1
+            convert_symbol = 1,
+
+            # Convert Zapf fonts to unicode equivelents. Default
+            # is 1.
+            convert_zapf = 1,
+
+            # Convert Wingding fonts to unicode equivelents.
+            # Default is 1.
+            convert_wingdings = 1,
+
+            # Convert RTF caps to real caps.
+            # Default is 1.
+            convert_caps = 1,
+
+            # Indent resulting XML.
+            # Default is 0 (no indent).
+            indent = 1,
+
+            # Form lists from RTF. Default is 1.
+            form_lists = 1,
+
+            # Convert headings to sections. Default is 0.
+            headings_to_sections = 1,
+
+            # Group paragraphs with the same style name. Default is 1.
+            group_styles = 1,
+
+            # Group borders. Default is 1.
+            group_borders = 1,
+
+            # Write or do not write paragraphs. Default is 0.
+            empty_paragraphs = 0,
+        )
+        parser.parse_rtf()
+        ans = open('out.xml').read()
+        os.remove('out.xml')
+        return ans
+
+    def convert(self, stream, options, file_ext, log,
+                accelerators):
+        from calibre.ebooks.rtf.xsl import xhtml
+        from calibre.ebooks.metadata.meta import get_metadata
+        from calibre.ebooks.metadata.opf import OPFCreator
+        from calibre.ebooks.rtf2xml.ParseRtf import RtfInvalidCodeException
+        self.log = log
+        self.log('Converting RTF to XML...')
+        try:
+            xml = self.generate_xml(stream)
+        except RtfInvalidCodeException:
+            raise ValueError(_('This RTF file has a feature calibre does not '
+            'support. Convert it to HTML first and then try it.'))
+        self.log('Parsing XML...')
+        parser = etree.XMLParser(recover=True, no_network=True)
+        doc = etree.fromstring(xml, parser=parser)
+        self.log('Converting XML to HTML...')
+        styledoc = etree.fromstring(xhtml)
+
+        transform = etree.XSLT(styledoc)
+        result = transform(doc)
+        html = 'index.xhtml'
+        with open(html, 'wb') as f:
+            res = transform.tostring(result)
+            res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
+            f.write(res)
+        stream.seek(0)
+        mi = get_metadata(stream, 'rtf')
+        if not mi.title:
+            mi.title = _('Unknown')
+        if not mi.authors:
+            mi.authors = [_('Unknown')]
+        opf = OPFCreator(os.getcwd(), mi)
+        opf.create_manifest([('index.xhtml', None)])
+        opf.create_spine(['index.xhtml'])
+        opf.render(open('metadata.opf', 'wb'))
+        return os.path.abspath('metadata.opf')
+
--- a/src/calibre/ebooks/lrf/rtf/xsl.py
+++ b/src/calibre/ebooks/lrf/rtf/xsl.py
--- a/src/calibre/ebooks/rtf2xml/ParseRtf.py
+++ b/src/calibre/ebooks/rtf2xml/ParseRtf.py
@ -149,9 +149,10 @@ class ParseRtf:
        self.__group_borders = group_borders
        self.__empty_paragraphs = empty_paragraphs
        self.__no_dtd = no_dtd
-        
+
    def __check_file(self, the_file, type):
        """Check to see if files exist"""
+        if hasattr(the_file, 'read'): return
        if the_file == None:
            if type == "file_to_parse":
                message = "You must provide a file for the script to work"
@ -545,13 +546,12 @@ class ParseRtf:
    def __make_temp_file(self,file):
        """Make a temporary file to parse"""
        write_file="rtf_write_file"
-        read_obj = open(file,'r')
+        read_obj = file if hasattr(file, 'read') else open(file,'r')
        write_obj = open(write_file, 'w')
        line = "dummy"
        while line:
            line = read_obj.read(1000)
            write_obj.write(line )
-        read_obj.close()
        write_obj.close()
        return write_file
    """
--- a/src/calibre/ebooks/rtf2xml/pict.py
+++ b/src/calibre/ebooks/rtf2xml/pict.py
@ -58,10 +58,12 @@ class Pict:
        return line[18:]
    def __make_dir(self):
        """ Make a dirctory to put the image data in"""
-        base_name = os.path.basename(self.__orig_file)
+        base_name = os.path.basename(getattr(self.__orig_file, 'name',
+            self.__orig_file))
        base_name = os.path.splitext(base_name)[0]
        if self.__out_file:
-            dir_name = os.path.dirname(self.__out_file)
+            dir_name = os.path.dirname(getattr(self.__out_file, 'name',
+                self.__out_file))
        else:
            dir_name = os.path.dirname(self.__orig_file)
        # self.__output_to_file_func()