Plugin for RTF input

2025-07-09 03:04:10 -04:00 · 2009-04-19 18:20:26 -07:00 · 2009-04-19 18:20:26 -07:00 · 5c5a4d8676
commit 5c5a4d8676
parent c7498b0d50
8 changed files with 110 additions and 206 deletions
--- a/src/calibre/customize/builtins.py
+++ b/src/calibre/customize/builtins.py
@ -283,6 +283,7 @@ from calibre.ebooks.txt.input import TXTInput
 from calibre.ebooks.lit.input import LITInput
 from calibre.ebooks.fb2.input import FB2Input
 from calibre.ebooks.odt.input import ODTInput
 from calibre.ebooks.rtf.input import RTFInput
 from calibre.ebooks.html.input import HTMLInput
 from calibre.ebooks.oeb.output import OEBOutput
 from calibre.ebooks.txt.output import TXTOutput
@ -291,7 +292,7 @@ from calibre.customize.profiles import input_profiles, output_profiles
 plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDFInput, HTMLInput,
        TXTInput, OEBOutput, TXTOutput, PDFOutput, LITInput,
-        FB2Input, ODTInput]
+        FB2Input, ODTInput, RTFInput]
 plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
                                        x.__name__.endswith('MetadataReader')]
 plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
--- a/src/calibre/ebooks/epub/from_any.py
+++ b/src/calibre/ebooks/epub/from_any.py
@ -19,11 +19,6 @@ from calibre.utils.zipfile import ZipFile
 from calibre.customize.ui import run_plugins_on_preprocess
 def rtf2opf(path, tdir, opts):
    from calibre.ebooks.lrf.rtf.convert_from import generate_html
    generate_html(path, tdir)
    return os.path.join(tdir, 'metadata.opf')
 def epub2opf(path, tdir, opts):
    zf = ZipFile(path)
    zf.extractall(tdir)
@ -42,11 +37,6 @@ def epub2opf(path, tdir, opts):
        raise ValueError('%s is not a valid EPUB file'%path)
    return opf
 def odt2epub(path, tdir, opts):
    from calibre.ebooks.odt.to_oeb import Extract
    opts.encoding = 'utf-8'
    return Extract()(path, tdir)
 SOURCE_FORMATS = ['lit', 'mobi', 'prc', 'azw', 'fb2', 'odt', 'rtf',
                  'txt', 'pdf', 'rar', 'zip', 'oebzip', 'htm', 'html', 'epub']
--- a/src/calibre/ebooks/lrf/rtf/convert_from.py
+++ b/src/calibre/ebooks/lrf/rtf/convert_from.py
@ -1,190 +0,0 @@
 __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
 import os, sys, shutil, logging, glob
 from lxml import etree
 from calibre.ebooks.lrf import option_parser as lrf_option_parser
 from calibre.ebooks.metadata.meta import get_metadata
 from calibre.ebooks.lrf.html.convert_from import process_file as html_process_file
 from calibre import setup_cli_handlers
 from calibre.libwand import convert, WandException
 from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
 from calibre.ebooks.lrf.rtf.xsl import xhtml
 from calibre.ebooks.rtf2xml.ParseRtf import RtfInvalidCodeException
 from calibre.ptempfile import PersistentTemporaryDirectory
 from calibre.ebooks.metadata import MetaInformation
 from calibre.ebooks.metadata.opf import OPFCreator 
 def option_parser():
    parser = lrf_option_parser(
 _('''%prog [options] mybook.rtf
 %prog converts mybook.rtf to mybook.lrf''')
        )
    parser.add_option('--keep-intermediate-files', action='store_true', default=False)
    return parser
 def convert_images(html, logger):
    wmfs = glob.glob('*.wmf') + glob.glob('*.WMF')
    for wmf in wmfs:
        target = os.path.join(os.path.dirname(wmf), os.path.splitext(os.path.basename(wmf))[0]+'.jpg')
        try:
            convert(wmf, target)
            html = html.replace(os.path.basename(wmf), os.path.basename(target))
        except WandException, err:
            logger.warning(u'Unable to convert image %s with error: %s'%(wmf, unicode(err)))
            continue
    return html
 def process_file(path, options, logger=None):
    if logger is None:
        level = logging.DEBUG if options.verbose else logging.INFO
        logger = logging.getLogger('rtf2lrf')
        setup_cli_handlers(logger, level)
    rtf = os.path.abspath(os.path.expanduser(path))
    f = open(rtf, 'rb')
    mi = get_metadata(f, 'rtf')
    f.close()
    tdir = PersistentTemporaryDirectory('_rtf2lrf')
    html = generate_html(rtf, tdir)
    cwd = os.getcwdu()
    try:
        if not options.output:
            ext = '.lrs' if options.lrs else '.lrf'
            options.output = os.path.abspath(os.path.basename(os.path.splitext(path)[0]) + ext)
        options.output = os.path.abspath(os.path.expanduser(options.output))
        if not mi.title:
            mi.title = os.path.splitext(os.path.basename(rtf))[0]
        if (not options.title or options.title == 'Unknown'):
            options.title = mi.title
        if (not options.author or options.author == 'Unknown') and mi.author:
            options.author = mi.author
        if (not options.category or options.category == 'Unknown') and mi.category:
            options.category = mi.category
        if (not options.freetext or options.freetext == 'Unknown') and mi.comments:
            options.freetext = mi.comments
        os.chdir(tdir)
        html_process_file(html, options, logger)
    finally:
        os.chdir(cwd)
        if hasattr(options, 'keep_intermediate_files') and options.keep_intermediate_files:
            logger.debug('Intermediate files in '+ tdir)
        else:
            shutil.rmtree(tdir)
 def main(args=sys.argv, logger=None):
    parser = option_parser()
    options, args = parser.parse_args(args)
    if len(args) != 2:
        parser.print_help()
        print
        print 'No rtf file specified'
        return 1
    process_file(args[1], options, logger)
    return 0
 def generate_xml(rtfpath, tdir):
    from calibre.ebooks.rtf2xml.ParseRtf import ParseRtf
    ofile = os.path.join(tdir, 'index.xml')
    cwd = os.getcwdu()
    os.chdir(tdir)
    rtfpath = os.path.abspath(rtfpath)
    try:
        parser = ParseRtf(
            in_file    = rtfpath,
            out_file   = ofile,
            # Convert symbol fonts to unicode equivelents. Default
            # is 1
            convert_symbol = 1,
            # Convert Zapf fonts to unicode equivelents. Default
            # is 1.
            convert_zapf = 1,
            # Convert Wingding fonts to unicode equivelents.
            # Default is 1.
            convert_wingdings = 1,
            # Convert RTF caps to real caps.
            # Default is 1.
            convert_caps = 1,
            # Indent resulting XML.
            # Default is 0 (no indent).
            indent = 1,
            # Form lists from RTF. Default is 1.
            form_lists = 1,
            # Convert headings to sections. Default is 0.
            headings_to_sections = 1,
            # Group paragraphs with the same style name. Default is 1.
            group_styles = 1,
            # Group borders. Default is 1.
            group_borders = 1,
            # Write or do not write paragraphs. Default is 0.
            empty_paragraphs = 0,
        )
        parser.parse_rtf()
    finally:
        os.chdir(cwd)
    return ofile
 def generate_html(rtfpath, tdir):
    print 'Converting RTF to XML...'
    rtfpath = os.path.abspath(rtfpath)
    try:
        xml = generate_xml(rtfpath, tdir)
    except RtfInvalidCodeException:
        raise Exception(_('This RTF file has a feature calibre does not support. Convert it to HTML and then convert it.'))
    tdir = os.path.dirname(xml)
    cwd = os.getcwdu()
    os.chdir(tdir)
    try:
        print 'Parsing XML...'
        parser = etree.XMLParser(recover=True, no_network=True)
        try:
            doc = etree.parse(xml, parser)
        except:
            raise
            print 'Parsing failed. Trying to clean up XML...'
            soup = BeautifulStoneSoup(open(xml, 'rb').read())
            doc = etree.fromstring(str(soup))
        print 'Converting XML to HTML...'
        styledoc = etree.fromstring(xhtml)
        transform = etree.XSLT(styledoc)
        result = transform(doc)
        tdir = os.path.dirname(xml)
        html = os.path.join(tdir, 'index.html')
        f = open(html, 'wb')
        res = transform.tostring(result)
        res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
        f.write(res)
        f.close()
        try:
            mi = get_metadata(open(rtfpath, 'rb'), 'rtf')
        except:
            mi = MetaInformation(None, None)
        if not mi.title:
            mi.title = os.path.splitext(os.path.basename(rtfpath))[0]
        if not mi.authors:
            mi.authors = [_('Unknown')]
        opf = OPFCreator(tdir, mi)
        opf.create_manifest([('index.html', None)])
        opf.create_spine(['index.html'])
        opf.render(open('metadata.opf', 'wb'))
    finally:
        os.chdir(cwd)
    return html
 if __name__ == '__main__':
    sys.exit(main())    
--- a/src/calibre/ebooks/lrf/rtf/init.py
+++ b/src/calibre/ebooks/lrf/rtf/init.py
--- a/src/calibre/ebooks/rtf/input.py
+++ b/src/calibre/ebooks/rtf/input.py
@ -0,0 +1,101 @@
 from __future__ import with_statement
 __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
 import os
 from lxml import etree
 from calibre.customize.conversion import InputFormatPlugin
 class RTFInput(InputFormatPlugin):
    name        = 'RTF Input'
    author      = 'Kovid Goyal'
    description = 'Convert RTF files to HTML'
    file_types  = set(['rtf'])
    def generate_xml(self, stream):
        from calibre.ebooks.rtf2xml.ParseRtf import ParseRtf
        ofile = 'out.xml'
        parser = ParseRtf(
            in_file    = stream,
            out_file   = ofile,
            # Convert symbol fonts to unicode equivelents. Default
            # is 1
            convert_symbol = 1,
            # Convert Zapf fonts to unicode equivelents. Default
            # is 1.
            convert_zapf = 1,
            # Convert Wingding fonts to unicode equivelents.
            # Default is 1.
            convert_wingdings = 1,
            # Convert RTF caps to real caps.
            # Default is 1.
            convert_caps = 1,
            # Indent resulting XML.
            # Default is 0 (no indent).
            indent = 1,
            # Form lists from RTF. Default is 1.
            form_lists = 1,
            # Convert headings to sections. Default is 0.
            headings_to_sections = 1,
            # Group paragraphs with the same style name. Default is 1.
            group_styles = 1,
            # Group borders. Default is 1.
            group_borders = 1,
            # Write or do not write paragraphs. Default is 0.
            empty_paragraphs = 0,
        )
        parser.parse_rtf()
        ans = open('out.xml').read()
        os.remove('out.xml')
        return ans
    def convert(self, stream, options, file_ext, log,
                accelerators):
        from calibre.ebooks.rtf.xsl import xhtml
        from calibre.ebooks.metadata.meta import get_metadata
        from calibre.ebooks.metadata.opf import OPFCreator
        from calibre.ebooks.rtf2xml.ParseRtf import RtfInvalidCodeException
        self.log = log
        self.log('Converting RTF to XML...')
        try:
            xml = self.generate_xml(stream)
        except RtfInvalidCodeException:
            raise ValueError(_('This RTF file has a feature calibre does not '
            'support. Convert it to HTML first and then try it.'))
        self.log('Parsing XML...')
        parser = etree.XMLParser(recover=True, no_network=True)
        doc = etree.fromstring(xml, parser=parser)
        self.log('Converting XML to HTML...')
        styledoc = etree.fromstring(xhtml)
        transform = etree.XSLT(styledoc)
        result = transform(doc)
        html = 'index.xhtml'
        with open(html, 'wb') as f:
            res = transform.tostring(result)
            res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
            f.write(res)
        stream.seek(0)
        mi = get_metadata(stream, 'rtf')
        if not mi.title:
            mi.title = _('Unknown')
        if not mi.authors:
            mi.authors = [_('Unknown')]
        opf = OPFCreator(os.getcwd(), mi)
        opf.create_manifest([('index.xhtml', None)])
        opf.create_spine(['index.xhtml'])
        opf.render(open('metadata.opf', 'wb'))
        return os.path.abspath('metadata.opf')
--- a/src/calibre/ebooks/lrf/rtf/xsl.py
+++ b/src/calibre/ebooks/lrf/rtf/xsl.py
--- a/src/calibre/ebooks/rtf2xml/ParseRtf.py
+++ b/src/calibre/ebooks/rtf2xml/ParseRtf.py
@ -152,6 +152,7 @@ class ParseRtf:
    def __check_file(self, the_file, type):
        """Check to see if files exist"""
        if hasattr(the_file, 'read'): return
        if the_file == None:
            if type == "file_to_parse":
                message = "You must provide a file for the script to work"
@ -545,13 +546,12 @@ class ParseRtf:
    def __make_temp_file(self,file):
        """Make a temporary file to parse"""
        write_file="rtf_write_file"
-        read_obj = open(file,'r')
+        read_obj = file if hasattr(file, 'read') else open(file,'r')
        write_obj = open(write_file, 'w')
        line = "dummy"
        while line:
            line = read_obj.read(1000)
            write_obj.write(line )
        read_obj.close()
        write_obj.close()
        return write_file
    """
--- a/src/calibre/ebooks/rtf2xml/pict.py
+++ b/src/calibre/ebooks/rtf2xml/pict.py
@ -58,10 +58,12 @@ class Pict:
        return line[18:]
    def __make_dir(self):
        """ Make a dirctory to put the image data in"""
-        base_name = os.path.basename(self.__orig_file)
+        base_name = os.path.basename(getattr(self.__orig_file, 'name',
            self.__orig_file))
        base_name = os.path.splitext(base_name)[0]
        if self.__out_file:
-            dir_name = os.path.dirname(self.__out_file)
+            dir_name = os.path.dirname(getattr(self.__out_file, 'name',
                self.__out_file))
        else:
            dir_name = os.path.dirname(self.__orig_file)
        # self.__output_to_file_func()