Pull from driver-dev

2025-07-09 03:04:10 -04:00 · 2009-05-29 00:13:53 -07:00 · 2009-05-29 00:13:53 -07:00 · eb2d348103
commit eb2d348103
parent 8072415891 08af3996bf
10 changed files with 341 additions and 49 deletions
--- a/src/calibre/customize/builtins.py
+++ b/src/calibre/customize/builtins.py
@ -337,6 +337,7 @@ from calibre.ebooks.pdb.output import PDBOutput
 from calibre.ebooks.pdf.output import PDFOutput
 from calibre.ebooks.pml.output import PMLOutput
 from calibre.ebooks.rb.output import RBOutput
 from calibre.ebooks.rtf.output import RTFOutput
 from calibre.ebooks.txt.output import TXTOutput
 from calibre.customize.profiles import input_profiles, output_profiles
@ -382,6 +383,7 @@ plugins += [
    PDFOutput,
    PMLOutput,
    RBOutput,
    RTFOutput,
    TXTOutput,
 ]
 plugins += [
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@ -176,7 +176,7 @@ class HTMLPreProcessor(object):
        elif self.is_pdftohtml(html):
            line_length_rules = [
                # Un wrap using punctuation
-                (re.compile(r'(?<=.{%i}[a-z,;:-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?[\w\d])' % line_length(html, .4), re.UNICODE), wrap_lines),
+                (re.compile(r'(?<=.{%i}[a-z,;:-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?[\w\d])' % line_length(html, .3), re.UNICODE), wrap_lines),
            ]
            rules = self.PDFTOHTML + line_length_rules
--- a/src/calibre/ebooks/fb2/fb2ml.py
+++ b/src/calibre/ebooks/fb2/fb2ml.py
@ -44,6 +44,12 @@ class FB2MLizer(object):
    def fb2mlize_spine(self):
        output = self.fb2_header()
        if 'titlepage' in self.oeb_book.guide:
            href = self.oeb_book.guide['titlepage'].href
            item = self.oeb_book.manifest.hrefs[href]
            if item.spine_position is None:
                stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile)
                output += self.dump_text(item.data.find(XHTML('body')), stylizer)
        for item in self.oeb_book.spine:
            stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile)
            output += self.dump_text(item.data.find(XHTML('body')), stylizer)
@ -98,25 +104,27 @@ class FB2MLizer(object):
            return u''
        tag = barename(elem.tag)
        tag_count = 0
        if tag == 'img':
            fb2_text += '<image xlink:herf="#%s" />' % os.path.basename(elem.attrib['src'])
-        tag_count = 0
+
-        if hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '':
+        fb2_tag = TAG_MAP.get(tag, 'p')
-            fb2_tag = TAG_MAP.get(tag, 'p')
+        if fb2_tag and fb2_tag not in tag_stack:
-            if fb2_tag and fb2_tag not in tag_stack:
+            tag_count += 1
            fb2_text += '<%s>' % fb2_tag
            tag_stack.append(fb2_tag)
        # Processes style information
        for s in STYLES:
            style_tag = s[1].get(style[s[0]], None)
            if style_tag:
                tag_count += 1
-                fb2_text += '<%s>' % fb2_tag
+                fb2_text += '<%s>' % style_tag
-                tag_stack.append(fb2_tag)
+                tag_stack.append(style_tag)
            # Processes style information
            for s in STYLES:
                style_tag = s[1].get(style[s[0]], None)
                if style_tag:
                    tag_count += 1
                    fb2_text += '<%s>' % style_tag
                    tag_stack.append(style_tag)
        if hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '':
            fb2_text += elem.text
        for item in elem:
--- a/src/calibre/ebooks/pml/output.py
+++ b/src/calibre/ebooks/pml/output.py
@ -6,7 +6,8 @@ __docformat__ = 'restructuredtext en'
 import os
-import Image, cStringIO
+import Image
 import cStringIO
 from calibre.customize.conversion import OutputFormatPlugin
 from calibre.ptempfile import TemporaryDirectory
--- a/src/calibre/ebooks/pml/pmlml.py
+++ b/src/calibre/ebooks/pml/pmlml.py
@ -78,6 +78,12 @@ class PMLMLizer(object):
    def pmlmlize_spine(self):
        output = u''
        if 'titlepage' in self.oeb_book.guide:
            href = self.oeb_book.guide['titlepage'].href
            item = self.oeb_book.manifest.hrefs[href]
            if item.spine_position is None:
                stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile)
                output += self.dump_text(item.data.find(XHTML('body')), stylizer)
        for item in self.oeb_book.spine:
            stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile)
            output += self.add_page_anchor(item.href)
@ -153,39 +159,39 @@ class PMLMLizer(object):
        #if style['page-break-before'] == 'always':
        #    text += '\\p'
        pml_tag = TAG_MAP.get(tag, None)
        if pml_tag and pml_tag not in tag_stack:
            tag_count += 1
            text += '\\%s' % pml_tag
            tag_stack.append(pml_tag)
        # Special processing of tags that require an argument.
        # Anchors links
        if tag in LINK_TAGS and 'q' not in tag_stack:
            href = elem.get('href')
            if href and '://' not in href:
                if '#' in href:
                    href = href.partition('#')[2]
                href = os.path.splitext(os.path.basename(href))[0]
                tag_count += 1
                text += '\\q="#%s"' % href
                tag_stack.append('q')
        # Anchor ids
        id_name = elem.get('id')
        if id_name:
            text += '\\Q="%s"' % os.path.splitext(id_name)[0]
        # Processes style information
        for s in STYLES:
            style_tag = s[1].get(style[s[0]], None)
            if style_tag and style_tag not in tag_stack:
                tag_count += 1
                text += '\\%s' % style_tag
                tag_stack.append(style_tag)
        # margin
        # Proccess tags that contain text.
        if hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '':
            pml_tag = TAG_MAP.get(tag, None)
            if pml_tag and pml_tag not in tag_stack:
                tag_count += 1
                text += '\\%s' % pml_tag
                tag_stack.append(pml_tag)
            # Special processing of tags that require an argument.
            # Anchors links
            if tag in LINK_TAGS and 'q' not in tag_stack:
                href = elem.get('href')
                if href and '://' not in href:
                    if '#' in href:
                        href = href.partition('#')[2]
                    href = os.path.splitext(os.path.basename(href))[0]
                    tag_count += 1
                    text += '\\q="#%s"' % href
                    tag_stack.append('q')
            # Anchor ids
            id_name = elem.get('id')
            if id_name:
                text += '\\Q="%s"' % os.path.splitext(id_name)[0]
            # Processes style information
            for s in STYLES:
                style_tag = s[1].get(style[s[0]], None)
                if style_tag and style_tag not in tag_stack:
                    tag_count += 1
                    text += '\\%s' % style_tag
                    tag_stack.append(style_tag)
            # margin
            text += self.elem_text(elem, tag_stack)
        for item in elem:
--- a/src/calibre/ebooks/rb/rbml.py
+++ b/src/calibre/ebooks/rb/rbml.py
@ -65,6 +65,12 @@ class RBMLizer(object):
    def mlize_spine(self):
        output = u'<HTML><HEAD><TITLE></TITLE></HEAD><BODY>'
        if 'titlepage' in self.oeb_book.guide:
            href = self.oeb_book.guide['titlepage'].href
            item = self.oeb_book.manifest.hrefs[href]
            if item.spine_position is None:
                stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile)
                output += self.dump_text(item.data.find(XHTML('body')), stylizer)
        for item in self.oeb_book.spine:
            stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile)
            output += self.add_page_anchor(item.href)
--- a/src/calibre/ebooks/rtf/input.py
+++ b/src/calibre/ebooks/rtf/input.py
@ -70,7 +70,7 @@ class RTFInput(InputFormatPlugin):
        self.log = log
        self.log('Converting RTF to XML...')
        try:
-            xml = self.generate_xml(stream)
+            xml = self.generate_xml(stream.name)
        except RtfInvalidCodeException:
            raise ValueError(_('This RTF file has a feature calibre does not '
            'support. Convert it to HTML first and then try it.'))
--- a/src/calibre/ebooks/rtf/output.py
+++ b/src/calibre/ebooks/rtf/output.py
@ -0,0 +1,36 @@
 # -*- coding: utf-8 -*-
 __license__ = 'GPL 3'
 __copyright__ = '2009, John Schember <john@nachtimwald.com>'
 __docformat__ = 'restructuredtext en'
 import os
 from calibre.ebooks.rtf.rtfml import RTFMLizer
 from calibre.customize.conversion import OutputFormatPlugin
 class RTFOutput(OutputFormatPlugin):
    name = 'RTF Output'
    author = 'John Schember'
    file_type = 'rtf'
    def convert(self, oeb_book, output_path, input_plugin, opts, log):
        rtfmlitzer = RTFMLizer(ignore_tables=opts.linearize_tables)
        content = rtfmlitzer.extract_content(oeb_book, opts)
        close = False
        if not hasattr(output_path, 'write'):
            close = True
            if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path) != '':
                os.makedirs(os.path.dirname(output_path))
            out_stream = open(output_path, 'wb')
        else:
            out_stream = output_path
        out_stream.seek(0)
        out_stream.truncate()
        out_stream.write(content.encode('ascii', 'replace'))
        if close:
            out_stream.close()
--- a/src/calibre/ebooks/rtf/rtfml.py
+++ b/src/calibre/ebooks/rtf/rtfml.py
@ -0,0 +1,234 @@
 # -*- coding: utf-8 -*-
 __license__ = 'GPL 3'
 __copyright__ = '2009, John Schember <john@nachtimwald.com>'
 __docformat__ = 'restructuredtext en'
 '''
 Transform OEB content into RTF markup
 '''
 import os
 import re
 import Image
 import cStringIO
 from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace, \
    OEB_IMAGES
 from calibre.ebooks.oeb.stylizer import Stylizer
 from calibre.ebooks.metadata import authors_to_string
 TAGS = {
    'b': '\\b',
    'del': '\\deleted',
    'h1': '\\b \\par \\pard \\hyphpar',
    'h2': '\\b \\par \\pard \\hyphpar',
    'h3': '\\b \\par \\pard \\hyphpar',
    'h4': '\\b \\par \\pard \\hyphpar',
    'h5': '\\b \\par \\pard \\hyphpar',
    'h6': '\\b \\par \\pard \\hyphpar',
    'li': '\\par \\pard \\hyphpar \t',
    'p': '\\par \\pard \\hyphpar \t',
    'sub': '\\sub',
    'sup': '\\super',
    'u': '\\ul',
 }
 SINGLE_TAGS = {
    'br': '\n{\\line }\n',
    'div': '\n{\\line }\n',
 }
 SINGLE_TAGS_END = {
    'div': '\n{\\line }\n',
 }
 STYLES = [
    ('display', {'block': '\\par \\pard \\hyphpar'}),
    ('font-weight', {'bold': '\\b', 'bolder': '\\b'}),
    ('font-style', {'italic': '\\i'}),
    ('text-align', {'center': '\\qc', 'left': '\\ql', 'right': '\\qr'}),
    ('text-decoration', {'line-through': '\\strike', 'underline': '\\ul'}),
 ]
 BLOCK_TAGS = [
    'p',
    'h1',
    'h2',
    'h3',
    'h4',
    'h5',
    'h6',
    'li',
 ]
 BLOCK_STYLES = [
    'block'
 ]
 '''
 TODO:
    * Tables
    * Fonts
 '''
 class RTFMLizer(object):
    def __init__(self, ignore_tables=False):
        self.ignore_tables = ignore_tables
    def extract_content(self, oeb_book, opts):
        oeb_book.logger.info('Converting XHTML to RTF markup...')
        self.oeb_book = oeb_book
        self.opts = opts
        return self.mlize_spine()
    def mlize_spine(self):
        output = self.header()
        if 'titlepage' in self.oeb_book.guide:
            href = self.oeb_book.guide['titlepage'].href
            item = self.oeb_book.manifest.hrefs[href]
            if item.spine_position is None:
                stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile)
                output += self.dump_text(item.data.find(XHTML('body')), stylizer)
                output += '{\\page } '
        for item in self.oeb_book.spine:
            stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile)
            output += self.dump_text(item.data.find(XHTML('body')), stylizer)
        output += self.footer()
        output = self.insert_images(output)
        output = self.clean_text(output)
        return output
    def header(self):
        return u'{\\rtf1{\\info{\\title %s}{\\author %s}}\\ansi\\ansicpg1252\\deff0\\deflang1033' % (self.oeb_book.metadata.title[0].value, authors_to_string([x.value for x in self.oeb_book.metadata.creator]))
    def footer(self):
        return ' }'
    def insert_images(self, text):
        for item in self.oeb_book.manifest:
            if item.media_type in OEB_IMAGES:
                src = os.path.basename(item.href)
                data, width, height = self.image_to_hexstring(item.data)
                text = text.replace('SPECIAL_IMAGE-%s-REPLACE_ME' % src, '\n\n{\\*\\shppict{\\pict\\picw%i\\pich%i\\jpegblip \n%s\n}}\n\n' % (width, height, data))
        return text
    def image_to_hexstring(self, data):
        im = Image.open(cStringIO.StringIO(data))
        data = cStringIO.StringIO()
        im.save(data, 'JPEG')
        data = data.getvalue()
        raw_hex = ''
        for char in data:
            raw_hex += hex(ord(char)).replace('0x', '').rjust(2, '0')
        # Images must be broken up so that they are no longer than 129 chars
        # per line
        hex_string = ''
        col = 1
        for char in raw_hex:
            if col == 129:
                hex_string += '\n'
                col = 1
            col += 1
            hex_string += char
        return (hex_string, im.size[0], im.size[1])
    def clean_text(self, text):
        # Remove excess spaces at beginning and end of lines
        text = re.sub('(?m)^[ ]+', '', text)
        text = re.sub('(?m)[ ]+$', '', text)
        # Remove excessive newlines
        #text = re.sub('%s{1,1}' % os.linesep, '%s%s' % (os.linesep, os.linesep), text)
        text = re.sub('%s{3,}' % os.linesep, '%s%s' % (os.linesep, os.linesep), text)
        # Remove excessive spaces
        text = re.sub('[ ]{2,}', ' ', text)
        text = re.sub(r'(\{\\line \}\s*){3,}', r'{\\line }{\\line }', text)
        #text = re.compile(r'(\{\\line \}\s*)+(?P<brackets>}*)\s*\{\\par').sub(lambda mo: r'%s{\\par' % mo.group('brackets'), text)
        # Remove non-breaking spaces
        text = text.replace(u'\xa0', ' ')
        text = text.replace('\n\r', '\n')
        return text
    def dump_text(self, elem, stylizer, tag_stack=[]):
        if not isinstance(elem.tag, basestring) \
           or namespace(elem.tag) != XHTML_NS:
            return u''
        text = u''
        style = stylizer.style(elem)
        if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
           or style['visibility'] == 'hidden':
            return u''
        tag = barename(elem.tag)
        tag_count = 0
        # Are we in a paragraph block?
        if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES:
            if 'block' not in tag_stack:
                tag_count += 1
                tag_stack.append('block')
        # Process tags that need special processing and that do not have inner
        # text. Usually these require an argument
        if tag == 'img':
            src = os.path.basename(elem.get('src'))
            block_start = ''
            block_end = ''
            if 'block' not in tag_stack:
                block_start = '{\\par \\pard \\hyphpar '
                block_end = '}'
            text += '%s SPECIAL_IMAGE-%s-REPLACE_ME %s' % (block_start, src, block_end)
        single_tag = SINGLE_TAGS.get(tag, None)
        if single_tag:
            text += single_tag
        rtf_tag = TAGS.get(tag, None)
        if rtf_tag and rtf_tag not in tag_stack:
            tag_count += 1
            text += '{%s\n' % rtf_tag
            tag_stack.append(rtf_tag)
        # Processes style information
        for s in STYLES:
            style_tag = s[1].get(style[s[0]], None)
            if style_tag and style_tag not in tag_stack:
                tag_count += 1
                text += '{%s\n' % style_tag
                tag_stack.append(style_tag)
        # Proccess tags that contain text.
        if hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '':
            text += '%s' % elem.text
        for item in elem:
            text += self.dump_text(item, stylizer, tag_stack)
        for i in range(0, tag_count):
            end_tag =  tag_stack.pop()
            if end_tag != 'block':
                text += u'}'
        single_tag_end = SINGLE_TAGS_END.get(tag, None)
        if single_tag_end:
            text += single_tag_end
        if hasattr(elem, 'tail') and elem.tail != None and elem.tail.strip() != '':
            if 'block' in tag_stack:
                text += '%s ' % elem.tail
            else:
                text += '{\\par \\pard \\hyphpar %s}' % elem.tail
        return text
--- a/src/calibre/ebooks/txt/output.py
+++ b/src/calibre/ebooks/txt/output.py
@ -9,7 +9,6 @@ import os
 from calibre.customize.conversion import OutputFormatPlugin, \
    OptionRecommendation
 from calibre.ebooks.txt.writer import TxtWriter, TxtNewlines
 from calibre.ebooks.metadata import authors_to_string
 class TXTOutput(OutputFormatPlugin):