Pull from driver-dev

This commit is contained in:
Kovid Goyal 2009-05-29 00:13:53 -07:00
commit eb2d348103
10 changed files with 341 additions and 49 deletions

View File

@ -337,6 +337,7 @@ from calibre.ebooks.pdb.output import PDBOutput
from calibre.ebooks.pdf.output import PDFOutput from calibre.ebooks.pdf.output import PDFOutput
from calibre.ebooks.pml.output import PMLOutput from calibre.ebooks.pml.output import PMLOutput
from calibre.ebooks.rb.output import RBOutput from calibre.ebooks.rb.output import RBOutput
from calibre.ebooks.rtf.output import RTFOutput
from calibre.ebooks.txt.output import TXTOutput from calibre.ebooks.txt.output import TXTOutput
from calibre.customize.profiles import input_profiles, output_profiles from calibre.customize.profiles import input_profiles, output_profiles
@ -382,6 +383,7 @@ plugins += [
PDFOutput, PDFOutput,
PMLOutput, PMLOutput,
RBOutput, RBOutput,
RTFOutput,
TXTOutput, TXTOutput,
] ]
plugins += [ plugins += [

View File

@ -176,7 +176,7 @@ class HTMLPreProcessor(object):
elif self.is_pdftohtml(html): elif self.is_pdftohtml(html):
line_length_rules = [ line_length_rules = [
# Un wrap using punctuation # Un wrap using punctuation
(re.compile(r'(?<=.{%i}[a-z,;:-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?[\w\d])' % line_length(html, .4), re.UNICODE), wrap_lines), (re.compile(r'(?<=.{%i}[a-z,;:-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?[\w\d])' % line_length(html, .3), re.UNICODE), wrap_lines),
] ]
rules = self.PDFTOHTML + line_length_rules rules = self.PDFTOHTML + line_length_rules

View File

@ -44,6 +44,12 @@ class FB2MLizer(object):
def fb2mlize_spine(self): def fb2mlize_spine(self):
output = self.fb2_header() output = self.fb2_header()
if 'titlepage' in self.oeb_book.guide:
href = self.oeb_book.guide['titlepage'].href
item = self.oeb_book.manifest.hrefs[href]
if item.spine_position is None:
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile)
output += self.dump_text(item.data.find(XHTML('body')), stylizer)
for item in self.oeb_book.spine: for item in self.oeb_book.spine:
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile) stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile)
output += self.dump_text(item.data.find(XHTML('body')), stylizer) output += self.dump_text(item.data.find(XHTML('body')), stylizer)
@ -98,25 +104,27 @@ class FB2MLizer(object):
return u'' return u''
tag = barename(elem.tag) tag = barename(elem.tag)
tag_count = 0
if tag == 'img': if tag == 'img':
fb2_text += '<image xlink:herf="#%s" />' % os.path.basename(elem.attrib['src']) fb2_text += '<image xlink:herf="#%s" />' % os.path.basename(elem.attrib['src'])
tag_count = 0
if hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '': fb2_tag = TAG_MAP.get(tag, 'p')
fb2_tag = TAG_MAP.get(tag, 'p') if fb2_tag and fb2_tag not in tag_stack:
if fb2_tag and fb2_tag not in tag_stack: tag_count += 1
fb2_text += '<%s>' % fb2_tag
tag_stack.append(fb2_tag)
# Processes style information
for s in STYLES:
style_tag = s[1].get(style[s[0]], None)
if style_tag:
tag_count += 1 tag_count += 1
fb2_text += '<%s>' % fb2_tag fb2_text += '<%s>' % style_tag
tag_stack.append(fb2_tag) tag_stack.append(style_tag)
# Processes style information
for s in STYLES:
style_tag = s[1].get(style[s[0]], None)
if style_tag:
tag_count += 1
fb2_text += '<%s>' % style_tag
tag_stack.append(style_tag)
if hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '':
fb2_text += elem.text fb2_text += elem.text
for item in elem: for item in elem:

View File

@ -6,7 +6,8 @@ __docformat__ = 'restructuredtext en'
import os import os
import Image, cStringIO import Image
import cStringIO
from calibre.customize.conversion import OutputFormatPlugin from calibre.customize.conversion import OutputFormatPlugin
from calibre.ptempfile import TemporaryDirectory from calibre.ptempfile import TemporaryDirectory

View File

@ -78,6 +78,12 @@ class PMLMLizer(object):
def pmlmlize_spine(self): def pmlmlize_spine(self):
output = u'' output = u''
if 'titlepage' in self.oeb_book.guide:
href = self.oeb_book.guide['titlepage'].href
item = self.oeb_book.manifest.hrefs[href]
if item.spine_position is None:
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile)
output += self.dump_text(item.data.find(XHTML('body')), stylizer)
for item in self.oeb_book.spine: for item in self.oeb_book.spine:
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile) stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile)
output += self.add_page_anchor(item.href) output += self.add_page_anchor(item.href)
@ -153,39 +159,39 @@ class PMLMLizer(object):
#if style['page-break-before'] == 'always': #if style['page-break-before'] == 'always':
# text += '\\p' # text += '\\p'
pml_tag = TAG_MAP.get(tag, None)
if pml_tag and pml_tag not in tag_stack:
tag_count += 1
text += '\\%s' % pml_tag
tag_stack.append(pml_tag)
# Special processing of tags that require an argument.
# Anchors links
if tag in LINK_TAGS and 'q' not in tag_stack:
href = elem.get('href')
if href and '://' not in href:
if '#' in href:
href = href.partition('#')[2]
href = os.path.splitext(os.path.basename(href))[0]
tag_count += 1
text += '\\q="#%s"' % href
tag_stack.append('q')
# Anchor ids
id_name = elem.get('id')
if id_name:
text += '\\Q="%s"' % os.path.splitext(id_name)[0]
# Processes style information
for s in STYLES:
style_tag = s[1].get(style[s[0]], None)
if style_tag and style_tag not in tag_stack:
tag_count += 1
text += '\\%s' % style_tag
tag_stack.append(style_tag)
# margin
# Proccess tags that contain text. # Proccess tags that contain text.
if hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '': if hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '':
pml_tag = TAG_MAP.get(tag, None)
if pml_tag and pml_tag not in tag_stack:
tag_count += 1
text += '\\%s' % pml_tag
tag_stack.append(pml_tag)
# Special processing of tags that require an argument.
# Anchors links
if tag in LINK_TAGS and 'q' not in tag_stack:
href = elem.get('href')
if href and '://' not in href:
if '#' in href:
href = href.partition('#')[2]
href = os.path.splitext(os.path.basename(href))[0]
tag_count += 1
text += '\\q="#%s"' % href
tag_stack.append('q')
# Anchor ids
id_name = elem.get('id')
if id_name:
text += '\\Q="%s"' % os.path.splitext(id_name)[0]
# Processes style information
for s in STYLES:
style_tag = s[1].get(style[s[0]], None)
if style_tag and style_tag not in tag_stack:
tag_count += 1
text += '\\%s' % style_tag
tag_stack.append(style_tag)
# margin
text += self.elem_text(elem, tag_stack) text += self.elem_text(elem, tag_stack)
for item in elem: for item in elem:

View File

@ -65,6 +65,12 @@ class RBMLizer(object):
def mlize_spine(self): def mlize_spine(self):
output = u'<HTML><HEAD><TITLE></TITLE></HEAD><BODY>' output = u'<HTML><HEAD><TITLE></TITLE></HEAD><BODY>'
if 'titlepage' in self.oeb_book.guide:
href = self.oeb_book.guide['titlepage'].href
item = self.oeb_book.manifest.hrefs[href]
if item.spine_position is None:
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile)
output += self.dump_text(item.data.find(XHTML('body')), stylizer)
for item in self.oeb_book.spine: for item in self.oeb_book.spine:
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile) stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile)
output += self.add_page_anchor(item.href) output += self.add_page_anchor(item.href)

View File

@ -70,7 +70,7 @@ class RTFInput(InputFormatPlugin):
self.log = log self.log = log
self.log('Converting RTF to XML...') self.log('Converting RTF to XML...')
try: try:
xml = self.generate_xml(stream) xml = self.generate_xml(stream.name)
except RtfInvalidCodeException: except RtfInvalidCodeException:
raise ValueError(_('This RTF file has a feature calibre does not ' raise ValueError(_('This RTF file has a feature calibre does not '
'support. Convert it to HTML first and then try it.')) 'support. Convert it to HTML first and then try it.'))

View File

@ -0,0 +1,36 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os
from calibre.ebooks.rtf.rtfml import RTFMLizer
from calibre.customize.conversion import OutputFormatPlugin
class RTFOutput(OutputFormatPlugin):
name = 'RTF Output'
author = 'John Schember'
file_type = 'rtf'
def convert(self, oeb_book, output_path, input_plugin, opts, log):
rtfmlitzer = RTFMLizer(ignore_tables=opts.linearize_tables)
content = rtfmlitzer.extract_content(oeb_book, opts)
close = False
if not hasattr(output_path, 'write'):
close = True
if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path) != '':
os.makedirs(os.path.dirname(output_path))
out_stream = open(output_path, 'wb')
else:
out_stream = output_path
out_stream.seek(0)
out_stream.truncate()
out_stream.write(content.encode('ascii', 'replace'))
if close:
out_stream.close()

View File

@ -0,0 +1,234 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
'''
Transform OEB content into RTF markup
'''
import os
import re
import Image
import cStringIO
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace, \
OEB_IMAGES
from calibre.ebooks.oeb.stylizer import Stylizer
from calibre.ebooks.metadata import authors_to_string
TAGS = {
'b': '\\b',
'del': '\\deleted',
'h1': '\\b \\par \\pard \\hyphpar',
'h2': '\\b \\par \\pard \\hyphpar',
'h3': '\\b \\par \\pard \\hyphpar',
'h4': '\\b \\par \\pard \\hyphpar',
'h5': '\\b \\par \\pard \\hyphpar',
'h6': '\\b \\par \\pard \\hyphpar',
'li': '\\par \\pard \\hyphpar \t',
'p': '\\par \\pard \\hyphpar \t',
'sub': '\\sub',
'sup': '\\super',
'u': '\\ul',
}
SINGLE_TAGS = {
'br': '\n{\\line }\n',
'div': '\n{\\line }\n',
}
SINGLE_TAGS_END = {
'div': '\n{\\line }\n',
}
STYLES = [
('display', {'block': '\\par \\pard \\hyphpar'}),
('font-weight', {'bold': '\\b', 'bolder': '\\b'}),
('font-style', {'italic': '\\i'}),
('text-align', {'center': '\\qc', 'left': '\\ql', 'right': '\\qr'}),
('text-decoration', {'line-through': '\\strike', 'underline': '\\ul'}),
]
BLOCK_TAGS = [
'p',
'h1',
'h2',
'h3',
'h4',
'h5',
'h6',
'li',
]
BLOCK_STYLES = [
'block'
]
'''
TODO:
* Tables
* Fonts
'''
class RTFMLizer(object):
def __init__(self, ignore_tables=False):
self.ignore_tables = ignore_tables
def extract_content(self, oeb_book, opts):
oeb_book.logger.info('Converting XHTML to RTF markup...')
self.oeb_book = oeb_book
self.opts = opts
return self.mlize_spine()
def mlize_spine(self):
output = self.header()
if 'titlepage' in self.oeb_book.guide:
href = self.oeb_book.guide['titlepage'].href
item = self.oeb_book.manifest.hrefs[href]
if item.spine_position is None:
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile)
output += self.dump_text(item.data.find(XHTML('body')), stylizer)
output += '{\\page } '
for item in self.oeb_book.spine:
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile)
output += self.dump_text(item.data.find(XHTML('body')), stylizer)
output += self.footer()
output = self.insert_images(output)
output = self.clean_text(output)
return output
def header(self):
return u'{\\rtf1{\\info{\\title %s}{\\author %s}}\\ansi\\ansicpg1252\\deff0\\deflang1033' % (self.oeb_book.metadata.title[0].value, authors_to_string([x.value for x in self.oeb_book.metadata.creator]))
def footer(self):
return ' }'
def insert_images(self, text):
for item in self.oeb_book.manifest:
if item.media_type in OEB_IMAGES:
src = os.path.basename(item.href)
data, width, height = self.image_to_hexstring(item.data)
text = text.replace('SPECIAL_IMAGE-%s-REPLACE_ME' % src, '\n\n{\\*\\shppict{\\pict\\picw%i\\pich%i\\jpegblip \n%s\n}}\n\n' % (width, height, data))
return text
def image_to_hexstring(self, data):
im = Image.open(cStringIO.StringIO(data))
data = cStringIO.StringIO()
im.save(data, 'JPEG')
data = data.getvalue()
raw_hex = ''
for char in data:
raw_hex += hex(ord(char)).replace('0x', '').rjust(2, '0')
# Images must be broken up so that they are no longer than 129 chars
# per line
hex_string = ''
col = 1
for char in raw_hex:
if col == 129:
hex_string += '\n'
col = 1
col += 1
hex_string += char
return (hex_string, im.size[0], im.size[1])
def clean_text(self, text):
# Remove excess spaces at beginning and end of lines
text = re.sub('(?m)^[ ]+', '', text)
text = re.sub('(?m)[ ]+$', '', text)
# Remove excessive newlines
#text = re.sub('%s{1,1}' % os.linesep, '%s%s' % (os.linesep, os.linesep), text)
text = re.sub('%s{3,}' % os.linesep, '%s%s' % (os.linesep, os.linesep), text)
# Remove excessive spaces
text = re.sub('[ ]{2,}', ' ', text)
text = re.sub(r'(\{\\line \}\s*){3,}', r'{\\line }{\\line }', text)
#text = re.compile(r'(\{\\line \}\s*)+(?P<brackets>}*)\s*\{\\par').sub(lambda mo: r'%s{\\par' % mo.group('brackets'), text)
# Remove non-breaking spaces
text = text.replace(u'\xa0', ' ')
text = text.replace('\n\r', '\n')
return text
def dump_text(self, elem, stylizer, tag_stack=[]):
if not isinstance(elem.tag, basestring) \
or namespace(elem.tag) != XHTML_NS:
return u''
text = u''
style = stylizer.style(elem)
if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
or style['visibility'] == 'hidden':
return u''
tag = barename(elem.tag)
tag_count = 0
# Are we in a paragraph block?
if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES:
if 'block' not in tag_stack:
tag_count += 1
tag_stack.append('block')
# Process tags that need special processing and that do not have inner
# text. Usually these require an argument
if tag == 'img':
src = os.path.basename(elem.get('src'))
block_start = ''
block_end = ''
if 'block' not in tag_stack:
block_start = '{\\par \\pard \\hyphpar '
block_end = '}'
text += '%s SPECIAL_IMAGE-%s-REPLACE_ME %s' % (block_start, src, block_end)
single_tag = SINGLE_TAGS.get(tag, None)
if single_tag:
text += single_tag
rtf_tag = TAGS.get(tag, None)
if rtf_tag and rtf_tag not in tag_stack:
tag_count += 1
text += '{%s\n' % rtf_tag
tag_stack.append(rtf_tag)
# Processes style information
for s in STYLES:
style_tag = s[1].get(style[s[0]], None)
if style_tag and style_tag not in tag_stack:
tag_count += 1
text += '{%s\n' % style_tag
tag_stack.append(style_tag)
# Proccess tags that contain text.
if hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '':
text += '%s' % elem.text
for item in elem:
text += self.dump_text(item, stylizer, tag_stack)
for i in range(0, tag_count):
end_tag = tag_stack.pop()
if end_tag != 'block':
text += u'}'
single_tag_end = SINGLE_TAGS_END.get(tag, None)
if single_tag_end:
text += single_tag_end
if hasattr(elem, 'tail') and elem.tail != None and elem.tail.strip() != '':
if 'block' in tag_stack:
text += '%s ' % elem.tail
else:
text += '{\\par \\pard \\hyphpar %s}' % elem.tail
return text

View File

@ -9,7 +9,6 @@ import os
from calibre.customize.conversion import OutputFormatPlugin, \ from calibre.customize.conversion import OutputFormatPlugin, \
OptionRecommendation OptionRecommendation
from calibre.ebooks.txt.writer import TxtWriter, TxtNewlines from calibre.ebooks.txt.writer import TxtWriter, TxtNewlines
from calibre.ebooks.metadata import authors_to_string
class TXTOutput(OutputFormatPlugin): class TXTOutput(OutputFormatPlugin):