mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Pull from driver-dev
This commit is contained in:
commit
eb2d348103
@ -337,6 +337,7 @@ from calibre.ebooks.pdb.output import PDBOutput
|
||||
from calibre.ebooks.pdf.output import PDFOutput
|
||||
from calibre.ebooks.pml.output import PMLOutput
|
||||
from calibre.ebooks.rb.output import RBOutput
|
||||
from calibre.ebooks.rtf.output import RTFOutput
|
||||
from calibre.ebooks.txt.output import TXTOutput
|
||||
|
||||
from calibre.customize.profiles import input_profiles, output_profiles
|
||||
@ -382,6 +383,7 @@ plugins += [
|
||||
PDFOutput,
|
||||
PMLOutput,
|
||||
RBOutput,
|
||||
RTFOutput,
|
||||
TXTOutput,
|
||||
]
|
||||
plugins += [
|
||||
|
@ -176,7 +176,7 @@ class HTMLPreProcessor(object):
|
||||
elif self.is_pdftohtml(html):
|
||||
line_length_rules = [
|
||||
# Un wrap using punctuation
|
||||
(re.compile(r'(?<=.{%i}[a-z,;:-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?[\w\d])' % line_length(html, .4), re.UNICODE), wrap_lines),
|
||||
(re.compile(r'(?<=.{%i}[a-z,;:-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?[\w\d])' % line_length(html, .3), re.UNICODE), wrap_lines),
|
||||
]
|
||||
|
||||
rules = self.PDFTOHTML + line_length_rules
|
||||
|
@ -44,6 +44,12 @@ class FB2MLizer(object):
|
||||
|
||||
def fb2mlize_spine(self):
|
||||
output = self.fb2_header()
|
||||
if 'titlepage' in self.oeb_book.guide:
|
||||
href = self.oeb_book.guide['titlepage'].href
|
||||
item = self.oeb_book.manifest.hrefs[href]
|
||||
if item.spine_position is None:
|
||||
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile)
|
||||
output += self.dump_text(item.data.find(XHTML('body')), stylizer)
|
||||
for item in self.oeb_book.spine:
|
||||
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile)
|
||||
output += self.dump_text(item.data.find(XHTML('body')), stylizer)
|
||||
@ -98,25 +104,27 @@ class FB2MLizer(object):
|
||||
return u''
|
||||
|
||||
tag = barename(elem.tag)
|
||||
tag_count = 0
|
||||
|
||||
if tag == 'img':
|
||||
fb2_text += '<image xlink:herf="#%s" />' % os.path.basename(elem.attrib['src'])
|
||||
|
||||
tag_count = 0
|
||||
if hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '':
|
||||
fb2_tag = TAG_MAP.get(tag, 'p')
|
||||
if fb2_tag and fb2_tag not in tag_stack:
|
||||
|
||||
fb2_tag = TAG_MAP.get(tag, 'p')
|
||||
if fb2_tag and fb2_tag not in tag_stack:
|
||||
tag_count += 1
|
||||
fb2_text += '<%s>' % fb2_tag
|
||||
tag_stack.append(fb2_tag)
|
||||
|
||||
# Processes style information
|
||||
for s in STYLES:
|
||||
style_tag = s[1].get(style[s[0]], None)
|
||||
if style_tag:
|
||||
tag_count += 1
|
||||
fb2_text += '<%s>' % fb2_tag
|
||||
tag_stack.append(fb2_tag)
|
||||
|
||||
# Processes style information
|
||||
for s in STYLES:
|
||||
style_tag = s[1].get(style[s[0]], None)
|
||||
if style_tag:
|
||||
tag_count += 1
|
||||
fb2_text += '<%s>' % style_tag
|
||||
tag_stack.append(style_tag)
|
||||
fb2_text += '<%s>' % style_tag
|
||||
tag_stack.append(style_tag)
|
||||
|
||||
if hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '':
|
||||
fb2_text += elem.text
|
||||
|
||||
for item in elem:
|
||||
|
@ -6,7 +6,8 @@ __docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
|
||||
import Image, cStringIO
|
||||
import Image
|
||||
import cStringIO
|
||||
|
||||
from calibre.customize.conversion import OutputFormatPlugin
|
||||
from calibre.ptempfile import TemporaryDirectory
|
||||
|
@ -78,6 +78,12 @@ class PMLMLizer(object):
|
||||
|
||||
def pmlmlize_spine(self):
|
||||
output = u''
|
||||
if 'titlepage' in self.oeb_book.guide:
|
||||
href = self.oeb_book.guide['titlepage'].href
|
||||
item = self.oeb_book.manifest.hrefs[href]
|
||||
if item.spine_position is None:
|
||||
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile)
|
||||
output += self.dump_text(item.data.find(XHTML('body')), stylizer)
|
||||
for item in self.oeb_book.spine:
|
||||
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile)
|
||||
output += self.add_page_anchor(item.href)
|
||||
@ -153,39 +159,39 @@ class PMLMLizer(object):
|
||||
#if style['page-break-before'] == 'always':
|
||||
# text += '\\p'
|
||||
|
||||
pml_tag = TAG_MAP.get(tag, None)
|
||||
if pml_tag and pml_tag not in tag_stack:
|
||||
tag_count += 1
|
||||
text += '\\%s' % pml_tag
|
||||
tag_stack.append(pml_tag)
|
||||
|
||||
# Special processing of tags that require an argument.
|
||||
# Anchors links
|
||||
if tag in LINK_TAGS and 'q' not in tag_stack:
|
||||
href = elem.get('href')
|
||||
if href and '://' not in href:
|
||||
if '#' in href:
|
||||
href = href.partition('#')[2]
|
||||
href = os.path.splitext(os.path.basename(href))[0]
|
||||
tag_count += 1
|
||||
text += '\\q="#%s"' % href
|
||||
tag_stack.append('q')
|
||||
# Anchor ids
|
||||
id_name = elem.get('id')
|
||||
if id_name:
|
||||
text += '\\Q="%s"' % os.path.splitext(id_name)[0]
|
||||
|
||||
# Processes style information
|
||||
for s in STYLES:
|
||||
style_tag = s[1].get(style[s[0]], None)
|
||||
if style_tag and style_tag not in tag_stack:
|
||||
tag_count += 1
|
||||
text += '\\%s' % style_tag
|
||||
tag_stack.append(style_tag)
|
||||
# margin
|
||||
|
||||
# Proccess tags that contain text.
|
||||
if hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '':
|
||||
pml_tag = TAG_MAP.get(tag, None)
|
||||
if pml_tag and pml_tag not in tag_stack:
|
||||
tag_count += 1
|
||||
text += '\\%s' % pml_tag
|
||||
tag_stack.append(pml_tag)
|
||||
|
||||
# Special processing of tags that require an argument.
|
||||
# Anchors links
|
||||
if tag in LINK_TAGS and 'q' not in tag_stack:
|
||||
href = elem.get('href')
|
||||
if href and '://' not in href:
|
||||
if '#' in href:
|
||||
href = href.partition('#')[2]
|
||||
href = os.path.splitext(os.path.basename(href))[0]
|
||||
tag_count += 1
|
||||
text += '\\q="#%s"' % href
|
||||
tag_stack.append('q')
|
||||
# Anchor ids
|
||||
id_name = elem.get('id')
|
||||
if id_name:
|
||||
text += '\\Q="%s"' % os.path.splitext(id_name)[0]
|
||||
|
||||
# Processes style information
|
||||
for s in STYLES:
|
||||
style_tag = s[1].get(style[s[0]], None)
|
||||
if style_tag and style_tag not in tag_stack:
|
||||
tag_count += 1
|
||||
text += '\\%s' % style_tag
|
||||
tag_stack.append(style_tag)
|
||||
# margin
|
||||
|
||||
text += self.elem_text(elem, tag_stack)
|
||||
|
||||
for item in elem:
|
||||
|
@ -65,6 +65,12 @@ class RBMLizer(object):
|
||||
|
||||
def mlize_spine(self):
|
||||
output = u'<HTML><HEAD><TITLE></TITLE></HEAD><BODY>'
|
||||
if 'titlepage' in self.oeb_book.guide:
|
||||
href = self.oeb_book.guide['titlepage'].href
|
||||
item = self.oeb_book.manifest.hrefs[href]
|
||||
if item.spine_position is None:
|
||||
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile)
|
||||
output += self.dump_text(item.data.find(XHTML('body')), stylizer)
|
||||
for item in self.oeb_book.spine:
|
||||
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile)
|
||||
output += self.add_page_anchor(item.href)
|
||||
|
@ -70,7 +70,7 @@ class RTFInput(InputFormatPlugin):
|
||||
self.log = log
|
||||
self.log('Converting RTF to XML...')
|
||||
try:
|
||||
xml = self.generate_xml(stream)
|
||||
xml = self.generate_xml(stream.name)
|
||||
except RtfInvalidCodeException:
|
||||
raise ValueError(_('This RTF file has a feature calibre does not '
|
||||
'support. Convert it to HTML first and then try it.'))
|
||||
|
36
src/calibre/ebooks/rtf/output.py
Normal file
36
src/calibre/ebooks/rtf/output.py
Normal file
@ -0,0 +1,36 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
|
||||
from calibre.ebooks.rtf.rtfml import RTFMLizer
|
||||
from calibre.customize.conversion import OutputFormatPlugin
|
||||
|
||||
class RTFOutput(OutputFormatPlugin):
|
||||
|
||||
name = 'RTF Output'
|
||||
author = 'John Schember'
|
||||
file_type = 'rtf'
|
||||
|
||||
def convert(self, oeb_book, output_path, input_plugin, opts, log):
|
||||
rtfmlitzer = RTFMLizer(ignore_tables=opts.linearize_tables)
|
||||
content = rtfmlitzer.extract_content(oeb_book, opts)
|
||||
|
||||
close = False
|
||||
if not hasattr(output_path, 'write'):
|
||||
close = True
|
||||
if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path) != '':
|
||||
os.makedirs(os.path.dirname(output_path))
|
||||
out_stream = open(output_path, 'wb')
|
||||
else:
|
||||
out_stream = output_path
|
||||
|
||||
out_stream.seek(0)
|
||||
out_stream.truncate()
|
||||
out_stream.write(content.encode('ascii', 'replace'))
|
||||
|
||||
if close:
|
||||
out_stream.close()
|
234
src/calibre/ebooks/rtf/rtfml.py
Normal file
234
src/calibre/ebooks/rtf/rtfml.py
Normal file
@ -0,0 +1,234 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
'''
|
||||
Transform OEB content into RTF markup
|
||||
'''
|
||||
|
||||
import os
|
||||
import re
|
||||
|
||||
import Image
|
||||
import cStringIO
|
||||
|
||||
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace, \
|
||||
OEB_IMAGES
|
||||
from calibre.ebooks.oeb.stylizer import Stylizer
|
||||
from calibre.ebooks.metadata import authors_to_string
|
||||
|
||||
TAGS = {
|
||||
'b': '\\b',
|
||||
'del': '\\deleted',
|
||||
'h1': '\\b \\par \\pard \\hyphpar',
|
||||
'h2': '\\b \\par \\pard \\hyphpar',
|
||||
'h3': '\\b \\par \\pard \\hyphpar',
|
||||
'h4': '\\b \\par \\pard \\hyphpar',
|
||||
'h5': '\\b \\par \\pard \\hyphpar',
|
||||
'h6': '\\b \\par \\pard \\hyphpar',
|
||||
'li': '\\par \\pard \\hyphpar \t',
|
||||
'p': '\\par \\pard \\hyphpar \t',
|
||||
'sub': '\\sub',
|
||||
'sup': '\\super',
|
||||
'u': '\\ul',
|
||||
}
|
||||
|
||||
SINGLE_TAGS = {
|
||||
'br': '\n{\\line }\n',
|
||||
'div': '\n{\\line }\n',
|
||||
}
|
||||
|
||||
SINGLE_TAGS_END = {
|
||||
'div': '\n{\\line }\n',
|
||||
}
|
||||
|
||||
STYLES = [
|
||||
('display', {'block': '\\par \\pard \\hyphpar'}),
|
||||
('font-weight', {'bold': '\\b', 'bolder': '\\b'}),
|
||||
('font-style', {'italic': '\\i'}),
|
||||
('text-align', {'center': '\\qc', 'left': '\\ql', 'right': '\\qr'}),
|
||||
('text-decoration', {'line-through': '\\strike', 'underline': '\\ul'}),
|
||||
]
|
||||
|
||||
BLOCK_TAGS = [
|
||||
'p',
|
||||
'h1',
|
||||
'h2',
|
||||
'h3',
|
||||
'h4',
|
||||
'h5',
|
||||
'h6',
|
||||
'li',
|
||||
]
|
||||
|
||||
BLOCK_STYLES = [
|
||||
'block'
|
||||
]
|
||||
|
||||
'''
|
||||
TODO:
|
||||
* Tables
|
||||
* Fonts
|
||||
'''
|
||||
class RTFMLizer(object):
|
||||
|
||||
def __init__(self, ignore_tables=False):
|
||||
self.ignore_tables = ignore_tables
|
||||
|
||||
def extract_content(self, oeb_book, opts):
|
||||
oeb_book.logger.info('Converting XHTML to RTF markup...')
|
||||
self.oeb_book = oeb_book
|
||||
self.opts = opts
|
||||
return self.mlize_spine()
|
||||
|
||||
def mlize_spine(self):
|
||||
output = self.header()
|
||||
if 'titlepage' in self.oeb_book.guide:
|
||||
href = self.oeb_book.guide['titlepage'].href
|
||||
item = self.oeb_book.manifest.hrefs[href]
|
||||
if item.spine_position is None:
|
||||
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile)
|
||||
output += self.dump_text(item.data.find(XHTML('body')), stylizer)
|
||||
output += '{\\page } '
|
||||
for item in self.oeb_book.spine:
|
||||
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile)
|
||||
output += self.dump_text(item.data.find(XHTML('body')), stylizer)
|
||||
output += self.footer()
|
||||
output = self.insert_images(output)
|
||||
output = self.clean_text(output)
|
||||
|
||||
return output
|
||||
|
||||
def header(self):
|
||||
return u'{\\rtf1{\\info{\\title %s}{\\author %s}}\\ansi\\ansicpg1252\\deff0\\deflang1033' % (self.oeb_book.metadata.title[0].value, authors_to_string([x.value for x in self.oeb_book.metadata.creator]))
|
||||
|
||||
def footer(self):
|
||||
return ' }'
|
||||
|
||||
def insert_images(self, text):
|
||||
for item in self.oeb_book.manifest:
|
||||
if item.media_type in OEB_IMAGES:
|
||||
src = os.path.basename(item.href)
|
||||
data, width, height = self.image_to_hexstring(item.data)
|
||||
text = text.replace('SPECIAL_IMAGE-%s-REPLACE_ME' % src, '\n\n{\\*\\shppict{\\pict\\picw%i\\pich%i\\jpegblip \n%s\n}}\n\n' % (width, height, data))
|
||||
return text
|
||||
|
||||
def image_to_hexstring(self, data):
|
||||
im = Image.open(cStringIO.StringIO(data))
|
||||
data = cStringIO.StringIO()
|
||||
im.save(data, 'JPEG')
|
||||
data = data.getvalue()
|
||||
|
||||
raw_hex = ''
|
||||
for char in data:
|
||||
raw_hex += hex(ord(char)).replace('0x', '').rjust(2, '0')
|
||||
|
||||
# Images must be broken up so that they are no longer than 129 chars
|
||||
# per line
|
||||
hex_string = ''
|
||||
col = 1
|
||||
for char in raw_hex:
|
||||
if col == 129:
|
||||
hex_string += '\n'
|
||||
col = 1
|
||||
col += 1
|
||||
hex_string += char
|
||||
|
||||
return (hex_string, im.size[0], im.size[1])
|
||||
|
||||
def clean_text(self, text):
|
||||
# Remove excess spaces at beginning and end of lines
|
||||
text = re.sub('(?m)^[ ]+', '', text)
|
||||
text = re.sub('(?m)[ ]+$', '', text)
|
||||
|
||||
# Remove excessive newlines
|
||||
#text = re.sub('%s{1,1}' % os.linesep, '%s%s' % (os.linesep, os.linesep), text)
|
||||
text = re.sub('%s{3,}' % os.linesep, '%s%s' % (os.linesep, os.linesep), text)
|
||||
|
||||
# Remove excessive spaces
|
||||
text = re.sub('[ ]{2,}', ' ', text)
|
||||
|
||||
text = re.sub(r'(\{\\line \}\s*){3,}', r'{\\line }{\\line }', text)
|
||||
#text = re.compile(r'(\{\\line \}\s*)+(?P<brackets>}*)\s*\{\\par').sub(lambda mo: r'%s{\\par' % mo.group('brackets'), text)
|
||||
|
||||
# Remove non-breaking spaces
|
||||
text = text.replace(u'\xa0', ' ')
|
||||
text = text.replace('\n\r', '\n')
|
||||
|
||||
return text
|
||||
|
||||
def dump_text(self, elem, stylizer, tag_stack=[]):
|
||||
if not isinstance(elem.tag, basestring) \
|
||||
or namespace(elem.tag) != XHTML_NS:
|
||||
return u''
|
||||
|
||||
text = u''
|
||||
style = stylizer.style(elem)
|
||||
|
||||
if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
|
||||
or style['visibility'] == 'hidden':
|
||||
return u''
|
||||
|
||||
tag = barename(elem.tag)
|
||||
tag_count = 0
|
||||
|
||||
# Are we in a paragraph block?
|
||||
if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES:
|
||||
if 'block' not in tag_stack:
|
||||
tag_count += 1
|
||||
tag_stack.append('block')
|
||||
|
||||
# Process tags that need special processing and that do not have inner
|
||||
# text. Usually these require an argument
|
||||
if tag == 'img':
|
||||
src = os.path.basename(elem.get('src'))
|
||||
block_start = ''
|
||||
block_end = ''
|
||||
if 'block' not in tag_stack:
|
||||
block_start = '{\\par \\pard \\hyphpar '
|
||||
block_end = '}'
|
||||
text += '%s SPECIAL_IMAGE-%s-REPLACE_ME %s' % (block_start, src, block_end)
|
||||
|
||||
single_tag = SINGLE_TAGS.get(tag, None)
|
||||
if single_tag:
|
||||
text += single_tag
|
||||
|
||||
rtf_tag = TAGS.get(tag, None)
|
||||
if rtf_tag and rtf_tag not in tag_stack:
|
||||
tag_count += 1
|
||||
text += '{%s\n' % rtf_tag
|
||||
tag_stack.append(rtf_tag)
|
||||
|
||||
# Processes style information
|
||||
for s in STYLES:
|
||||
style_tag = s[1].get(style[s[0]], None)
|
||||
if style_tag and style_tag not in tag_stack:
|
||||
tag_count += 1
|
||||
text += '{%s\n' % style_tag
|
||||
tag_stack.append(style_tag)
|
||||
|
||||
# Proccess tags that contain text.
|
||||
if hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '':
|
||||
text += '%s' % elem.text
|
||||
|
||||
for item in elem:
|
||||
text += self.dump_text(item, stylizer, tag_stack)
|
||||
|
||||
for i in range(0, tag_count):
|
||||
end_tag = tag_stack.pop()
|
||||
if end_tag != 'block':
|
||||
text += u'}'
|
||||
|
||||
single_tag_end = SINGLE_TAGS_END.get(tag, None)
|
||||
if single_tag_end:
|
||||
text += single_tag_end
|
||||
|
||||
if hasattr(elem, 'tail') and elem.tail != None and elem.tail.strip() != '':
|
||||
if 'block' in tag_stack:
|
||||
text += '%s ' % elem.tail
|
||||
else:
|
||||
text += '{\\par \\pard \\hyphpar %s}' % elem.tail
|
||||
|
||||
return text
|
@ -9,7 +9,6 @@ import os
|
||||
from calibre.customize.conversion import OutputFormatPlugin, \
|
||||
OptionRecommendation
|
||||
from calibre.ebooks.txt.writer import TxtWriter, TxtNewlines
|
||||
from calibre.ebooks.metadata import authors_to_string
|
||||
|
||||
class TXTOutput(OutputFormatPlugin):
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user