Pull from driver-dev

This commit is contained in:
Kovid Goyal 2009-07-14 10:11:15 -06:00
commit d73f000a10
7 changed files with 188 additions and 163 deletions

View File

@ -3,6 +3,7 @@
''' '''
Writer content to palmdoc pdb file. Writer content to palmdoc pdb file.
''' '''
import os
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>' __copyright__ = '2009, John Schember <john@nachtimwald.com>'
@ -13,8 +14,8 @@ import struct
from calibre.ebooks.compression.palmdoc import compress_doc from calibre.ebooks.compression.palmdoc import compress_doc
from calibre.ebooks.pdb.formatwriter import FormatWriter from calibre.ebooks.pdb.formatwriter import FormatWriter
from calibre.ebooks.pdb.header import PdbHeaderBuilder from calibre.ebooks.pdb.header import PdbHeaderBuilder
from calibre.ebooks.txt.writer import TxtNewlines from calibre.ebooks.txt.txtml import TXTMLizer
from calibre.ebooks.txt.writer import TxtWriter from calibre.ebooks.txt.newlines import TxtNewlines, specified_newlines
MAX_RECORD_SIZE = 4096 MAX_RECORD_SIZE = 4096
@ -27,7 +28,7 @@ class Writer(FormatWriter):
def write_content(self, oeb_book, out_stream, metadata=None): def write_content(self, oeb_book, out_stream, metadata=None):
title = self.opts.title if self.opts.title else oeb_book.metadata.title[0].value if oeb_book.metadata.title != [] else _('Unknown') title = self.opts.title if self.opts.title else oeb_book.metadata.title[0].value if oeb_book.metadata.title != [] else _('Unknown')
txt_records, txt_length = self._generate_text(oeb_book.spine) txt_records, txt_length = self._generate_text(oeb_book)
header_record = self._header_record(txt_length, len(txt_records)) header_record = self._header_record(txt_length, len(txt_records))
section_lengths = [len(header_record)] section_lengths = [len(header_record)]
@ -44,9 +45,12 @@ class Writer(FormatWriter):
for record in [header_record] + txt_records: for record in [header_record] + txt_records:
out_stream.write(record) out_stream.write(record)
def _generate_text(self, spine): def _generate_text(self, oeb_book):
txt_writer = TxtWriter(TxtNewlines('system').newline, self.log) writer = TXTMLizer(self.log)
txt = txt_writer.dump(spine).encode(self.opts.output_encoding, 'replace') txt = writer.extract_content(oeb_book, self.opts)
self.log.debug('\tReplacing newlines with selected type...')
txt = specified_newlines(TxtNewlines('windows').newline, txt).encode(self.opts.output_encoding, 'replace')
txt_length = len(txt) txt_length = len(txt)

View File

@ -11,8 +11,9 @@ __docformat__ = 'restructuredtext en'
import struct, zlib import struct, zlib
from calibre.ebooks.pdb.formatwriter import FormatWriter from calibre.ebooks.pdb.formatwriter import FormatWriter
from calibre.ebooks.txt.writer import TxtWriter, TxtNewlines
from calibre.ebooks.pdb.header import PdbHeaderBuilder from calibre.ebooks.pdb.header import PdbHeaderBuilder
from calibre.ebooks.txt.txtml import TXTMLizer
from calibre.ebooks.txt.newlines import TxtNewlines, specified_newlines
MAX_RECORD_SIZE = 8192 MAX_RECORD_SIZE = 8192
@ -25,7 +26,7 @@ class Writer(FormatWriter):
def write_content(self, oeb_book, out_stream, metadata=None): def write_content(self, oeb_book, out_stream, metadata=None):
title = self.opts.title if self.opts.title else oeb_book.metadata.title[0].value if oeb_book.metadata.title != [] else _('Unknown') title = self.opts.title if self.opts.title else oeb_book.metadata.title[0].value if oeb_book.metadata.title != [] else _('Unknown')
txt_records, txt_length = self._generate_text(oeb_book.spine) txt_records, txt_length = self._generate_text(oeb_book)
crc32 = 0 crc32 = 0
section_lengths = [] section_lengths = []
@ -33,7 +34,7 @@ class Writer(FormatWriter):
self.log.info('Compressing data...') self.log.info('Compressing data...')
for i in range(0, len(txt_records)): for i in range(0, len(txt_records)):
self.log.debug('\tCompressing record %i' % i) self.log.debug('\tCompressing record %i' % i)
txt_records[i] = compressor.compress(txt_records[i].encode('cp1252', 'replace')) txt_records[i] = compressor.compress(txt_records[i])
txt_records[i] = txt_records[i] + compressor.flush(zlib.Z_FULL_FLUSH) txt_records[i] = txt_records[i] + compressor.flush(zlib.Z_FULL_FLUSH)
section_lengths.append(len(txt_records[i])) section_lengths.append(len(txt_records[i]))
crc32 = zlib.crc32(txt_records[i], crc32) & 0xffffffff crc32 = zlib.crc32(txt_records[i], crc32) & 0xffffffff
@ -48,10 +49,13 @@ class Writer(FormatWriter):
for record in [header_record]+txt_records: for record in [header_record]+txt_records:
out_stream.write(record) out_stream.write(record)
def _generate_text(self, spine): def _generate_text(self, oeb_book):
txt_writer = TxtWriter(TxtNewlines('system').newline, self.log) writer = TXTMLizer(self.log)
txt = txt_writer.dump(spine).encode(self.opts.output_encoding, 'replace') txt = writer.extract_content(oeb_book, self.opts)
self.log.debug('\tReplacing newlines with selected type...')
txt = specified_newlines(TxtNewlines('windows').newline, txt).encode(self.opts.output_encoding, 'replace')
txt_length = len(txt) txt_length = len(txt)
txt_records = [] txt_records = []

View File

@ -38,7 +38,7 @@ PML_HTML_RULES = [
(re.compile(r'\\U(?P<num>\d+)'), lambda match: '%s' % unichr(int(match.group('num'), 16))), (re.compile(r'\\U(?P<num>\d+)'), lambda match: '%s' % unichr(int(match.group('num'), 16))),
(re.compile(r'\\m="(?P<name>.+?)"'), lambda match: '<img src="images/%s" />' % image_name(match.group('name')).strip('\x00')), (re.compile(r'\\m="(?P<name>.+?)"'), lambda match: '<img src="images/%s" />' % image_name(match.group('name')).strip('\x00')),
(re.compile(r'\\q="(?P<target>#.+?)"(?P<text>.+?)\\q', re.DOTALL), lambda match: '<a href="%s">%s</a>' % (match.group('target'), match.group('text'))), (re.compile(r'\\q="(?P<target>#.+?)"(?P<text>.+?)\\q', re.DOTALL), lambda match: '<a href="%s">%s</a>' % (match.group('target'), match.group('text'))),
(re.compile(r'\\Q="(?P<target>.+?)"'), lambda match: '<div id="%s"></div>' % match.group('target')), (re.compile(r'\\Q="(?P<target>.+?)"'), lambda match: '<span id="%s"></span>' % match.group('target')),
(re.compile(r'\\-'), lambda match: ''), (re.compile(r'\\-'), lambda match: ''),
(re.compile(r'\\Fn="(?P<target>.+?)"(?P<text>.+?)\\Fn'), lambda match: '<a href="#footnote-%s">%s</a>' % (match.group('target'), match.group('text'))), (re.compile(r'\\Fn="(?P<target>.+?)"(?P<text>.+?)\\Fn'), lambda match: '<a href="#footnote-%s">%s</a>' % (match.group('target'), match.group('text'))),
(re.compile(r'\\Sd="(?P<target>.+?)"(?P<text>.+?)\\Sd'), lambda match: '<a href="#sidebar-%s">%s</a>' % (match.group('target'), match.group('text'))), (re.compile(r'\\Sd="(?P<target>.+?)"(?P<text>.+?)\\Sd'), lambda match: '<a href="#sidebar-%s">%s</a>' % (match.group('target'), match.group('text'))),

View File

@ -0,0 +1,25 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os
class TxtNewlines(object):
NEWLINE_TYPES = {
'system' : os.linesep,
'unix' : '\n',
'old_mac' : '\r',
'windows' : '\r\n'
}
def __init__(self, newline_type):
self.newline = self.NEWLINE_TYPES.get(newline_type.lower(), os.linesep)
def specified_newlines(newline, text):
if newline == os.linesep:
return text
return text.replace(os.linesep, newline)

View File

@ -8,7 +8,8 @@ import os
from calibre.customize.conversion import OutputFormatPlugin, \ from calibre.customize.conversion import OutputFormatPlugin, \
OptionRecommendation OptionRecommendation
from calibre.ebooks.txt.writer import TxtWriter, TxtNewlines from calibre.ebooks.txt.txtml import TXTMLizer
from calibre.ebooks.txt.newlines import TxtNewlines, specified_newlines
class TXTOutput(OutputFormatPlugin): class TXTOutput(OutputFormatPlugin):
@ -32,8 +33,11 @@ class TXTOutput(OutputFormatPlugin):
]) ])
def convert(self, oeb_book, output_path, input_plugin, opts, log): def convert(self, oeb_book, output_path, input_plugin, opts, log):
writer = TxtWriter(TxtNewlines(opts.newline).newline, log) writer = TXTMLizer(log)
txt = writer.dump(oeb_book.spine) txt = writer.extract_content(oeb_book, opts)
log.debug('\tReplacing newlines with selected type...')
txt = specified_newlines(TxtNewlines(opts.newline).newline, txt)
close = False close = False
if not hasattr(output_path, 'write'): if not hasattr(output_path, 'write'):

View File

@ -0,0 +1,134 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
'''
Transform OEB content into plain text
'''
import os, re
from lxml import etree
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace
from calibre.ebooks.oeb.stylizer import Stylizer
BLOCK_TAGS = [
'div',
'p',
'h1',
'h2',
'h3',
'h4',
'h5',
'h6',
'li',
]
BLOCK_STYLES = [
'block',
]
class TXTMLizer(object):
def __init__(self, log):
self.log = log
def extract_content(self, oeb_book, opts):
self.log.info('Converting XHTML to TXT...')
self.oeb_book = oeb_book
self.opts = opts
return self.mlize_spine()
def mlize_spine(self):
output = u''
for item in self.oeb_book.spine:
self.log.debug('Converting %s to TXT...' % item.href)
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile)
content = unicode(etree.tostring(item.data.find(XHTML('body')), encoding=unicode))
content = self.remove_newlines(content)
output += self.dump_text(etree.fromstring(content), stylizer)
output = self.cleanup_text(output)
return output
def remove_newlines(self, text):
self.log.debug('\tRemove newlines for processing...')
text = text.replace('\r\n', ' ')
text = text.replace('\n', ' ')
text = text.replace('\r', ' ')
return text
def cleanup_text(self, text):
self.log.debug('\tClean up text...')
# Replace bad characters.
text = text.replace(u'\xc2', '')
text = text.replace(u'\xa0', ' ')
# Replace tabs, vertical tags and form feeds with single space.
text = text.replace('\t+', ' ')
text = text.replace('\v+', ' ')
text = text.replace('\f+', ' ')
# Single line paragraph.
text = re.sub('(?<=.)%s(?=.)' % os.linesep, ' ', text)
# Remove multiple spaces.
text = re.sub('[ ]+', ' ', text)
# Remove excessive newlines.
#text = re.sub('\n[ ]+\n', '\n\n', text)
#text = re.sub('\n{3,}', '\n\n', text)
# Replace spaces at the beginning and end of lines
text = re.sub('(?imu)^[ ]+', '', text)
text = re.sub('(?imu)[ ]+$', '', text)
return text
def dump_text(self, elem, stylizer, end=''):
'''
@elem: The element in the etree that we are working on.
@stylizer: The style information attached to the element.
@end: The last two characters of the text from the previous element.
This is used to determine if a blank line is needed when starting
a new block element.
'''
if not isinstance(elem.tag, basestring) \
or namespace(elem.tag) != XHTML_NS:
return u''
text = u''
style = stylizer.style(elem)
if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
or style['visibility'] == 'hidden':
return u''
tag = barename(elem.tag)
in_block = False
# Are we in a paragraph block?
if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES:
in_block = True
if not end.endswith(os.linesep + os.linesep) and hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '':
text += os.linesep + os.linesep
# Proccess tags that contain text.
if hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '':
text += elem.text
for item in elem:
text += self.dump_text(item, stylizer, text[-2:])
if in_block:
text += os.linesep + os.linesep
if hasattr(elem, 'tail') and elem.tail != None and elem.tail.strip() != '':
text += elem.tail
return text

View File

@ -1,146 +0,0 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
'''
Write content to TXT.
'''
import os
import re
from lxml import etree
from calibre import entity_to_unicode
from calibre.ebooks.oeb.base import XHTML
class TxtWriter(object):
def __init__(self, newline, log):
self.newline = newline
self.log = log
def dump(self, spine):
out = u''
for item in spine:
self.log.debug('Processing %s...' % item.href)
content = unicode(etree.tostring(item.data.find(XHTML('body')), encoding=unicode))
content = self.remove_newlines(content)
content = self.strip_html(content)
content = self.replace_html_symbols(content)
content = self.cleanup_text(content)
content = self.specified_newlines(content)
out += content
# Put two blank lines at end of file
end = out[-3 * len(self.newline):]
for i in range(3 - end.count(self.newline)):
out += self.newline
return out
def strip_html(self, text):
self.log.debug('\tStripping html...')
stripped = u''
# Remove unnecessary tags
for tag in ['script', 'style']:
text = re.sub('(?imu)<[ ]*%s[ ]*.*?>(.*)</[ ]*%s[ ]*>' % (tag, tag), '', text)
text = re.sub('<!--.*-->', '', text)
text = re.sub('<\?.*?\?>', '', text)
text = re.sub('<@.*?@>', '', text)
text = re.sub('<%.*?%>', '', text)
# Headings usually indicate Chapters.
# We are going to use a marker to insert the proper number of
# newline characters at the end of cleanup_text because cleanup_text
# remove excessive (more than 2 newlines).
for tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
text = re.sub('(?imu)<[ ]*%s[ ]*.*?>' % tag, '-vzxedxy-', text)
text = re.sub('(?imu)</[ ]*%s[ ]*>' % tag, '-vlgzxey-', text)
# Separate content with space.
for tag in ['td']:
text = re.sub('(?imu)</[ ]*%s[ ]*>', ' ', text)
# Separate content with empty line.
for tag in ['p', 'div', 'pre', 'li', 'table', 'tr']:
text = re.sub('(?imu)</[ ]*%s[ ]*>' % tag, '\n\n', text)
for tag in ['hr', 'br']:
text = re.sub('(?imu)<[ ]*%s.*?>' % tag, '\n\n', text)
# Remove any tags that do not need special processing.
text = re.sub('<.*?>', '', text)
stripped = stripped + text
return stripped
def replace_html_symbols(self, content):
self.log.debug('\tReplacing entities with unicode...')
for entity in set(re.findall('&.+?;', content)):
mo = re.search('(%s)' % entity[1:-1], content)
content = content.replace(entity, entity_to_unicode(mo))
return content
def cleanup_text(self, text):
self.log.debug('\tClean up text...')
# Replace bad characters.
text = text.replace(u'\xc2', '')
text = text.replace(u'\xa0', ' ')
# Replace tabs, vertical tags and form feeds with single space.
text = text.replace('\t+', ' ')
text = text.replace('\v+', ' ')
text = text.replace('\f+', ' ')
# Single line paragraph.
text = re.sub('(?<=.)\n(?=.)', ' ', text)
# Remove multiple spaces.
text = re.sub('[ ]+', ' ', text)
# Remove excessive newlines.
text = re.sub('\n[ ]+\n', '\n\n', text)
text = re.sub('\n{3,}', '\n\n', text)
# Replace markers with the proper characters.
text = text.replace('-vzxedxy-', '\n\n\n\n\n')
text = text.replace('-vlgzxey-', '\n\n\n')
# Replace spaces at the beginning and end of lines
text = re.sub('(?imu)^[ ]+', '', text)
text = re.sub('(?imu)[ ]+$', '', text)
return text
def remove_newlines(self, text):
self.log.debug('\tRemove newlines for processing...')
text = text.replace('\r\n', ' ')
text = text.replace('\n', ' ')
text = text.replace('\r', ' ')
return text
def specified_newlines(self, text):
self.log.debug('\tReplacing newlines with selected type...')
if self.newline == '\n':
return text
return text.replace('\n', self.newline)
class TxtNewlines(object):
NEWLINE_TYPES = {
'system' : os.linesep,
'unix' : '\n',
'old_mac' : '\r',
'windows' : '\r\n'
}
def __init__(self, newline_type):
self.newline = self.NEWLINE_TYPES.get(newline_type.lower(), os.linesep)