mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Pull from driver-dev
This commit is contained in:
commit
d73f000a10
@ -3,6 +3,7 @@
|
||||
'''
|
||||
Writer content to palmdoc pdb file.
|
||||
'''
|
||||
import os
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
@ -13,8 +14,8 @@ import struct
|
||||
from calibre.ebooks.compression.palmdoc import compress_doc
|
||||
from calibre.ebooks.pdb.formatwriter import FormatWriter
|
||||
from calibre.ebooks.pdb.header import PdbHeaderBuilder
|
||||
from calibre.ebooks.txt.writer import TxtNewlines
|
||||
from calibre.ebooks.txt.writer import TxtWriter
|
||||
from calibre.ebooks.txt.txtml import TXTMLizer
|
||||
from calibre.ebooks.txt.newlines import TxtNewlines, specified_newlines
|
||||
|
||||
MAX_RECORD_SIZE = 4096
|
||||
|
||||
@ -27,7 +28,7 @@ class Writer(FormatWriter):
|
||||
def write_content(self, oeb_book, out_stream, metadata=None):
|
||||
title = self.opts.title if self.opts.title else oeb_book.metadata.title[0].value if oeb_book.metadata.title != [] else _('Unknown')
|
||||
|
||||
txt_records, txt_length = self._generate_text(oeb_book.spine)
|
||||
txt_records, txt_length = self._generate_text(oeb_book)
|
||||
header_record = self._header_record(txt_length, len(txt_records))
|
||||
|
||||
section_lengths = [len(header_record)]
|
||||
@ -44,9 +45,12 @@ class Writer(FormatWriter):
|
||||
for record in [header_record] + txt_records:
|
||||
out_stream.write(record)
|
||||
|
||||
def _generate_text(self, spine):
|
||||
txt_writer = TxtWriter(TxtNewlines('system').newline, self.log)
|
||||
txt = txt_writer.dump(spine).encode(self.opts.output_encoding, 'replace')
|
||||
def _generate_text(self, oeb_book):
|
||||
writer = TXTMLizer(self.log)
|
||||
txt = writer.extract_content(oeb_book, self.opts)
|
||||
|
||||
self.log.debug('\tReplacing newlines with selected type...')
|
||||
txt = specified_newlines(TxtNewlines('windows').newline, txt).encode(self.opts.output_encoding, 'replace')
|
||||
|
||||
txt_length = len(txt)
|
||||
|
||||
|
@ -11,8 +11,9 @@ __docformat__ = 'restructuredtext en'
|
||||
import struct, zlib
|
||||
|
||||
from calibre.ebooks.pdb.formatwriter import FormatWriter
|
||||
from calibre.ebooks.txt.writer import TxtWriter, TxtNewlines
|
||||
from calibre.ebooks.pdb.header import PdbHeaderBuilder
|
||||
from calibre.ebooks.txt.txtml import TXTMLizer
|
||||
from calibre.ebooks.txt.newlines import TxtNewlines, specified_newlines
|
||||
|
||||
MAX_RECORD_SIZE = 8192
|
||||
|
||||
@ -25,7 +26,7 @@ class Writer(FormatWriter):
|
||||
def write_content(self, oeb_book, out_stream, metadata=None):
|
||||
title = self.opts.title if self.opts.title else oeb_book.metadata.title[0].value if oeb_book.metadata.title != [] else _('Unknown')
|
||||
|
||||
txt_records, txt_length = self._generate_text(oeb_book.spine)
|
||||
txt_records, txt_length = self._generate_text(oeb_book)
|
||||
|
||||
crc32 = 0
|
||||
section_lengths = []
|
||||
@ -33,7 +34,7 @@ class Writer(FormatWriter):
|
||||
self.log.info('Compressing data...')
|
||||
for i in range(0, len(txt_records)):
|
||||
self.log.debug('\tCompressing record %i' % i)
|
||||
txt_records[i] = compressor.compress(txt_records[i].encode('cp1252', 'replace'))
|
||||
txt_records[i] = compressor.compress(txt_records[i])
|
||||
txt_records[i] = txt_records[i] + compressor.flush(zlib.Z_FULL_FLUSH)
|
||||
section_lengths.append(len(txt_records[i]))
|
||||
crc32 = zlib.crc32(txt_records[i], crc32) & 0xffffffff
|
||||
@ -48,10 +49,13 @@ class Writer(FormatWriter):
|
||||
for record in [header_record]+txt_records:
|
||||
out_stream.write(record)
|
||||
|
||||
def _generate_text(self, spine):
|
||||
txt_writer = TxtWriter(TxtNewlines('system').newline, self.log)
|
||||
txt = txt_writer.dump(spine).encode(self.opts.output_encoding, 'replace')
|
||||
|
||||
def _generate_text(self, oeb_book):
|
||||
writer = TXTMLizer(self.log)
|
||||
txt = writer.extract_content(oeb_book, self.opts)
|
||||
|
||||
self.log.debug('\tReplacing newlines with selected type...')
|
||||
txt = specified_newlines(TxtNewlines('windows').newline, txt).encode(self.opts.output_encoding, 'replace')
|
||||
|
||||
txt_length = len(txt)
|
||||
|
||||
txt_records = []
|
||||
|
@ -38,7 +38,7 @@ PML_HTML_RULES = [
|
||||
(re.compile(r'\\U(?P<num>\d+)'), lambda match: '%s' % unichr(int(match.group('num'), 16))),
|
||||
(re.compile(r'\\m="(?P<name>.+?)"'), lambda match: '<img src="images/%s" />' % image_name(match.group('name')).strip('\x00')),
|
||||
(re.compile(r'\\q="(?P<target>#.+?)"(?P<text>.+?)\\q', re.DOTALL), lambda match: '<a href="%s">%s</a>' % (match.group('target'), match.group('text'))),
|
||||
(re.compile(r'\\Q="(?P<target>.+?)"'), lambda match: '<div id="%s"></div>' % match.group('target')),
|
||||
(re.compile(r'\\Q="(?P<target>.+?)"'), lambda match: '<span id="%s"></span>' % match.group('target')),
|
||||
(re.compile(r'\\-'), lambda match: ''),
|
||||
(re.compile(r'\\Fn="(?P<target>.+?)"(?P<text>.+?)\\Fn'), lambda match: '<a href="#footnote-%s">%s</a>' % (match.group('target'), match.group('text'))),
|
||||
(re.compile(r'\\Sd="(?P<target>.+?)"(?P<text>.+?)\\Sd'), lambda match: '<a href="#sidebar-%s">%s</a>' % (match.group('target'), match.group('text'))),
|
||||
|
25
src/calibre/ebooks/txt/newlines.py
Normal file
25
src/calibre/ebooks/txt/newlines.py
Normal file
@ -0,0 +1,25 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
|
||||
class TxtNewlines(object):
|
||||
|
||||
NEWLINE_TYPES = {
|
||||
'system' : os.linesep,
|
||||
'unix' : '\n',
|
||||
'old_mac' : '\r',
|
||||
'windows' : '\r\n'
|
||||
}
|
||||
|
||||
def __init__(self, newline_type):
|
||||
self.newline = self.NEWLINE_TYPES.get(newline_type.lower(), os.linesep)
|
||||
|
||||
def specified_newlines(newline, text):
|
||||
if newline == os.linesep:
|
||||
return text
|
||||
|
||||
return text.replace(os.linesep, newline)
|
@ -8,7 +8,8 @@ import os
|
||||
|
||||
from calibre.customize.conversion import OutputFormatPlugin, \
|
||||
OptionRecommendation
|
||||
from calibre.ebooks.txt.writer import TxtWriter, TxtNewlines
|
||||
from calibre.ebooks.txt.txtml import TXTMLizer
|
||||
from calibre.ebooks.txt.newlines import TxtNewlines, specified_newlines
|
||||
|
||||
class TXTOutput(OutputFormatPlugin):
|
||||
|
||||
@ -32,8 +33,11 @@ class TXTOutput(OutputFormatPlugin):
|
||||
])
|
||||
|
||||
def convert(self, oeb_book, output_path, input_plugin, opts, log):
|
||||
writer = TxtWriter(TxtNewlines(opts.newline).newline, log)
|
||||
txt = writer.dump(oeb_book.spine)
|
||||
writer = TXTMLizer(log)
|
||||
txt = writer.extract_content(oeb_book, opts)
|
||||
|
||||
log.debug('\tReplacing newlines with selected type...')
|
||||
txt = specified_newlines(TxtNewlines(opts.newline).newline, txt)
|
||||
|
||||
close = False
|
||||
if not hasattr(output_path, 'write'):
|
||||
|
134
src/calibre/ebooks/txt/txtml.py
Normal file
134
src/calibre/ebooks/txt/txtml.py
Normal file
@ -0,0 +1,134 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
'''
|
||||
Transform OEB content into plain text
|
||||
'''
|
||||
|
||||
import os, re
|
||||
|
||||
from lxml import etree
|
||||
|
||||
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace
|
||||
from calibre.ebooks.oeb.stylizer import Stylizer
|
||||
|
||||
BLOCK_TAGS = [
|
||||
'div',
|
||||
'p',
|
||||
'h1',
|
||||
'h2',
|
||||
'h3',
|
||||
'h4',
|
||||
'h5',
|
||||
'h6',
|
||||
'li',
|
||||
]
|
||||
|
||||
BLOCK_STYLES = [
|
||||
'block',
|
||||
]
|
||||
|
||||
class TXTMLizer(object):
|
||||
|
||||
def __init__(self, log):
|
||||
self.log = log
|
||||
|
||||
def extract_content(self, oeb_book, opts):
|
||||
self.log.info('Converting XHTML to TXT...')
|
||||
self.oeb_book = oeb_book
|
||||
self.opts = opts
|
||||
return self.mlize_spine()
|
||||
|
||||
def mlize_spine(self):
|
||||
output = u''
|
||||
for item in self.oeb_book.spine:
|
||||
self.log.debug('Converting %s to TXT...' % item.href)
|
||||
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile)
|
||||
content = unicode(etree.tostring(item.data.find(XHTML('body')), encoding=unicode))
|
||||
content = self.remove_newlines(content)
|
||||
output += self.dump_text(etree.fromstring(content), stylizer)
|
||||
output = self.cleanup_text(output)
|
||||
|
||||
return output
|
||||
|
||||
def remove_newlines(self, text):
|
||||
self.log.debug('\tRemove newlines for processing...')
|
||||
text = text.replace('\r\n', ' ')
|
||||
text = text.replace('\n', ' ')
|
||||
text = text.replace('\r', ' ')
|
||||
|
||||
return text
|
||||
|
||||
def cleanup_text(self, text):
|
||||
self.log.debug('\tClean up text...')
|
||||
# Replace bad characters.
|
||||
text = text.replace(u'\xc2', '')
|
||||
text = text.replace(u'\xa0', ' ')
|
||||
|
||||
# Replace tabs, vertical tags and form feeds with single space.
|
||||
text = text.replace('\t+', ' ')
|
||||
text = text.replace('\v+', ' ')
|
||||
text = text.replace('\f+', ' ')
|
||||
|
||||
# Single line paragraph.
|
||||
text = re.sub('(?<=.)%s(?=.)' % os.linesep, ' ', text)
|
||||
|
||||
# Remove multiple spaces.
|
||||
text = re.sub('[ ]+', ' ', text)
|
||||
|
||||
# Remove excessive newlines.
|
||||
#text = re.sub('\n[ ]+\n', '\n\n', text)
|
||||
#text = re.sub('\n{3,}', '\n\n', text)
|
||||
|
||||
# Replace spaces at the beginning and end of lines
|
||||
text = re.sub('(?imu)^[ ]+', '', text)
|
||||
text = re.sub('(?imu)[ ]+$', '', text)
|
||||
|
||||
return text
|
||||
|
||||
def dump_text(self, elem, stylizer, end=''):
|
||||
'''
|
||||
@elem: The element in the etree that we are working on.
|
||||
@stylizer: The style information attached to the element.
|
||||
@end: The last two characters of the text from the previous element.
|
||||
This is used to determine if a blank line is needed when starting
|
||||
a new block element.
|
||||
'''
|
||||
|
||||
if not isinstance(elem.tag, basestring) \
|
||||
or namespace(elem.tag) != XHTML_NS:
|
||||
return u''
|
||||
|
||||
text = u''
|
||||
style = stylizer.style(elem)
|
||||
|
||||
if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
|
||||
or style['visibility'] == 'hidden':
|
||||
return u''
|
||||
|
||||
tag = barename(elem.tag)
|
||||
in_block = False
|
||||
|
||||
# Are we in a paragraph block?
|
||||
if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES:
|
||||
in_block = True
|
||||
if not end.endswith(os.linesep + os.linesep) and hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '':
|
||||
text += os.linesep + os.linesep
|
||||
|
||||
# Proccess tags that contain text.
|
||||
if hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '':
|
||||
text += elem.text
|
||||
|
||||
for item in elem:
|
||||
text += self.dump_text(item, stylizer, text[-2:])
|
||||
|
||||
if in_block:
|
||||
text += os.linesep + os.linesep
|
||||
|
||||
if hasattr(elem, 'tail') and elem.tail != None and elem.tail.strip() != '':
|
||||
text += elem.tail
|
||||
|
||||
return text
|
@ -1,146 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
'''
|
||||
Write content to TXT.
|
||||
'''
|
||||
|
||||
import os
|
||||
import re
|
||||
|
||||
from lxml import etree
|
||||
|
||||
from calibre import entity_to_unicode
|
||||
from calibre.ebooks.oeb.base import XHTML
|
||||
|
||||
class TxtWriter(object):
|
||||
def __init__(self, newline, log):
|
||||
self.newline = newline
|
||||
self.log = log
|
||||
|
||||
def dump(self, spine):
|
||||
out = u''
|
||||
for item in spine:
|
||||
self.log.debug('Processing %s...' % item.href)
|
||||
content = unicode(etree.tostring(item.data.find(XHTML('body')), encoding=unicode))
|
||||
content = self.remove_newlines(content)
|
||||
content = self.strip_html(content)
|
||||
content = self.replace_html_symbols(content)
|
||||
content = self.cleanup_text(content)
|
||||
content = self.specified_newlines(content)
|
||||
out += content
|
||||
|
||||
# Put two blank lines at end of file
|
||||
end = out[-3 * len(self.newline):]
|
||||
for i in range(3 - end.count(self.newline)):
|
||||
out += self.newline
|
||||
|
||||
return out
|
||||
|
||||
def strip_html(self, text):
|
||||
self.log.debug('\tStripping html...')
|
||||
stripped = u''
|
||||
|
||||
# Remove unnecessary tags
|
||||
for tag in ['script', 'style']:
|
||||
text = re.sub('(?imu)<[ ]*%s[ ]*.*?>(.*)</[ ]*%s[ ]*>' % (tag, tag), '', text)
|
||||
text = re.sub('<!--.*-->', '', text)
|
||||
text = re.sub('<\?.*?\?>', '', text)
|
||||
text = re.sub('<@.*?@>', '', text)
|
||||
text = re.sub('<%.*?%>', '', text)
|
||||
|
||||
# Headings usually indicate Chapters.
|
||||
# We are going to use a marker to insert the proper number of
|
||||
# newline characters at the end of cleanup_text because cleanup_text
|
||||
# remove excessive (more than 2 newlines).
|
||||
for tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
|
||||
text = re.sub('(?imu)<[ ]*%s[ ]*.*?>' % tag, '-vzxedxy-', text)
|
||||
text = re.sub('(?imu)</[ ]*%s[ ]*>' % tag, '-vlgzxey-', text)
|
||||
|
||||
# Separate content with space.
|
||||
for tag in ['td']:
|
||||
text = re.sub('(?imu)</[ ]*%s[ ]*>', ' ', text)
|
||||
|
||||
# Separate content with empty line.
|
||||
for tag in ['p', 'div', 'pre', 'li', 'table', 'tr']:
|
||||
text = re.sub('(?imu)</[ ]*%s[ ]*>' % tag, '\n\n', text)
|
||||
|
||||
for tag in ['hr', 'br']:
|
||||
text = re.sub('(?imu)<[ ]*%s.*?>' % tag, '\n\n', text)
|
||||
|
||||
# Remove any tags that do not need special processing.
|
||||
text = re.sub('<.*?>', '', text)
|
||||
|
||||
stripped = stripped + text
|
||||
|
||||
return stripped
|
||||
|
||||
def replace_html_symbols(self, content):
|
||||
self.log.debug('\tReplacing entities with unicode...')
|
||||
for entity in set(re.findall('&.+?;', content)):
|
||||
mo = re.search('(%s)' % entity[1:-1], content)
|
||||
content = content.replace(entity, entity_to_unicode(mo))
|
||||
|
||||
return content
|
||||
|
||||
def cleanup_text(self, text):
|
||||
self.log.debug('\tClean up text...')
|
||||
# Replace bad characters.
|
||||
text = text.replace(u'\xc2', '')
|
||||
text = text.replace(u'\xa0', ' ')
|
||||
|
||||
# Replace tabs, vertical tags and form feeds with single space.
|
||||
text = text.replace('\t+', ' ')
|
||||
text = text.replace('\v+', ' ')
|
||||
text = text.replace('\f+', ' ')
|
||||
|
||||
# Single line paragraph.
|
||||
text = re.sub('(?<=.)\n(?=.)', ' ', text)
|
||||
|
||||
# Remove multiple spaces.
|
||||
text = re.sub('[ ]+', ' ', text)
|
||||
|
||||
# Remove excessive newlines.
|
||||
text = re.sub('\n[ ]+\n', '\n\n', text)
|
||||
text = re.sub('\n{3,}', '\n\n', text)
|
||||
|
||||
# Replace markers with the proper characters.
|
||||
text = text.replace('-vzxedxy-', '\n\n\n\n\n')
|
||||
text = text.replace('-vlgzxey-', '\n\n\n')
|
||||
|
||||
# Replace spaces at the beginning and end of lines
|
||||
text = re.sub('(?imu)^[ ]+', '', text)
|
||||
text = re.sub('(?imu)[ ]+$', '', text)
|
||||
|
||||
return text
|
||||
|
||||
def remove_newlines(self, text):
|
||||
self.log.debug('\tRemove newlines for processing...')
|
||||
text = text.replace('\r\n', ' ')
|
||||
text = text.replace('\n', ' ')
|
||||
text = text.replace('\r', ' ')
|
||||
|
||||
return text
|
||||
|
||||
def specified_newlines(self, text):
|
||||
self.log.debug('\tReplacing newlines with selected type...')
|
||||
if self.newline == '\n':
|
||||
return text
|
||||
|
||||
return text.replace('\n', self.newline)
|
||||
|
||||
|
||||
class TxtNewlines(object):
|
||||
NEWLINE_TYPES = {
|
||||
'system' : os.linesep,
|
||||
'unix' : '\n',
|
||||
'old_mac' : '\r',
|
||||
'windows' : '\r\n'
|
||||
}
|
||||
|
||||
def __init__(self, newline_type):
|
||||
self.newline = self.NEWLINE_TYPES.get(newline_type.lower(), os.linesep)
|
||||
|
Loading…
x
Reference in New Issue
Block a user