mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
[[SNBOutput] Reuse the original html->txt algorithm in txtml.py to get better output. Removed some unnecessary debug prints.
This commit is contained in:
parent
b4c69ba634
commit
e72c3ce0f8
@ -203,7 +203,6 @@ class SNBOutput(OutputFormatPlugin):
|
|||||||
from calibre.utils.magick import Image
|
from calibre.utils.magick import Image
|
||||||
img = Image()
|
img = Image()
|
||||||
img.load(imageData)
|
img.load(imageData)
|
||||||
print img.size
|
|
||||||
(x,y) = img.size
|
(x,y) = img.size
|
||||||
# TODO use the data from device profile
|
# TODO use the data from device profile
|
||||||
SCREEN_X = 540
|
SCREEN_X = 540
|
||||||
@ -219,7 +218,6 @@ class SNBOutput(OutputFormatPlugin):
|
|||||||
# img = img.rotate(90)
|
# img = img.rotate(90)
|
||||||
# x,y = y,x
|
# x,y = y,x
|
||||||
img.size = (x / scale, y / scale)
|
img.size = (x / scale, y / scale)
|
||||||
print img.size
|
|
||||||
img.save(imagePath)
|
img.save(imagePath)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
@ -135,7 +135,6 @@ class SNBFile:
|
|||||||
def FromDir(self, tdir):
|
def FromDir(self, tdir):
|
||||||
for root, dirs, files in os.walk(tdir):
|
for root, dirs, files in os.walk(tdir):
|
||||||
for name in files:
|
for name in files:
|
||||||
print name
|
|
||||||
p, ext = os.path.splitext(name)
|
p, ext = os.path.splitext(name)
|
||||||
if ext in [ ".snbf", ".snbc" ]:
|
if ext in [ ".snbf", ".snbc" ]:
|
||||||
self.AppendPlain(os.path.relpath(os.path.join(root, name), tdir), tdir)
|
self.AppendPlain(os.path.relpath(os.path.join(root, name), tdir), tdir)
|
||||||
@ -148,7 +147,6 @@ class SNBFile:
|
|||||||
f.fileSize = os.path.getsize(os.path.join(tdir,fileName))
|
f.fileSize = os.path.getsize(os.path.join(tdir,fileName))
|
||||||
f.fileBody = open(os.path.join(tdir,fileName), 'rb').read()
|
f.fileBody = open(os.path.join(tdir,fileName), 'rb').read()
|
||||||
f.fileName = fileName.replace(os.sep, '/')
|
f.fileName = fileName.replace(os.sep, '/')
|
||||||
print f.fileSize
|
|
||||||
self.files.append(f)
|
self.files.append(f)
|
||||||
|
|
||||||
def AppendBinary(self, fileName, tdir):
|
def AppendBinary(self, fileName, tdir):
|
||||||
@ -157,7 +155,6 @@ class SNBFile:
|
|||||||
f.fileSize = os.path.getsize(os.path.join(tdir,fileName))
|
f.fileSize = os.path.getsize(os.path.join(tdir,fileName))
|
||||||
f.fileBody = open(os.path.join(tdir,fileName), 'rb').read()
|
f.fileBody = open(os.path.join(tdir,fileName), 'rb').read()
|
||||||
f.fileName = fileName.replace(os.sep, '/')
|
f.fileName = fileName.replace(os.sep, '/')
|
||||||
print f.fileSize
|
|
||||||
self.files.append(f)
|
self.files.append(f)
|
||||||
|
|
||||||
def Output(self, outputFile):
|
def Output(self, outputFile):
|
||||||
|
@ -51,12 +51,13 @@ SPACE_TAGS = [
|
|||||||
'td',
|
'td',
|
||||||
]
|
]
|
||||||
|
|
||||||
CLIABRE_SNB_IMG_TAG = "<calibre_snb_temp_img>"
|
CALIBRE_SNB_IMG_TAG = "<$$calibre_snb_temp_img$$>"
|
||||||
|
CALIBRE_SNB_BM_TAG = "<$$calibre_snb_bm_tag$$>"
|
||||||
|
|
||||||
class SNBMLizer(object):
|
class SNBMLizer(object):
|
||||||
|
|
||||||
curSubItem = ""
|
curSubItem = ""
|
||||||
curText = [ ]
|
# curText = [ ]
|
||||||
|
|
||||||
def __init__(self, log):
|
def __init__(self, log):
|
||||||
self.log = log
|
self.log = log
|
||||||
@ -71,6 +72,7 @@ class SNBMLizer(object):
|
|||||||
|
|
||||||
|
|
||||||
def mlize(self):
|
def mlize(self):
|
||||||
|
output = [ u'' ]
|
||||||
stylizer = Stylizer(self.item.data, self.item.href, self.oeb_book, self.opts, self.opts.output_profile)
|
stylizer = Stylizer(self.item.data, self.item.href, self.oeb_book, self.opts, self.opts.output_profile)
|
||||||
content = unicode(etree.tostring(self.item.data.find(XHTML('body')), encoding=unicode))
|
content = unicode(etree.tostring(self.item.data.find(XHTML('body')), encoding=unicode))
|
||||||
content = self.remove_newlines(content)
|
content = self.remove_newlines(content)
|
||||||
@ -80,9 +82,20 @@ class SNBMLizer(object):
|
|||||||
etree.SubElement(etree.SubElement(snbcTree, "head"), "title").text = subtitle
|
etree.SubElement(etree.SubElement(snbcTree, "head"), "title").text = subtitle
|
||||||
etree.SubElement(snbcTree, "body")
|
etree.SubElement(snbcTree, "body")
|
||||||
trees[subitem] = snbcTree
|
trees[subitem] = snbcTree
|
||||||
|
output.append(u'%s%s\n\n' % (CALIBRE_SNB_BM_TAG, ""))
|
||||||
|
output += self.dump_text(self.subitems, etree.fromstring(content), stylizer)
|
||||||
|
output = self.cleanup_text(u''.join(output))
|
||||||
|
|
||||||
self.dump_text(trees, self.subitems, etree.fromstring(content), stylizer)
|
subitem = ''
|
||||||
self.Output(trees)
|
for line in output.splitlines():
|
||||||
|
line = line.strip(' \t\n\r')
|
||||||
|
if len(line) != 0:
|
||||||
|
if line.find(CALIBRE_SNB_IMG_TAG) == 0:
|
||||||
|
etree.SubElement(trees[subitem], "img").text = line[len(CALIBRE_SNB_IMG_TAG):]
|
||||||
|
elif line.find(CALIBRE_SNB_BM_TAG) == 0:
|
||||||
|
subitem = line[len(CALIBRE_SNB_BM_TAG):]
|
||||||
|
else:
|
||||||
|
etree.SubElement(trees[subitem], "text").text = etree.CDATA(unicode(u'\u3000\u3000' + line))
|
||||||
return trees
|
return trees
|
||||||
|
|
||||||
def remove_newlines(self, text):
|
def remove_newlines(self, text):
|
||||||
@ -93,25 +106,86 @@ class SNBMLizer(object):
|
|||||||
|
|
||||||
return text
|
return text
|
||||||
|
|
||||||
def dump_text(self, trees, subitems, elem, stylizer, end=''):
|
def cleanup_text(self, text):
|
||||||
'''
|
self.log.debug('\tClean up text...')
|
||||||
@elem: The element in the etree that we are working on.
|
# Replace bad characters.
|
||||||
@stylizer: The style information attached to the element.
|
text = text.replace(u'\xc2', '')
|
||||||
@end: The last two characters of the text from the previous element.
|
text = text.replace(u'\xa0', ' ')
|
||||||
This is used to determine if a blank line is needed when starting
|
text = text.replace(u'\xa9', '(C)')
|
||||||
a new block element.
|
|
||||||
'''
|
# Replace tabs, vertical tags and form feeds with single space.
|
||||||
|
text = text.replace('\t+', ' ')
|
||||||
|
text = text.replace('\v+', ' ')
|
||||||
|
text = text.replace('\f+', ' ')
|
||||||
|
|
||||||
|
# Single line paragraph.
|
||||||
|
text = re.sub('(?<=.)%s(?=.)' % os.linesep, ' ', text)
|
||||||
|
|
||||||
|
# Remove multiple spaces.
|
||||||
|
text = re.sub('[ ]{2,}', ' ', text)
|
||||||
|
|
||||||
|
# Remove excessive newlines.
|
||||||
|
text = re.sub('\n[ ]+\n', '\n\n', text)
|
||||||
|
if self.opts.remove_paragraph_spacing:
|
||||||
|
text = re.sub('\n{2,}', '\n', text)
|
||||||
|
text = re.sub('(?imu)^(?=.)', '\t', text)
|
||||||
|
else:
|
||||||
|
text = re.sub('\n{3,}', '\n\n', text)
|
||||||
|
|
||||||
|
# Replace spaces at the beginning and end of lines
|
||||||
|
text = re.sub('(?imu)^[ ]+', '', text)
|
||||||
|
text = re.sub('(?imu)[ ]+$', '', text)
|
||||||
|
|
||||||
|
if self.opts.max_line_length:
|
||||||
|
max_length = self.opts.max_line_length
|
||||||
|
if self.opts.max_line_length < 25 and not self.opts.force_max_line_length:
|
||||||
|
max_length = 25
|
||||||
|
short_lines = []
|
||||||
|
lines = text.splitlines()
|
||||||
|
for line in lines:
|
||||||
|
while len(line) > max_length:
|
||||||
|
space = line.rfind(' ', 0, max_length)
|
||||||
|
if space != -1:
|
||||||
|
# Space was found.
|
||||||
|
short_lines.append(line[:space])
|
||||||
|
line = line[space + 1:]
|
||||||
|
else:
|
||||||
|
# Space was not found.
|
||||||
|
if self.opts.force_max_line_length:
|
||||||
|
# Force breaking at max_lenght.
|
||||||
|
short_lines.append(line[:max_length])
|
||||||
|
line = line[max_length:]
|
||||||
|
else:
|
||||||
|
# Look for the first space after max_length.
|
||||||
|
space = line.find(' ', max_length, len(line))
|
||||||
|
if space != -1:
|
||||||
|
# Space was found.
|
||||||
|
short_lines.append(line[:space])
|
||||||
|
line = line[space + 1:]
|
||||||
|
else:
|
||||||
|
# No space was found cannot break line.
|
||||||
|
short_lines.append(line)
|
||||||
|
line = ''
|
||||||
|
# Add the text that was less than max_lengh to the list
|
||||||
|
short_lines.append(line)
|
||||||
|
text = '\n'.join(short_lines)
|
||||||
|
|
||||||
|
return text
|
||||||
|
|
||||||
|
def dump_text(self, subitems, elem, stylizer, end=''):
|
||||||
|
|
||||||
if not isinstance(elem.tag, basestring) \
|
if not isinstance(elem.tag, basestring) \
|
||||||
or namespace(elem.tag) != XHTML_NS:
|
or namespace(elem.tag) != XHTML_NS:
|
||||||
return ['']
|
return ['']
|
||||||
|
|
||||||
|
|
||||||
|
text = ['']
|
||||||
|
style = stylizer.style(elem)
|
||||||
|
|
||||||
if elem.attrib.get('id') != None and elem.attrib['id'] in [ href for href, title in subitems ]:
|
if elem.attrib.get('id') != None and elem.attrib['id'] in [ href for href, title in subitems ]:
|
||||||
if self.curSubItem != None and self.curSubItem != elem.attrib['id']:
|
if self.curSubItem != None and self.curSubItem != elem.attrib['id']:
|
||||||
self.Output(trees)
|
|
||||||
self.curSubItem = elem.attrib['id']
|
self.curSubItem = elem.attrib['id']
|
||||||
self.curText = [ ]
|
text.append(u'%s%s\n\n' % (CALIBRE_SNB_BM_TAG, self.curSubItem))
|
||||||
|
|
||||||
style = stylizer.style(elem)
|
|
||||||
|
|
||||||
if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
|
if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
|
||||||
or style['visibility'] == 'hidden':
|
or style['visibility'] == 'hidden':
|
||||||
@ -124,37 +198,29 @@ class SNBMLizer(object):
|
|||||||
if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES:
|
if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES:
|
||||||
in_block = True
|
in_block = True
|
||||||
if not end.endswith(u'\n\n') and hasattr(elem, 'text') and elem.text:
|
if not end.endswith(u'\n\n') and hasattr(elem, 'text') and elem.text:
|
||||||
self.curText.append(u'\n\n')
|
text.append(u'\n\n')
|
||||||
|
|
||||||
if tag in SPACE_TAGS:
|
if tag in SPACE_TAGS:
|
||||||
if not end.endswith('u ') and hasattr(elem, 'text') and elem.text:
|
if not end.endswith('u ') and hasattr(elem, 'text') and elem.text:
|
||||||
self.curText.append(u' ')
|
text.append(u' ')
|
||||||
|
|
||||||
if tag == 'img':
|
if tag == 'img':
|
||||||
self.curText.append(u'%s%s' % (CLIABRE_SNB_IMG_TAG, ProcessFileName(elem.attrib['src'])))
|
text.append(u'%s%s' % (CALIBRE_SNB_IMG_TAG, ProcessFileName(elem.attrib['src'])))
|
||||||
|
|
||||||
# Process tags that contain text.
|
# Process tags that contain text.
|
||||||
if hasattr(elem, 'text') and elem.text:
|
if hasattr(elem, 'text') and elem.text:
|
||||||
self.curText.append(elem.text)
|
text.append(elem.text)
|
||||||
|
|
||||||
for item in elem:
|
for item in elem:
|
||||||
en = u''
|
en = u''
|
||||||
if len(self.curText) >= 2:
|
if len(text) >= 2:
|
||||||
en = self.curText[-1][-2:]
|
en = text[-1][-2:]
|
||||||
self.dump_text(trees, subitems, item, stylizer, en)
|
text += self.dump_text(subitems, item, stylizer, en)
|
||||||
|
|
||||||
if in_block:
|
if in_block:
|
||||||
self.curText.append(u'\n\n')
|
text.append(u'\n\n')
|
||||||
|
|
||||||
if hasattr(elem, 'tail') and elem.tail:
|
if hasattr(elem, 'tail') and elem.tail:
|
||||||
self.curText.append(elem.tail)
|
text.append(elem.tail)
|
||||||
|
|
||||||
def Output(self, trees):
|
return text
|
||||||
if self.curSubItem == None or not self.curSubItem in trees:
|
|
||||||
return
|
|
||||||
for t in self.curText:
|
|
||||||
if len(t.strip(' \t\n\r')) != 0:
|
|
||||||
if t.find(CLIABRE_SNB_IMG_TAG) == 0:
|
|
||||||
etree.SubElement(trees[self.curSubItem], "img").text = t[len(CLIABRE_SNB_IMG_TAG):]
|
|
||||||
else:
|
|
||||||
etree.SubElement(trees[self.curSubItem], "text").text = etree.CDATA(unicode(u'\u3000\u3000' + t))
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user