mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 18:54:09 -04:00
Star of new html to text parser.
This commit is contained in:
parent
1fbf2cee84
commit
c354272030
@ -38,7 +38,7 @@ PML_HTML_RULES = [
|
|||||||
(re.compile(r'\\U(?P<num>\d+)'), lambda match: '%s' % unichr(int(match.group('num'), 16))),
|
(re.compile(r'\\U(?P<num>\d+)'), lambda match: '%s' % unichr(int(match.group('num'), 16))),
|
||||||
(re.compile(r'\\m="(?P<name>.+?)"'), lambda match: '<img src="images/%s" />' % image_name(match.group('name')).strip('\x00')),
|
(re.compile(r'\\m="(?P<name>.+?)"'), lambda match: '<img src="images/%s" />' % image_name(match.group('name')).strip('\x00')),
|
||||||
(re.compile(r'\\q="(?P<target>#.+?)"(?P<text>.+?)\\q', re.DOTALL), lambda match: '<a href="%s">%s</a>' % (match.group('target'), match.group('text'))),
|
(re.compile(r'\\q="(?P<target>#.+?)"(?P<text>.+?)\\q', re.DOTALL), lambda match: '<a href="%s">%s</a>' % (match.group('target'), match.group('text'))),
|
||||||
(re.compile(r'\\Q="(?P<target>.+?)"'), lambda match: '<div id="%s"></div>' % match.group('target')),
|
(re.compile(r'\\Q="(?P<target>.+?)"'), lambda match: '<span id="%s"></span>' % match.group('target')),
|
||||||
(re.compile(r'\\-'), lambda match: ''),
|
(re.compile(r'\\-'), lambda match: ''),
|
||||||
(re.compile(r'\\Fn="(?P<target>.+?)"(?P<text>.+?)\\Fn'), lambda match: '<a href="#footnote-%s">%s</a>' % (match.group('target'), match.group('text'))),
|
(re.compile(r'\\Fn="(?P<target>.+?)"(?P<text>.+?)\\Fn'), lambda match: '<a href="#footnote-%s">%s</a>' % (match.group('target'), match.group('text'))),
|
||||||
(re.compile(r'\\Sd="(?P<target>.+?)"(?P<text>.+?)\\Sd'), lambda match: '<a href="#sidebar-%s">%s</a>' % (match.group('target'), match.group('text'))),
|
(re.compile(r'\\Sd="(?P<target>.+?)"(?P<text>.+?)\\Sd'), lambda match: '<a href="#sidebar-%s">%s</a>' % (match.group('target'), match.group('text'))),
|
||||||
|
@ -32,8 +32,12 @@ class TXTOutput(OutputFormatPlugin):
|
|||||||
])
|
])
|
||||||
|
|
||||||
def convert(self, oeb_book, output_path, input_plugin, opts, log):
|
def convert(self, oeb_book, output_path, input_plugin, opts, log):
|
||||||
writer = TxtWriter(TxtNewlines(opts.newline).newline, log)
|
# writer = TxtWriter(TxtNewlines(opts.newline).newline, log)
|
||||||
txt = writer.dump(oeb_book.spine)
|
# txt = writer.dump(oeb_book.spine)
|
||||||
|
|
||||||
|
from calibre.ebooks.txt.txtml import TXTMLizer
|
||||||
|
writer = TXTMLizer(log)
|
||||||
|
txt = writer.extract_content(oeb_book, opts)
|
||||||
|
|
||||||
close = False
|
close = False
|
||||||
if not hasattr(output_path, 'write'):
|
if not hasattr(output_path, 'write'):
|
||||||
|
98
src/calibre/ebooks/txt/txtml.py
Normal file
98
src/calibre/ebooks/txt/txtml.py
Normal file
@ -0,0 +1,98 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
__license__ = 'GPL 3'
|
||||||
|
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||||
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
'''
|
||||||
|
Transform OEB content into plain text
|
||||||
|
'''
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
|
from lxml import etree
|
||||||
|
|
||||||
|
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace
|
||||||
|
from calibre.ebooks.oeb.stylizer import Stylizer
|
||||||
|
|
||||||
|
BLOCK_TAGS = [
|
||||||
|
'div',
|
||||||
|
'p',
|
||||||
|
'h1',
|
||||||
|
'h2',
|
||||||
|
'h3',
|
||||||
|
'h4',
|
||||||
|
'h5',
|
||||||
|
'h6',
|
||||||
|
'li',
|
||||||
|
]
|
||||||
|
|
||||||
|
BLOCK_STYLES = [
|
||||||
|
'block',
|
||||||
|
]
|
||||||
|
|
||||||
|
class TXTMLizer(object):
|
||||||
|
def __init__(self, log):
|
||||||
|
self.log = log
|
||||||
|
|
||||||
|
def extract_content(self, oeb_book, opts):
|
||||||
|
self.log.info('Converting XHTML to PML markup...')
|
||||||
|
self.oeb_book = oeb_book
|
||||||
|
self.opts = opts
|
||||||
|
return self.mlize_spine()
|
||||||
|
|
||||||
|
def mlize_spine(self):
|
||||||
|
output = u''
|
||||||
|
for item in self.oeb_book.spine:
|
||||||
|
self.log.debug('Converting %s to TXT...' % item.href)
|
||||||
|
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile)
|
||||||
|
content = unicode(etree.tostring(item.data.find(XHTML('body')), encoding=unicode))
|
||||||
|
content = self.remove_newlines(content)
|
||||||
|
output += self.dump_text(etree.fromstring(content), stylizer)
|
||||||
|
|
||||||
|
return output
|
||||||
|
|
||||||
|
def remove_newlines(self, text):
|
||||||
|
self.log.debug('\tRemove newlines for processing...')
|
||||||
|
text = text.replace('\r\n', ' ')
|
||||||
|
text = text.replace('\n', ' ')
|
||||||
|
text = text.replace('\r', ' ')
|
||||||
|
|
||||||
|
return text
|
||||||
|
|
||||||
|
def dump_text(self, elem, stylizer):
|
||||||
|
if not isinstance(elem.tag, basestring) \
|
||||||
|
or namespace(elem.tag) != XHTML_NS:
|
||||||
|
return u''
|
||||||
|
|
||||||
|
text = u''
|
||||||
|
style = stylizer.style(elem)
|
||||||
|
|
||||||
|
if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
|
||||||
|
or style['visibility'] == 'hidden':
|
||||||
|
return u''
|
||||||
|
|
||||||
|
tag = barename(elem.tag)
|
||||||
|
in_block = False
|
||||||
|
|
||||||
|
# Are we in a paragraph block?
|
||||||
|
if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES:
|
||||||
|
in_block = True
|
||||||
|
#if not text.endswith(os.linesep) and hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '':
|
||||||
|
# print '"%s"' % text
|
||||||
|
# text += os.linesep + os.linesep
|
||||||
|
|
||||||
|
# Proccess tags that contain text.
|
||||||
|
if hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '':
|
||||||
|
text += elem.text
|
||||||
|
|
||||||
|
for item in elem:
|
||||||
|
text += self.dump_text(item, stylizer)
|
||||||
|
|
||||||
|
if in_block:
|
||||||
|
text += os.linesep + os.linesep
|
||||||
|
|
||||||
|
if hasattr(elem, 'tail') and elem.tail != None and elem.tail.strip() != '':
|
||||||
|
text += elem.tail
|
||||||
|
|
||||||
|
return text
|
Loading…
x
Reference in New Issue
Block a user