mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Pull from driver-dev
This commit is contained in:
commit
51bbd45551
@ -12,19 +12,19 @@ import os
|
||||
import re
|
||||
from base64 import b64encode
|
||||
|
||||
from lxml import etree
|
||||
|
||||
from calibre import entity_to_unicode
|
||||
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace
|
||||
from calibre.ebooks.oeb.stylizer import Stylizer
|
||||
from calibre.ebooks.oeb.base import OEB_IMAGES
|
||||
from calibre.constants import __appname__, __version__
|
||||
|
||||
from BeautifulSoup import BeautifulSoup
|
||||
|
||||
TAG_MAP = {
|
||||
'b' : 'strong',
|
||||
'i' : 'emphasis',
|
||||
'p' : 'p',
|
||||
'div' : 'p',
|
||||
'li' : 'p'
|
||||
}
|
||||
|
||||
STYLES = [
|
||||
@ -57,11 +57,10 @@ class FB2MLizer(object):
|
||||
output += self.fb2mlize_images()
|
||||
output += self.fb2_footer()
|
||||
output = self.clean_text(output)
|
||||
return BeautifulSoup(output.encode('utf-8')).prettify()
|
||||
return etree.tostring(etree.fromstring(output), encoding=unicode, pretty_print=True)
|
||||
|
||||
def fb2_header(self):
|
||||
return u'<?xml version="1.0" encoding="utf-8"?> ' \
|
||||
'<FictionBook xmlns:xlink="http://www.w3.org/1999/xlink" ' \
|
||||
return u'<FictionBook xmlns:xlink="http://www.w3.org/1999/xlink" ' \
|
||||
'xmlns="http://www.gribuser.ru/xml/fictionbook/2.0"> ' \
|
||||
'<description><title-info><book-title>%s</book-title> ' \
|
||||
'</title-info><document-info> ' \
|
||||
@ -110,12 +109,13 @@ class FB2MLizer(object):
|
||||
fb2_text += '<image xlink:herf="#%s" />' % os.path.basename(elem.attrib['src'])
|
||||
|
||||
|
||||
fb2_tag = TAG_MAP.get(tag, 'p')
|
||||
fb2_tag = TAG_MAP.get(tag, None)
|
||||
if fb2_tag and fb2_tag not in tag_stack:
|
||||
tag_count += 1
|
||||
fb2_text += '<%s>' % fb2_tag
|
||||
tag_stack.append(fb2_tag)
|
||||
|
||||
|
||||
# Processes style information
|
||||
for s in STYLES:
|
||||
style_tag = s[1].get(style[s[0]], None)
|
||||
@ -133,7 +133,6 @@ class FB2MLizer(object):
|
||||
close_tag_list = []
|
||||
for i in range(0, tag_count):
|
||||
close_tag_list.insert(0, tag_stack.pop())
|
||||
|
||||
fb2_text += self.close_tags(close_tag_list)
|
||||
|
||||
if hasattr(elem, 'tail') and elem.tail != None and elem.tail.strip() != '':
|
||||
@ -151,4 +150,3 @@ class FB2MLizer(object):
|
||||
fb2_text += '</%s>' % fb2_tag
|
||||
|
||||
return fb2_text
|
||||
|
||||
|
@ -30,7 +30,7 @@ class FB2Output(OutputFormatPlugin):
|
||||
|
||||
out_stream.seek(0)
|
||||
out_stream.truncate()
|
||||
out_stream.write(fb2_content)
|
||||
out_stream.write(fb2_content.encode('utf-8'))
|
||||
|
||||
if close:
|
||||
out_stream.close()
|
||||
|
@ -11,9 +11,10 @@ Write content to TXT.
|
||||
import os
|
||||
import re
|
||||
|
||||
from calibre import entity_to_unicode
|
||||
from lxml import etree
|
||||
|
||||
from BeautifulSoup import BeautifulSoup
|
||||
from calibre import entity_to_unicode
|
||||
from calibre.ebooks.oeb.base import XHTML
|
||||
|
||||
class TxtWriter(object):
|
||||
def __init__(self, newline, log):
|
||||
@ -23,10 +24,8 @@ class TxtWriter(object):
|
||||
def dump(self, spine):
|
||||
out = u''
|
||||
for item in spine:
|
||||
content = unicode(item)
|
||||
# Convert newlines to unix style \n for processing. These
|
||||
# will be changed to the specified type later in the process.
|
||||
content = self.unix_newlines(content)
|
||||
content = unicode(etree.tostring(item.data.find(XHTML('body')), encoding=unicode))
|
||||
content = self.remove_newlines(content)
|
||||
content = self.strip_html(content)
|
||||
content = self.replace_html_symbols(content)
|
||||
content = self.cleanup_text(content)
|
||||
@ -40,12 +39,9 @@ class TxtWriter(object):
|
||||
|
||||
return out
|
||||
|
||||
def strip_html(self, html):
|
||||
def strip_html(self, text):
|
||||
stripped = u''
|
||||
|
||||
for dom_tree in BeautifulSoup(html).findAll('body'):
|
||||
text = unicode(dom_tree)
|
||||
|
||||
# Remove unnecessary tags
|
||||
for tag in ['script', 'style']:
|
||||
text = re.sub('(?imu)<[ ]*%s[ ]*.*?>(.*)</[ ]*%s[ ]*>' % (tag, tag), '', text)
|
||||
@ -117,7 +113,7 @@ class TxtWriter(object):
|
||||
|
||||
return text
|
||||
|
||||
def unix_newlines(self, text):
|
||||
def remove_newlines(self, text):
|
||||
text = text.replace('\r\n', ' ')
|
||||
text = text.replace('\n', ' ')
|
||||
text = text.replace('\r', ' ')
|
||||
|
Loading…
x
Reference in New Issue
Block a user