mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Move from Beautiful Soup to lxml in txt and fb2 output.
This commit is contained in:
parent
7b591f3d72
commit
7b121a42e3
@ -12,14 +12,14 @@ import os
|
|||||||
import re
|
import re
|
||||||
from base64 import b64encode
|
from base64 import b64encode
|
||||||
|
|
||||||
|
from lxml import etree
|
||||||
|
|
||||||
from calibre import entity_to_unicode
|
from calibre import entity_to_unicode
|
||||||
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace
|
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace
|
||||||
from calibre.ebooks.oeb.stylizer import Stylizer
|
from calibre.ebooks.oeb.stylizer import Stylizer
|
||||||
from calibre.ebooks.oeb.base import OEB_IMAGES
|
from calibre.ebooks.oeb.base import OEB_IMAGES
|
||||||
from calibre.constants import __appname__, __version__
|
from calibre.constants import __appname__, __version__
|
||||||
|
|
||||||
from BeautifulSoup import BeautifulSoup
|
|
||||||
|
|
||||||
TAG_MAP = {
|
TAG_MAP = {
|
||||||
'b' : 'strong',
|
'b' : 'strong',
|
||||||
'i' : 'emphasis',
|
'i' : 'emphasis',
|
||||||
@ -57,11 +57,10 @@ class FB2MLizer(object):
|
|||||||
output += self.fb2mlize_images()
|
output += self.fb2mlize_images()
|
||||||
output += self.fb2_footer()
|
output += self.fb2_footer()
|
||||||
output = self.clean_text(output)
|
output = self.clean_text(output)
|
||||||
return BeautifulSoup(output.encode('utf-8')).prettify()
|
return etree.tostring(etree.fromstring(output), encoding=unicode, pretty_print=True)
|
||||||
|
|
||||||
def fb2_header(self):
|
def fb2_header(self):
|
||||||
return u'<?xml version="1.0" encoding="utf-8"?> ' \
|
return u'<FictionBook xmlns:xlink="http://www.w3.org/1999/xlink" ' \
|
||||||
'<FictionBook xmlns:xlink="http://www.w3.org/1999/xlink" ' \
|
|
||||||
'xmlns="http://www.gribuser.ru/xml/fictionbook/2.0"> ' \
|
'xmlns="http://www.gribuser.ru/xml/fictionbook/2.0"> ' \
|
||||||
'<description><title-info><book-title>%s</book-title> ' \
|
'<description><title-info><book-title>%s</book-title> ' \
|
||||||
'</title-info><document-info> ' \
|
'</title-info><document-info> ' \
|
||||||
|
@ -30,7 +30,7 @@ class FB2Output(OutputFormatPlugin):
|
|||||||
|
|
||||||
out_stream.seek(0)
|
out_stream.seek(0)
|
||||||
out_stream.truncate()
|
out_stream.truncate()
|
||||||
out_stream.write(fb2_content)
|
out_stream.write(fb2_content.encode('utf-8'))
|
||||||
|
|
||||||
if close:
|
if close:
|
||||||
out_stream.close()
|
out_stream.close()
|
||||||
|
@ -11,9 +11,10 @@ Write content to TXT.
|
|||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from calibre import entity_to_unicode
|
from lxml import etree
|
||||||
|
|
||||||
from BeautifulSoup import BeautifulSoup
|
from calibre import entity_to_unicode
|
||||||
|
from calibre.ebooks.oeb.base import XHTML
|
||||||
|
|
||||||
class TxtWriter(object):
|
class TxtWriter(object):
|
||||||
def __init__(self, newline, log):
|
def __init__(self, newline, log):
|
||||||
@ -23,10 +24,8 @@ class TxtWriter(object):
|
|||||||
def dump(self, spine):
|
def dump(self, spine):
|
||||||
out = u''
|
out = u''
|
||||||
for item in spine:
|
for item in spine:
|
||||||
content = unicode(item)
|
content = unicode(etree.tostring(item.data.find(XHTML('body')), encoding=unicode))
|
||||||
# Convert newlines to unix style \n for processing. These
|
content = self.remove_newlines(content)
|
||||||
# will be changed to the specified type later in the process.
|
|
||||||
content = self.unix_newlines(content)
|
|
||||||
content = self.strip_html(content)
|
content = self.strip_html(content)
|
||||||
content = self.replace_html_symbols(content)
|
content = self.replace_html_symbols(content)
|
||||||
content = self.cleanup_text(content)
|
content = self.cleanup_text(content)
|
||||||
@ -40,95 +39,92 @@ class TxtWriter(object):
|
|||||||
|
|
||||||
return out
|
return out
|
||||||
|
|
||||||
def strip_html(self, html):
|
def strip_html(self, text):
|
||||||
stripped = u''
|
stripped = u''
|
||||||
|
|
||||||
for dom_tree in BeautifulSoup(html).findAll('body'):
|
|
||||||
text = unicode(dom_tree)
|
|
||||||
|
|
||||||
# Remove unnecessary tags
|
|
||||||
for tag in ['script', 'style']:
|
|
||||||
text = re.sub('(?imu)<[ ]*%s[ ]*.*?>(.*)</[ ]*%s[ ]*>' % (tag, tag), '', text)
|
|
||||||
text = re.sub('<!--.*-->', '', text)
|
|
||||||
text = re.sub('<\?.*?\?>', '', text)
|
|
||||||
text = re.sub('<@.*?@>', '', text)
|
|
||||||
text = re.sub('<%.*?%>', '', text)
|
|
||||||
|
|
||||||
# Headings usually indicate Chapters.
|
# Remove unnecessary tags
|
||||||
# We are going to use a marker to insert the proper number of
|
for tag in ['script', 'style']:
|
||||||
# newline characters at the end of cleanup_text because cleanup_text
|
text = re.sub('(?imu)<[ ]*%s[ ]*.*?>(.*)</[ ]*%s[ ]*>' % (tag, tag), '', text)
|
||||||
# remove excessive (more than 2 newlines).
|
text = re.sub('<!--.*-->', '', text)
|
||||||
for tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
|
text = re.sub('<\?.*?\?>', '', text)
|
||||||
text = re.sub('(?imu)<[ ]*%s[ ]*.*?>' % tag, '-vzxedxy-', text)
|
text = re.sub('<@.*?@>', '', text)
|
||||||
text = re.sub('(?imu)</[ ]*%s[ ]*>' % tag, '-vlgzxey-', text)
|
text = re.sub('<%.*?%>', '', text)
|
||||||
|
|
||||||
|
# Headings usually indicate Chapters.
|
||||||
|
# We are going to use a marker to insert the proper number of
|
||||||
|
# newline characters at the end of cleanup_text because cleanup_text
|
||||||
|
# remove excessive (more than 2 newlines).
|
||||||
|
for tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
|
||||||
|
text = re.sub('(?imu)<[ ]*%s[ ]*.*?>' % tag, '-vzxedxy-', text)
|
||||||
|
text = re.sub('(?imu)</[ ]*%s[ ]*>' % tag, '-vlgzxey-', text)
|
||||||
|
|
||||||
|
# Separate content with space.
|
||||||
|
for tag in ['td']:
|
||||||
|
text = re.sub('(?imu)</[ ]*%s[ ]*>', ' ', text)
|
||||||
|
|
||||||
|
# Separate content with empty line.
|
||||||
|
for tag in ['p', 'div', 'pre', 'li', 'table', 'tr']:
|
||||||
|
text = re.sub('(?imu)</[ ]*%s[ ]*>' % tag, '\n\n', text)
|
||||||
|
|
||||||
|
for tag in ['hr', 'br']:
|
||||||
|
text = re.sub('(?imu)<[ ]*%s.*?>' % tag, '\n\n', text)
|
||||||
|
|
||||||
|
# Remove any tags that do not need special processing.
|
||||||
|
text = re.sub('<.*?>', '', text)
|
||||||
|
|
||||||
|
stripped = stripped + text
|
||||||
|
|
||||||
# Separate content with space.
|
|
||||||
for tag in ['td']:
|
|
||||||
text = re.sub('(?imu)</[ ]*%s[ ]*>', ' ', text)
|
|
||||||
|
|
||||||
# Separate content with empty line.
|
|
||||||
for tag in ['p', 'div', 'pre', 'li', 'table', 'tr']:
|
|
||||||
text = re.sub('(?imu)</[ ]*%s[ ]*>' % tag, '\n\n', text)
|
|
||||||
|
|
||||||
for tag in ['hr', 'br']:
|
|
||||||
text = re.sub('(?imu)<[ ]*%s.*?>' % tag, '\n\n', text)
|
|
||||||
|
|
||||||
# Remove any tags that do not need special processing.
|
|
||||||
text = re.sub('<.*?>', '', text)
|
|
||||||
|
|
||||||
stripped = stripped + text
|
|
||||||
|
|
||||||
return stripped
|
return stripped
|
||||||
|
|
||||||
def replace_html_symbols(self, content):
|
def replace_html_symbols(self, content):
|
||||||
for entity in set(re.findall('&.+?;', content)):
|
for entity in set(re.findall('&.+?;', content)):
|
||||||
mo = re.search('(%s)' % entity[1:-1], content)
|
mo = re.search('(%s)' % entity[1:-1], content)
|
||||||
content = content.replace(entity, entity_to_unicode(mo))
|
content = content.replace(entity, entity_to_unicode(mo))
|
||||||
|
|
||||||
return content
|
return content
|
||||||
|
|
||||||
def cleanup_text(self, text):
|
def cleanup_text(self, text):
|
||||||
# Replace bad characters.
|
# Replace bad characters.
|
||||||
text = text.replace(u'\xc2', '')
|
text = text.replace(u'\xc2', '')
|
||||||
text = text.replace(u'\xa0', ' ')
|
text = text.replace(u'\xa0', ' ')
|
||||||
|
|
||||||
# Replace tabs, vertical tags and form feeds with single space.
|
# Replace tabs, vertical tags and form feeds with single space.
|
||||||
text = text.replace('\t+', ' ')
|
text = text.replace('\t+', ' ')
|
||||||
text = text.replace('\v+', ' ')
|
text = text.replace('\v+', ' ')
|
||||||
text = text.replace('\f+', ' ')
|
text = text.replace('\f+', ' ')
|
||||||
|
|
||||||
# Single line paragraph.
|
# Single line paragraph.
|
||||||
text = re.sub('(?<=.)\n(?=.)', ' ', text)
|
text = re.sub('(?<=.)\n(?=.)', ' ', text)
|
||||||
|
|
||||||
# Remove multiple spaces.
|
# Remove multiple spaces.
|
||||||
text = re.sub('[ ]+', ' ', text)
|
text = re.sub('[ ]+', ' ', text)
|
||||||
|
|
||||||
# Remove excessive newlines.
|
# Remove excessive newlines.
|
||||||
text = re.sub('\n[ ]+\n', '\n\n', text)
|
text = re.sub('\n[ ]+\n', '\n\n', text)
|
||||||
text = re.sub('\n{3,}', '\n\n', text)
|
text = re.sub('\n{3,}', '\n\n', text)
|
||||||
|
|
||||||
# Replace markers with the proper characters.
|
# Replace markers with the proper characters.
|
||||||
text = text.replace('-vzxedxy-', '\n\n\n\n\n')
|
text = text.replace('-vzxedxy-', '\n\n\n\n\n')
|
||||||
text = text.replace('-vlgzxey-', '\n\n\n')
|
text = text.replace('-vlgzxey-', '\n\n\n')
|
||||||
|
|
||||||
# Replace spaces at the beginning and end of lines
|
# Replace spaces at the beginning and end of lines
|
||||||
text = re.sub('(?imu)^[ ]+', '', text)
|
text = re.sub('(?imu)^[ ]+', '', text)
|
||||||
text = re.sub('(?imu)[ ]+$', '', text)
|
text = re.sub('(?imu)[ ]+$', '', text)
|
||||||
|
|
||||||
return text
|
return text
|
||||||
|
|
||||||
def unix_newlines(self, text):
|
def remove_newlines(self, text):
|
||||||
text = text.replace('\r\n', ' ')
|
text = text.replace('\r\n', ' ')
|
||||||
text = text.replace('\n', ' ')
|
text = text.replace('\n', ' ')
|
||||||
text = text.replace('\r', ' ')
|
text = text.replace('\r', ' ')
|
||||||
|
|
||||||
return text
|
return text
|
||||||
|
|
||||||
def specified_newlines(self, text):
|
def specified_newlines(self, text):
|
||||||
if self.newline == '\n':
|
if self.newline == '\n':
|
||||||
return text
|
return text
|
||||||
|
|
||||||
return text.replace('\n', self.newline)
|
return text.replace('\n', self.newline)
|
||||||
|
|
||||||
|
|
||||||
class TxtNewlines(object):
|
class TxtNewlines(object):
|
||||||
@ -138,7 +134,7 @@ class TxtNewlines(object):
|
|||||||
'old_mac' : '\r',
|
'old_mac' : '\r',
|
||||||
'windows' : '\r\n'
|
'windows' : '\r\n'
|
||||||
}
|
}
|
||||||
|
|
||||||
def __init__(self, newline_type):
|
def __init__(self, newline_type):
|
||||||
self.newline = self.NEWLINE_TYPES.get(newline_type.lower(), os.linesep)
|
self.newline = self.NEWLINE_TYPES.get(newline_type.lower(), os.linesep)
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user