Move from Beautiful Soup to lxml in txt and fb2 output.

This commit is contained in:
John Schember 2009-06-06 18:20:06 -04:00
parent 7b591f3d72
commit 7b121a42e3
3 changed files with 57 additions and 62 deletions

View File

@ -12,14 +12,14 @@ import os
import re import re
from base64 import b64encode from base64 import b64encode
from lxml import etree
from calibre import entity_to_unicode from calibre import entity_to_unicode
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace
from calibre.ebooks.oeb.stylizer import Stylizer from calibre.ebooks.oeb.stylizer import Stylizer
from calibre.ebooks.oeb.base import OEB_IMAGES from calibre.ebooks.oeb.base import OEB_IMAGES
from calibre.constants import __appname__, __version__ from calibre.constants import __appname__, __version__
from BeautifulSoup import BeautifulSoup
TAG_MAP = { TAG_MAP = {
'b' : 'strong', 'b' : 'strong',
'i' : 'emphasis', 'i' : 'emphasis',
@ -57,11 +57,10 @@ class FB2MLizer(object):
output += self.fb2mlize_images() output += self.fb2mlize_images()
output += self.fb2_footer() output += self.fb2_footer()
output = self.clean_text(output) output = self.clean_text(output)
return BeautifulSoup(output.encode('utf-8')).prettify() return etree.tostring(etree.fromstring(output), encoding=unicode, pretty_print=True)
def fb2_header(self): def fb2_header(self):
return u'<?xml version="1.0" encoding="utf-8"?> ' \ return u'<FictionBook xmlns:xlink="http://www.w3.org/1999/xlink" ' \
'<FictionBook xmlns:xlink="http://www.w3.org/1999/xlink" ' \
'xmlns="http://www.gribuser.ru/xml/fictionbook/2.0"> ' \ 'xmlns="http://www.gribuser.ru/xml/fictionbook/2.0"> ' \
'<description><title-info><book-title>%s</book-title> ' \ '<description><title-info><book-title>%s</book-title> ' \
'</title-info><document-info> ' \ '</title-info><document-info> ' \

View File

@ -30,7 +30,7 @@ class FB2Output(OutputFormatPlugin):
out_stream.seek(0) out_stream.seek(0)
out_stream.truncate() out_stream.truncate()
out_stream.write(fb2_content) out_stream.write(fb2_content.encode('utf-8'))
if close: if close:
out_stream.close() out_stream.close()

View File

@ -11,9 +11,10 @@ Write content to TXT.
import os import os
import re import re
from calibre import entity_to_unicode from lxml import etree
from BeautifulSoup import BeautifulSoup from calibre import entity_to_unicode
from calibre.ebooks.oeb.base import XHTML
class TxtWriter(object): class TxtWriter(object):
def __init__(self, newline, log): def __init__(self, newline, log):
@ -23,10 +24,8 @@ class TxtWriter(object):
def dump(self, spine): def dump(self, spine):
out = u'' out = u''
for item in spine: for item in spine:
content = unicode(item) content = unicode(etree.tostring(item.data.find(XHTML('body')), encoding=unicode))
# Convert newlines to unix style \n for processing. These content = self.remove_newlines(content)
# will be changed to the specified type later in the process.
content = self.unix_newlines(content)
content = self.strip_html(content) content = self.strip_html(content)
content = self.replace_html_symbols(content) content = self.replace_html_symbols(content)
content = self.cleanup_text(content) content = self.cleanup_text(content)
@ -40,95 +39,92 @@ class TxtWriter(object):
return out return out
def strip_html(self, html): def strip_html(self, text):
stripped = u'' stripped = u''
for dom_tree in BeautifulSoup(html).findAll('body'):
text = unicode(dom_tree)
# Remove unnecessary tags
for tag in ['script', 'style']:
text = re.sub('(?imu)<[ ]*%s[ ]*.*?>(.*)</[ ]*%s[ ]*>' % (tag, tag), '', text)
text = re.sub('<!--.*-->', '', text)
text = re.sub('<\?.*?\?>', '', text)
text = re.sub('<@.*?@>', '', text)
text = re.sub('<%.*?%>', '', text)
# Headings usually indicate Chapters. # Remove unnecessary tags
# We are going to use a marker to insert the proper number of for tag in ['script', 'style']:
# newline characters at the end of cleanup_text because cleanup_text text = re.sub('(?imu)<[ ]*%s[ ]*.*?>(.*)</[ ]*%s[ ]*>' % (tag, tag), '', text)
# remove excessive (more than 2 newlines). text = re.sub('<!--.*-->', '', text)
for tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']: text = re.sub('<\?.*?\?>', '', text)
text = re.sub('(?imu)<[ ]*%s[ ]*.*?>' % tag, '-vzxedxy-', text) text = re.sub('<@.*?@>', '', text)
text = re.sub('(?imu)</[ ]*%s[ ]*>' % tag, '-vlgzxey-', text) text = re.sub('<%.*?%>', '', text)
# Headings usually indicate Chapters.
# We are going to use a marker to insert the proper number of
# newline characters at the end of cleanup_text because cleanup_text
# remove excessive (more than 2 newlines).
for tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
text = re.sub('(?imu)<[ ]*%s[ ]*.*?>' % tag, '-vzxedxy-', text)
text = re.sub('(?imu)</[ ]*%s[ ]*>' % tag, '-vlgzxey-', text)
# Separate content with space.
for tag in ['td']:
text = re.sub('(?imu)</[ ]*%s[ ]*>', ' ', text)
# Separate content with empty line.
for tag in ['p', 'div', 'pre', 'li', 'table', 'tr']:
text = re.sub('(?imu)</[ ]*%s[ ]*>' % tag, '\n\n', text)
for tag in ['hr', 'br']:
text = re.sub('(?imu)<[ ]*%s.*?>' % tag, '\n\n', text)
# Remove any tags that do not need special processing.
text = re.sub('<.*?>', '', text)
stripped = stripped + text
# Separate content with space.
for tag in ['td']:
text = re.sub('(?imu)</[ ]*%s[ ]*>', ' ', text)
# Separate content with empty line.
for tag in ['p', 'div', 'pre', 'li', 'table', 'tr']:
text = re.sub('(?imu)</[ ]*%s[ ]*>' % tag, '\n\n', text)
for tag in ['hr', 'br']:
text = re.sub('(?imu)<[ ]*%s.*?>' % tag, '\n\n', text)
# Remove any tags that do not need special processing.
text = re.sub('<.*?>', '', text)
stripped = stripped + text
return stripped return stripped
def replace_html_symbols(self, content): def replace_html_symbols(self, content):
for entity in set(re.findall('&.+?;', content)): for entity in set(re.findall('&.+?;', content)):
mo = re.search('(%s)' % entity[1:-1], content) mo = re.search('(%s)' % entity[1:-1], content)
content = content.replace(entity, entity_to_unicode(mo)) content = content.replace(entity, entity_to_unicode(mo))
return content return content
def cleanup_text(self, text): def cleanup_text(self, text):
# Replace bad characters. # Replace bad characters.
text = text.replace(u'\xc2', '') text = text.replace(u'\xc2', '')
text = text.replace(u'\xa0', ' ') text = text.replace(u'\xa0', ' ')
# Replace tabs, vertical tags and form feeds with single space. # Replace tabs, vertical tags and form feeds with single space.
text = text.replace('\t+', ' ') text = text.replace('\t+', ' ')
text = text.replace('\v+', ' ') text = text.replace('\v+', ' ')
text = text.replace('\f+', ' ') text = text.replace('\f+', ' ')
# Single line paragraph. # Single line paragraph.
text = re.sub('(?<=.)\n(?=.)', ' ', text) text = re.sub('(?<=.)\n(?=.)', ' ', text)
# Remove multiple spaces. # Remove multiple spaces.
text = re.sub('[ ]+', ' ', text) text = re.sub('[ ]+', ' ', text)
# Remove excessive newlines. # Remove excessive newlines.
text = re.sub('\n[ ]+\n', '\n\n', text) text = re.sub('\n[ ]+\n', '\n\n', text)
text = re.sub('\n{3,}', '\n\n', text) text = re.sub('\n{3,}', '\n\n', text)
# Replace markers with the proper characters. # Replace markers with the proper characters.
text = text.replace('-vzxedxy-', '\n\n\n\n\n') text = text.replace('-vzxedxy-', '\n\n\n\n\n')
text = text.replace('-vlgzxey-', '\n\n\n') text = text.replace('-vlgzxey-', '\n\n\n')
# Replace spaces at the beginning and end of lines # Replace spaces at the beginning and end of lines
text = re.sub('(?imu)^[ ]+', '', text) text = re.sub('(?imu)^[ ]+', '', text)
text = re.sub('(?imu)[ ]+$', '', text) text = re.sub('(?imu)[ ]+$', '', text)
return text return text
def unix_newlines(self, text): def remove_newlines(self, text):
text = text.replace('\r\n', ' ') text = text.replace('\r\n', ' ')
text = text.replace('\n', ' ') text = text.replace('\n', ' ')
text = text.replace('\r', ' ') text = text.replace('\r', ' ')
return text return text
def specified_newlines(self, text): def specified_newlines(self, text):
if self.newline == '\n': if self.newline == '\n':
return text return text
return text.replace('\n', self.newline) return text.replace('\n', self.newline)
class TxtNewlines(object): class TxtNewlines(object):
@ -138,7 +134,7 @@ class TxtNewlines(object):
'old_mac' : '\r', 'old_mac' : '\r',
'windows' : '\r\n' 'windows' : '\r\n'
} }
def __init__(self, newline_type): def __init__(self, newline_type):
self.newline = self.NEWLINE_TYPES.get(newline_type.lower(), os.linesep) self.newline = self.NEWLINE_TYPES.get(newline_type.lower(), os.linesep)