From 7b121a42e33ed870b6848022cdb2bd68ae5773cf Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 6 Jun 2009 18:20:06 -0400 Subject: [PATCH] Move from Beautiful Soup to lxml in txt and fb2 output. --- src/calibre/ebooks/fb2/fb2ml.py | 9 ++- src/calibre/ebooks/fb2/output.py | 2 +- src/calibre/ebooks/txt/writer.py | 108 +++++++++++++++---------------- 3 files changed, 57 insertions(+), 62 deletions(-) diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py index 3a5806b143..cf668c1879 100644 --- a/src/calibre/ebooks/fb2/fb2ml.py +++ b/src/calibre/ebooks/fb2/fb2ml.py @@ -12,14 +12,14 @@ import os import re from base64 import b64encode +from lxml import etree + from calibre import entity_to_unicode from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace from calibre.ebooks.oeb.stylizer import Stylizer from calibre.ebooks.oeb.base import OEB_IMAGES from calibre.constants import __appname__, __version__ -from BeautifulSoup import BeautifulSoup - TAG_MAP = { 'b' : 'strong', 'i' : 'emphasis', @@ -57,11 +57,10 @@ class FB2MLizer(object): output += self.fb2mlize_images() output += self.fb2_footer() output = self.clean_text(output) - return BeautifulSoup(output.encode('utf-8')).prettify() + return etree.tostring(etree.fromstring(output), encoding=unicode, pretty_print=True) def fb2_header(self): - return u' ' \ - ' ' \ '%s ' \ ' ' \ diff --git a/src/calibre/ebooks/fb2/output.py b/src/calibre/ebooks/fb2/output.py index 67ee9f468e..3b9a5a245f 100644 --- a/src/calibre/ebooks/fb2/output.py +++ b/src/calibre/ebooks/fb2/output.py @@ -30,7 +30,7 @@ class FB2Output(OutputFormatPlugin): out_stream.seek(0) out_stream.truncate() - out_stream.write(fb2_content) + out_stream.write(fb2_content.encode('utf-8')) if close: out_stream.close() diff --git a/src/calibre/ebooks/txt/writer.py b/src/calibre/ebooks/txt/writer.py index b31f325c3b..46f095d256 100644 --- a/src/calibre/ebooks/txt/writer.py +++ b/src/calibre/ebooks/txt/writer.py @@ -11,9 +11,10 @@ Write content to TXT. import os import re -from calibre import entity_to_unicode +from lxml import etree -from BeautifulSoup import BeautifulSoup +from calibre import entity_to_unicode +from calibre.ebooks.oeb.base import XHTML class TxtWriter(object): def __init__(self, newline, log): @@ -23,10 +24,8 @@ class TxtWriter(object): def dump(self, spine): out = u'' for item in spine: - content = unicode(item) - # Convert newlines to unix style \n for processing. These - # will be changed to the specified type later in the process. - content = self.unix_newlines(content) + content = unicode(etree.tostring(item.data.find(XHTML('body')), encoding=unicode)) + content = self.remove_newlines(content) content = self.strip_html(content) content = self.replace_html_symbols(content) content = self.cleanup_text(content) @@ -40,95 +39,92 @@ class TxtWriter(object): return out - def strip_html(self, html): + def strip_html(self, text): stripped = u'' - - for dom_tree in BeautifulSoup(html).findAll('body'): - text = unicode(dom_tree) - - # Remove unnecessary tags - for tag in ['script', 'style']: - text = re.sub('(?imu)<[ ]*%s[ ]*.*?>(.*)' % (tag, tag), '', text) - text = re.sub('', '', text) - text = re.sub('<\?.*?\?>', '', text) - text = re.sub('<@.*?@>', '', text) - text = re.sub('<%.*?%>', '', text) - # Headings usually indicate Chapters. - # We are going to use a marker to insert the proper number of - # newline characters at the end of cleanup_text because cleanup_text - # remove excessive (more than 2 newlines). - for tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']: - text = re.sub('(?imu)<[ ]*%s[ ]*.*?>' % tag, '-vzxedxy-', text) - text = re.sub('(?imu)' % tag, '-vlgzxey-', text) + # Remove unnecessary tags + for tag in ['script', 'style']: + text = re.sub('(?imu)<[ ]*%s[ ]*.*?>(.*)' % (tag, tag), '', text) + text = re.sub('', '', text) + text = re.sub('<\?.*?\?>', '', text) + text = re.sub('<@.*?@>', '', text) + text = re.sub('<%.*?%>', '', text) + + # Headings usually indicate Chapters. + # We are going to use a marker to insert the proper number of + # newline characters at the end of cleanup_text because cleanup_text + # remove excessive (more than 2 newlines). + for tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']: + text = re.sub('(?imu)<[ ]*%s[ ]*.*?>' % tag, '-vzxedxy-', text) + text = re.sub('(?imu)' % tag, '-vlgzxey-', text) + + # Separate content with space. + for tag in ['td']: + text = re.sub('(?imu)', ' ', text) + + # Separate content with empty line. + for tag in ['p', 'div', 'pre', 'li', 'table', 'tr']: + text = re.sub('(?imu)' % tag, '\n\n', text) + + for tag in ['hr', 'br']: + text = re.sub('(?imu)<[ ]*%s.*?>' % tag, '\n\n', text) + + # Remove any tags that do not need special processing. + text = re.sub('<.*?>', '', text) + + stripped = stripped + text - # Separate content with space. - for tag in ['td']: - text = re.sub('(?imu)', ' ', text) - - # Separate content with empty line. - for tag in ['p', 'div', 'pre', 'li', 'table', 'tr']: - text = re.sub('(?imu)' % tag, '\n\n', text) - - for tag in ['hr', 'br']: - text = re.sub('(?imu)<[ ]*%s.*?>' % tag, '\n\n', text) - - # Remove any tags that do not need special processing. - text = re.sub('<.*?>', '', text) - - stripped = stripped + text - return stripped - + def replace_html_symbols(self, content): for entity in set(re.findall('&.+?;', content)): mo = re.search('(%s)' % entity[1:-1], content) content = content.replace(entity, entity_to_unicode(mo)) return content - + def cleanup_text(self, text): # Replace bad characters. text = text.replace(u'\xc2', '') text = text.replace(u'\xa0', ' ') - + # Replace tabs, vertical tags and form feeds with single space. text = text.replace('\t+', ' ') text = text.replace('\v+', ' ') text = text.replace('\f+', ' ') - + # Single line paragraph. text = re.sub('(?<=.)\n(?=.)', ' ', text) - + # Remove multiple spaces. text = re.sub('[ ]+', ' ', text) - + # Remove excessive newlines. text = re.sub('\n[ ]+\n', '\n\n', text) text = re.sub('\n{3,}', '\n\n', text) - + # Replace markers with the proper characters. text = text.replace('-vzxedxy-', '\n\n\n\n\n') text = text.replace('-vlgzxey-', '\n\n\n') - + # Replace spaces at the beginning and end of lines text = re.sub('(?imu)^[ ]+', '', text) text = re.sub('(?imu)[ ]+$', '', text) - + return text - def unix_newlines(self, text): + def remove_newlines(self, text): text = text.replace('\r\n', ' ') text = text.replace('\n', ' ') text = text.replace('\r', ' ') - + return text - + def specified_newlines(self, text): if self.newline == '\n': return text - - return text.replace('\n', self.newline) + + return text.replace('\n', self.newline) class TxtNewlines(object): @@ -138,7 +134,7 @@ class TxtNewlines(object): 'old_mac' : '\r', 'windows' : '\r\n' } - + def __init__(self, newline_type): self.newline = self.NEWLINE_TYPES.get(newline_type.lower(), os.linesep)