From 7b121a42e33ed870b6848022cdb2bd68ae5773cf Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sat, 6 Jun 2009 18:20:06 -0400
Subject: [PATCH] Move from Beautiful Soup to lxml in txt and fb2 output.

---
 src/calibre/ebooks/fb2/fb2ml.py  |   9 ++-
 src/calibre/ebooks/fb2/output.py |   2 +-
 src/calibre/ebooks/txt/writer.py | 108 +++++++++++++++----------------
 3 files changed, 57 insertions(+), 62 deletions(-)
diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py
index 3a5806b143..cf668c1879 100644
--- a/src/calibre/ebooks/fb2/fb2ml.py
+++ b/src/calibre/ebooks/fb2/fb2ml.py
@@ -12,14 +12,14 @@ import os
 import re
 from base64 import b64encode
 
+from lxml import etree
+
 from calibre import entity_to_unicode
 from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace
 from calibre.ebooks.oeb.stylizer import Stylizer
 from calibre.ebooks.oeb.base import OEB_IMAGES
 from calibre.constants import __appname__, __version__
 
-from BeautifulSoup import BeautifulSoup
-
 TAG_MAP = {
     'b' : 'strong',
     'i' : 'emphasis',
@@ -57,11 +57,10 @@ class FB2MLizer(object):
         output += self.fb2mlize_images()
         output += self.fb2_footer()
         output = self.clean_text(output)
-        return BeautifulSoup(output.encode('utf-8')).prettify()
+        return etree.tostring(etree.fromstring(output), encoding=unicode, pretty_print=True)
 
     def fb2_header(self):
-        return u'<?xml version="1.0" encoding="utf-8"?> ' \
-        '<FictionBook xmlns:xlink="http://www.w3.org/1999/xlink" ' \
+        return u'<FictionBook xmlns:xlink="http://www.w3.org/1999/xlink" ' \
         'xmlns="http://www.gribuser.ru/xml/fictionbook/2.0"> ' \
         '<description><title-info><book-title>%s</book-title> ' \
         '</title-info><document-info> ' \
diff --git a/src/calibre/ebooks/fb2/output.py b/src/calibre/ebooks/fb2/output.py
index 67ee9f468e..3b9a5a245f 100644
--- a/src/calibre/ebooks/fb2/output.py
+++ b/src/calibre/ebooks/fb2/output.py
@@ -30,7 +30,7 @@ class FB2Output(OutputFormatPlugin):
         
         out_stream.seek(0)
         out_stream.truncate()
-        out_stream.write(fb2_content)
+        out_stream.write(fb2_content.encode('utf-8'))
         
         if close:
             out_stream.close()
diff --git a/src/calibre/ebooks/txt/writer.py b/src/calibre/ebooks/txt/writer.py
index b31f325c3b..46f095d256 100644
--- a/src/calibre/ebooks/txt/writer.py
+++ b/src/calibre/ebooks/txt/writer.py
@@ -11,9 +11,10 @@ Write content to TXT.
 import os
 import re
 
-from calibre import entity_to_unicode
+from lxml import etree
 
-from BeautifulSoup import BeautifulSoup
+from calibre import entity_to_unicode
+from calibre.ebooks.oeb.base import XHTML
 
 class TxtWriter(object):
     def __init__(self, newline, log):
@@ -23,10 +24,8 @@ class TxtWriter(object):
     def dump(self, spine):
         out = u''
         for item in spine:
-            content = unicode(item)
-            # Convert newlines to unix style \n for processing. These
-            # will be changed to the specified type later in the process.
-            content = self.unix_newlines(content)
+            content = unicode(etree.tostring(item.data.find(XHTML('body')), encoding=unicode))
+            content = self.remove_newlines(content)
             content = self.strip_html(content)
             content = self.replace_html_symbols(content)
             content = self.cleanup_text(content)
@@ -40,95 +39,92 @@ class TxtWriter(object):
 
         return out
 
-    def strip_html(self, html):
+    def strip_html(self, text):
         stripped = u''
-        
-        for dom_tree in BeautifulSoup(html).findAll('body'):
-            text = unicode(dom_tree)
-            
-            # Remove unnecessary tags
-            for tag in ['script', 'style']:
-                text = re.sub('(?imu)<[ ]*%s[ ]*.*?>(.*)</[ ]*%s[ ]*>' % (tag, tag), '', text)
-            text = re.sub('<!--.*-->', '', text)
-            text = re.sub('<\?.*?\?>', '', text)
-            text = re.sub('<@.*?@>', '', text)
-            text = re.sub('<%.*?%>', '', text)
 
-            # Headings usually indicate Chapters.
-            # We are going to use a marker to insert the proper number of
-            # newline characters at the end of cleanup_text because cleanup_text
-            # remove excessive (more than 2 newlines).
-            for tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
-                text = re.sub('(?imu)<[ ]*%s[ ]*.*?>' % tag, '-vzxedxy-', text)
-                text = re.sub('(?imu)</[ ]*%s[ ]*>' % tag, '-vlgzxey-', text)
+        # Remove unnecessary tags
+        for tag in ['script', 'style']:
+            text = re.sub('(?imu)<[ ]*%s[ ]*.*?>(.*)</[ ]*%s[ ]*>' % (tag, tag), '', text)
+        text = re.sub('<!--.*-->', '', text)
+        text = re.sub('<\?.*?\?>', '', text)
+        text = re.sub('<@.*?@>', '', text)
+        text = re.sub('<%.*?%>', '', text)
+
+        # Headings usually indicate Chapters.
+        # We are going to use a marker to insert the proper number of
+        # newline characters at the end of cleanup_text because cleanup_text
+        # remove excessive (more than 2 newlines).
+        for tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
+            text = re.sub('(?imu)<[ ]*%s[ ]*.*?>' % tag, '-vzxedxy-', text)
+            text = re.sub('(?imu)</[ ]*%s[ ]*>' % tag, '-vlgzxey-', text)
+
+        # Separate content with space.
+        for tag in ['td']:
+            text = re.sub('(?imu)</[ ]*%s[ ]*>', ' ', text)
+
+        # Separate content with empty line.
+        for tag in ['p', 'div', 'pre', 'li', 'table', 'tr']:
+            text = re.sub('(?imu)</[ ]*%s[ ]*>' % tag, '\n\n', text)
+
+        for tag in ['hr', 'br']:
+            text = re.sub('(?imu)<[ ]*%s.*?>' % tag, '\n\n', text)
+
+        # Remove any tags that do not need special processing.
+        text = re.sub('<.*?>', '', text)
+
+        stripped = stripped + text
 
-            # Separate content with space.
-            for tag in ['td']:
-                text = re.sub('(?imu)</[ ]*%s[ ]*>', ' ', text)
-            
-            # Separate content with empty line.
-            for tag in ['p', 'div', 'pre', 'li', 'table', 'tr']:
-                text = re.sub('(?imu)</[ ]*%s[ ]*>' % tag, '\n\n', text)
-            
-            for tag in ['hr', 'br']:
-                text = re.sub('(?imu)<[ ]*%s.*?>' % tag, '\n\n', text)
-            
-            # Remove any tags that do not need special processing.
-            text = re.sub('<.*?>', '', text)
-            
-            stripped = stripped + text
-            
         return stripped
-        
+
     def replace_html_symbols(self, content):
         for entity in set(re.findall('&.+?;', content)):
             mo = re.search('(%s)' % entity[1:-1], content)
             content = content.replace(entity, entity_to_unicode(mo))
 
         return content
-        
+
     def cleanup_text(self, text):
         # Replace bad characters.
         text = text.replace(u'\xc2', '')
         text = text.replace(u'\xa0', ' ')
-    
+
         # Replace tabs, vertical tags and form feeds with single space.
         text = text.replace('\t+', ' ')
         text = text.replace('\v+', ' ')
         text = text.replace('\f+', ' ')
-    
+
         # Single line paragraph.
         text = re.sub('(?<=.)\n(?=.)', ' ', text)
-        
+
         # Remove multiple spaces.
         text = re.sub('[  ]+', ' ', text)
-        
+
         # Remove excessive newlines.
         text = re.sub('\n[ ]+\n', '\n\n', text)
         text = re.sub('\n{3,}', '\n\n', text)
-        
+
         # Replace markers with the proper characters.
         text = text.replace('-vzxedxy-', '\n\n\n\n\n')
         text = text.replace('-vlgzxey-', '\n\n\n')
-        
+
         # Replace spaces at the beginning and end of lines
         text = re.sub('(?imu)^[ ]+', '', text)
         text = re.sub('(?imu)[ ]+$', '', text)
-        
+
         return text
 
-    def unix_newlines(self, text):
+    def remove_newlines(self, text):
         text = text.replace('\r\n', ' ')
         text = text.replace('\n', ' ')
         text = text.replace('\r', ' ')
-        
+
         return text
-        
+
     def specified_newlines(self, text):
         if self.newline == '\n':
             return text
-        
-        return text.replace('\n', self.newline)        
+
+        return text.replace('\n', self.newline)
 
 
 class TxtNewlines(object):
@@ -138,7 +134,7 @@ class TxtNewlines(object):
                         'old_mac' : '\r',
                         'windows' : '\r\n'
                      }
-                     
+
     def __init__(self, newline_type):
         self.newline = self.NEWLINE_TYPES.get(newline_type.lower(), os.linesep)