Move from Beautiful Soup to lxml in txt and fb2 output.

2025-07-09 03:04:10 -04:00 · 2009-06-06 18:20:06 -04:00 · 2009-06-06 18:20:06 -04:00 · 7b121a42e3
commit 7b121a42e3
parent 7b591f3d72
3 changed files with 57 additions and 62 deletions
--- a/src/calibre/ebooks/fb2/fb2ml.py
+++ b/src/calibre/ebooks/fb2/fb2ml.py
@ -12,14 +12,14 @@ import os
 import re
 from base64 import b64encode
 from lxml import etree
 from calibre import entity_to_unicode
 from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace
 from calibre.ebooks.oeb.stylizer import Stylizer
 from calibre.ebooks.oeb.base import OEB_IMAGES
 from calibre.constants import __appname__, __version__
 from BeautifulSoup import BeautifulSoup
 TAG_MAP = {
    'b' : 'strong',
    'i' : 'emphasis',
@ -57,11 +57,10 @@ class FB2MLizer(object):
        output += self.fb2mlize_images()
        output += self.fb2_footer()
        output = self.clean_text(output)
-        return BeautifulSoup(output.encode('utf-8')).prettify()
+        return etree.tostring(etree.fromstring(output), encoding=unicode, pretty_print=True)
    def fb2_header(self):
-        return u'<?xml version="1.0" encoding="utf-8"?> ' \
+        return u'<FictionBook xmlns:xlink="http://www.w3.org/1999/xlink" ' \
        '<FictionBook xmlns:xlink="http://www.w3.org/1999/xlink" ' \
        'xmlns="http://www.gribuser.ru/xml/fictionbook/2.0"> ' \
        '<description><title-info><book-title>%s</book-title> ' \
        '</title-info><document-info> ' \
--- a/src/calibre/ebooks/fb2/output.py
+++ b/src/calibre/ebooks/fb2/output.py
@ -30,7 +30,7 @@ class FB2Output(OutputFormatPlugin):
        out_stream.seek(0)
        out_stream.truncate()
-        out_stream.write(fb2_content)
+        out_stream.write(fb2_content.encode('utf-8'))
        if close:
            out_stream.close()
--- a/src/calibre/ebooks/txt/writer.py
+++ b/src/calibre/ebooks/txt/writer.py
@ -11,9 +11,10 @@ Write content to TXT.
 import os
 import re
-from calibre import entity_to_unicode
+from lxml import etree
-from BeautifulSoup import BeautifulSoup
+from calibre import entity_to_unicode
 from calibre.ebooks.oeb.base import XHTML
 class TxtWriter(object):
    def __init__(self, newline, log):
@ -23,10 +24,8 @@ class TxtWriter(object):
    def dump(self, spine):
        out = u''
        for item in spine:
-            content = unicode(item)
+            content = unicode(etree.tostring(item.data.find(XHTML('body')), encoding=unicode))
-            # Convert newlines to unix style \n for processing. These
+            content = self.remove_newlines(content)
            # will be changed to the specified type later in the process.
            content = self.unix_newlines(content)
            content = self.strip_html(content)
            content = self.replace_html_symbols(content)
            content = self.cleanup_text(content)
@ -40,95 +39,92 @@ class TxtWriter(object):
        return out
-    def strip_html(self, html):
+    def strip_html(self, text):
        stripped = u''
        for dom_tree in BeautifulSoup(html).findAll('body'):
            text = unicode(dom_tree)
            # Remove unnecessary tags
            for tag in ['script', 'style']:
                text = re.sub('(?imu)<[ ]*%s[ ]*.*?>(.*)</[ ]*%s[ ]*>' % (tag, tag), '', text)
            text = re.sub('<!--.*-->', '', text)
            text = re.sub('<\?.*?\?>', '', text)
            text = re.sub('<@.*?@>', '', text)
            text = re.sub('<%.*?%>', '', text)
-            # Headings usually indicate Chapters.
+        # Remove unnecessary tags
-            # We are going to use a marker to insert the proper number of
+        for tag in ['script', 'style']:
-            # newline characters at the end of cleanup_text because cleanup_text
+            text = re.sub('(?imu)<[ ]*%s[ ]*.*?>(.*)</[ ]*%s[ ]*>' % (tag, tag), '', text)
-            # remove excessive (more than 2 newlines).
+        text = re.sub('<!--.*-->', '', text)
-            for tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
+        text = re.sub('<\?.*?\?>', '', text)
-                text = re.sub('(?imu)<[ ]*%s[ ]*.*?>' % tag, '-vzxedxy-', text)
+        text = re.sub('<@.*?@>', '', text)
-                text = re.sub('(?imu)</[ ]*%s[ ]*>' % tag, '-vlgzxey-', text)
+        text = re.sub('<%.*?%>', '', text)
        # Headings usually indicate Chapters.
        # We are going to use a marker to insert the proper number of
        # newline characters at the end of cleanup_text because cleanup_text
        # remove excessive (more than 2 newlines).
        for tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
            text = re.sub('(?imu)<[ ]*%s[ ]*.*?>' % tag, '-vzxedxy-', text)
            text = re.sub('(?imu)</[ ]*%s[ ]*>' % tag, '-vlgzxey-', text)
        # Separate content with space.
        for tag in ['td']:
            text = re.sub('(?imu)</[ ]*%s[ ]*>', ' ', text)
        # Separate content with empty line.
        for tag in ['p', 'div', 'pre', 'li', 'table', 'tr']:
            text = re.sub('(?imu)</[ ]*%s[ ]*>' % tag, '\n\n', text)
        for tag in ['hr', 'br']:
            text = re.sub('(?imu)<[ ]*%s.*?>' % tag, '\n\n', text)
        # Remove any tags that do not need special processing.
        text = re.sub('<.*?>', '', text)
        stripped = stripped + text
            # Separate content with space.
            for tag in ['td']:
                text = re.sub('(?imu)</[ ]*%s[ ]*>', ' ', text)
            # Separate content with empty line.
            for tag in ['p', 'div', 'pre', 'li', 'table', 'tr']:
                text = re.sub('(?imu)</[ ]*%s[ ]*>' % tag, '\n\n', text)
            for tag in ['hr', 'br']:
                text = re.sub('(?imu)<[ ]*%s.*?>' % tag, '\n\n', text)
            # Remove any tags that do not need special processing.
            text = re.sub('<.*?>', '', text)
            stripped = stripped + text
        return stripped
-        
+
    def replace_html_symbols(self, content):
        for entity in set(re.findall('&.+?;', content)):
            mo = re.search('(%s)' % entity[1:-1], content)
            content = content.replace(entity, entity_to_unicode(mo))
        return content
-        
+
    def cleanup_text(self, text):
        # Replace bad characters.
        text = text.replace(u'\xc2', '')
        text = text.replace(u'\xa0', ' ')
-    
+
        # Replace tabs, vertical tags and form feeds with single space.
        text = text.replace('\t+', ' ')
        text = text.replace('\v+', ' ')
        text = text.replace('\f+', ' ')
-    
+
        # Single line paragraph.
        text = re.sub('(?<=.)\n(?=.)', ' ', text)
-        
+
        # Remove multiple spaces.
        text = re.sub('[  ]+', ' ', text)
-        
+
        # Remove excessive newlines.
        text = re.sub('\n[ ]+\n', '\n\n', text)
        text = re.sub('\n{3,}', '\n\n', text)
-        
+
        # Replace markers with the proper characters.
        text = text.replace('-vzxedxy-', '\n\n\n\n\n')
        text = text.replace('-vlgzxey-', '\n\n\n')
-        
+
        # Replace spaces at the beginning and end of lines
        text = re.sub('(?imu)^[ ]+', '', text)
        text = re.sub('(?imu)[ ]+$', '', text)
-        
+
        return text
-    def unix_newlines(self, text):
+    def remove_newlines(self, text):
        text = text.replace('\r\n', ' ')
        text = text.replace('\n', ' ')
        text = text.replace('\r', ' ')
-        
+
        return text
-        
+
    def specified_newlines(self, text):
        if self.newline == '\n':
            return text
-        
+
-        return text.replace('\n', self.newline)        
+        return text.replace('\n', self.newline)
 class TxtNewlines(object):
@ -138,7 +134,7 @@ class TxtNewlines(object):
                        'old_mac' : '\r',
                        'windows' : '\r\n'
                     }
-                     
+
    def __init__(self, newline_type):
        self.newline = self.NEWLINE_TYPES.get(newline_type.lower(), os.linesep)