Pull from driver-dev

2025-08-30 23:00:21 -04:00 · 2009-06-06 16:41:45 -07:00 · 2009-06-06 16:41:45 -07:00 · 51bbd45551
commit 51bbd45551
parent 8e5cb5e1cd ee29a571f4
3 changed files with 60 additions and 66 deletions
--- a/src/calibre/ebooks/fb2/fb2ml.py
+++ b/src/calibre/ebooks/fb2/fb2ml.py
@ -12,19 +12,19 @@ import os
 import re
 from base64 import b64encode

+from lxml import etree
+
 from calibre import entity_to_unicode
 from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace
 from calibre.ebooks.oeb.stylizer import Stylizer
 from calibre.ebooks.oeb.base import OEB_IMAGES
 from calibre.constants import __appname__, __version__

-from BeautifulSoup import BeautifulSoup
-
 TAG_MAP = {
    'b' : 'strong',
    'i' : 'emphasis',
    'p' : 'p',
-    'div' : 'p',
+    'li' : 'p'
 }

 STYLES = [
@ -57,11 +57,10 @@ class FB2MLizer(object):
        output += self.fb2mlize_images()
        output += self.fb2_footer()
        output = self.clean_text(output)
-        return BeautifulSoup(output.encode('utf-8')).prettify()
+        return etree.tostring(etree.fromstring(output), encoding=unicode, pretty_print=True)

    def fb2_header(self):
-        return u'<?xml version="1.0" encoding="utf-8"?> ' \
-        '<FictionBook xmlns:xlink="http://www.w3.org/1999/xlink" ' \
+        return u'<FictionBook xmlns:xlink="http://www.w3.org/1999/xlink" ' \
        'xmlns="http://www.gribuser.ru/xml/fictionbook/2.0"> ' \
        '<description><title-info><book-title>%s</book-title> ' \
        '</title-info><document-info> ' \
@ -110,12 +109,13 @@ class FB2MLizer(object):
            fb2_text += '<image xlink:herf="#%s" />' % os.path.basename(elem.attrib['src'])
        

-        fb2_tag = TAG_MAP.get(tag, 'p')
+        fb2_tag = TAG_MAP.get(tag, None)
        if fb2_tag and fb2_tag not in tag_stack:
            tag_count += 1
            fb2_text += '<%s>' % fb2_tag
            tag_stack.append(fb2_tag)

+
        # Processes style information
        for s in STYLES:
            style_tag = s[1].get(style[s[0]], None)
@ -133,7 +133,6 @@ class FB2MLizer(object):
        close_tag_list = []
        for i in range(0, tag_count):
            close_tag_list.insert(0, tag_stack.pop())
-            
        fb2_text += self.close_tags(close_tag_list)

        if hasattr(elem, 'tail') and elem.tail != None and elem.tail.strip() != '':
@ -151,4 +150,3 @@ class FB2MLizer(object):
            fb2_text += '</%s>' % fb2_tag

        return fb2_text
-
--- a/src/calibre/ebooks/fb2/output.py
+++ b/src/calibre/ebooks/fb2/output.py
@ -30,7 +30,7 @@ class FB2Output(OutputFormatPlugin):
        
        out_stream.seek(0)
        out_stream.truncate()
-        out_stream.write(fb2_content)
+        out_stream.write(fb2_content.encode('utf-8'))
        
        if close:
            out_stream.close()
--- a/src/calibre/ebooks/txt/writer.py
+++ b/src/calibre/ebooks/txt/writer.py
@ -11,9 +11,10 @@ Write content to TXT.
 import os
 import re

-from calibre import entity_to_unicode
+from lxml import etree

-from BeautifulSoup import BeautifulSoup
+from calibre import entity_to_unicode
+from calibre.ebooks.oeb.base import XHTML

 class TxtWriter(object):
    def __init__(self, newline, log):
@ -23,10 +24,8 @@ class TxtWriter(object):
    def dump(self, spine):
        out = u''
        for item in spine:
-            content = unicode(item)
-            # Convert newlines to unix style \n for processing. These
-            # will be changed to the specified type later in the process.
-            content = self.unix_newlines(content)
+            content = unicode(etree.tostring(item.data.find(XHTML('body')), encoding=unicode))
+            content = self.remove_newlines(content)
            content = self.strip_html(content)
            content = self.replace_html_symbols(content)
            content = self.cleanup_text(content)
@ -40,12 +39,9 @@ class TxtWriter(object):

        return out

-    def strip_html(self, html):
+    def strip_html(self, text):
        stripped = u''

-        for dom_tree in BeautifulSoup(html).findAll('body'):
-            text = unicode(dom_tree)
-            
        # Remove unnecessary tags
        for tag in ['script', 'style']:
            text = re.sub('(?imu)<[ ]*%s[ ]*.*?>(.*)</[ ]*%s[ ]*>' % (tag, tag), '', text)
@ -117,7 +113,7 @@ class TxtWriter(object):

        return text

-    def unix_newlines(self, text):
+    def remove_newlines(self, text):
        text = text.replace('\r\n', ' ')
        text = text.replace('\n', ' ')
        text = text.replace('\r', ' ')