FB2 Output: Speed up conversion of images and handle external links

Merge branch 'fb2-fixes' of https://github.com/PalmtopTiger/calibre
2025-07-09 03:04:10 -04:00 · 2019-10-17 16:17:57 +05:30 · 2019-10-17 16:17:57 +05:30 · 15d9fc23f2
commit 15d9fc23f2
parent bf3236db9b 3e680ac307
1 changed files with 106 additions and 90 deletions
--- a/src/calibre/ebooks/fb2/fb2ml.py
+++ b/src/calibre/ebooks/fb2/fb2ml.py
@ -19,14 +19,15 @@ from calibre.constants import __appname__, __version__
 from calibre.utils.localization import lang_as_iso639_1
 from calibre.utils.img import save_cover_data_to
 from calibre.ebooks.oeb.base import urlnormalize
-from polyglot.builtins import unicode_type, string_or_bytes
+from polyglot.builtins import unicode_type, string_or_bytes, range, filter
 from polyglot.binary import as_base64_unicode
 from polyglot.urllib import urlparse
 class FB2MLizer(object):
    '''
    Todo: * Include more FB2 specific tags in the conversion.
-          * Handle a tags.
+          * Handle notes and anchor links.
    '''
    def __init__(self, log):
@ -59,43 +60,53 @@ class FB2MLizer(object):
        return self.fb2mlize_spine()
    def fb2mlize_spine(self):
-        output = [self.fb2_header()]
+        output = (
-        output.append(self.get_text())
+            self.fb2_header(),
-        output.append(self.fb2mlize_images())
+            self.get_text(),
-        output.append(self.fb2_footer())
+            self.fb2mlize_images(),
-        output = self.clean_text(''.join(output))
+            self.fb2_footer(),
        )
        output = self.clean_text('\n'.join(output))
        if self.opts.pretty_print:
-            return '<?xml version="1.0" encoding="UTF-8"?>\n%s' % etree.tostring(etree.fromstring(output), encoding='unicode', pretty_print=True)
+            output = etree.tostring(etree.fromstring(output), encoding='unicode', pretty_print=True)
-        else:
+
-            return '<?xml version="1.0" encoding="UTF-8"?>' + output
+        return '<?xml version="1.0" encoding="UTF-8"?>\n' + output
    def clean_text(self, text):
        # Remove pointless tags, but keep their contents.
        text = re.sub(r'(?mu)<(strong|emphasis|strikethrough|sub|sup)>(\s*)</\1>', r'\2', text)
        # Clean up paragraphs endings.
        text = re.sub(r'(?mu)\s+</p>', '</p>', text)
        # Condense empty paragraphs into a line break.
-        text = re.sub(r'(?miu)(<p>\s*</p>\s*){3,}', '<empty-line />', text)
+        text = re.sub(r'(?mu)(?:<p></p>\s*){3,}', '<empty-line/>', text)
        # Remove empty paragraphs.
-        text = re.sub(r'(?miu)<p>\s*</p>', '', text)
+        text = re.sub(r'(?mu)<p></p>\s*', '', text)
-        # Clean up pargraph endings.
+        # Put the paragraph following a paragraph on a separate line.
-        text = re.sub(r'(?miu)\s*</p>', '</p>', text)
+        text = re.sub(r'(?mu)</p>\s*<p>', '</p>\n<p>', text)
        # Put paragraphs following a paragraph on a separate line.
        text = re.sub(r'(?miu)</p>\s*<p>', '</p>\n\n<p>', text)
        # Remove empty title elements.
        text = re.sub(r'(?miu)<title>\s*</title>', '', text)
        text = re.sub(r'(?miu)\s+</title>', '</title>', text)
        # Remove empty sections.
        text = re.sub(r'(?miu)<section>\s*</section>', '', text)
        # Clean up sections start and ends.
        text = re.sub(r'(?miu)\s*</section>', '\n</section>', text)
        text = re.sub(r'(?miu)</section>\s*', '</section>\n\n', text)
        text = re.sub(r'(?miu)\s*<section>', '\n<section>', text)
        text = re.sub(r'(?miu)<section>\s*', '<section>\n', text)
        # Put sectnions followed by sections on a separate line.
        text = re.sub(r'(?miu)</section>\s*<section>', '</section>\n\n<section>', text)
        if self.opts.insert_blank_line:
-            text = re.sub(r'(?miu)</p>', '</p><empty-line />', text)
+            text = re.sub(r'(?mu)</p>', '</p><empty-line/>', text)
        # Clean up title endings.
        text = re.sub(r'(?mu)\s+</title>', '</title>', text)
        # Remove empty title elements.
        text = re.sub(r'(?mu)<title></title>\s*', '', text)
        # Put the paragraph following a title on a separate line.
        text = re.sub(r'(?mu)</title>\s*<p>', '</title>\n<p>', text)
        # Put line breaks between paragraphs on a separate line.
        text = re.sub(r'(?mu)</(p|title)>\s*<empty-line/>', r'</\1>\n<empty-line/>', text)
        text = re.sub(r'(?mu)<empty-line/>\s*<p>', '<empty-line/>\n<p>', text)
        # Remove empty sections.
        text = re.sub(r'(?mu)<section>\s*</section>', '', text)
        # Clean up sections starts and ends.
        text = re.sub(r'(?mu)\s*<section>', '\n<section>', text)
        text = re.sub(r'(?mu)<section>\s*', '<section>\n', text)
        text = re.sub(r'(?mu)\s*</section>', '\n</section>', text)
        text = re.sub(r'(?mu)</section>\s*', '</section>\n', text)
        return text
@ -152,7 +163,7 @@ class FB2MLizer(object):
            index = '1'
            if self.oeb_book.metadata.series_index:
                index = self.oeb_book.metadata.series_index[0]
-            metadata['sequence'] = '<sequence name="%s" number="%s" />' % (prepare_string_for_xml('%s' % self.oeb_book.metadata.series[0]), index)
+            metadata['sequence'] = '<sequence name="%s" number="%s"/>' % (prepare_string_for_xml('%s' % self.oeb_book.metadata.series[0]), index)
        year = publisher = isbn = ''
        identifiers = self.oeb_book.metadata['identifier']
@ -193,37 +204,41 @@ class FB2MLizer(object):
            metadata['comments'] = ''
        else:
            from calibre.utils.html2text import html2text
-            metadata['comments'] = '<annotation>{}</annotation>'.format(prepare_string_for_xml(html2text(comments.value.strip())))
+            metadata['comments'] = '<annotation><p>{}</p></annotation>'.format(prepare_string_for_xml(html2text(comments.value).strip()))
-        return textwrap.dedent('''
+        # Keep the indentation level of the description the same as the body.
-            <FictionBook xmlns="http://www.gribuser.ru/xml/fictionbook/2.0" xmlns:xlink="http://www.w3.org/1999/xlink">
+        header = textwrap.dedent('''\
-                <description>
+            <FictionBook xmlns="http://www.gribuser.ru/xml/fictionbook/2.0" xmlns:l="http://www.w3.org/1999/xlink">
-                    <title-info>
+            <description>
-                        <genre>%(genre)s</genre>
+                <title-info>
-                            %(author)s
+                    <genre>%(genre)s</genre>
-                        <book-title>%(title)s</book-title>
+                    %(author)s
-                        %(cover)s
+                    <book-title>%(title)s</book-title>
-                        <lang>%(lang)s</lang>
+                    %(cover)s
-                        %(keywords)s
+                    <lang>%(lang)s</lang>
-                        %(sequence)s
+                    %(keywords)s
-                        %(comments)s
+                    %(sequence)s
-                    </title-info>
+                    %(comments)s
-                    <document-info>
+                </title-info>
-                        %(author)s
+                <document-info>
-                        <program-used>%(appname)s %(version)s</program-used>
+                    %(author)s
-                        <date>%(date)s</date>
+                    <program-used>%(appname)s %(version)s</program-used>
-                        <id>%(id)s</id>
+                    <date>%(date)s</date>
-                        <version>1.0</version>
+                    <id>%(id)s</id>
-                    </document-info>
+                    <version>1.0</version>
-                    <publish-info>
+                </document-info>
-                        %(publisher)s
+                <publish-info>
-                        %(year)s
+                    %(publisher)s
-                        %(isbn)s
+                    %(year)s
-                    </publish-info>
+                    %(isbn)s
-                </description>\n''') % metadata
+                </publish-info>
            </description>''') % metadata
        # Remove empty lines.
        return '\n'.join(filter(unicode_type.strip, header.splitlines()))
    def fb2_footer(self):
-        return '\n</FictionBook>'
+        return '</FictionBook>'
    def get_cover(self):
        from calibre.ebooks.oeb.base import OEB_RASTER_IMAGES
@ -253,10 +268,9 @@ class FB2MLizer(object):
        if cover_href:
            # Only write the image tag if it is in the manifest.
-            if cover_href in self.oeb_book.manifest.hrefs.keys():
+            if cover_href in self.oeb_book.manifest.hrefs and cover_href not in self.image_hrefs:
-                if cover_href not in self.image_hrefs.keys():
+                self.image_hrefs[cover_href] = 'img_%s' % len(self.image_hrefs)
-                    self.image_hrefs[cover_href] = '_%s.jpg' % len(self.image_hrefs.keys())
+            return '<coverpage><image l:href="#%s"/></coverpage>' % self.image_hrefs[cover_href]
            return '<coverpage><image xlink:href="#%s" /></coverpage>' % self.image_hrefs[cover_href]
        return ''
@ -292,7 +306,8 @@ class FB2MLizer(object):
            text.append('</section>')
            self.section_level -= 1
-        return ''.join(text) + '</body>'
+        text.append('</body>')
        return ''.join(text)
    def fb2mlize_images(self):
        '''
@ -315,19 +330,13 @@ class FB2MLizer(object):
                        raw_data = as_base64_unicode(item.data)
                        content_type = item.media_type
                    # Don't put the encoded image on a single line.
-                    data = ''
+                    step = 72
-                    col = 1
+                    data = '\n'.join(raw_data[i:i+step] for i in range(0, len(raw_data), step))
-                    for char in raw_data:
+                    images.append('<binary id="%s" content-type="%s">%s</binary>' % (self.image_hrefs[item.href], content_type, data))
                        if col == 72:
                            data += '\n'
                            col = 1
                        col += 1
                        data += char
                    images.append('<binary id="%s" content-type="%s">%s\n</binary>' % (self.image_hrefs[item.href], content_type, data))
                except Exception as e:
                    self.log.error('Error: Could not include file %s because '
                        '%s.' % (item.href, e))
-        return ''.join(images)
+        return '\n'.join(images)
    def create_flat_toc(self, nodes, level):
        for item in nodes:
@ -462,19 +471,18 @@ class FB2MLizer(object):
        # Process the XHTML tag and styles. Converted to an FB2 tag.
        # Use individual if statement not if else. There can be
        # only one XHTML tag but it can have multiple styles.
-        if tag == 'img':
+        if tag == 'img' and elem_tree.attrib.get('src', None):
-            if elem_tree.attrib.get('src', None):
+            # Only write the image tag if it is in the manifest.
-                # Only write the image tag if it is in the manifest.
+            ihref = urlnormalize(page.abshref(elem_tree.attrib['src']))
-                ihref = urlnormalize(page.abshref(elem_tree.attrib['src']))
+            if ihref in self.oeb_book.manifest.hrefs:
-                if ihref in self.oeb_book.manifest.hrefs:
+                if ihref not in self.image_hrefs:
-                    if ihref not in self.image_hrefs:
+                    self.image_hrefs[ihref] = 'img_%s' % len(self.image_hrefs)
-                        self.image_hrefs[ihref] = '_%s.jpg' % len(self.image_hrefs)
+                p_txt, p_tag = self.ensure_p()
-                    p_txt, p_tag = self.ensure_p()
+                fb2_out += p_txt
-                    fb2_out += p_txt
+                tags += p_tag
-                    tags += p_tag
+                fb2_out.append('<image l:href="#%s"/>' % self.image_hrefs[ihref])
-                    fb2_out.append('<image xlink:href="#%s" />' % self.image_hrefs[ihref])
+            else:
-                else:
+                self.log.warn(u'Ignoring image not in manifest: %s' % ihref)
                    self.log.warn(u'Ignoring image not in manifest: %s'%ihref)
        if tag in ('br', 'hr') or ems >= 1:
            if ems < 1:
                multiplier = 1
@ -489,17 +497,25 @@ class FB2MLizer(object):
                    closed_tags.append(t)
                    if t == 'p':
                        break
-                fb2_out.append('<empty-line />' * multiplier)
+                fb2_out.append('<empty-line/>' * multiplier)
                closed_tags.reverse()
                for t in closed_tags:
                    fb2_out.append('<%s>' % t)
            else:
-                fb2_out.append('<empty-line />' * multiplier)
+                fb2_out.append('<empty-line/>' * multiplier)
        if tag in ('div', 'li', 'p'):
            p_text, added_p = self.close_open_p(tag_stack+tags)
            fb2_out += p_text
            if added_p:
                tags.append('p')
        if tag == 'a' and elem_tree.attrib.get('href', None):
            # Handle only external links for now
            if urlparse(elem_tree.attrib['href']).netloc:
                p_txt, p_tag = self.ensure_p()
                fb2_out += p_txt
                tags += p_tag
                fb2_out.append('<a l:href="%s">' % urlnormalize(elem_tree.attrib['href']))
                tags.append('a')
        if tag == 'b' or style['font-weight'] in ('bold', 'bolder'):
            s_out, s_tags = self.handle_simple_tag('strong', tag_stack+tags)
            fb2_out += s_out