mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
FB2 Output: Speed up conversion of images and handle external links
Merge branch 'fb2-fixes' of https://github.com/PalmtopTiger/calibre
This commit is contained in:
commit
15d9fc23f2
@ -19,14 +19,15 @@ from calibre.constants import __appname__, __version__
|
|||||||
from calibre.utils.localization import lang_as_iso639_1
|
from calibre.utils.localization import lang_as_iso639_1
|
||||||
from calibre.utils.img import save_cover_data_to
|
from calibre.utils.img import save_cover_data_to
|
||||||
from calibre.ebooks.oeb.base import urlnormalize
|
from calibre.ebooks.oeb.base import urlnormalize
|
||||||
from polyglot.builtins import unicode_type, string_or_bytes
|
from polyglot.builtins import unicode_type, string_or_bytes, range, filter
|
||||||
from polyglot.binary import as_base64_unicode
|
from polyglot.binary import as_base64_unicode
|
||||||
|
from polyglot.urllib import urlparse
|
||||||
|
|
||||||
|
|
||||||
class FB2MLizer(object):
|
class FB2MLizer(object):
|
||||||
'''
|
'''
|
||||||
Todo: * Include more FB2 specific tags in the conversion.
|
Todo: * Include more FB2 specific tags in the conversion.
|
||||||
* Handle a tags.
|
* Handle notes and anchor links.
|
||||||
'''
|
'''
|
||||||
|
|
||||||
def __init__(self, log):
|
def __init__(self, log):
|
||||||
@ -59,43 +60,53 @@ class FB2MLizer(object):
|
|||||||
return self.fb2mlize_spine()
|
return self.fb2mlize_spine()
|
||||||
|
|
||||||
def fb2mlize_spine(self):
|
def fb2mlize_spine(self):
|
||||||
output = [self.fb2_header()]
|
output = (
|
||||||
output.append(self.get_text())
|
self.fb2_header(),
|
||||||
output.append(self.fb2mlize_images())
|
self.get_text(),
|
||||||
output.append(self.fb2_footer())
|
self.fb2mlize_images(),
|
||||||
output = self.clean_text(''.join(output))
|
self.fb2_footer(),
|
||||||
|
)
|
||||||
|
output = self.clean_text('\n'.join(output))
|
||||||
|
|
||||||
if self.opts.pretty_print:
|
if self.opts.pretty_print:
|
||||||
return '<?xml version="1.0" encoding="UTF-8"?>\n%s' % etree.tostring(etree.fromstring(output), encoding='unicode', pretty_print=True)
|
output = etree.tostring(etree.fromstring(output), encoding='unicode', pretty_print=True)
|
||||||
else:
|
|
||||||
return '<?xml version="1.0" encoding="UTF-8"?>' + output
|
return '<?xml version="1.0" encoding="UTF-8"?>\n' + output
|
||||||
|
|
||||||
def clean_text(self, text):
|
def clean_text(self, text):
|
||||||
|
# Remove pointless tags, but keep their contents.
|
||||||
|
text = re.sub(r'(?mu)<(strong|emphasis|strikethrough|sub|sup)>(\s*)</\1>', r'\2', text)
|
||||||
|
|
||||||
|
# Clean up paragraphs endings.
|
||||||
|
text = re.sub(r'(?mu)\s+</p>', '</p>', text)
|
||||||
# Condense empty paragraphs into a line break.
|
# Condense empty paragraphs into a line break.
|
||||||
text = re.sub(r'(?miu)(<p>\s*</p>\s*){3,}', '<empty-line />', text)
|
text = re.sub(r'(?mu)(?:<p></p>\s*){3,}', '<empty-line/>', text)
|
||||||
# Remove empty paragraphs.
|
# Remove empty paragraphs.
|
||||||
text = re.sub(r'(?miu)<p>\s*</p>', '', text)
|
text = re.sub(r'(?mu)<p></p>\s*', '', text)
|
||||||
# Clean up pargraph endings.
|
# Put the paragraph following a paragraph on a separate line.
|
||||||
text = re.sub(r'(?miu)\s*</p>', '</p>', text)
|
text = re.sub(r'(?mu)</p>\s*<p>', '</p>\n<p>', text)
|
||||||
# Put paragraphs following a paragraph on a separate line.
|
|
||||||
text = re.sub(r'(?miu)</p>\s*<p>', '</p>\n\n<p>', text)
|
|
||||||
|
|
||||||
# Remove empty title elements.
|
|
||||||
text = re.sub(r'(?miu)<title>\s*</title>', '', text)
|
|
||||||
text = re.sub(r'(?miu)\s+</title>', '</title>', text)
|
|
||||||
|
|
||||||
# Remove empty sections.
|
|
||||||
text = re.sub(r'(?miu)<section>\s*</section>', '', text)
|
|
||||||
# Clean up sections start and ends.
|
|
||||||
text = re.sub(r'(?miu)\s*</section>', '\n</section>', text)
|
|
||||||
text = re.sub(r'(?miu)</section>\s*', '</section>\n\n', text)
|
|
||||||
text = re.sub(r'(?miu)\s*<section>', '\n<section>', text)
|
|
||||||
text = re.sub(r'(?miu)<section>\s*', '<section>\n', text)
|
|
||||||
# Put sectnions followed by sections on a separate line.
|
|
||||||
text = re.sub(r'(?miu)</section>\s*<section>', '</section>\n\n<section>', text)
|
|
||||||
|
|
||||||
if self.opts.insert_blank_line:
|
if self.opts.insert_blank_line:
|
||||||
text = re.sub(r'(?miu)</p>', '</p><empty-line />', text)
|
text = re.sub(r'(?mu)</p>', '</p><empty-line/>', text)
|
||||||
|
|
||||||
|
# Clean up title endings.
|
||||||
|
text = re.sub(r'(?mu)\s+</title>', '</title>', text)
|
||||||
|
# Remove empty title elements.
|
||||||
|
text = re.sub(r'(?mu)<title></title>\s*', '', text)
|
||||||
|
# Put the paragraph following a title on a separate line.
|
||||||
|
text = re.sub(r'(?mu)</title>\s*<p>', '</title>\n<p>', text)
|
||||||
|
|
||||||
|
# Put line breaks between paragraphs on a separate line.
|
||||||
|
text = re.sub(r'(?mu)</(p|title)>\s*<empty-line/>', r'</\1>\n<empty-line/>', text)
|
||||||
|
text = re.sub(r'(?mu)<empty-line/>\s*<p>', '<empty-line/>\n<p>', text)
|
||||||
|
|
||||||
|
# Remove empty sections.
|
||||||
|
text = re.sub(r'(?mu)<section>\s*</section>', '', text)
|
||||||
|
# Clean up sections starts and ends.
|
||||||
|
text = re.sub(r'(?mu)\s*<section>', '\n<section>', text)
|
||||||
|
text = re.sub(r'(?mu)<section>\s*', '<section>\n', text)
|
||||||
|
text = re.sub(r'(?mu)\s*</section>', '\n</section>', text)
|
||||||
|
text = re.sub(r'(?mu)</section>\s*', '</section>\n', text)
|
||||||
|
|
||||||
return text
|
return text
|
||||||
|
|
||||||
@ -152,7 +163,7 @@ class FB2MLizer(object):
|
|||||||
index = '1'
|
index = '1'
|
||||||
if self.oeb_book.metadata.series_index:
|
if self.oeb_book.metadata.series_index:
|
||||||
index = self.oeb_book.metadata.series_index[0]
|
index = self.oeb_book.metadata.series_index[0]
|
||||||
metadata['sequence'] = '<sequence name="%s" number="%s" />' % (prepare_string_for_xml('%s' % self.oeb_book.metadata.series[0]), index)
|
metadata['sequence'] = '<sequence name="%s" number="%s"/>' % (prepare_string_for_xml('%s' % self.oeb_book.metadata.series[0]), index)
|
||||||
|
|
||||||
year = publisher = isbn = ''
|
year = publisher = isbn = ''
|
||||||
identifiers = self.oeb_book.metadata['identifier']
|
identifiers = self.oeb_book.metadata['identifier']
|
||||||
@ -193,37 +204,41 @@ class FB2MLizer(object):
|
|||||||
metadata['comments'] = ''
|
metadata['comments'] = ''
|
||||||
else:
|
else:
|
||||||
from calibre.utils.html2text import html2text
|
from calibre.utils.html2text import html2text
|
||||||
metadata['comments'] = '<annotation>{}</annotation>'.format(prepare_string_for_xml(html2text(comments.value.strip())))
|
metadata['comments'] = '<annotation><p>{}</p></annotation>'.format(prepare_string_for_xml(html2text(comments.value).strip()))
|
||||||
|
|
||||||
return textwrap.dedent('''
|
# Keep the indentation level of the description the same as the body.
|
||||||
<FictionBook xmlns="http://www.gribuser.ru/xml/fictionbook/2.0" xmlns:xlink="http://www.w3.org/1999/xlink">
|
header = textwrap.dedent('''\
|
||||||
<description>
|
<FictionBook xmlns="http://www.gribuser.ru/xml/fictionbook/2.0" xmlns:l="http://www.w3.org/1999/xlink">
|
||||||
<title-info>
|
<description>
|
||||||
<genre>%(genre)s</genre>
|
<title-info>
|
||||||
%(author)s
|
<genre>%(genre)s</genre>
|
||||||
<book-title>%(title)s</book-title>
|
%(author)s
|
||||||
%(cover)s
|
<book-title>%(title)s</book-title>
|
||||||
<lang>%(lang)s</lang>
|
%(cover)s
|
||||||
%(keywords)s
|
<lang>%(lang)s</lang>
|
||||||
%(sequence)s
|
%(keywords)s
|
||||||
%(comments)s
|
%(sequence)s
|
||||||
</title-info>
|
%(comments)s
|
||||||
<document-info>
|
</title-info>
|
||||||
%(author)s
|
<document-info>
|
||||||
<program-used>%(appname)s %(version)s</program-used>
|
%(author)s
|
||||||
<date>%(date)s</date>
|
<program-used>%(appname)s %(version)s</program-used>
|
||||||
<id>%(id)s</id>
|
<date>%(date)s</date>
|
||||||
<version>1.0</version>
|
<id>%(id)s</id>
|
||||||
</document-info>
|
<version>1.0</version>
|
||||||
<publish-info>
|
</document-info>
|
||||||
%(publisher)s
|
<publish-info>
|
||||||
%(year)s
|
%(publisher)s
|
||||||
%(isbn)s
|
%(year)s
|
||||||
</publish-info>
|
%(isbn)s
|
||||||
</description>\n''') % metadata
|
</publish-info>
|
||||||
|
</description>''') % metadata
|
||||||
|
|
||||||
|
# Remove empty lines.
|
||||||
|
return '\n'.join(filter(unicode_type.strip, header.splitlines()))
|
||||||
|
|
||||||
def fb2_footer(self):
|
def fb2_footer(self):
|
||||||
return '\n</FictionBook>'
|
return '</FictionBook>'
|
||||||
|
|
||||||
def get_cover(self):
|
def get_cover(self):
|
||||||
from calibre.ebooks.oeb.base import OEB_RASTER_IMAGES
|
from calibre.ebooks.oeb.base import OEB_RASTER_IMAGES
|
||||||
@ -253,10 +268,9 @@ class FB2MLizer(object):
|
|||||||
|
|
||||||
if cover_href:
|
if cover_href:
|
||||||
# Only write the image tag if it is in the manifest.
|
# Only write the image tag if it is in the manifest.
|
||||||
if cover_href in self.oeb_book.manifest.hrefs.keys():
|
if cover_href in self.oeb_book.manifest.hrefs and cover_href not in self.image_hrefs:
|
||||||
if cover_href not in self.image_hrefs.keys():
|
self.image_hrefs[cover_href] = 'img_%s' % len(self.image_hrefs)
|
||||||
self.image_hrefs[cover_href] = '_%s.jpg' % len(self.image_hrefs.keys())
|
return '<coverpage><image l:href="#%s"/></coverpage>' % self.image_hrefs[cover_href]
|
||||||
return '<coverpage><image xlink:href="#%s" /></coverpage>' % self.image_hrefs[cover_href]
|
|
||||||
|
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
@ -292,7 +306,8 @@ class FB2MLizer(object):
|
|||||||
text.append('</section>')
|
text.append('</section>')
|
||||||
self.section_level -= 1
|
self.section_level -= 1
|
||||||
|
|
||||||
return ''.join(text) + '</body>'
|
text.append('</body>')
|
||||||
|
return ''.join(text)
|
||||||
|
|
||||||
def fb2mlize_images(self):
|
def fb2mlize_images(self):
|
||||||
'''
|
'''
|
||||||
@ -315,19 +330,13 @@ class FB2MLizer(object):
|
|||||||
raw_data = as_base64_unicode(item.data)
|
raw_data = as_base64_unicode(item.data)
|
||||||
content_type = item.media_type
|
content_type = item.media_type
|
||||||
# Don't put the encoded image on a single line.
|
# Don't put the encoded image on a single line.
|
||||||
data = ''
|
step = 72
|
||||||
col = 1
|
data = '\n'.join(raw_data[i:i+step] for i in range(0, len(raw_data), step))
|
||||||
for char in raw_data:
|
images.append('<binary id="%s" content-type="%s">%s</binary>' % (self.image_hrefs[item.href], content_type, data))
|
||||||
if col == 72:
|
|
||||||
data += '\n'
|
|
||||||
col = 1
|
|
||||||
col += 1
|
|
||||||
data += char
|
|
||||||
images.append('<binary id="%s" content-type="%s">%s\n</binary>' % (self.image_hrefs[item.href], content_type, data))
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.log.error('Error: Could not include file %s because '
|
self.log.error('Error: Could not include file %s because '
|
||||||
'%s.' % (item.href, e))
|
'%s.' % (item.href, e))
|
||||||
return ''.join(images)
|
return '\n'.join(images)
|
||||||
|
|
||||||
def create_flat_toc(self, nodes, level):
|
def create_flat_toc(self, nodes, level):
|
||||||
for item in nodes:
|
for item in nodes:
|
||||||
@ -462,19 +471,18 @@ class FB2MLizer(object):
|
|||||||
# Process the XHTML tag and styles. Converted to an FB2 tag.
|
# Process the XHTML tag and styles. Converted to an FB2 tag.
|
||||||
# Use individual if statement not if else. There can be
|
# Use individual if statement not if else. There can be
|
||||||
# only one XHTML tag but it can have multiple styles.
|
# only one XHTML tag but it can have multiple styles.
|
||||||
if tag == 'img':
|
if tag == 'img' and elem_tree.attrib.get('src', None):
|
||||||
if elem_tree.attrib.get('src', None):
|
# Only write the image tag if it is in the manifest.
|
||||||
# Only write the image tag if it is in the manifest.
|
ihref = urlnormalize(page.abshref(elem_tree.attrib['src']))
|
||||||
ihref = urlnormalize(page.abshref(elem_tree.attrib['src']))
|
if ihref in self.oeb_book.manifest.hrefs:
|
||||||
if ihref in self.oeb_book.manifest.hrefs:
|
if ihref not in self.image_hrefs:
|
||||||
if ihref not in self.image_hrefs:
|
self.image_hrefs[ihref] = 'img_%s' % len(self.image_hrefs)
|
||||||
self.image_hrefs[ihref] = '_%s.jpg' % len(self.image_hrefs)
|
p_txt, p_tag = self.ensure_p()
|
||||||
p_txt, p_tag = self.ensure_p()
|
fb2_out += p_txt
|
||||||
fb2_out += p_txt
|
tags += p_tag
|
||||||
tags += p_tag
|
fb2_out.append('<image l:href="#%s"/>' % self.image_hrefs[ihref])
|
||||||
fb2_out.append('<image xlink:href="#%s" />' % self.image_hrefs[ihref])
|
else:
|
||||||
else:
|
self.log.warn(u'Ignoring image not in manifest: %s' % ihref)
|
||||||
self.log.warn(u'Ignoring image not in manifest: %s'%ihref)
|
|
||||||
if tag in ('br', 'hr') or ems >= 1:
|
if tag in ('br', 'hr') or ems >= 1:
|
||||||
if ems < 1:
|
if ems < 1:
|
||||||
multiplier = 1
|
multiplier = 1
|
||||||
@ -489,17 +497,25 @@ class FB2MLizer(object):
|
|||||||
closed_tags.append(t)
|
closed_tags.append(t)
|
||||||
if t == 'p':
|
if t == 'p':
|
||||||
break
|
break
|
||||||
fb2_out.append('<empty-line />' * multiplier)
|
fb2_out.append('<empty-line/>' * multiplier)
|
||||||
closed_tags.reverse()
|
closed_tags.reverse()
|
||||||
for t in closed_tags:
|
for t in closed_tags:
|
||||||
fb2_out.append('<%s>' % t)
|
fb2_out.append('<%s>' % t)
|
||||||
else:
|
else:
|
||||||
fb2_out.append('<empty-line />' * multiplier)
|
fb2_out.append('<empty-line/>' * multiplier)
|
||||||
if tag in ('div', 'li', 'p'):
|
if tag in ('div', 'li', 'p'):
|
||||||
p_text, added_p = self.close_open_p(tag_stack+tags)
|
p_text, added_p = self.close_open_p(tag_stack+tags)
|
||||||
fb2_out += p_text
|
fb2_out += p_text
|
||||||
if added_p:
|
if added_p:
|
||||||
tags.append('p')
|
tags.append('p')
|
||||||
|
if tag == 'a' and elem_tree.attrib.get('href', None):
|
||||||
|
# Handle only external links for now
|
||||||
|
if urlparse(elem_tree.attrib['href']).netloc:
|
||||||
|
p_txt, p_tag = self.ensure_p()
|
||||||
|
fb2_out += p_txt
|
||||||
|
tags += p_tag
|
||||||
|
fb2_out.append('<a l:href="%s">' % urlnormalize(elem_tree.attrib['href']))
|
||||||
|
tags.append('a')
|
||||||
if tag == 'b' or style['font-weight'] in ('bold', 'bolder'):
|
if tag == 'b' or style['font-weight'] in ('bold', 'bolder'):
|
||||||
s_out, s_tags = self.handle_simple_tag('strong', tag_stack+tags)
|
s_out, s_tags = self.handle_simple_tag('strong', tag_stack+tags)
|
||||||
fb2_out += s_out
|
fb2_out += s_out
|
||||||
|
Loading…
x
Reference in New Issue
Block a user