diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py index ba1f775645..71573bac8b 100644 --- a/src/calibre/ebooks/fb2/fb2ml.py +++ b/src/calibre/ebooks/fb2/fb2ml.py @@ -19,14 +19,15 @@ from calibre.constants import __appname__, __version__ from calibre.utils.localization import lang_as_iso639_1 from calibre.utils.img import save_cover_data_to from calibre.ebooks.oeb.base import urlnormalize -from polyglot.builtins import unicode_type, string_or_bytes +from polyglot.builtins import unicode_type, string_or_bytes, range, filter from polyglot.binary import as_base64_unicode +from polyglot.urllib import urlparse class FB2MLizer(object): ''' Todo: * Include more FB2 specific tags in the conversion. - * Handle a tags. + * Handle notes and anchor links. ''' def __init__(self, log): @@ -59,43 +60,53 @@ class FB2MLizer(object): return self.fb2mlize_spine() def fb2mlize_spine(self): - output = [self.fb2_header()] - output.append(self.get_text()) - output.append(self.fb2mlize_images()) - output.append(self.fb2_footer()) - output = self.clean_text(''.join(output)) + output = ( + self.fb2_header(), + self.get_text(), + self.fb2mlize_images(), + self.fb2_footer(), + ) + output = self.clean_text('\n'.join(output)) if self.opts.pretty_print: - return '\n%s' % etree.tostring(etree.fromstring(output), encoding='unicode', pretty_print=True) - else: - return '' + output + output = etree.tostring(etree.fromstring(output), encoding='unicode', pretty_print=True) + + return '\n' + output def clean_text(self, text): + # Remove pointless tags, but keep their contents. + text = re.sub(r'(?mu)<(strong|emphasis|strikethrough|sub|sup)>(\s*)\1>', r'\2', text) + + # Clean up paragraphs endings. + text = re.sub(r'(?mu)\s+
', '', text) # Condense empty paragraphs into a line break. - text = re.sub(r'(?miu)(\s*
\s*){3,}', '\s*
', '', text) - # Clean up pargraph endings. - text = re.sub(r'(?miu)\s*', '', text) - # Put paragraphs following a paragraph on a separate line. - text = re.sub(r'(?miu)\s*', '
\n\n', text) - - # Remove empty title elements. - text = re.sub(r'(?miu)
', '
\n', text) if self.opts.insert_blank_line: - text = re.sub(r'(?miu)
', '', '\n
', text)
+
+ # Put line breaks between paragraphs on a separate line.
+ text = re.sub(r'(?mu)(p|title)>\s*
', '
', text)
+
+ # Remove empty sections.
+ text = re.sub(r'(?mu) {}