From 2ce345e5dd9dba1f54f9c74b63b553877d4d003c Mon Sep 17 00:00:00 2001 From: Andrey Efremov Date: Sun, 6 Oct 2019 13:04:07 +0700 Subject: [PATCH 01/18] FB2 Output: Speed up image data splitting --- src/calibre/ebooks/fb2/fb2ml.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py index ba1f775645..10ea49dc64 100644 --- a/src/calibre/ebooks/fb2/fb2ml.py +++ b/src/calibre/ebooks/fb2/fb2ml.py @@ -19,7 +19,7 @@ from calibre.constants import __appname__, __version__ from calibre.utils.localization import lang_as_iso639_1 from calibre.utils.img import save_cover_data_to from calibre.ebooks.oeb.base import urlnormalize -from polyglot.builtins import unicode_type, string_or_bytes +from polyglot.builtins import unicode_type, string_or_bytes, range from polyglot.binary import as_base64_unicode @@ -315,14 +315,8 @@ class FB2MLizer(object): raw_data = as_base64_unicode(item.data) content_type = item.media_type # Don't put the encoded image on a single line. - data = '' - col = 1 - for char in raw_data: - if col == 72: - data += '\n' - col = 1 - col += 1 - data += char + step = 72 + data = '\n'.join(raw_data[i:i+step] for i in range(0, len(raw_data), step)) images.append('%s\n' % (self.image_hrefs[item.href], content_type, data)) except Exception as e: self.log.error('Error: Could not include file %s because ' From 6071352d678fa1ad3dcd73f2c1c932a198a7a4c4 Mon Sep 17 00:00:00 2001 From: Andrey Efremov Date: Sun, 6 Oct 2019 13:28:45 +0700 Subject: [PATCH 02/18] FB2 Output: Fix id for PNG images --- src/calibre/ebooks/fb2/fb2ml.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py index 10ea49dc64..3a191cd54e 100644 --- a/src/calibre/ebooks/fb2/fb2ml.py +++ b/src/calibre/ebooks/fb2/fb2ml.py @@ -253,9 +253,8 @@ class FB2MLizer(object): if cover_href: # Only write the image tag if it is in the manifest. - if cover_href in self.oeb_book.manifest.hrefs.keys(): - if cover_href not in self.image_hrefs.keys(): - self.image_hrefs[cover_href] = '_%s.jpg' % len(self.image_hrefs.keys()) + if cover_href in self.oeb_book.manifest.hrefs and cover_href not in self.image_hrefs: + self.image_hrefs[cover_href] = 'img_%s' % len(self.image_hrefs) return '' % self.image_hrefs[cover_href] return '' @@ -462,7 +461,7 @@ class FB2MLizer(object): ihref = urlnormalize(page.abshref(elem_tree.attrib['src'])) if ihref in self.oeb_book.manifest.hrefs: if ihref not in self.image_hrefs: - self.image_hrefs[ihref] = '_%s.jpg' % len(self.image_hrefs) + self.image_hrefs[ihref] = 'img_%s' % len(self.image_hrefs) p_txt, p_tag = self.ensure_p() fb2_out += p_txt tags += p_tag From 88159288a87d5e408932edae232378c077739753 Mon Sep 17 00:00:00 2001 From: Andrey Efremov Date: Sun, 6 Oct 2019 13:07:17 +0700 Subject: [PATCH 03/18] FB2 Output: Remove extra line breaks and prettify XML markup --- src/calibre/ebooks/fb2/fb2ml.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py index 3a191cd54e..91d05f0140 100644 --- a/src/calibre/ebooks/fb2/fb2ml.py +++ b/src/calibre/ebooks/fb2/fb2ml.py @@ -78,7 +78,7 @@ class FB2MLizer(object): # Clean up pargraph endings. text = re.sub(r'(?miu)\s*

', '

', text) # Put paragraphs following a paragraph on a separate line. - text = re.sub(r'(?miu)

\s*

', '

\n\n

', text) + text = re.sub(r'(?miu)

\s*

', '

\n

', text) # Remove empty title elements. text = re.sub(r'(?miu)\s*', '', text) @@ -88,11 +88,11 @@ class FB2MLizer(object): text = re.sub(r'(?miu)

\s*
', '', text) # Clean up sections start and ends. text = re.sub(r'(?miu)\s*', '\n', text) - text = re.sub(r'(?miu)\s*', '\n\n', text) + text = re.sub(r'(?miu)\s*', '\n', text) text = re.sub(r'(?miu)\s*
', '\n
', text) text = re.sub(r'(?miu)
\s*', '
\n', text) # Put sectnions followed by sections on a separate line. - text = re.sub(r'(?miu)
\s*
', '
\n\n
', text) + text = re.sub(r'(?miu)
\s*
', '
\n
', text) if self.opts.insert_blank_line: text = re.sub(r'(?miu)

', '

', text) @@ -223,7 +223,7 @@ class FB2MLizer(object): \n''') % metadata def fb2_footer(self): - return '\n' + return '' def get_cover(self): from calibre.ebooks.oeb.base import OEB_RASTER_IMAGES @@ -291,7 +291,8 @@ class FB2MLizer(object): text.append('
') self.section_level -= 1 - return ''.join(text) + '' + text.append('') + return ''.join(text) + '\n' def fb2mlize_images(self): ''' @@ -316,11 +317,11 @@ class FB2MLizer(object): # Don't put the encoded image on a single line. step = 72 data = '\n'.join(raw_data[i:i+step] for i in range(0, len(raw_data), step)) - images.append('%s\n' % (self.image_hrefs[item.href], content_type, data)) + images.append('%s' % (self.image_hrefs[item.href], content_type, data)) except Exception as e: self.log.error('Error: Could not include file %s because ' '%s.' % (item.href, e)) - return ''.join(images) + return '\n'.join(images) + '\n' def create_flat_toc(self, nodes, level): for item in nodes: From 0f5b72b4372c799d127221efb579787b3ada8d5d Mon Sep 17 00:00:00 2001 From: Andrey Efremov Date: Sun, 6 Oct 2019 13:35:27 +0700 Subject: [PATCH 04/18] FB2 Output: Use short xlink namespace --- src/calibre/ebooks/fb2/fb2ml.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py index 91d05f0140..9d88d5554f 100644 --- a/src/calibre/ebooks/fb2/fb2ml.py +++ b/src/calibre/ebooks/fb2/fb2ml.py @@ -196,7 +196,7 @@ class FB2MLizer(object): metadata['comments'] = '{}'.format(prepare_string_for_xml(html2text(comments.value.strip()))) return textwrap.dedent(''' - + %(genre)s @@ -255,7 +255,7 @@ class FB2MLizer(object): # Only write the image tag if it is in the manifest. if cover_href in self.oeb_book.manifest.hrefs and cover_href not in self.image_hrefs: self.image_hrefs[cover_href] = 'img_%s' % len(self.image_hrefs) - return '' % self.image_hrefs[cover_href] + return '' % self.image_hrefs[cover_href] return '' @@ -466,7 +466,7 @@ class FB2MLizer(object): p_txt, p_tag = self.ensure_p() fb2_out += p_txt tags += p_tag - fb2_out.append('' % self.image_hrefs[ihref]) + fb2_out.append('' % self.image_hrefs[ihref]) else: self.log.warn(u'Ignoring image not in manifest: %s'%ihref) if tag in ('br', 'hr') or ems >= 1: From 906b8773c5005ffb1699f086a1d4fc346df3ffa4 Mon Sep 17 00:00:00 2001 From: Andrey Efremov Date: Sun, 6 Oct 2019 13:36:04 +0700 Subject: [PATCH 05/18] FB2 Output: Remove extra space in short tags --- src/calibre/ebooks/fb2/fb2ml.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py index 9d88d5554f..55349cd426 100644 --- a/src/calibre/ebooks/fb2/fb2ml.py +++ b/src/calibre/ebooks/fb2/fb2ml.py @@ -72,7 +72,7 @@ class FB2MLizer(object): def clean_text(self, text): # Condense empty paragraphs into a line break. - text = re.sub(r'(?miu)(

\s*

\s*){3,}', '', text) + text = re.sub(r'(?miu)(

\s*

\s*){3,}', '', text) # Remove empty paragraphs. text = re.sub(r'(?miu)

\s*

', '', text) # Clean up pargraph endings. @@ -95,7 +95,7 @@ class FB2MLizer(object): text = re.sub(r'(?miu)
\s*
', '
\n
', text) if self.opts.insert_blank_line: - text = re.sub(r'(?miu)

', '

', text) + text = re.sub(r'(?miu)

', '

', text) return text @@ -152,7 +152,7 @@ class FB2MLizer(object): index = '1' if self.oeb_book.metadata.series_index: index = self.oeb_book.metadata.series_index[0] - metadata['sequence'] = '' % (prepare_string_for_xml('%s' % self.oeb_book.metadata.series[0]), index) + metadata['sequence'] = '' % (prepare_string_for_xml('%s' % self.oeb_book.metadata.series[0]), index) year = publisher = isbn = '' identifiers = self.oeb_book.metadata['identifier'] @@ -255,7 +255,7 @@ class FB2MLizer(object): # Only write the image tag if it is in the manifest. if cover_href in self.oeb_book.manifest.hrefs and cover_href not in self.image_hrefs: self.image_hrefs[cover_href] = 'img_%s' % len(self.image_hrefs) - return '' % self.image_hrefs[cover_href] + return '' % self.image_hrefs[cover_href] return '' @@ -466,7 +466,7 @@ class FB2MLizer(object): p_txt, p_tag = self.ensure_p() fb2_out += p_txt tags += p_tag - fb2_out.append('' % self.image_hrefs[ihref]) + fb2_out.append('' % self.image_hrefs[ihref]) else: self.log.warn(u'Ignoring image not in manifest: %s'%ihref) if tag in ('br', 'hr') or ems >= 1: @@ -483,12 +483,12 @@ class FB2MLizer(object): closed_tags.append(t) if t == 'p': break - fb2_out.append('' * multiplier) + fb2_out.append('' * multiplier) closed_tags.reverse() for t in closed_tags: fb2_out.append('<%s>' % t) else: - fb2_out.append('' * multiplier) + fb2_out.append('' * multiplier) if tag in ('div', 'li', 'p'): p_text, added_p = self.close_open_p(tag_stack+tags) fb2_out += p_text From 62757eadffa79722aa1ddf78be95cfe1f0cec923 Mon Sep 17 00:00:00 2001 From: Andrey Efremov Date: Mon, 7 Oct 2019 12:22:07 +0700 Subject: [PATCH 06/18] FB2 Output: Remove empty lines from header --- src/calibre/ebooks/fb2/fb2ml.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py index 55349cd426..13fbba976d 100644 --- a/src/calibre/ebooks/fb2/fb2ml.py +++ b/src/calibre/ebooks/fb2/fb2ml.py @@ -19,7 +19,7 @@ from calibre.constants import __appname__, __version__ from calibre.utils.localization import lang_as_iso639_1 from calibre.utils.img import save_cover_data_to from calibre.ebooks.oeb.base import urlnormalize -from polyglot.builtins import unicode_type, string_or_bytes, range +from polyglot.builtins import unicode_type, string_or_bytes, range, filter from polyglot.binary import as_base64_unicode @@ -195,12 +195,12 @@ class FB2MLizer(object): from calibre.utils.html2text import html2text metadata['comments'] = '{}'.format(prepare_string_for_xml(html2text(comments.value.strip()))) - return textwrap.dedent(''' + header = textwrap.dedent('''\ %(genre)s - %(author)s + %(author)s %(title)s %(cover)s %(lang)s @@ -220,7 +220,10 @@ class FB2MLizer(object): %(year)s %(isbn)s - \n''') % metadata + ''') % metadata + + # Remove empty lines. + return '\n'.join(filter(unicode_type.strip, header.splitlines())) + '\n' def fb2_footer(self): return '' From 58352097ad14daaecef33be46a8240f5261e325a Mon Sep 17 00:00:00 2001 From: Andrey Efremov Date: Mon, 7 Oct 2019 12:27:00 +0700 Subject: [PATCH 07/18] FB2 Output: Proper join of XML parts --- src/calibre/ebooks/fb2/fb2ml.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py index 13fbba976d..04d8ade536 100644 --- a/src/calibre/ebooks/fb2/fb2ml.py +++ b/src/calibre/ebooks/fb2/fb2ml.py @@ -59,16 +59,18 @@ class FB2MLizer(object): return self.fb2mlize_spine() def fb2mlize_spine(self): - output = [self.fb2_header()] - output.append(self.get_text()) - output.append(self.fb2mlize_images()) - output.append(self.fb2_footer()) - output = self.clean_text(''.join(output)) + output = ( + self.fb2_header(), + self.get_text(), + self.fb2mlize_images(), + self.fb2_footer(), + ) + output = self.clean_text('\n'.join(output)) if self.opts.pretty_print: - return '\n%s' % etree.tostring(etree.fromstring(output), encoding='unicode', pretty_print=True) - else: - return '' + output + output = etree.tostring(etree.fromstring(output), encoding='unicode', pretty_print=True) + + return '\n' + output def clean_text(self, text): # Condense empty paragraphs into a line break. @@ -223,7 +225,7 @@ class FB2MLizer(object): ''') % metadata # Remove empty lines. - return '\n'.join(filter(unicode_type.strip, header.splitlines())) + '\n' + return '\n'.join(filter(unicode_type.strip, header.splitlines())) def fb2_footer(self): return '' @@ -295,7 +297,7 @@ class FB2MLizer(object): self.section_level -= 1 text.append('') - return ''.join(text) + '\n' + return ''.join(text) def fb2mlize_images(self): ''' @@ -324,7 +326,7 @@ class FB2MLizer(object): except Exception as e: self.log.error('Error: Could not include file %s because ' '%s.' % (item.href, e)) - return '\n'.join(images) + '\n' + return '\n'.join(images) def create_flat_toc(self, nodes, level): for item in nodes: From d02966423c2201558494a3a9bd9698cf24741d11 Mon Sep 17 00:00:00 2001 From: Andrey Efremov Date: Mon, 7 Oct 2019 12:28:18 +0700 Subject: [PATCH 08/18] FB2 Output: Remove empty tags --- src/calibre/ebooks/fb2/fb2ml.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py index 04d8ade536..4421623869 100644 --- a/src/calibre/ebooks/fb2/fb2ml.py +++ b/src/calibre/ebooks/fb2/fb2ml.py @@ -73,6 +73,8 @@ class FB2MLizer(object): return '\n' + output def clean_text(self, text): + # Remove empty tags. + text = re.sub(r'(?miu)<(strong|emphasis|strikethrough|sub|sup)>\s*', '', text) # Condense empty paragraphs into a line break. text = re.sub(r'(?miu)(

\s*

\s*){3,}', '', text) # Remove empty paragraphs. From 0c1ca17192fa408d7d731d07b9b44158268aaccc Mon Sep 17 00:00:00 2001 From: Andrey Efremov Date: Mon, 7 Oct 2019 12:28:49 +0700 Subject: [PATCH 09/18] FB2 Output: Annotation should be wrapped in p --- src/calibre/ebooks/fb2/fb2ml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py index 4421623869..30791c8ed3 100644 --- a/src/calibre/ebooks/fb2/fb2ml.py +++ b/src/calibre/ebooks/fb2/fb2ml.py @@ -197,7 +197,7 @@ class FB2MLizer(object): metadata['comments'] = '' else: from calibre.utils.html2text import html2text - metadata['comments'] = '{}'.format(prepare_string_for_xml(html2text(comments.value.strip()))) + metadata['comments'] = '

{}

'.format(prepare_string_for_xml(html2text(comments.value).strip())) header = textwrap.dedent('''\ From 8b1194730923c308f5fe073fbd4a25fd773d8291 Mon Sep 17 00:00:00 2001 From: Andrey Efremov Date: Mon, 7 Oct 2019 17:48:17 +0700 Subject: [PATCH 10/18] FB2 Output: Keep contents of empty tags if they placed between words --- src/calibre/ebooks/fb2/fb2ml.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py index 30791c8ed3..2a408c3f94 100644 --- a/src/calibre/ebooks/fb2/fb2ml.py +++ b/src/calibre/ebooks/fb2/fb2ml.py @@ -73,8 +73,9 @@ class FB2MLizer(object): return '\n' + output def clean_text(self, text): - # Remove empty tags. - text = re.sub(r'(?miu)<(strong|emphasis|strikethrough|sub|sup)>\s*', '', text) + # Remove pointless tags, but keep their contents. + text = re.sub(r'(?miu)<(strong|emphasis|strikethrough|sub|sup)>(\s*)', r'\2', text) + # Condense empty paragraphs into a line break. text = re.sub(r'(?miu)(

\s*

\s*){3,}', '', text) # Remove empty paragraphs. From abe408bc3943f46fa083cf1f5a86fb71edb15388 Mon Sep 17 00:00:00 2001 From: Andrey Efremov Date: Tue, 8 Oct 2019 11:11:13 +0700 Subject: [PATCH 11/18] FB2 Output: Dedent the header to position it on the same level as the body --- src/calibre/ebooks/fb2/fb2ml.py | 49 +++++++++++++++++---------------- 1 file changed, 25 insertions(+), 24 deletions(-) diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py index 2a408c3f94..a9d604da65 100644 --- a/src/calibre/ebooks/fb2/fb2ml.py +++ b/src/calibre/ebooks/fb2/fb2ml.py @@ -200,32 +200,33 @@ class FB2MLizer(object): from calibre.utils.html2text import html2text metadata['comments'] = '

{}

'.format(prepare_string_for_xml(html2text(comments.value).strip())) + # Keep the indentation level of the description the same as the body. header = textwrap.dedent('''\ - - - %(genre)s - %(author)s - %(title)s - %(cover)s - %(lang)s - %(keywords)s - %(sequence)s - %(comments)s - - - %(author)s - %(appname)s %(version)s - %(date)s - %(id)s - 1.0 - - - %(publisher)s - %(year)s - %(isbn)s - - ''') % metadata + + + %(genre)s + %(author)s + %(title)s + %(cover)s + %(lang)s + %(keywords)s + %(sequence)s + %(comments)s + + + %(author)s + %(appname)s %(version)s + %(date)s + %(id)s + 1.0 + + + %(publisher)s + %(year)s + %(isbn)s + + ''') % metadata # Remove empty lines. return '\n'.join(filter(unicode_type.strip, header.splitlines())) From 3b8d769128922a939ca7b6c72578d1eaad779b95 Mon Sep 17 00:00:00 2001 From: Andrey Efremov Date: Tue, 8 Oct 2019 12:16:06 +0700 Subject: [PATCH 12/18] FB2 Output: Simplify regular expressions --- src/calibre/ebooks/fb2/fb2ml.py | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py index a9d604da65..a5745e1bc2 100644 --- a/src/calibre/ebooks/fb2/fb2ml.py +++ b/src/calibre/ebooks/fb2/fb2ml.py @@ -76,27 +76,28 @@ class FB2MLizer(object): # Remove pointless tags, but keep their contents. text = re.sub(r'(?miu)<(strong|emphasis|strikethrough|sub|sup)>(\s*)', r'\2', text) + # Clean up paragraphs endings. + text = re.sub(r'(?miu)\s+

', '

', text) # Condense empty paragraphs into a line break. - text = re.sub(r'(?miu)(

\s*

\s*){3,}', '', text) + text = re.sub(r'(?miu)(?:

\s*){3,}', '', text) # Remove empty paragraphs. - text = re.sub(r'(?miu)

\s*

', '', text) - # Clean up pargraph endings. - text = re.sub(r'(?miu)\s*

', '

', text) - # Put paragraphs following a paragraph on a separate line. + text = re.sub(r'(?miu)

', '', text) + # Put the paragraph following a paragraph on a separate line. text = re.sub(r'(?miu)

\s*

', '

\n

', text) - # Remove empty title elements. - text = re.sub(r'(?miu)\s*', '', text) + # Clean up title endings. text = re.sub(r'(?miu)\s+', '', text) + # Remove empty title elements. + text = re.sub(r'(?miu)', '', text) + # Put the paragraph following a title on a separate line. + text = re.sub(r'(?miu)\s*

', '\n

', text) # Remove empty sections. text = re.sub(r'(?miu)

\s*
', '', text) - # Clean up sections start and ends. - text = re.sub(r'(?miu)\s*
', '\n
', text) - text = re.sub(r'(?miu)
\s*', '\n', text) - text = re.sub(r'(?miu)\s*
', '\n
', text) - text = re.sub(r'(?miu)
\s*', '
\n', text) - # Put sectnions followed by sections on a separate line. + # Clean up sections starts and ends. + text = re.sub(r'(?miu)\s*
\s*', '\n
\n', text) + text = re.sub(r'(?miu)\s*
\s*', '\n
\n', text) + # Put the section following a section on a separate line. text = re.sub(r'(?miu)
\s*
', '
\n
', text) if self.opts.insert_blank_line: From cb45766216929a28fc21e128f963006f37a1ffb9 Mon Sep 17 00:00:00 2001 From: Andrey Efremov Date: Tue, 8 Oct 2019 12:16:35 +0700 Subject: [PATCH 13/18] FB2 Output: Put line breaks between paragraphs on a separate line --- src/calibre/ebooks/fb2/fb2ml.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py index a5745e1bc2..433db22fa1 100644 --- a/src/calibre/ebooks/fb2/fb2ml.py +++ b/src/calibre/ebooks/fb2/fb2ml.py @@ -103,6 +103,10 @@ class FB2MLizer(object): if self.opts.insert_blank_line: text = re.sub(r'(?miu)

', '

', text) + # Put line breaks between paragraphs on a separate line. + text = re.sub(r'(?miu)\s*', r'\n', text) + text = re.sub(r'(?miu)\s*

', '\n

', text) + return text def fb2_header(self): From 162d86429deb95df251e2cf186e8199b1ed2013c Mon Sep 17 00:00:00 2001 From: Andrey Efremov Date: Tue, 8 Oct 2019 12:19:02 +0700 Subject: [PATCH 14/18] FB2 Output: Ignore case flag is not required because the writer outputs only lowercase tag names --- src/calibre/ebooks/fb2/fb2ml.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py index 433db22fa1..5697f2262a 100644 --- a/src/calibre/ebooks/fb2/fb2ml.py +++ b/src/calibre/ebooks/fb2/fb2ml.py @@ -74,38 +74,38 @@ class FB2MLizer(object): def clean_text(self, text): # Remove pointless tags, but keep their contents. - text = re.sub(r'(?miu)<(strong|emphasis|strikethrough|sub|sup)>(\s*)', r'\2', text) + text = re.sub(r'(?mu)<(strong|emphasis|strikethrough|sub|sup)>(\s*)', r'\2', text) # Clean up paragraphs endings. - text = re.sub(r'(?miu)\s+

', '

', text) + text = re.sub(r'(?mu)\s+

', '

', text) # Condense empty paragraphs into a line break. - text = re.sub(r'(?miu)(?:

\s*){3,}', '', text) + text = re.sub(r'(?mu)(?:

\s*){3,}', '', text) # Remove empty paragraphs. - text = re.sub(r'(?miu)

', '', text) + text = re.sub(r'(?mu)

', '', text) # Put the paragraph following a paragraph on a separate line. - text = re.sub(r'(?miu)

\s*

', '

\n

', text) + text = re.sub(r'(?mu)

\s*

', '

\n

', text) # Clean up title endings. - text = re.sub(r'(?miu)\s+', '', text) + text = re.sub(r'(?mu)\s+', '', text) # Remove empty title elements. - text = re.sub(r'(?miu)', '', text) + text = re.sub(r'(?mu)', '', text) # Put the paragraph following a title on a separate line. - text = re.sub(r'(?miu)\s*

', '\n

', text) + text = re.sub(r'(?mu)\s*

', '\n

', text) # Remove empty sections. - text = re.sub(r'(?miu)

\s*
', '', text) + text = re.sub(r'(?mu)
\s*
', '', text) # Clean up sections starts and ends. - text = re.sub(r'(?miu)\s*
\s*', '\n
\n', text) - text = re.sub(r'(?miu)\s*
\s*', '\n
\n', text) + text = re.sub(r'(?mu)\s*
\s*', '\n
\n', text) + text = re.sub(r'(?mu)\s*
\s*', '\n
\n', text) # Put the section following a section on a separate line. - text = re.sub(r'(?miu)
\s*
', '
\n
', text) + text = re.sub(r'(?mu)
\s*
', '
\n
', text) if self.opts.insert_blank_line: - text = re.sub(r'(?miu)

', '

', text) + text = re.sub(r'(?mu)

', '

', text) # Put line breaks between paragraphs on a separate line. - text = re.sub(r'(?miu)\s*', r'\n', text) - text = re.sub(r'(?miu)\s*

', '\n

', text) + text = re.sub(r'(?mu)\s*', r'\n', text) + text = re.sub(r'(?mu)\s*

', '\n

', text) return text From d7a4e10c6022d13a17f87ae304d4b6ff84e02c17 Mon Sep 17 00:00:00 2001 From: Andrey Efremov Date: Tue, 8 Oct 2019 15:57:13 +0700 Subject: [PATCH 15/18] FB2 Output: Revert some changes in regular expressions due to different behavior --- src/calibre/ebooks/fb2/fb2ml.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py index 5697f2262a..12dd539e53 100644 --- a/src/calibre/ebooks/fb2/fb2ml.py +++ b/src/calibre/ebooks/fb2/fb2ml.py @@ -81,22 +81,24 @@ class FB2MLizer(object): # Condense empty paragraphs into a line break. text = re.sub(r'(?mu)(?:

\s*){3,}', '', text) # Remove empty paragraphs. - text = re.sub(r'(?mu)

', '', text) + text = re.sub(r'(?mu)

\s*', '', text) # Put the paragraph following a paragraph on a separate line. text = re.sub(r'(?mu)

\s*

', '

\n

', text) # Clean up title endings. text = re.sub(r'(?mu)\s+', '', text) # Remove empty title elements. - text = re.sub(r'(?mu)', '', text) + text = re.sub(r'(?mu)\s*', '', text) # Put the paragraph following a title on a separate line. text = re.sub(r'(?mu)\s*

', '\n

', text) # Remove empty sections. text = re.sub(r'(?mu)

\s*
', '', text) # Clean up sections starts and ends. - text = re.sub(r'(?mu)\s*
\s*', '\n
\n', text) - text = re.sub(r'(?mu)\s*
\s*', '\n
\n', text) + text = re.sub(r'(?mu)\s*
', '\n
', text) + text = re.sub(r'(?mu)
\s*', '
\n', text) + text = re.sub(r'(?mu)\s*
', '\n
', text) + text = re.sub(r'(?mu)
\s*', '
\n', text) # Put the section following a section on a separate line. text = re.sub(r'(?mu)
\s*
', '
\n
', text) From e7175e802b4a48e280bb495fd98acedaa3e5fc8d Mon Sep 17 00:00:00 2001 From: Andrey Efremov Date: Tue, 8 Oct 2019 16:10:33 +0700 Subject: [PATCH 16/18] FB2 Output: Remove section regexp what do nothing and rearrange regexps --- src/calibre/ebooks/fb2/fb2ml.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py index 12dd539e53..197f063baa 100644 --- a/src/calibre/ebooks/fb2/fb2ml.py +++ b/src/calibre/ebooks/fb2/fb2ml.py @@ -85,6 +85,9 @@ class FB2MLizer(object): # Put the paragraph following a paragraph on a separate line. text = re.sub(r'(?mu)

\s*

', '

\n

', text) + if self.opts.insert_blank_line: + text = re.sub(r'(?mu)

', '

', text) + # Clean up title endings. text = re.sub(r'(?mu)\s+', '', text) # Remove empty title elements. @@ -92,6 +95,10 @@ class FB2MLizer(object): # Put the paragraph following a title on a separate line. text = re.sub(r'(?mu)\s*

', '\n

', text) + # Put line breaks between paragraphs on a separate line. + text = re.sub(r'(?mu)\s*', r'\n', text) + text = re.sub(r'(?mu)\s*

', '\n

', text) + # Remove empty sections. text = re.sub(r'(?mu)

\s*
', '', text) # Clean up sections starts and ends. @@ -99,15 +106,6 @@ class FB2MLizer(object): text = re.sub(r'(?mu)
\s*', '
\n', text) text = re.sub(r'(?mu)\s*
', '\n
', text) text = re.sub(r'(?mu)
\s*', '
\n', text) - # Put the section following a section on a separate line. - text = re.sub(r'(?mu)
\s*
', '
\n
', text) - - if self.opts.insert_blank_line: - text = re.sub(r'(?mu)

', '

', text) - - # Put line breaks between paragraphs on a separate line. - text = re.sub(r'(?mu)\s*', r'\n', text) - text = re.sub(r'(?mu)\s*

', '\n

', text) return text From c42d2db421e7532dd0b3c5711b8c3ba805ef52b3 Mon Sep 17 00:00:00 2001 From: Andrey Efremov Date: Wed, 9 Oct 2019 12:13:25 +0700 Subject: [PATCH 17/18] FB2 Output: Handle external links --- src/calibre/ebooks/fb2/fb2ml.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py index 197f063baa..7198d0f321 100644 --- a/src/calibre/ebooks/fb2/fb2ml.py +++ b/src/calibre/ebooks/fb2/fb2ml.py @@ -21,12 +21,13 @@ from calibre.utils.img import save_cover_data_to from calibre.ebooks.oeb.base import urlnormalize from polyglot.builtins import unicode_type, string_or_bytes, range, filter from polyglot.binary import as_base64_unicode +from polyglot.urllib import urlparse class FB2MLizer(object): ''' Todo: * Include more FB2 specific tags in the conversion. - * Handle a tags. + * Handle notes and anchor links. ''' def __init__(self, log): @@ -508,6 +509,14 @@ class FB2MLizer(object): fb2_out += p_text if added_p: tags.append('p') + if tag == 'a' and elem_tree.attrib.get('href', None): + # Handle only external links for now + if urlparse(elem_tree.attrib['href']).netloc: + p_txt, p_tag = self.ensure_p() + fb2_out += p_txt + tags += p_tag + fb2_out.append('' % urlnormalize(elem_tree.attrib['href'])) + tags.append('a') if tag == 'b' or style['font-weight'] in ('bold', 'bolder'): s_out, s_tags = self.handle_simple_tag('strong', tag_stack+tags) fb2_out += s_out From 3e680ac307023d962a41b84f62fc7c2f72a3068a Mon Sep 17 00:00:00 2001 From: Andrey Efremov Date: Wed, 9 Oct 2019 12:18:31 +0700 Subject: [PATCH 18/18] FB2 Output: Merge ifs --- src/calibre/ebooks/fb2/fb2ml.py | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py index 7198d0f321..71573bac8b 100644 --- a/src/calibre/ebooks/fb2/fb2ml.py +++ b/src/calibre/ebooks/fb2/fb2ml.py @@ -471,19 +471,18 @@ class FB2MLizer(object): # Process the XHTML tag and styles. Converted to an FB2 tag. # Use individual if statement not if else. There can be # only one XHTML tag but it can have multiple styles. - if tag == 'img': - if elem_tree.attrib.get('src', None): - # Only write the image tag if it is in the manifest. - ihref = urlnormalize(page.abshref(elem_tree.attrib['src'])) - if ihref in self.oeb_book.manifest.hrefs: - if ihref not in self.image_hrefs: - self.image_hrefs[ihref] = 'img_%s' % len(self.image_hrefs) - p_txt, p_tag = self.ensure_p() - fb2_out += p_txt - tags += p_tag - fb2_out.append('' % self.image_hrefs[ihref]) - else: - self.log.warn(u'Ignoring image not in manifest: %s'%ihref) + if tag == 'img' and elem_tree.attrib.get('src', None): + # Only write the image tag if it is in the manifest. + ihref = urlnormalize(page.abshref(elem_tree.attrib['src'])) + if ihref in self.oeb_book.manifest.hrefs: + if ihref not in self.image_hrefs: + self.image_hrefs[ihref] = 'img_%s' % len(self.image_hrefs) + p_txt, p_tag = self.ensure_p() + fb2_out += p_txt + tags += p_tag + fb2_out.append('' % self.image_hrefs[ihref]) + else: + self.log.warn(u'Ignoring image not in manifest: %s' % ihref) if tag in ('br', 'hr') or ems >= 1: if ems < 1: multiplier = 1