diff --git a/src/calibre/library/catalogs/epub_mobi_builder.py b/src/calibre/library/catalogs/epub_mobi_builder.py index 1a795810fc..83736fb65a 100644 --- a/src/calibre/library/catalogs/epub_mobi_builder.py +++ b/src/calibre/library/catalogs/epub_mobi_builder.py @@ -1,34 +1,49 @@ -# -*- coding: utf-8 -*- +#!/usr/bin/env python2 +# vim:fileencoding=utf-8 +# License: GPLv3 Copyright: 2010, Greg Riker from __future__ import print_function -__license__ = 'GPL v3' -__copyright__ = '2010, Greg Riker' -import datetime, os, platform, re, shutil, time, unicodedata, zlib +import datetime +import os +import platform +import re +import shutil +import time +import unicodedata +import zlib from copy import deepcopy from xml.sax.saxutils import escape from calibre import ( - prepare_string_for_xml, strftime, force_unicode, isbytestring, replace_entities, as_unicode, xml_replace_entities) -from calibre.constants import isosx, cache_dir + as_unicode, force_unicode, isbytestring, replace_entities, strftime, + xml_replace_entities +) +from calibre.constants import cache_dir, isosx from calibre.customize.conversion import DummyReporter from calibre.customize.ui import output_profiles -from calibre.ebooks.BeautifulSoup import BeautifulSoup, BeautifulStoneSoup, NavigableString, prettify +from calibre.ebooks.BeautifulSoup import ( + BeautifulSoup, BeautifulStoneSoup, NavigableString, prettify +) from calibre.ebooks.chardet import substitute_entites from calibre.ebooks.metadata import author_to_author_sort -from calibre.library.catalogs import AuthorSortMismatchException, EmptyCatalogException, \ - InvalidGenresSourceFieldException +from calibre.library.catalogs import ( + AuthorSortMismatchException, EmptyCatalogException, + InvalidGenresSourceFieldException +) +from calibre.library.comments import comments_to_html from calibre.ptempfile import PersistentTemporaryDirectory -from calibre.utils.date import format_date, is_date_undefined, now as nowf, as_local_time +from calibre.utils.date import ( + as_local_time, format_date, is_date_undefined, now as nowf +) from calibre.utils.filenames import ascii_text, shorten_components_to from calibre.utils.formatter import TemplateFormatter from calibre.utils.icu import capitalize, collation_order, sort_key from calibre.utils.img import scale_image -from calibre.utils.zipfile import ZipFile from calibre.utils.localization import get_lang, lang_as_iso639_1 +from calibre.utils.zipfile import ZipFile from polyglot.builtins import unicode_type - NBSP = u'\u00a0' @@ -953,7 +968,7 @@ class CatalogBuilder(object): if ad_offset >= 0: record['comments'] = record['comments'][:ad_offset] - this_title['description'] = self.massage_comments(record['comments']) + this_title['description'] = comments_to_html(record['comments']) # Create short description paras = BeautifulSoup(this_title['description']).findAll('p') @@ -4016,17 +4031,17 @@ class CatalogBuilder(object): if lang_as_iso639_1(lang): lang = lang_as_iso639_1(lang) - header = ''' - - - - LANG - - - - - + header = '''\ + + + + LANG + + + + + '''.replace('LANG', lang) # Add the supplied metadata tags soup = BeautifulStoneSoup(header) @@ -4171,7 +4186,7 @@ class CatalogBuilder(object): if isinstance(output, unicode_type): output = output.encode('utf-8') with lopen("%s/%s.opf" % (self.catalog_path, self.opts.basename), 'wb') as outfile: - outfile.write(output) + outfile.write(output.strip()) def generate_rating_string(self, book): """ Generate rating string for Descriptions. @@ -4606,127 +4621,6 @@ class CatalogBuilder(object): if name.startswith('by_') and name.endswith('_template'): setattr(self, name, force_unicode(template, 'utf-8')) - def massage_comments(self, comments): - """ Massage comments to somewhat consistent format. - - Convert random comment text to normalized, xml-legal block of

s - 'plain text' returns as -

plain text

- - 'plain text with minimal markup' returns as -

plain text with minimal markup

- - '

pre-formatted text

returns untouched - - 'A line of text\n\nFollowed by a line of text' returns as -

A line of text

-

Followed by a line of text

- - 'A line of text.\nA second line of text.\rA third line of text' returns as -

A line of text.
A second line of text.
A third line of text.

- - '...end of a paragraph.Somehow the break was lost...' returns as -

...end of a paragraph.

-

Somehow the break was lost...

- - Deprecated HTML returns as HTML via BeautifulSoup() - - Args: - comments (str): comments from metadata, possibly HTML - - Return: - result (BeautifulSoup): massaged comments in HTML form - """ - - # Hackish - ignoring sentences ending or beginning in numbers to avoid - # confusion with decimal points. - - # Explode lost CRs to \n\n - for lost_cr in re.finditer('([a-z])([\\.\\?!])([A-Z])', comments): - comments = comments.replace(lost_cr.group(), - '%s%s\n\n%s' % (lost_cr.group(1), - lost_cr.group(2), - lost_cr.group(3))) - # Extract pre-built elements - annotations, etc. - if not isinstance(comments, unicode_type): - comments = comments.decode('utf-8', 'replace') - soup = BeautifulSoup(comments) - elems = soup.findAll('div') - for elem in elems: - elem.extract() - - # Reconstruct comments w/o
s - comments = soup.decode_contents() - - # Convert \n\n to

s - if re.search('\n\n', comments): - soup = BeautifulSoup() - split_ps = comments.split(u'\n\n') - tsc = 0 - for p in split_ps: - pTag = soup.new_tag('p') - pTag.insert(0, p) - soup.insert(tsc, pTag) - tsc += 1 - comments = soup.decode_contents() - - # Convert solo returns to
- comments = re.sub('[\r\n]', '
', comments) - - # Convert two hypens to emdash - comments = re.sub('--', '—', comments) - soup = BeautifulSoup(comments) - result = BeautifulSoup() - rtc = 0 - open_pTag = False - - all_tokens = list(soup.contents) - for token in all_tokens: - if type(token) is NavigableString: - if not open_pTag: - pTag = result.new_tag('p') - open_pTag = True - ptc = 0 - pTag.insert(ptc, prepare_string_for_xml(token)) - ptc += 1 - - elif token.name in ['br', 'b', 'i', 'em']: - if not open_pTag: - pTag = result.new_tag('p') - open_pTag = True - ptc = 0 - pTag.insert(ptc, token) - ptc += 1 - - else: - if open_pTag: - result.insert(rtc, pTag) - rtc += 1 - open_pTag = False - ptc = 0 - # Clean up NavigableStrings for xml - sub_tokens = list(token.contents) - for sub_token in sub_tokens: - if type(sub_token) is NavigableString: - sub_token.replaceWith(prepare_string_for_xml(sub_token)) - result.insert(rtc, token) - rtc += 1 - - if open_pTag: - result.insert(rtc, pTag) - rtc += 1 - - paras = result.findAll('p') - for p in paras: - p['class'] = 'description' - - # Add back

elems initially removed - for elem in elems: - result.insert(rtc, elem) - rtc += 1 - - return result.decode_contents() - def merge_comments(self, record): """ Merge comments with custom column content. @@ -4958,4 +4852,4 @@ class CatalogBuilder(object): ncx = ncx.encode('utf-8') with lopen("%s/%s.ncx" % (self.catalog_path, self.opts.basename), 'wb') as outfile: - outfile.write(ncx) + outfile.write(ncx.strip())