diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 268dad4328..9cd3271fad 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -501,27 +501,27 @@ class TXTZMetadataWriter(MetadataWriterPlugin): # }}} -from calibre.ebooks.comic.input import ComicInput -from calibre.ebooks.djvu.input import DJVUInput -from calibre.ebooks.epub.input import EPUBInput -from calibre.ebooks.fb2.input import FB2Input -from calibre.ebooks.html.input import HTMLInput -from calibre.ebooks.htmlz.input import HTMLZInput -from calibre.ebooks.lit.input import LITInput -from calibre.ebooks.mobi.input import MOBIInput -from calibre.ebooks.odt.input import ODTInput -from calibre.ebooks.pdb.input import PDBInput -from calibre.ebooks.azw4.input import AZW4Input -from calibre.ebooks.pdf.input import PDFInput -from calibre.ebooks.pml.input import PMLInput -from calibre.ebooks.rb.input import RBInput -from calibre.web.feeds.input import RecipeInput -from calibre.ebooks.rtf.input import RTFInput -from calibre.ebooks.tcr.input import TCRInput -from calibre.ebooks.txt.input import TXTInput -from calibre.ebooks.lrf.input import LRFInput -from calibre.ebooks.chm.input import CHMInput -from calibre.ebooks.snb.input import SNBInput +from calibre.ebooks.conversion.plugins.comic_input import ComicInput +from calibre.ebooks.conversion.plugins.djvu_input import DJVUInput +from calibre.ebooks.conversion.plugins.epub_input import EPUBInput +from calibre.ebooks.conversion.plugins.fb2_input import FB2Input +from calibre.ebooks.conversion.plugins.html_input import HTMLInput +from calibre.ebooks.conversion.plugins.htmlz_input import HTMLZInput +from calibre.ebooks.conversion.plugins.lit_input import LITInput +from calibre.ebooks.conversion.plugins.mobi_input import MOBIInput +from calibre.ebooks.conversion.plugins.odt_input import ODTInput +from calibre.ebooks.conversion.plugins.pdb_input import PDBInput +from calibre.ebooks.conversion.plugins.azw4_input import AZW4Input +from calibre.ebooks.conversion.plugins.pdf_input import PDFInput +from calibre.ebooks.conversion.plugins.pml_input import PMLInput +from calibre.ebooks.conversion.plugins.rb_input import RBInput +from calibre.ebooks.conversion.plugins.recipe_input import RecipeInput +from calibre.ebooks.conversion.plugins.rtf_input import RTFInput +from calibre.ebooks.conversion.plugins.tcr_input import TCRInput +from calibre.ebooks.conversion.plugins.txt_input import TXTInput +from calibre.ebooks.conversion.plugins.lrf_input import LRFInput +from calibre.ebooks.conversion.plugins.chm_input import CHMInput +from calibre.ebooks.conversion.plugins.snb_input import SNBInput from calibre.ebooks.epub.output import EPUBOutput from calibre.ebooks.fb2.output import FB2Output diff --git a/src/calibre/ebooks/comic/input.py b/src/calibre/ebooks/comic/input.py index 9fcfc559aa..221bece092 100755 --- a/src/calibre/ebooks/comic/input.py +++ b/src/calibre/ebooks/comic/input.py @@ -7,11 +7,10 @@ __docformat__ = 'restructuredtext en' Based on ideas from comiclrf created by FangornUK. ''' -import os, shutil, traceback, textwrap, time, codecs +import os, traceback, time from Queue import Empty -from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation -from calibre import extract, CurrentDir, prints, walk +from calibre import extract, prints, walk from calibre.constants import filesystem_encoding from calibre.ptempfile import PersistentTemporaryDirectory from calibre.utils.ipc.server import Server @@ -273,245 +272,4 @@ def process_pages(pages, opts, update, tdir): return ans, failures -class ComicInput(InputFormatPlugin): - - name = 'Comic Input' - author = 'Kovid Goyal' - description = 'Optimize comic files (.cbz, .cbr, .cbc) for viewing on portable devices' - file_types = set(['cbz', 'cbr', 'cbc']) - is_image_collection = True - core_usage = -1 - - options = set([ - OptionRecommendation(name='colors', recommended_value=256, - help=_('Number of colors for grayscale image conversion. Default: ' - '%default. Values of less than 256 may result in blurred text ' - 'on your device if you are creating your comics in EPUB format.')), - OptionRecommendation(name='dont_normalize', recommended_value=False, - help=_('Disable normalize (improve contrast) color range ' - 'for pictures. Default: False')), - OptionRecommendation(name='keep_aspect_ratio', recommended_value=False, - help=_('Maintain picture aspect ratio. Default is to fill the screen.')), - OptionRecommendation(name='dont_sharpen', recommended_value=False, - help=_('Disable sharpening.')), - OptionRecommendation(name='disable_trim', recommended_value=False, - help=_('Disable trimming of comic pages. For some comics, ' - 'trimming might remove content as well as borders.')), - OptionRecommendation(name='landscape', recommended_value=False, - help=_("Don't split landscape images into two portrait images")), - OptionRecommendation(name='wide', recommended_value=False, - help=_("Keep aspect ratio and scale image using screen height as " - "image width for viewing in landscape mode.")), - OptionRecommendation(name='right2left', recommended_value=False, - help=_('Used for right-to-left publications like manga. ' - 'Causes landscape pages to be split into portrait pages ' - 'from right to left.')), - OptionRecommendation(name='despeckle', recommended_value=False, - help=_('Enable Despeckle. Reduces speckle noise. ' - 'May greatly increase processing time.')), - OptionRecommendation(name='no_sort', recommended_value=False, - help=_("Don't sort the files found in the comic " - "alphabetically by name. Instead use the order they were " - "added to the comic.")), - OptionRecommendation(name='output_format', choices=['png', 'jpg'], - recommended_value='png', help=_('The format that images in the created ebook ' - 'are converted to. You can experiment to see which format gives ' - 'you optimal size and look on your device.')), - OptionRecommendation(name='no_process', recommended_value=False, - help=_("Apply no processing to the image")), - OptionRecommendation(name='dont_grayscale', recommended_value=False, - help=_('Do not convert the image to grayscale (black and white)')), - OptionRecommendation(name='comic_image_size', recommended_value=None, - help=_('Specify the image size as widthxheight pixels. Normally,' - ' an image size is automatically calculated from the output ' - 'profile, this option overrides it.')), - OptionRecommendation(name='dont_add_comic_pages_to_toc', recommended_value=False, - help=_('When converting a CBC do not add links to each page to' - ' the TOC. Note this only applies if the TOC has more than one' - ' section')), - ]) - - recommendations = set([ - ('margin_left', 0, OptionRecommendation.HIGH), - ('margin_top', 0, OptionRecommendation.HIGH), - ('margin_right', 0, OptionRecommendation.HIGH), - ('margin_bottom', 0, OptionRecommendation.HIGH), - ('insert_blank_line', False, OptionRecommendation.HIGH), - ('remove_paragraph_spacing', False, OptionRecommendation.HIGH), - ('change_justification', 'left', OptionRecommendation.HIGH), - ('dont_split_on_pagebreaks', True, OptionRecommendation.HIGH), - ('chapter', None, OptionRecommendation.HIGH), - ('page_breaks_brefore', None, OptionRecommendation.HIGH), - ('use_auto_toc', False, OptionRecommendation.HIGH), - ('page_breaks_before', None, OptionRecommendation.HIGH), - ('disable_font_rescaling', True, OptionRecommendation.HIGH), - ('linearize_tables', False, OptionRecommendation.HIGH), - ]) - - def get_comics_from_collection(self, stream): - from calibre.libunzip import extract as zipextract - tdir = PersistentTemporaryDirectory('_comic_collection') - zipextract(stream, tdir) - comics = [] - with CurrentDir(tdir): - if not os.path.exists('comics.txt'): - raise ValueError(( - '%s is not a valid comic collection' - ' no comics.txt was found in the file') - %stream.name) - raw = open('comics.txt', 'rb').read() - if raw.startswith(codecs.BOM_UTF16_BE): - raw = raw.decode('utf-16-be')[1:] - elif raw.startswith(codecs.BOM_UTF16_LE): - raw = raw.decode('utf-16-le')[1:] - elif raw.startswith(codecs.BOM_UTF8): - raw = raw.decode('utf-8')[1:] - else: - raw = raw.decode('utf-8') - for line in raw.splitlines(): - line = line.strip() - if not line: - continue - fname, title = line.partition(':')[0], line.partition(':')[-1] - fname = fname.replace('#', '_') - fname = os.path.join(tdir, *fname.split('/')) - if not title: - title = os.path.basename(fname).rpartition('.')[0] - if os.access(fname, os.R_OK): - comics.append([title, fname]) - if not comics: - raise ValueError('%s has no comics'%stream.name) - return comics - - def get_pages(self, comic, tdir2): - tdir = extract_comic(comic) - new_pages = find_pages(tdir, sort_on_mtime=self.opts.no_sort, - verbose=self.opts.verbose) - thumbnail = None - if not new_pages: - raise ValueError('Could not find any pages in the comic: %s' - %comic) - if self.opts.no_process: - n2 = [] - for page in new_pages: - n2.append(os.path.join(tdir2, os.path.basename(page))) - shutil.copyfile(page, n2[-1]) - new_pages = n2 - else: - new_pages, failures = process_pages(new_pages, self.opts, - self.report_progress, tdir2) - if failures: - self.log.warning('Could not process the following pages ' - '(run with --verbose to see why):') - for f in failures: - self.log.warning('\t', f) - if not new_pages: - raise ValueError('Could not find any valid pages in comic: %s' - % comic) - thumbnail = os.path.join(tdir2, - 'thumbnail.'+self.opts.output_format.lower()) - if not os.access(thumbnail, os.R_OK): - thumbnail = None - return new_pages - - def get_images(self): - return self._images - - def convert(self, stream, opts, file_ext, log, accelerators): - from calibre.ebooks.metadata import MetaInformation - from calibre.ebooks.metadata.opf2 import OPFCreator - from calibre.ebooks.metadata.toc import TOC - - self.opts, self.log= opts, log - if file_ext == 'cbc': - comics_ = self.get_comics_from_collection(stream) - else: - comics_ = [['Comic', os.path.abspath(stream.name)]] - stream.close() - comics = [] - for i, x in enumerate(comics_): - title, fname = x - cdir = 'comic_%d'%(i+1) if len(comics_) > 1 else '.' - cdir = os.path.abspath(cdir) - if not os.path.exists(cdir): - os.makedirs(cdir) - pages = self.get_pages(fname, cdir) - if not pages: continue - wrappers = self.create_wrappers(pages) - comics.append((title, pages, wrappers)) - - if not comics: - raise ValueError('No comic pages found in %s'%stream.name) - - mi = MetaInformation(os.path.basename(stream.name).rpartition('.')[0], - [_('Unknown')]) - opf = OPFCreator(os.path.abspath('.'), mi) - entries = [] - - def href(x): - if len(comics) == 1: return os.path.basename(x) - return '/'.join(x.split(os.sep)[-2:]) - - for comic in comics: - pages, wrappers = comic[1:] - entries += [(w, None) for w in map(href, wrappers)] + \ - [(x, None) for x in map(href, pages)] - opf.create_manifest(entries) - spine = [] - for comic in comics: - spine.extend(map(href, comic[2])) - self._images = [] - for comic in comics: - self._images.extend(comic[1]) - opf.create_spine(spine) - toc = TOC() - if len(comics) == 1: - wrappers = comics[0][2] - for i, x in enumerate(wrappers): - toc.add_item(href(x), None, _('Page')+' %d'%(i+1), - play_order=i) - else: - po = 0 - for comic in comics: - po += 1 - wrappers = comic[2] - stoc = toc.add_item(href(wrappers[0]), - None, comic[0], play_order=po) - if not opts.dont_add_comic_pages_to_toc: - for i, x in enumerate(wrappers): - stoc.add_item(href(x), None, - _('Page')+' %d'%(i+1), play_order=po) - po += 1 - opf.set_toc(toc) - m, n = open('metadata.opf', 'wb'), open('toc.ncx', 'wb') - opf.render(m, n, 'toc.ncx') - return os.path.abspath('metadata.opf') - - def create_wrappers(self, pages): - from calibre.ebooks.oeb.base import XHTML_NS - wrappers = [] - WRAPPER = textwrap.dedent('''\ - - - Page #%d - - - -
- comic page #%d -
- - - ''') - dir = os.path.dirname(pages[0]) - for i, page in enumerate(pages): - wrapper = WRAPPER%(XHTML_NS, i+1, os.path.basename(page), i+1) - page = os.path.join(dir, 'page_%d.xhtml'%(i+1)) - open(page, 'wb').write(wrapper) - wrappers.append(page) - return wrappers diff --git a/src/calibre/ebooks/conversion/plugins/__init__.py b/src/calibre/ebooks/conversion/plugins/__init__.py new file mode 100644 index 0000000000..dd9615356c --- /dev/null +++ b/src/calibre/ebooks/conversion/plugins/__init__.py @@ -0,0 +1,11 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2012, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + + + diff --git a/src/calibre/ebooks/azw4/input.py b/src/calibre/ebooks/conversion/plugins/azw4_input.py similarity index 84% rename from src/calibre/ebooks/azw4/input.py rename to src/calibre/ebooks/conversion/plugins/azw4_input.py index 1ac7657342..6d2b2a917e 100644 --- a/src/calibre/ebooks/azw4/input.py +++ b/src/calibre/ebooks/conversion/plugins/azw4_input.py @@ -7,8 +7,6 @@ __docformat__ = 'restructuredtext en' import os from calibre.customize.conversion import InputFormatPlugin -from calibre.ebooks.pdb.header import PdbHeaderReader -from calibre.ebooks.azw4.reader import Reader class AZW4Input(InputFormatPlugin): @@ -19,6 +17,9 @@ class AZW4Input(InputFormatPlugin): def convert(self, stream, options, file_ext, log, accelerators): + from calibre.ebooks.pdb.header import PdbHeaderReader + from calibre.ebooks.azw4.reader import Reader + header = PdbHeaderReader(stream) reader = Reader(header, stream, log, options) opf = reader.extract_content(os.getcwd()) diff --git a/src/calibre/ebooks/chm/input.py b/src/calibre/ebooks/conversion/plugins/chm_input.py similarity index 98% rename from src/calibre/ebooks/chm/input.py rename to src/calibre/ebooks/conversion/plugins/chm_input.py index f36685bd91..a674735f1d 100644 --- a/src/calibre/ebooks/chm/input.py +++ b/src/calibre/ebooks/conversion/plugins/chm_input.py @@ -3,9 +3,7 @@ __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ,' \ ' and Alex Bramley .' -import os, uuid - -from lxml import html +import os from calibre.customize.conversion import InputFormatPlugin from calibre.ptempfile import TemporaryDirectory @@ -77,7 +75,7 @@ class CHMInput(InputFormatPlugin): def _create_oebbook_html(self, htmlpath, basedir, opts, log, mi): # use HTMLInput plugin to generate book - from calibre.ebooks.html.input import HTMLInput + from calibre.customize.builtins import HTMLInput opts.breadth_first = True htmlinput = HTMLInput(None) oeb = htmlinput.create_oebbook(htmlpath, basedir, opts, log, mi) @@ -85,6 +83,8 @@ class CHMInput(InputFormatPlugin): def _create_oebbook(self, hhcpath, basedir, opts, log, mi): + import uuid + from lxml import html from calibre.ebooks.conversion.plumber import create_oebbook from calibre.ebooks.oeb.base import DirContainer oeb = create_oebbook(log, None, opts, @@ -142,6 +142,7 @@ class CHMInput(InputFormatPlugin): return oeb def _create_html_root(self, hhcpath, log): + from lxml import html hhcdata = self._read_file(hhcpath) hhcroot = html.fromstring(hhcdata) chapters = self._process_nodes(hhcroot) diff --git a/src/calibre/ebooks/conversion/plugins/comic_input.py b/src/calibre/ebooks/conversion/plugins/comic_input.py new file mode 100644 index 0000000000..77ae7d8086 --- /dev/null +++ b/src/calibre/ebooks/conversion/plugins/comic_input.py @@ -0,0 +1,259 @@ +from __future__ import with_statement +__license__ = 'GPL v3' +__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' +__docformat__ = 'restructuredtext en' + +''' +Based on ideas from comiclrf created by FangornUK. +''' + +import shutil, textwrap, codecs, os + +from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation +from calibre import CurrentDir +from calibre.ptempfile import PersistentTemporaryDirectory + +class ComicInput(InputFormatPlugin): + + name = 'Comic Input' + author = 'Kovid Goyal' + description = 'Optimize comic files (.cbz, .cbr, .cbc) for viewing on portable devices' + file_types = set(['cbz', 'cbr', 'cbc']) + is_image_collection = True + core_usage = -1 + + options = set([ + OptionRecommendation(name='colors', recommended_value=256, + help=_('Number of colors for grayscale image conversion. Default: ' + '%default. Values of less than 256 may result in blurred text ' + 'on your device if you are creating your comics in EPUB format.')), + OptionRecommendation(name='dont_normalize', recommended_value=False, + help=_('Disable normalize (improve contrast) color range ' + 'for pictures. Default: False')), + OptionRecommendation(name='keep_aspect_ratio', recommended_value=False, + help=_('Maintain picture aspect ratio. Default is to fill the screen.')), + OptionRecommendation(name='dont_sharpen', recommended_value=False, + help=_('Disable sharpening.')), + OptionRecommendation(name='disable_trim', recommended_value=False, + help=_('Disable trimming of comic pages. For some comics, ' + 'trimming might remove content as well as borders.')), + OptionRecommendation(name='landscape', recommended_value=False, + help=_("Don't split landscape images into two portrait images")), + OptionRecommendation(name='wide', recommended_value=False, + help=_("Keep aspect ratio and scale image using screen height as " + "image width for viewing in landscape mode.")), + OptionRecommendation(name='right2left', recommended_value=False, + help=_('Used for right-to-left publications like manga. ' + 'Causes landscape pages to be split into portrait pages ' + 'from right to left.')), + OptionRecommendation(name='despeckle', recommended_value=False, + help=_('Enable Despeckle. Reduces speckle noise. ' + 'May greatly increase processing time.')), + OptionRecommendation(name='no_sort', recommended_value=False, + help=_("Don't sort the files found in the comic " + "alphabetically by name. Instead use the order they were " + "added to the comic.")), + OptionRecommendation(name='output_format', choices=['png', 'jpg'], + recommended_value='png', help=_('The format that images in the created ebook ' + 'are converted to. You can experiment to see which format gives ' + 'you optimal size and look on your device.')), + OptionRecommendation(name='no_process', recommended_value=False, + help=_("Apply no processing to the image")), + OptionRecommendation(name='dont_grayscale', recommended_value=False, + help=_('Do not convert the image to grayscale (black and white)')), + OptionRecommendation(name='comic_image_size', recommended_value=None, + help=_('Specify the image size as widthxheight pixels. Normally,' + ' an image size is automatically calculated from the output ' + 'profile, this option overrides it.')), + OptionRecommendation(name='dont_add_comic_pages_to_toc', recommended_value=False, + help=_('When converting a CBC do not add links to each page to' + ' the TOC. Note this only applies if the TOC has more than one' + ' section')), + ]) + + recommendations = set([ + ('margin_left', 0, OptionRecommendation.HIGH), + ('margin_top', 0, OptionRecommendation.HIGH), + ('margin_right', 0, OptionRecommendation.HIGH), + ('margin_bottom', 0, OptionRecommendation.HIGH), + ('insert_blank_line', False, OptionRecommendation.HIGH), + ('remove_paragraph_spacing', False, OptionRecommendation.HIGH), + ('change_justification', 'left', OptionRecommendation.HIGH), + ('dont_split_on_pagebreaks', True, OptionRecommendation.HIGH), + ('chapter', None, OptionRecommendation.HIGH), + ('page_breaks_brefore', None, OptionRecommendation.HIGH), + ('use_auto_toc', False, OptionRecommendation.HIGH), + ('page_breaks_before', None, OptionRecommendation.HIGH), + ('disable_font_rescaling', True, OptionRecommendation.HIGH), + ('linearize_tables', False, OptionRecommendation.HIGH), + ]) + + def get_comics_from_collection(self, stream): + from calibre.libunzip import extract as zipextract + tdir = PersistentTemporaryDirectory('_comic_collection') + zipextract(stream, tdir) + comics = [] + with CurrentDir(tdir): + if not os.path.exists('comics.txt'): + raise ValueError(( + '%s is not a valid comic collection' + ' no comics.txt was found in the file') + %stream.name) + raw = open('comics.txt', 'rb').read() + if raw.startswith(codecs.BOM_UTF16_BE): + raw = raw.decode('utf-16-be')[1:] + elif raw.startswith(codecs.BOM_UTF16_LE): + raw = raw.decode('utf-16-le')[1:] + elif raw.startswith(codecs.BOM_UTF8): + raw = raw.decode('utf-8')[1:] + else: + raw = raw.decode('utf-8') + for line in raw.splitlines(): + line = line.strip() + if not line: + continue + fname, title = line.partition(':')[0], line.partition(':')[-1] + fname = fname.replace('#', '_') + fname = os.path.join(tdir, *fname.split('/')) + if not title: + title = os.path.basename(fname).rpartition('.')[0] + if os.access(fname, os.R_OK): + comics.append([title, fname]) + if not comics: + raise ValueError('%s has no comics'%stream.name) + return comics + + def get_pages(self, comic, tdir2): + from calibre.ebooks.comic.input import (extract_comic, process_pages, + find_pages) + tdir = extract_comic(comic) + new_pages = find_pages(tdir, sort_on_mtime=self.opts.no_sort, + verbose=self.opts.verbose) + thumbnail = None + if not new_pages: + raise ValueError('Could not find any pages in the comic: %s' + %comic) + if self.opts.no_process: + n2 = [] + for page in new_pages: + n2.append(os.path.join(tdir2, os.path.basename(page))) + shutil.copyfile(page, n2[-1]) + new_pages = n2 + else: + new_pages, failures = process_pages(new_pages, self.opts, + self.report_progress, tdir2) + if failures: + self.log.warning('Could not process the following pages ' + '(run with --verbose to see why):') + for f in failures: + self.log.warning('\t', f) + if not new_pages: + raise ValueError('Could not find any valid pages in comic: %s' + % comic) + thumbnail = os.path.join(tdir2, + 'thumbnail.'+self.opts.output_format.lower()) + if not os.access(thumbnail, os.R_OK): + thumbnail = None + return new_pages + + def get_images(self): + return self._images + + def convert(self, stream, opts, file_ext, log, accelerators): + from calibre.ebooks.metadata import MetaInformation + from calibre.ebooks.metadata.opf2 import OPFCreator + from calibre.ebooks.metadata.toc import TOC + + self.opts, self.log= opts, log + if file_ext == 'cbc': + comics_ = self.get_comics_from_collection(stream) + else: + comics_ = [['Comic', os.path.abspath(stream.name)]] + stream.close() + comics = [] + for i, x in enumerate(comics_): + title, fname = x + cdir = 'comic_%d'%(i+1) if len(comics_) > 1 else '.' + cdir = os.path.abspath(cdir) + if not os.path.exists(cdir): + os.makedirs(cdir) + pages = self.get_pages(fname, cdir) + if not pages: continue + wrappers = self.create_wrappers(pages) + comics.append((title, pages, wrappers)) + + if not comics: + raise ValueError('No comic pages found in %s'%stream.name) + + mi = MetaInformation(os.path.basename(stream.name).rpartition('.')[0], + [_('Unknown')]) + opf = OPFCreator(os.path.abspath('.'), mi) + entries = [] + + def href(x): + if len(comics) == 1: return os.path.basename(x) + return '/'.join(x.split(os.sep)[-2:]) + + for comic in comics: + pages, wrappers = comic[1:] + entries += [(w, None) for w in map(href, wrappers)] + \ + [(x, None) for x in map(href, pages)] + opf.create_manifest(entries) + spine = [] + for comic in comics: + spine.extend(map(href, comic[2])) + self._images = [] + for comic in comics: + self._images.extend(comic[1]) + opf.create_spine(spine) + toc = TOC() + if len(comics) == 1: + wrappers = comics[0][2] + for i, x in enumerate(wrappers): + toc.add_item(href(x), None, _('Page')+' %d'%(i+1), + play_order=i) + else: + po = 0 + for comic in comics: + po += 1 + wrappers = comic[2] + stoc = toc.add_item(href(wrappers[0]), + None, comic[0], play_order=po) + if not opts.dont_add_comic_pages_to_toc: + for i, x in enumerate(wrappers): + stoc.add_item(href(x), None, + _('Page')+' %d'%(i+1), play_order=po) + po += 1 + opf.set_toc(toc) + m, n = open('metadata.opf', 'wb'), open('toc.ncx', 'wb') + opf.render(m, n, 'toc.ncx') + return os.path.abspath('metadata.opf') + + def create_wrappers(self, pages): + from calibre.ebooks.oeb.base import XHTML_NS + wrappers = [] + WRAPPER = textwrap.dedent('''\ + + + Page #%d + + + +
+ comic page #%d +
+ + + ''') + dir = os.path.dirname(pages[0]) + for i, page in enumerate(pages): + wrapper = WRAPPER%(XHTML_NS, i+1, os.path.basename(page), i+1) + page = os.path.join(dir, 'page_%d.xhtml'%(i+1)) + open(page, 'wb').write(wrapper) + wrappers.append(page) + return wrappers + diff --git a/src/calibre/ebooks/djvu/input.py b/src/calibre/ebooks/conversion/plugins/djvu_input.py similarity index 98% rename from src/calibre/ebooks/djvu/input.py rename to src/calibre/ebooks/conversion/plugins/djvu_input.py index 70dbf97f5d..936ef1a702 100644 --- a/src/calibre/ebooks/djvu/input.py +++ b/src/calibre/ebooks/conversion/plugins/djvu_input.py @@ -12,7 +12,6 @@ from subprocess import Popen, PIPE from cStringIO import StringIO from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation -from calibre.ebooks.txt.processor import convert_basic class DJVUInput(InputFormatPlugin): @@ -28,6 +27,8 @@ class DJVUInput(InputFormatPlugin): ]) def convert(self, stream, options, file_ext, log, accelerators): + from calibre.ebooks.txt.processor import convert_basic + stdout = StringIO() ppdjvu = True # using djvutxt is MUCH faster, should make it an option diff --git a/src/calibre/ebooks/epub/input.py b/src/calibre/ebooks/conversion/plugins/epub_input.py similarity index 98% rename from src/calibre/ebooks/epub/input.py rename to src/calibre/ebooks/conversion/plugins/epub_input.py index c2cfedd7d4..47356dbd1f 100644 --- a/src/calibre/ebooks/epub/input.py +++ b/src/calibre/ebooks/conversion/plugins/epub_input.py @@ -3,11 +3,9 @@ __license__ = 'GPL 3' __copyright__ = '2009, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import os, uuid +import os from itertools import cycle -from lxml import etree - from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation class EPUBInput(InputFormatPlugin): @@ -30,6 +28,8 @@ class EPUBInput(InputFormatPlugin): f.write(raw[1024:]) def process_encryption(self, encfile, opf, log): + from lxml import etree + import uuid key = None for item in opf.identifier_iter(): scheme = None @@ -65,6 +65,7 @@ class EPUBInput(InputFormatPlugin): return False def rationalize_cover(self, opf, log): + from lxml import etree guide_cover, guide_elem = None, None for guide_elem in opf.iterguide(): if guide_elem.get('type', '').lower() == 'cover': @@ -110,6 +111,7 @@ class EPUBInput(InputFormatPlugin): renderer) def find_opf(self): + from lxml import etree def attr(n, attr): for k, v in n.attrib.items(): if k.endswith(attr): diff --git a/src/calibre/ebooks/fb2/input.py b/src/calibre/ebooks/conversion/plugins/fb2_input.py similarity index 99% rename from src/calibre/ebooks/fb2/input.py rename to src/calibre/ebooks/conversion/plugins/fb2_input.py index 147e940eb4..747f8f19d8 100644 --- a/src/calibre/ebooks/fb2/input.py +++ b/src/calibre/ebooks/conversion/plugins/fb2_input.py @@ -6,7 +6,6 @@ Convert .fb2 files to .lrf """ import os, re from base64 import b64decode -from lxml import etree from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation from calibre import guess_type @@ -38,6 +37,7 @@ class FB2Input(InputFormatPlugin): def convert(self, stream, options, file_ext, log, accelerators): + from lxml import etree from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.ebooks.metadata.meta import get_metadata from calibre.ebooks.oeb.base import XLINK_NS, XHTML_NS, RECOVER_PARSER diff --git a/src/calibre/ebooks/conversion/plugins/html_input.py b/src/calibre/ebooks/conversion/plugins/html_input.py new file mode 100644 index 0000000000..cfd2ebf8cf --- /dev/null +++ b/src/calibre/ebooks/conversion/plugins/html_input.py @@ -0,0 +1,283 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2012, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import re, tempfile, os +from functools import partial +from itertools import izip +from urllib import quote + +from calibre.constants import islinux, isbsd +from calibre.customize.conversion import (InputFormatPlugin, + OptionRecommendation) +from calibre.utils.localization import get_lang +from calibre.utils.filenames import ascii_filename + + +class HTMLInput(InputFormatPlugin): + + name = 'HTML Input' + author = 'Kovid Goyal' + description = 'Convert HTML and OPF files to an OEB' + file_types = set(['opf', 'html', 'htm', 'xhtml', 'xhtm', 'shtm', 'shtml']) + + options = set([ + OptionRecommendation(name='breadth_first', + recommended_value=False, level=OptionRecommendation.LOW, + help=_('Traverse links in HTML files breadth first. Normally, ' + 'they are traversed depth first.' + ) + ), + + OptionRecommendation(name='max_levels', + recommended_value=5, level=OptionRecommendation.LOW, + help=_('Maximum levels of recursion when following links in ' + 'HTML files. Must be non-negative. 0 implies that no ' + 'links in the root HTML file are followed. Default is ' + '%default.' + ) + ), + + OptionRecommendation(name='dont_package', + recommended_value=False, level=OptionRecommendation.LOW, + help=_('Normally this input plugin re-arranges all the input ' + 'files into a standard folder hierarchy. Only use this option ' + 'if you know what you are doing as it can result in various ' + 'nasty side effects in the rest of the conversion pipeline.' + ) + ), + + ]) + + def convert(self, stream, opts, file_ext, log, + accelerators): + self._is_case_sensitive = None + basedir = os.getcwd() + self.opts = opts + + fname = None + if hasattr(stream, 'name'): + basedir = os.path.dirname(stream.name) + fname = os.path.basename(stream.name) + + if file_ext != 'opf': + if opts.dont_package: + raise ValueError('The --dont-package option is not supported for an HTML input file') + from calibre.ebooks.metadata.html import get_metadata + mi = get_metadata(stream) + if fname: + from calibre.ebooks.metadata.meta import metadata_from_filename + fmi = metadata_from_filename(fname) + fmi.smart_update(mi) + mi = fmi + oeb = self.create_oebbook(stream.name, basedir, opts, log, mi) + return oeb + + from calibre.ebooks.conversion.plumber import create_oebbook + return create_oebbook(log, stream.name, opts, + encoding=opts.input_encoding) + + def is_case_sensitive(self, path): + if getattr(self, '_is_case_sensitive', None) is not None: + return self._is_case_sensitive + if not path or not os.path.exists(path): + return islinux or isbsd + self._is_case_sensitive = not (os.path.exists(path.lower()) \ + and os.path.exists(path.upper())) + return self._is_case_sensitive + + def create_oebbook(self, htmlpath, basedir, opts, log, mi): + import uuid + from calibre.ebooks.conversion.plumber import create_oebbook + from calibre.ebooks.oeb.base import (DirContainer, + rewrite_links, urlnormalize, urldefrag, BINARY_MIME, OEB_STYLES, + xpath) + from calibre import guess_type + from calibre.ebooks.oeb.transforms.metadata import \ + meta_info_to_oeb_metadata + from calibre.ebooks.html.input import get_filelist + import cssutils, logging + cssutils.log.setLevel(logging.WARN) + self.OEB_STYLES = OEB_STYLES + oeb = create_oebbook(log, None, opts, self, + encoding=opts.input_encoding, populate=False) + self.oeb = oeb + + metadata = oeb.metadata + meta_info_to_oeb_metadata(mi, metadata, log) + if not metadata.language: + oeb.logger.warn(u'Language not specified') + metadata.add('language', get_lang().replace('_', '-')) + if not metadata.creator: + oeb.logger.warn('Creator not specified') + metadata.add('creator', self.oeb.translate(__('Unknown'))) + if not metadata.title: + oeb.logger.warn('Title not specified') + metadata.add('title', self.oeb.translate(__('Unknown'))) + bookid = str(uuid.uuid4()) + metadata.add('identifier', bookid, id='uuid_id', scheme='uuid') + for ident in metadata.identifier: + if 'id' in ident.attrib: + self.oeb.uid = metadata.identifier[0] + break + + filelist = get_filelist(htmlpath, basedir, opts, log) + filelist = [f for f in filelist if not f.is_binary] + htmlfile_map = {} + for f in filelist: + path = f.path + oeb.container = DirContainer(os.path.dirname(path), log, + ignore_opf=True) + bname = os.path.basename(path) + id, href = oeb.manifest.generate(id='html', + href=ascii_filename(bname)) + htmlfile_map[path] = href + item = oeb.manifest.add(id, href, 'text/html') + item.html_input_href = bname + oeb.spine.add(item, True) + + self.added_resources = {} + self.log = log + self.log('Normalizing filename cases') + for path, href in htmlfile_map.items(): + if not self.is_case_sensitive(path): + path = path.lower() + self.added_resources[path] = href + self.urlnormalize, self.DirContainer = urlnormalize, DirContainer + self.urldefrag = urldefrag + self.guess_type, self.BINARY_MIME = guess_type, BINARY_MIME + + self.log('Rewriting HTML links') + for f in filelist: + path = f.path + dpath = os.path.dirname(path) + oeb.container = DirContainer(dpath, log, ignore_opf=True) + item = oeb.manifest.hrefs[htmlfile_map[path]] + rewrite_links(item.data, partial(self.resource_adder, base=dpath)) + + for item in oeb.manifest.values(): + if item.media_type in self.OEB_STYLES: + dpath = None + for path, href in self.added_resources.items(): + if href == item.href: + dpath = os.path.dirname(path) + break + cssutils.replaceUrls(item.data, + partial(self.resource_adder, base=dpath)) + + toc = self.oeb.toc + self.oeb.auto_generated_toc = True + titles = [] + headers = [] + for item in self.oeb.spine: + if not item.linear: continue + html = item.data + title = ''.join(xpath(html, '/h:html/h:head/h:title/text()')) + title = re.sub(r'\s+', ' ', title.strip()) + if title: + titles.append(title) + headers.append('(unlabled)') + for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'): + expr = '/h:html/h:body//h:%s[position()=1]/text()' + header = ''.join(xpath(html, expr % tag)) + header = re.sub(r'\s+', ' ', header.strip()) + if header: + headers[-1] = header + break + use = titles + if len(titles) > len(set(titles)): + use = headers + for title, item in izip(use, self.oeb.spine): + if not item.linear: continue + toc.add(title, item.href) + + oeb.container = DirContainer(os.getcwdu(), oeb.log, ignore_opf=True) + return oeb + + def link_to_local_path(self, link_, base=None): + from calibre.ebooks.html.input import Link + if not isinstance(link_, unicode): + try: + link_ = link_.decode('utf-8', 'error') + except: + self.log.warn('Failed to decode link %r. Ignoring'%link_) + return None, None + try: + l = Link(link_, base if base else os.getcwdu()) + except: + self.log.exception('Failed to process link: %r'%link_) + return None, None + if l.path is None: + # Not a local resource + return None, None + link = l.path.replace('/', os.sep).strip() + frag = l.fragment + if not link: + return None, None + return link, frag + + def resource_adder(self, link_, base=None): + link, frag = self.link_to_local_path(link_, base=base) + if link is None: + return link_ + try: + if base and not os.path.isabs(link): + link = os.path.join(base, link) + link = os.path.abspath(link) + except: + return link_ + if not os.access(link, os.R_OK): + return link_ + if os.path.isdir(link): + self.log.warn(link_, 'is a link to a directory. Ignoring.') + return link_ + if not self.is_case_sensitive(tempfile.gettempdir()): + link = link.lower() + if link not in self.added_resources: + bhref = os.path.basename(link) + id, href = self.oeb.manifest.generate(id='added', + href=bhref) + guessed = self.guess_type(href)[0] + media_type = guessed or self.BINARY_MIME + if media_type == 'text/plain': + self.log.warn('Ignoring link to text file %r'%link_) + return None + + self.oeb.log.debug('Added', link) + self.oeb.container = self.DirContainer(os.path.dirname(link), + self.oeb.log, ignore_opf=True) + # Load into memory + item = self.oeb.manifest.add(id, href, media_type) + # bhref refers to an already existing file. The read() method of + # DirContainer will call unquote on it before trying to read the + # file, therefore we quote it here. + if isinstance(bhref, unicode): + bhref = bhref.encode('utf-8') + item.html_input_href = quote(bhref).decode('utf-8') + if guessed in self.OEB_STYLES: + item.override_css_fetch = partial( + self.css_import_handler, os.path.dirname(link)) + item.data + self.added_resources[link] = href + + nlink = self.added_resources[link] + if frag: + nlink = '#'.join((nlink, frag)) + return nlink + + def css_import_handler(self, base, href): + link, frag = self.link_to_local_path(href, base=base) + if link is None or not os.access(link, os.R_OK) or os.path.isdir(link): + return (None, None) + try: + raw = open(link, 'rb').read().decode('utf-8', 'replace') + raw = self.oeb.css_preprocessor(raw, add_namespace=True) + except: + self.log.exception('Failed to read CSS file: %r'%link) + return (None, None) + return (None, raw) diff --git a/src/calibre/ebooks/htmlz/input.py b/src/calibre/ebooks/conversion/plugins/htmlz_input.py similarity index 96% rename from src/calibre/ebooks/htmlz/input.py rename to src/calibre/ebooks/conversion/plugins/htmlz_input.py index f0f45f72fe..e9fbb1d7c2 100644 --- a/src/calibre/ebooks/htmlz/input.py +++ b/src/calibre/ebooks/conversion/plugins/htmlz_input.py @@ -10,9 +10,6 @@ import os from calibre import guess_type from calibre.customize.conversion import InputFormatPlugin -from calibre.ebooks.chardet import xml_to_unicode -from calibre.ebooks.metadata.opf2 import OPF -from calibre.utils.zipfile import ZipFile class HTMLZInput(InputFormatPlugin): @@ -23,6 +20,10 @@ class HTMLZInput(InputFormatPlugin): def convert(self, stream, options, file_ext, log, accelerators): + from calibre.ebooks.chardet import xml_to_unicode + from calibre.ebooks.metadata.opf2 import OPF + from calibre.utils.zipfile import ZipFile + self.log = log html = u'' top_levels = [] diff --git a/src/calibre/ebooks/lit/input.py b/src/calibre/ebooks/conversion/plugins/lit_input.py similarity index 100% rename from src/calibre/ebooks/lit/input.py rename to src/calibre/ebooks/conversion/plugins/lit_input.py diff --git a/src/calibre/ebooks/conversion/plugins/lrf_input.py b/src/calibre/ebooks/conversion/plugins/lrf_input.py new file mode 100644 index 0000000000..63af39e1e0 --- /dev/null +++ b/src/calibre/ebooks/conversion/plugins/lrf_input.py @@ -0,0 +1,87 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2009, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import os, sys +from calibre.customize.conversion import InputFormatPlugin + +class LRFInput(InputFormatPlugin): + + name = 'LRF Input' + author = 'Kovid Goyal' + description = 'Convert LRF files to HTML' + file_types = set(['lrf']) + + def convert(self, stream, options, file_ext, log, + accelerators): + from lxml import etree + from calibre.ebooks.lrf.input import (MediaType, Styles, TextBlock, + Canvas, ImageBlock, RuledLine) + self.log = log + self.log('Generating XML') + from calibre.ebooks.lrf.lrfparser import LRFDocument + d = LRFDocument(stream) + d.parse() + xml = d.to_xml(write_files=True) + if options.verbose > 2: + open('lrs.xml', 'wb').write(xml.encode('utf-8')) + parser = etree.XMLParser(no_network=True, huge_tree=True) + try: + doc = etree.fromstring(xml, parser=parser) + except: + self.log.warn('Failed to parse XML. Trying to recover') + parser = etree.XMLParser(no_network=True, huge_tree=True, + recover=True) + doc = etree.fromstring(xml, parser=parser) + + + char_button_map = {} + for x in doc.xpath('//CharButton[@refobj]'): + ro = x.get('refobj') + jump_button = doc.xpath('//*[@objid="%s"]'%ro) + if jump_button: + jump_to = jump_button[0].xpath('descendant::JumpTo[@refpage and @refobj]') + if jump_to: + char_button_map[ro] = '%s.xhtml#%s'%(jump_to[0].get('refpage'), + jump_to[0].get('refobj')) + plot_map = {} + for x in doc.xpath('//Plot[@refobj]'): + ro = x.get('refobj') + image = doc.xpath('//Image[@objid="%s" and @refstream]'%ro) + if image: + imgstr = doc.xpath('//ImageStream[@objid="%s" and @file]'% + image[0].get('refstream')) + if imgstr: + plot_map[ro] = imgstr[0].get('file') + + self.log('Converting XML to HTML...') + styledoc = etree.fromstring(P('templates/lrf.xsl', data=True)) + media_type = MediaType() + styles = Styles() + text_block = TextBlock(styles, char_button_map, plot_map, log) + canvas = Canvas(doc, styles, text_block, log) + image_block = ImageBlock(canvas) + ruled_line = RuledLine() + extensions = { + ('calibre', 'media-type') : media_type, + ('calibre', 'text-block') : text_block, + ('calibre', 'ruled-line') : ruled_line, + ('calibre', 'styles') : styles, + ('calibre', 'canvas') : canvas, + ('calibre', 'image-block'): image_block, + } + transform = etree.XSLT(styledoc, extensions=extensions) + try: + result = transform(doc) + except RuntimeError: + sys.setrecursionlimit(5000) + result = transform(doc) + + with open('content.opf', 'wb') as f: + f.write(result) + styles.write() + return os.path.abspath('content.opf') diff --git a/src/calibre/ebooks/mobi/input.py b/src/calibre/ebooks/conversion/plugins/mobi_input.py similarity index 100% rename from src/calibre/ebooks/mobi/input.py rename to src/calibre/ebooks/conversion/plugins/mobi_input.py diff --git a/src/calibre/ebooks/conversion/plugins/odt_input.py b/src/calibre/ebooks/conversion/plugins/odt_input.py new file mode 100644 index 0000000000..5e92ea5163 --- /dev/null +++ b/src/calibre/ebooks/conversion/plugins/odt_input.py @@ -0,0 +1,25 @@ +from __future__ import with_statement +__license__ = 'GPL v3' +__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' +__docformat__ = 'restructuredtext en' + +''' +Convert an ODT file into a Open Ebook +''' + +from calibre.customize.conversion import InputFormatPlugin + +class ODTInput(InputFormatPlugin): + + name = 'ODT Input' + author = 'Kovid Goyal' + description = 'Convert ODT (OpenOffice) files to HTML' + file_types = set(['odt']) + + + def convert(self, stream, options, file_ext, log, + accelerators): + from calibre.ebooks.odt.input import Extract + return Extract()(stream, '.', log) + + diff --git a/src/calibre/ebooks/pdb/input.py b/src/calibre/ebooks/conversion/plugins/pdb_input.py similarity index 87% rename from src/calibre/ebooks/pdb/input.py rename to src/calibre/ebooks/conversion/plugins/pdb_input.py index cd861216af..69984ab268 100644 --- a/src/calibre/ebooks/pdb/input.py +++ b/src/calibre/ebooks/conversion/plugins/pdb_input.py @@ -7,8 +7,6 @@ __docformat__ = 'restructuredtext en' import os from calibre.customize.conversion import InputFormatPlugin -from calibre.ebooks.pdb.header import PdbHeaderReader -from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader class PDBInput(InputFormatPlugin): @@ -19,6 +17,9 @@ class PDBInput(InputFormatPlugin): def convert(self, stream, options, file_ext, log, accelerators): + from calibre.ebooks.pdb.header import PdbHeaderReader + from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader + header = PdbHeaderReader(stream) Reader = get_reader(header.ident) diff --git a/src/calibre/ebooks/pdf/input.py b/src/calibre/ebooks/conversion/plugins/pdf_input.py similarity index 95% rename from src/calibre/ebooks/pdf/input.py rename to src/calibre/ebooks/conversion/plugins/pdf_input.py index 51f44ba502..0a3821c584 100644 --- a/src/calibre/ebooks/pdf/input.py +++ b/src/calibre/ebooks/conversion/plugins/pdf_input.py @@ -7,8 +7,6 @@ __docformat__ = 'restructuredtext en' import os from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation -from calibre.ebooks.pdf.pdftohtml import pdftohtml -from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.constants import plugins pdfreflow, pdfreflow_err = plugins['pdfreflow'] @@ -43,6 +41,9 @@ class PDFInput(InputFormatPlugin): def convert(self, stream, options, file_ext, log, accelerators): + from calibre.ebooks.metadata.opf2 import OPFCreator + from calibre.ebooks.pdf.pdftohtml import pdftohtml + log.debug('Converting file to html...') # The main html file will be named index.html self.opts, self.log = options, log diff --git a/src/calibre/ebooks/pml/input.py b/src/calibre/ebooks/conversion/plugins/pml_input.py similarity index 96% rename from src/calibre/ebooks/pml/input.py rename to src/calibre/ebooks/conversion/plugins/pml_input.py index 4d59668b12..1351a5c492 100644 --- a/src/calibre/ebooks/pml/input.py +++ b/src/calibre/ebooks/conversion/plugins/pml_input.py @@ -11,9 +11,6 @@ import shutil from calibre.customize.conversion import InputFormatPlugin from calibre.ptempfile import TemporaryDirectory from calibre.utils.zipfile import ZipFile -from calibre.ebooks.pml.pmlconverter import PML_HTMLizer -from calibre.ebooks.metadata.toc import TOC -from calibre.ebooks.metadata.opf2 import OPFCreator class PMLInput(InputFormatPlugin): @@ -24,6 +21,8 @@ class PMLInput(InputFormatPlugin): file_types = set(['pml', 'pmlz']) def process_pml(self, pml_path, html_path, close_all=False): + from calibre.ebooks.pml.pmlconverter import PML_HTMLizer + pclose = False hclose = False @@ -85,6 +84,9 @@ class PMLInput(InputFormatPlugin): def convert(self, stream, options, file_ext, log, accelerators): + from calibre.ebooks.metadata.toc import TOC + from calibre.ebooks.metadata.opf2 import OPFCreator + self.options = options self.log = log pages, images = [], [] diff --git a/src/calibre/ebooks/rb/input.py b/src/calibre/ebooks/conversion/plugins/rb_input.py similarity index 91% rename from src/calibre/ebooks/rb/input.py rename to src/calibre/ebooks/conversion/plugins/rb_input.py index 8b05c1d42e..6a6ca3205a 100644 --- a/src/calibre/ebooks/rb/input.py +++ b/src/calibre/ebooks/conversion/plugins/rb_input.py @@ -6,7 +6,6 @@ __docformat__ = 'restructuredtext en' import os -from calibre.ebooks.rb.reader import Reader from calibre.customize.conversion import InputFormatPlugin class RBInput(InputFormatPlugin): @@ -18,6 +17,8 @@ class RBInput(InputFormatPlugin): def convert(self, stream, options, file_ext, log, accelerators): + from calibre.ebooks.rb.reader import Reader + reader = Reader(stream, log, options.input_encoding) opf = reader.extract_content(os.getcwd()) diff --git a/src/calibre/web/feeds/input.py b/src/calibre/ebooks/conversion/plugins/recipe_input.py similarity index 100% rename from src/calibre/web/feeds/input.py rename to src/calibre/ebooks/conversion/plugins/recipe_input.py diff --git a/src/calibre/ebooks/conversion/plugins/rtf_input.py b/src/calibre/ebooks/conversion/plugins/rtf_input.py new file mode 100644 index 0000000000..91c285c10c --- /dev/null +++ b/src/calibre/ebooks/conversion/plugins/rtf_input.py @@ -0,0 +1,298 @@ +from __future__ import with_statement +__license__ = 'GPL v3' +__copyright__ = '2008, Kovid Goyal ' + +import os, glob, re, textwrap + +from calibre.customize.conversion import InputFormatPlugin + +border_style_map = { + 'single' : 'solid', + 'double-thickness-border' : 'double', + 'shadowed-border': 'outset', + 'double-border': 'double', + 'dotted-border': 'dotted', + 'dashed': 'dashed', + 'hairline': 'solid', + 'inset': 'inset', + 'dash-small': 'dashed', + 'dot-dash': 'dotted', + 'dot-dot-dash': 'dotted', + 'outset': 'outset', + 'tripple': 'double', + 'triple': 'double', + 'thick-thin-small': 'solid', + 'thin-thick-small': 'solid', + 'thin-thick-thin-small': 'solid', + 'thick-thin-medium': 'solid', + 'thin-thick-medium': 'solid', + 'thin-thick-thin-medium': 'solid', + 'thick-thin-large': 'solid', + 'thin-thick-thin-large': 'solid', + 'wavy': 'ridge', + 'double-wavy': 'ridge', + 'striped': 'ridge', + 'emboss': 'inset', + 'engrave': 'inset', + 'frame': 'ridge', +} + + +class RTFInput(InputFormatPlugin): + + name = 'RTF Input' + author = 'Kovid Goyal' + description = 'Convert RTF files to HTML' + file_types = set(['rtf']) + + def generate_xml(self, stream): + from calibre.ebooks.rtf2xml.ParseRtf import ParseRtf + ofile = 'dataxml.xml' + run_lev, debug_dir, indent_out = 1, None, 0 + if getattr(self.opts, 'debug_pipeline', None) is not None: + try: + os.mkdir('rtfdebug') + debug_dir = 'rtfdebug' + run_lev = 4 + indent_out = 1 + self.log('Running RTFParser in debug mode') + except: + self.log.warn('Impossible to run RTFParser in debug mode') + parser = ParseRtf( + in_file = stream, + out_file = ofile, + # Convert symbol fonts to unicode equivalents. Default + # is 1 + convert_symbol = 1, + + # Convert Zapf fonts to unicode equivalents. Default + # is 1. + convert_zapf = 1, + + # Convert Wingding fonts to unicode equivalents. + # Default is 1. + convert_wingdings = 1, + + # Convert RTF caps to real caps. + # Default is 1. + convert_caps = 1, + + # Indent resulting XML. + # Default is 0 (no indent). + indent = indent_out, + + # Form lists from RTF. Default is 1. + form_lists = 1, + + # Convert headings to sections. Default is 0. + headings_to_sections = 1, + + # Group paragraphs with the same style name. Default is 1. + group_styles = 1, + + # Group borders. Default is 1. + group_borders = 1, + + # Write or do not write paragraphs. Default is 0. + empty_paragraphs = 1, + + #debug + deb_dir = debug_dir, + run_level = run_lev, + ) + parser.parse_rtf() + with open(ofile, 'rb') as f: + return f.read() + + def extract_images(self, picts): + import imghdr + self.log('Extracting images...') + + with open(picts, 'rb') as f: + raw = f.read() + picts = filter(len, re.findall(r'\{\\pict([^}]+)\}', raw)) + hex = re.compile(r'[^a-fA-F0-9]') + encs = [hex.sub('', pict) for pict in picts] + + count = 0 + imap = {} + for enc in encs: + if len(enc) % 2 == 1: + enc = enc[:-1] + data = enc.decode('hex') + fmt = imghdr.what(None, data) + if fmt is None: + fmt = 'wmf' + count += 1 + name = '%04d.%s' % (count, fmt) + with open(name, 'wb') as f: + f.write(data) + imap[count] = name + # with open(name+'.hex', 'wb') as f: + # f.write(enc) + return self.convert_images(imap) + + def convert_images(self, imap): + self.default_img = None + for count, val in imap.iteritems(): + try: + imap[count] = self.convert_image(val) + except: + self.log.exception('Failed to convert', val) + return imap + + def convert_image(self, name): + if not name.endswith('.wmf'): + return name + try: + return self.rasterize_wmf(name) + except: + self.log.exception('Failed to convert WMF image %r'%name) + return self.replace_wmf(name) + + def replace_wmf(self, name): + from calibre.ebooks import calibre_cover + if self.default_img is None: + self.default_img = calibre_cover('Conversion of WMF images is not supported', + 'Use Microsoft Word or OpenOffice to save this RTF file' + ' as HTML and convert that in calibre.', title_size=36, + author_size=20) + name = name.replace('.wmf', '.jpg') + with open(name, 'wb') as f: + f.write(self.default_img) + return name + + def rasterize_wmf(self, name): + from calibre.utils.wmf.parse import wmf_unwrap + with open(name, 'rb') as f: + data = f.read() + data = wmf_unwrap(data) + name = name.replace('.wmf', '.png') + with open(name, 'wb') as f: + f.write(data) + return name + + + def write_inline_css(self, ic, border_styles): + font_size_classes = ['span.fs%d { font-size: %spt }'%(i, x) for i, x in + enumerate(ic.font_sizes)] + color_classes = ['span.col%d { color: %s }'%(i, x) for i, x in + enumerate(ic.colors)] + css = textwrap.dedent(''' + span.none { + text-decoration: none; font-weight: normal; + font-style: normal; font-variant: normal + } + + span.italics { font-style: italic } + + span.bold { font-weight: bold } + + span.small-caps { font-variant: small-caps } + + span.underlined { text-decoration: underline } + + span.strike-through { text-decoration: line-through } + + ''') + css += '\n'+'\n'.join(font_size_classes) + css += '\n' +'\n'.join(color_classes) + + for cls, val in border_styles.iteritems(): + css += '\n\n.%s {\n%s\n}'%(cls, val) + + with open('styles.css', 'ab') as f: + f.write(css) + + def convert_borders(self, doc): + border_styles = [] + style_map = {} + for elem in doc.xpath(r'//*[local-name()="cell"]'): + style = ['border-style: hidden', 'border-width: 1px', + 'border-color: black'] + for x in ('bottom', 'top', 'left', 'right'): + bs = elem.get('border-cell-%s-style'%x, None) + if bs: + cbs = border_style_map.get(bs, 'solid') + style.append('border-%s-style: %s'%(x, cbs)) + bw = elem.get('border-cell-%s-line-width'%x, None) + if bw: + style.append('border-%s-width: %spt'%(x, bw)) + bc = elem.get('border-cell-%s-color'%x, None) + if bc: + style.append('border-%s-color: %s'%(x, bc)) + style = ';\n'.join(style) + if style not in border_styles: + border_styles.append(style) + idx = border_styles.index(style) + cls = 'border_style%d'%idx + style_map[cls] = style + elem.set('class', cls) + return style_map + + def convert(self, stream, options, file_ext, log, + accelerators): + from lxml import etree + from calibre.ebooks.metadata.meta import get_metadata + from calibre.ebooks.metadata.opf2 import OPFCreator + from calibre.ebooks.rtf2xml.ParseRtf import RtfInvalidCodeException + from calibre.ebooks.rtf.input import InlineClass + self.opts = options + self.log = log + self.log('Converting RTF to XML...') + try: + xml = self.generate_xml(stream.name) + except RtfInvalidCodeException as e: + raise ValueError(_('This RTF file has a feature calibre does not ' + 'support. Convert it to HTML first and then try it.\n%s')%e) + + d = glob.glob(os.path.join('*_rtf_pict_dir', 'picts.rtf')) + if d: + imap = {} + try: + imap = self.extract_images(d[0]) + except: + self.log.exception('Failed to extract images...') + + self.log('Parsing XML...') + parser = etree.XMLParser(recover=True, no_network=True) + doc = etree.fromstring(xml, parser=parser) + border_styles = self.convert_borders(doc) + for pict in doc.xpath('//rtf:pict[@num]', + namespaces={'rtf':'http://rtf2xml.sourceforge.net/'}): + num = int(pict.get('num')) + name = imap.get(num, None) + if name is not None: + pict.set('num', name) + + self.log('Converting XML to HTML...') + inline_class = InlineClass(self.log) + styledoc = etree.fromstring(P('templates/rtf.xsl', data=True)) + extensions = { ('calibre', 'inline-class') : inline_class } + transform = etree.XSLT(styledoc, extensions=extensions) + result = transform(doc) + html = 'index.xhtml' + with open(html, 'wb') as f: + res = transform.tostring(result) + # res = res[:100].replace('xmlns:html', 'xmlns') + res[100:] + #clean multiple \n + res = re.sub('\n+', '\n', res) + # Replace newlines inserted by the 'empty_paragraphs' option in rtf2xml with html blank lines + # res = re.sub('\s*', '', res) + # res = re.sub('(?<=\n)\n{2}', + # u'

\u00a0

\n'.encode('utf-8'), res) + f.write(res) + self.write_inline_css(inline_class, border_styles) + stream.seek(0) + mi = get_metadata(stream, 'rtf') + if not mi.title: + mi.title = _('Unknown') + if not mi.authors: + mi.authors = [_('Unknown')] + opf = OPFCreator(os.getcwd(), mi) + opf.create_manifest([('index.xhtml', None)]) + opf.create_spine(['index.xhtml']) + opf.render(open('metadata.opf', 'wb')) + return os.path.abspath('metadata.opf') + + diff --git a/src/calibre/ebooks/snb/input.py b/src/calibre/ebooks/conversion/plugins/snb_input.py similarity index 97% rename from src/calibre/ebooks/snb/input.py rename to src/calibre/ebooks/conversion/plugins/snb_input.py index 13b1ca45f9..ae3ab0033c 100755 --- a/src/calibre/ebooks/snb/input.py +++ b/src/calibre/ebooks/conversion/plugins/snb_input.py @@ -4,13 +4,11 @@ __license__ = 'GPL 3' __copyright__ = '2010, Li Fanxi ' __docformat__ = 'restructuredtext en' -import os, uuid +import os from calibre.customize.conversion import InputFormatPlugin -from calibre.ebooks.snb.snbfile import SNBFile from calibre.ptempfile import TemporaryDirectory from calibre.utils.filenames import ascii_filename -from lxml import etree HTML_TEMPLATE = u'%s\n%s\n' @@ -29,7 +27,12 @@ class SNBInput(InputFormatPlugin): def convert(self, stream, options, file_ext, log, accelerators): + import uuid + from lxml import etree + from calibre.ebooks.oeb.base import DirContainer + from calibre.ebooks.snb.snbfile import SNBFile + log.debug("Parsing SNB file...") snbFile = SNBFile() try: diff --git a/src/calibre/ebooks/tcr/input.py b/src/calibre/ebooks/conversion/plugins/tcr_input.py similarity index 94% rename from src/calibre/ebooks/tcr/input.py rename to src/calibre/ebooks/conversion/plugins/tcr_input.py index 4d15fd0923..de4f3f5f40 100644 --- a/src/calibre/ebooks/tcr/input.py +++ b/src/calibre/ebooks/conversion/plugins/tcr_input.py @@ -7,7 +7,6 @@ __docformat__ = 'restructuredtext en' from cStringIO import StringIO from calibre.customize.conversion import InputFormatPlugin -from calibre.ebooks.compression.tcr import decompress class TCRInput(InputFormatPlugin): @@ -17,6 +16,8 @@ class TCRInput(InputFormatPlugin): file_types = set(['tcr']) def convert(self, stream, options, file_ext, log, accelerators): + from calibre.ebooks.compression.tcr import decompress + log.info('Decompressing text...') raw_txt = decompress(stream) diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/conversion/plugins/txt_input.py similarity index 94% rename from src/calibre/ebooks/txt/input.py rename to src/calibre/ebooks/conversion/plugins/txt_input.py index 49c8a2129d..e916b30c29 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/conversion/plugins/txt_input.py @@ -8,14 +8,6 @@ import os from calibre import _ent_pat, walk, xml_entity_to_unicode from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation -from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator -from calibre.ebooks.chardet import detect -from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \ - separate_paragraphs_single_line, separate_paragraphs_print_formatted, \ - preserve_spaces, detect_paragraph_type, detect_formatting_type, \ - normalize_line_endings, convert_textile, remove_indents, block_to_single_line, \ - separate_hard_scene_breaks -from calibre.utils.zipfile import ZipFile class TXTInput(InputFormatPlugin): @@ -61,6 +53,17 @@ class TXTInput(InputFormatPlugin): def convert(self, stream, options, file_ext, log, accelerators): + from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator + from calibre.ebooks.chardet import detect + from calibre.utils.zipfile import ZipFile + from calibre.ebooks.txt.processor import (convert_basic, + convert_markdown, separate_paragraphs_single_line, + separate_paragraphs_print_formatted, preserve_spaces, + detect_paragraph_type, detect_formatting_type, + normalize_line_endings, convert_textile, remove_indents, + block_to_single_line, separate_hard_scene_breaks) + + self.log = log txt = '' log.debug('Reading text from file...') diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py index d303dd66a5..6cacb34edc 100644 --- a/src/calibre/ebooks/html/input.py +++ b/src/calibre/ebooks/html/input.py @@ -11,19 +11,13 @@ __docformat__ = 'restructuredtext en' Input plugin for HTML or OPF ebooks. ''' -import os, re, sys, uuid, tempfile, errno as gerrno +import os, re, sys, errno as gerrno from urlparse import urlparse, urlunparse -from urllib import unquote, quote -from functools import partial -from itertools import izip +from urllib import unquote -from calibre.customize.conversion import InputFormatPlugin from calibre.ebooks.chardet import detect_xml_encoding -from calibre.customize.conversion import OptionRecommendation -from calibre.constants import islinux, isbsd, iswindows +from calibre.constants import iswindows from calibre import unicode_path, as_unicode -from calibre.utils.localization import get_lang -from calibre.utils.filenames import ascii_filename class Link(object): ''' @@ -241,262 +235,4 @@ def get_filelist(htmlfile, dir, opts, log): return filelist -class HTMLInput(InputFormatPlugin): - name = 'HTML Input' - author = 'Kovid Goyal' - description = 'Convert HTML and OPF files to an OEB' - file_types = set(['opf', 'html', 'htm', 'xhtml', 'xhtm', 'shtm', 'shtml']) - - options = set([ - OptionRecommendation(name='breadth_first', - recommended_value=False, level=OptionRecommendation.LOW, - help=_('Traverse links in HTML files breadth first. Normally, ' - 'they are traversed depth first.' - ) - ), - - OptionRecommendation(name='max_levels', - recommended_value=5, level=OptionRecommendation.LOW, - help=_('Maximum levels of recursion when following links in ' - 'HTML files. Must be non-negative. 0 implies that no ' - 'links in the root HTML file are followed. Default is ' - '%default.' - ) - ), - - OptionRecommendation(name='dont_package', - recommended_value=False, level=OptionRecommendation.LOW, - help=_('Normally this input plugin re-arranges all the input ' - 'files into a standard folder hierarchy. Only use this option ' - 'if you know what you are doing as it can result in various ' - 'nasty side effects in the rest of the conversion pipeline.' - ) - ), - - ]) - - def convert(self, stream, opts, file_ext, log, - accelerators): - self._is_case_sensitive = None - basedir = os.getcwd() - self.opts = opts - - fname = None - if hasattr(stream, 'name'): - basedir = os.path.dirname(stream.name) - fname = os.path.basename(stream.name) - - if file_ext != 'opf': - if opts.dont_package: - raise ValueError('The --dont-package option is not supported for an HTML input file') - from calibre.ebooks.metadata.html import get_metadata - mi = get_metadata(stream) - if fname: - from calibre.ebooks.metadata.meta import metadata_from_filename - fmi = metadata_from_filename(fname) - fmi.smart_update(mi) - mi = fmi - oeb = self.create_oebbook(stream.name, basedir, opts, log, mi) - return oeb - - from calibre.ebooks.conversion.plumber import create_oebbook - return create_oebbook(log, stream.name, opts, - encoding=opts.input_encoding) - - def is_case_sensitive(self, path): - if getattr(self, '_is_case_sensitive', None) is not None: - return self._is_case_sensitive - if not path or not os.path.exists(path): - return islinux or isbsd - self._is_case_sensitive = not (os.path.exists(path.lower()) \ - and os.path.exists(path.upper())) - return self._is_case_sensitive - - def create_oebbook(self, htmlpath, basedir, opts, log, mi): - from calibre.ebooks.conversion.plumber import create_oebbook - from calibre.ebooks.oeb.base import (DirContainer, - rewrite_links, urlnormalize, urldefrag, BINARY_MIME, OEB_STYLES, - xpath) - from calibre import guess_type - from calibre.ebooks.oeb.transforms.metadata import \ - meta_info_to_oeb_metadata - import cssutils, logging - cssutils.log.setLevel(logging.WARN) - self.OEB_STYLES = OEB_STYLES - oeb = create_oebbook(log, None, opts, self, - encoding=opts.input_encoding, populate=False) - self.oeb = oeb - - metadata = oeb.metadata - meta_info_to_oeb_metadata(mi, metadata, log) - if not metadata.language: - oeb.logger.warn(u'Language not specified') - metadata.add('language', get_lang().replace('_', '-')) - if not metadata.creator: - oeb.logger.warn('Creator not specified') - metadata.add('creator', self.oeb.translate(__('Unknown'))) - if not metadata.title: - oeb.logger.warn('Title not specified') - metadata.add('title', self.oeb.translate(__('Unknown'))) - bookid = str(uuid.uuid4()) - metadata.add('identifier', bookid, id='uuid_id', scheme='uuid') - for ident in metadata.identifier: - if 'id' in ident.attrib: - self.oeb.uid = metadata.identifier[0] - break - - filelist = get_filelist(htmlpath, basedir, opts, log) - filelist = [f for f in filelist if not f.is_binary] - htmlfile_map = {} - for f in filelist: - path = f.path - oeb.container = DirContainer(os.path.dirname(path), log, - ignore_opf=True) - bname = os.path.basename(path) - id, href = oeb.manifest.generate(id='html', - href=ascii_filename(bname)) - htmlfile_map[path] = href - item = oeb.manifest.add(id, href, 'text/html') - item.html_input_href = bname - oeb.spine.add(item, True) - - self.added_resources = {} - self.log = log - self.log('Normalizing filename cases') - for path, href in htmlfile_map.items(): - if not self.is_case_sensitive(path): - path = path.lower() - self.added_resources[path] = href - self.urlnormalize, self.DirContainer = urlnormalize, DirContainer - self.urldefrag = urldefrag - self.guess_type, self.BINARY_MIME = guess_type, BINARY_MIME - - self.log('Rewriting HTML links') - for f in filelist: - path = f.path - dpath = os.path.dirname(path) - oeb.container = DirContainer(dpath, log, ignore_opf=True) - item = oeb.manifest.hrefs[htmlfile_map[path]] - rewrite_links(item.data, partial(self.resource_adder, base=dpath)) - - for item in oeb.manifest.values(): - if item.media_type in self.OEB_STYLES: - dpath = None - for path, href in self.added_resources.items(): - if href == item.href: - dpath = os.path.dirname(path) - break - cssutils.replaceUrls(item.data, - partial(self.resource_adder, base=dpath)) - - toc = self.oeb.toc - self.oeb.auto_generated_toc = True - titles = [] - headers = [] - for item in self.oeb.spine: - if not item.linear: continue - html = item.data - title = ''.join(xpath(html, '/h:html/h:head/h:title/text()')) - title = re.sub(r'\s+', ' ', title.strip()) - if title: - titles.append(title) - headers.append('(unlabled)') - for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'): - expr = '/h:html/h:body//h:%s[position()=1]/text()' - header = ''.join(xpath(html, expr % tag)) - header = re.sub(r'\s+', ' ', header.strip()) - if header: - headers[-1] = header - break - use = titles - if len(titles) > len(set(titles)): - use = headers - for title, item in izip(use, self.oeb.spine): - if not item.linear: continue - toc.add(title, item.href) - - oeb.container = DirContainer(os.getcwdu(), oeb.log, ignore_opf=True) - return oeb - - def link_to_local_path(self, link_, base=None): - if not isinstance(link_, unicode): - try: - link_ = link_.decode('utf-8', 'error') - except: - self.log.warn('Failed to decode link %r. Ignoring'%link_) - return None, None - try: - l = Link(link_, base if base else os.getcwdu()) - except: - self.log.exception('Failed to process link: %r'%link_) - return None, None - if l.path is None: - # Not a local resource - return None, None - link = l.path.replace('/', os.sep).strip() - frag = l.fragment - if not link: - return None, None - return link, frag - - def resource_adder(self, link_, base=None): - link, frag = self.link_to_local_path(link_, base=base) - if link is None: - return link_ - try: - if base and not os.path.isabs(link): - link = os.path.join(base, link) - link = os.path.abspath(link) - except: - return link_ - if not os.access(link, os.R_OK): - return link_ - if os.path.isdir(link): - self.log.warn(link_, 'is a link to a directory. Ignoring.') - return link_ - if not self.is_case_sensitive(tempfile.gettempdir()): - link = link.lower() - if link not in self.added_resources: - bhref = os.path.basename(link) - id, href = self.oeb.manifest.generate(id='added', - href=bhref) - guessed = self.guess_type(href)[0] - media_type = guessed or self.BINARY_MIME - if media_type == 'text/plain': - self.log.warn('Ignoring link to text file %r'%link_) - return None - - self.oeb.log.debug('Added', link) - self.oeb.container = self.DirContainer(os.path.dirname(link), - self.oeb.log, ignore_opf=True) - # Load into memory - item = self.oeb.manifest.add(id, href, media_type) - # bhref refers to an already existing file. The read() method of - # DirContainer will call unquote on it before trying to read the - # file, therefore we quote it here. - if isinstance(bhref, unicode): - bhref = bhref.encode('utf-8') - item.html_input_href = quote(bhref).decode('utf-8') - if guessed in self.OEB_STYLES: - item.override_css_fetch = partial( - self.css_import_handler, os.path.dirname(link)) - item.data - self.added_resources[link] = href - - nlink = self.added_resources[link] - if frag: - nlink = '#'.join((nlink, frag)) - return nlink - - def css_import_handler(self, base, href): - link, frag = self.link_to_local_path(href, base=base) - if link is None or not os.access(link, os.R_OK) or os.path.isdir(link): - return (None, None) - try: - raw = open(link, 'rb').read().decode('utf-8', 'replace') - raw = self.oeb.css_preprocessor(raw, add_namespace=True) - except: - self.log.exception('Failed to read CSS file: %r'%link) - return (None, None) - return (None, raw) diff --git a/src/calibre/ebooks/lrf/input.py b/src/calibre/ebooks/lrf/input.py index 9777a8a998..e9bf42c6bd 100644 --- a/src/calibre/ebooks/lrf/input.py +++ b/src/calibre/ebooks/lrf/input.py @@ -6,12 +6,11 @@ __license__ = 'GPL v3' __copyright__ = '2009, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import os, textwrap, sys, operator +import textwrap, operator from copy import deepcopy, copy from lxml import etree -from calibre.customize.conversion import InputFormatPlugin from calibre import guess_type class Canvas(etree.XSLTExtension): @@ -406,76 +405,4 @@ class Styles(etree.XSLTExtension): -class LRFInput(InputFormatPlugin): - name = 'LRF Input' - author = 'Kovid Goyal' - description = 'Convert LRF files to HTML' - file_types = set(['lrf']) - - def convert(self, stream, options, file_ext, log, - accelerators): - self.log = log - self.log('Generating XML') - from calibre.ebooks.lrf.lrfparser import LRFDocument - d = LRFDocument(stream) - d.parse() - xml = d.to_xml(write_files=True) - if options.verbose > 2: - open('lrs.xml', 'wb').write(xml.encode('utf-8')) - parser = etree.XMLParser(no_network=True, huge_tree=True) - try: - doc = etree.fromstring(xml, parser=parser) - except: - self.log.warn('Failed to parse XML. Trying to recover') - parser = etree.XMLParser(no_network=True, huge_tree=True, - recover=True) - doc = etree.fromstring(xml, parser=parser) - - - char_button_map = {} - for x in doc.xpath('//CharButton[@refobj]'): - ro = x.get('refobj') - jump_button = doc.xpath('//*[@objid="%s"]'%ro) - if jump_button: - jump_to = jump_button[0].xpath('descendant::JumpTo[@refpage and @refobj]') - if jump_to: - char_button_map[ro] = '%s.xhtml#%s'%(jump_to[0].get('refpage'), - jump_to[0].get('refobj')) - plot_map = {} - for x in doc.xpath('//Plot[@refobj]'): - ro = x.get('refobj') - image = doc.xpath('//Image[@objid="%s" and @refstream]'%ro) - if image: - imgstr = doc.xpath('//ImageStream[@objid="%s" and @file]'% - image[0].get('refstream')) - if imgstr: - plot_map[ro] = imgstr[0].get('file') - - self.log('Converting XML to HTML...') - styledoc = etree.fromstring(P('templates/lrf.xsl', data=True)) - media_type = MediaType() - styles = Styles() - text_block = TextBlock(styles, char_button_map, plot_map, log) - canvas = Canvas(doc, styles, text_block, log) - image_block = ImageBlock(canvas) - ruled_line = RuledLine() - extensions = { - ('calibre', 'media-type') : media_type, - ('calibre', 'text-block') : text_block, - ('calibre', 'ruled-line') : ruled_line, - ('calibre', 'styles') : styles, - ('calibre', 'canvas') : canvas, - ('calibre', 'image-block'): image_block, - } - transform = etree.XSLT(styledoc, extensions=extensions) - try: - result = transform(doc) - except RuntimeError: - sys.setrecursionlimit(5000) - result = transform(doc) - - with open('content.opf', 'wb') as f: - f.write(result) - styles.write() - return os.path.abspath('content.opf') diff --git a/src/calibre/ebooks/odt/input.py b/src/calibre/ebooks/odt/input.py index 214a40c29b..430d95b31f 100644 --- a/src/calibre/ebooks/odt/input.py +++ b/src/calibre/ebooks/odt/input.py @@ -12,7 +12,6 @@ from lxml import etree from odf.odf2xhtml import ODF2XHTML from calibre import CurrentDir, walk -from calibre.customize.conversion import InputFormatPlugin class Extract(ODF2XHTML): @@ -178,16 +177,4 @@ class Extract(ODF2XHTML): return os.path.abspath('metadata.opf') -class ODTInput(InputFormatPlugin): - - name = 'ODT Input' - author = 'Kovid Goyal' - description = 'Convert ODT (OpenOffice) files to HTML' - file_types = set(['odt']) - - - def convert(self, stream, options, file_ext, log, - accelerators): - return Extract()(stream, '.', log) - diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py index 5858824434..8e1a5ac775 100644 --- a/src/calibre/ebooks/rtf/input.py +++ b/src/calibre/ebooks/rtf/input.py @@ -2,42 +2,9 @@ from __future__ import with_statement __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' -import os, glob, re, textwrap from lxml import etree -from calibre.customize.conversion import InputFormatPlugin - -border_style_map = { - 'single' : 'solid', - 'double-thickness-border' : 'double', - 'shadowed-border': 'outset', - 'double-border': 'double', - 'dotted-border': 'dotted', - 'dashed': 'dashed', - 'hairline': 'solid', - 'inset': 'inset', - 'dash-small': 'dashed', - 'dot-dash': 'dotted', - 'dot-dot-dash': 'dotted', - 'outset': 'outset', - 'tripple': 'double', - 'triple': 'double', - 'thick-thin-small': 'solid', - 'thin-thick-small': 'solid', - 'thin-thick-thin-small': 'solid', - 'thick-thin-medium': 'solid', - 'thin-thick-medium': 'solid', - 'thin-thick-thin-medium': 'solid', - 'thick-thin-large': 'solid', - 'thin-thick-thin-large': 'solid', - 'wavy': 'ridge', - 'double-wavy': 'ridge', - 'striped': 'ridge', - 'emboss': 'inset', - 'engrave': 'inset', - 'frame': 'ridge', -} class InlineClass(etree.XSLTExtension): @@ -71,261 +38,3 @@ class InlineClass(etree.XSLTExtension): output_parent.text = ' '.join(classes) -class RTFInput(InputFormatPlugin): - - name = 'RTF Input' - author = 'Kovid Goyal' - description = 'Convert RTF files to HTML' - file_types = set(['rtf']) - - def generate_xml(self, stream): - from calibre.ebooks.rtf2xml.ParseRtf import ParseRtf - ofile = 'dataxml.xml' - run_lev, debug_dir, indent_out = 1, None, 0 - if getattr(self.opts, 'debug_pipeline', None) is not None: - try: - os.mkdir('rtfdebug') - debug_dir = 'rtfdebug' - run_lev = 4 - indent_out = 1 - self.log('Running RTFParser in debug mode') - except: - self.log.warn('Impossible to run RTFParser in debug mode') - parser = ParseRtf( - in_file = stream, - out_file = ofile, - # Convert symbol fonts to unicode equivalents. Default - # is 1 - convert_symbol = 1, - - # Convert Zapf fonts to unicode equivalents. Default - # is 1. - convert_zapf = 1, - - # Convert Wingding fonts to unicode equivalents. - # Default is 1. - convert_wingdings = 1, - - # Convert RTF caps to real caps. - # Default is 1. - convert_caps = 1, - - # Indent resulting XML. - # Default is 0 (no indent). - indent = indent_out, - - # Form lists from RTF. Default is 1. - form_lists = 1, - - # Convert headings to sections. Default is 0. - headings_to_sections = 1, - - # Group paragraphs with the same style name. Default is 1. - group_styles = 1, - - # Group borders. Default is 1. - group_borders = 1, - - # Write or do not write paragraphs. Default is 0. - empty_paragraphs = 1, - - #debug - deb_dir = debug_dir, - run_level = run_lev, - ) - parser.parse_rtf() - with open(ofile, 'rb') as f: - return f.read() - - def extract_images(self, picts): - import imghdr - self.log('Extracting images...') - - with open(picts, 'rb') as f: - raw = f.read() - picts = filter(len, re.findall(r'\{\\pict([^}]+)\}', raw)) - hex = re.compile(r'[^a-fA-F0-9]') - encs = [hex.sub('', pict) for pict in picts] - - count = 0 - imap = {} - for enc in encs: - if len(enc) % 2 == 1: - enc = enc[:-1] - data = enc.decode('hex') - fmt = imghdr.what(None, data) - if fmt is None: - fmt = 'wmf' - count += 1 - name = '%04d.%s' % (count, fmt) - with open(name, 'wb') as f: - f.write(data) - imap[count] = name - # with open(name+'.hex', 'wb') as f: - # f.write(enc) - return self.convert_images(imap) - - def convert_images(self, imap): - self.default_img = None - for count, val in imap.iteritems(): - try: - imap[count] = self.convert_image(val) - except: - self.log.exception('Failed to convert', val) - return imap - - def convert_image(self, name): - if not name.endswith('.wmf'): - return name - try: - return self.rasterize_wmf(name) - except: - self.log.exception('Failed to convert WMF image %r'%name) - return self.replace_wmf(name) - - def replace_wmf(self, name): - from calibre.ebooks import calibre_cover - if self.default_img is None: - self.default_img = calibre_cover('Conversion of WMF images is not supported', - 'Use Microsoft Word or OpenOffice to save this RTF file' - ' as HTML and convert that in calibre.', title_size=36, - author_size=20) - name = name.replace('.wmf', '.jpg') - with open(name, 'wb') as f: - f.write(self.default_img) - return name - - def rasterize_wmf(self, name): - from calibre.utils.wmf.parse import wmf_unwrap - with open(name, 'rb') as f: - data = f.read() - data = wmf_unwrap(data) - name = name.replace('.wmf', '.png') - with open(name, 'wb') as f: - f.write(data) - return name - - - def write_inline_css(self, ic, border_styles): - font_size_classes = ['span.fs%d { font-size: %spt }'%(i, x) for i, x in - enumerate(ic.font_sizes)] - color_classes = ['span.col%d { color: %s }'%(i, x) for i, x in - enumerate(ic.colors)] - css = textwrap.dedent(''' - span.none { - text-decoration: none; font-weight: normal; - font-style: normal; font-variant: normal - } - - span.italics { font-style: italic } - - span.bold { font-weight: bold } - - span.small-caps { font-variant: small-caps } - - span.underlined { text-decoration: underline } - - span.strike-through { text-decoration: line-through } - - ''') - css += '\n'+'\n'.join(font_size_classes) - css += '\n' +'\n'.join(color_classes) - - for cls, val in border_styles.iteritems(): - css += '\n\n.%s {\n%s\n}'%(cls, val) - - with open('styles.css', 'ab') as f: - f.write(css) - - def convert_borders(self, doc): - border_styles = [] - style_map = {} - for elem in doc.xpath(r'//*[local-name()="cell"]'): - style = ['border-style: hidden', 'border-width: 1px', - 'border-color: black'] - for x in ('bottom', 'top', 'left', 'right'): - bs = elem.get('border-cell-%s-style'%x, None) - if bs: - cbs = border_style_map.get(bs, 'solid') - style.append('border-%s-style: %s'%(x, cbs)) - bw = elem.get('border-cell-%s-line-width'%x, None) - if bw: - style.append('border-%s-width: %spt'%(x, bw)) - bc = elem.get('border-cell-%s-color'%x, None) - if bc: - style.append('border-%s-color: %s'%(x, bc)) - style = ';\n'.join(style) - if style not in border_styles: - border_styles.append(style) - idx = border_styles.index(style) - cls = 'border_style%d'%idx - style_map[cls] = style - elem.set('class', cls) - return style_map - - def convert(self, stream, options, file_ext, log, - accelerators): - from calibre.ebooks.metadata.meta import get_metadata - from calibre.ebooks.metadata.opf2 import OPFCreator - from calibre.ebooks.rtf2xml.ParseRtf import RtfInvalidCodeException - self.opts = options - self.log = log - self.log('Converting RTF to XML...') - try: - xml = self.generate_xml(stream.name) - except RtfInvalidCodeException as e: - raise ValueError(_('This RTF file has a feature calibre does not ' - 'support. Convert it to HTML first and then try it.\n%s')%e) - - d = glob.glob(os.path.join('*_rtf_pict_dir', 'picts.rtf')) - if d: - imap = {} - try: - imap = self.extract_images(d[0]) - except: - self.log.exception('Failed to extract images...') - - self.log('Parsing XML...') - parser = etree.XMLParser(recover=True, no_network=True) - doc = etree.fromstring(xml, parser=parser) - border_styles = self.convert_borders(doc) - for pict in doc.xpath('//rtf:pict[@num]', - namespaces={'rtf':'http://rtf2xml.sourceforge.net/'}): - num = int(pict.get('num')) - name = imap.get(num, None) - if name is not None: - pict.set('num', name) - - self.log('Converting XML to HTML...') - inline_class = InlineClass(self.log) - styledoc = etree.fromstring(P('templates/rtf.xsl', data=True)) - extensions = { ('calibre', 'inline-class') : inline_class } - transform = etree.XSLT(styledoc, extensions=extensions) - result = transform(doc) - html = 'index.xhtml' - with open(html, 'wb') as f: - res = transform.tostring(result) - # res = res[:100].replace('xmlns:html', 'xmlns') + res[100:] - #clean multiple \n - res = re.sub('\n+', '\n', res) - # Replace newlines inserted by the 'empty_paragraphs' option in rtf2xml with html blank lines - # res = re.sub('\s*', '', res) - # res = re.sub('(?<=\n)\n{2}', - # u'

\u00a0

\n'.encode('utf-8'), res) - f.write(res) - self.write_inline_css(inline_class, border_styles) - stream.seek(0) - mi = get_metadata(stream, 'rtf') - if not mi.title: - mi.title = _('Unknown') - if not mi.authors: - mi.authors = [_('Unknown')] - opf = OPFCreator(os.getcwd(), mi) - opf.create_manifest([('index.xhtml', None)]) - opf.create_spine(['index.xhtml']) - opf.render(open('metadata.opf', 'wb')) - return os.path.abspath('metadata.opf') - -#ebook-convert "bad.rtf" test.epub -v -d "E:\Mes eBooks\Developpement\debug" -# os.makedirs("E:\\Mes eBooks\\Developpement\\rtfdebug") -# debug_dir = "E:\\Mes eBooks\\Developpement\\rtfdebug"