From 36fd295ca12540e73ee7bcde2b3e896a5da53478 Mon Sep 17 00:00:00 2001 From: John Schember Date: Fri, 20 Mar 2009 19:39:26 -0400 Subject: [PATCH 01/16] any2txt converter --- src/calibre/ebooks/htmlsymbols.py | 219 ++++++++++++++++++++++++++ src/calibre/ebooks/txt/__init__.py | 9 ++ src/calibre/ebooks/txt/from_any.py | 74 +++++++++ src/calibre/ebooks/txt/writer.py | 237 +++++++++++++++++++++++++++++ 4 files changed, 539 insertions(+) create mode 100644 src/calibre/ebooks/htmlsymbols.py create mode 100644 src/calibre/ebooks/txt/__init__.py create mode 100644 src/calibre/ebooks/txt/from_any.py create mode 100644 src/calibre/ebooks/txt/writer.py diff --git a/src/calibre/ebooks/htmlsymbols.py b/src/calibre/ebooks/htmlsymbols.py new file mode 100644 index 0000000000..9b50f20fcd --- /dev/null +++ b/src/calibre/ebooks/htmlsymbols.py @@ -0,0 +1,219 @@ +# -*- coding: utf-8 -*- +''' +Maping of non-acii symbols and their corresponding html entity number and name +''' +__license__ = 'GPL v3' +__copyright__ = '2009, John Schember ' + +# http://www.w3schools.com/tags/ref_symbols.asp +HTML_SYMBOLS = { + # Math Symbols + u'∀' : ['∀', '∀'], # for all + u'∂' : ['∂', '∂'], # part + u'∃' : ['∃', '&exists;'], # exists + u'∅' : ['∅', '∅'], # empty + u'∇' : ['∇', '∇'], # nabla + u'∈' : ['∈', '∈'], # isin + u'∉' : ['∉', '∉'], # notin + u'∋' : ['∋', '∋'], # ni + u'∏' : ['∏', '∏'], # prod + u'∑' : ['∑', '∑'], # sum + u'−' : ['−', '−'], # minus + u'∗' : ['∗', '∗'], # lowast + u'√' : ['√', '√'], # square root + u'∝' : ['∝', '∝'], # proportional to + u'∞' : ['∞', '∞'], # infinity + u'∠' : ['∠', '∠'], # angle + u'∧' : ['∧', '∧'], # and + u'∨' : ['∨', '∨'], # or + u'∩' : ['∩', '∩'], # cap + u'∪' : ['∪', '∪'], # cup + u'∫' : ['∫', '∫'], # integral + u'∴' : ['∴', '∴'], # therefore + u'∼' : ['∼', '∼'], # simular to + u'≅' : ['≅', '≅'], # approximately equal + u'≈' : ['≈', '≈'], # almost equal + u'≠' : ['≠', '≠'], # not equal + u'≡' : ['≡', '≡'], # equivalent + u'≤' : ['≤', '≤'], # less or equal + u'≥' : ['≥', '≥'], # greater or equal + u'⊂' : ['⊂', '⊂'], # subset of + u'⊃' : ['⊃', '⊃'], # superset of + u'⊄' : ['⊄', '⊄'], # not subset of + u'⊆' : ['⊆', '⊆'], # subset or equal + u'⊇' : ['⊇', '⊇'], # superset or equal + u'⊕' : ['⊕', '⊕'], # circled plus + u'⊗' : ['⊗', '⊗'], # cirled times + u'⊥' : ['⊥', '⊥'], # perpendicular + u'⋅' : ['⋅', '⋅'], # dot operator + # Greek Letters + u'Α' : ['Α', 'Α'], # Alpha + u'Β' : ['Β', 'Β'], # Beta + u'Γ' : ['Γ', 'Γ'], # Gamma + u'Δ' : ['Δ', 'Δ'], # Delta + u'Ε' : ['Ε', 'Ε'], # Epsilon + u'Ζ' : ['Ζ', 'Ζ'], # Zeta + u'Η' : ['Η', 'Η'], # Eta + u'Θ' : ['Θ', 'Θ'], # Theta + u'Ι' : ['Ι', 'Ι'], # Iota + u'Κ' : ['Κ', 'Κ'], # Kappa + u'Λ' : ['Λ', 'Λ'], # Lambda + u'Μ' : ['Μ', 'Μ'], # Mu + u'Ν' : ['Ν', 'Ν'], # Nu + u'Ξ' : ['Ξ', 'Ξ'], # Xi + u'Ο' : ['Ο', 'Ο'], # Omicron + u'Π' : ['Π', 'Π'], # Pi + u'Ρ' : ['Ρ', 'Ρ'], # Rho + u'Σ' : ['Σ', 'Σ'], # Sigma + u'Τ' : ['Τ', 'Τ'], # Tau + u'Υ' : ['Υ', 'Υ'], # Upsilon + u'Φ' : ['Φ', 'Φ'], # Phi + u'Χ' : ['Χ', 'Χ'], # Chi + u'Ψ' : ['Ψ', 'Ψ'], # Psi + u'ω' : ['ω', 'ω'], # omega + u'ϑ' : ['ϑ', 'ϑ'], # theta symbol + u'ϒ' : ['ϒ', 'ϒ'], # upsilon symbol + u'ϖ' : ['ϖ', 'ϖ'], # pi symbol + # Other + u'Œ' : ['Œ', 'Œ'], # capital ligature OE + u'œ' : ['œ', 'œ'], # small ligature oe + u'Š' : ['Š', 'Š'], # capital S with caron + u'š' : ['š', 'š'], # small S with caron + u'Ÿ' : ['Ÿ', 'Ÿ'], # capital Y with diaeres + u'ƒ' : ['ƒ', 'ƒ'], # f with hook + u'ˆ' : ['ˆ', 'ˆ'], # modifier letter circumflex accent + u'˜' : ['˜', '˜'], # small tilde + u'–' : ['–', '–'], # en dash + u'—' : ['—', '—'], # em dash + u'‘' : ['‘', '‘'], # left single quotation mark + u'’' : ['’', '’'], # right single quotation mark + u'‚' : ['‚', '‚'], # single low-9 quotation mark + u'“' : ['“', '“'], # left double quotation mark + u'”' : ['”', '”'], # right double quotation mark + u'„' : ['„', '„'], # double low-9 quotation mark + u'†' : ['†', '†'], # dagger + u'‡' : ['‡', '‡'], # double dagger + u'•' : ['•', '•'], # bullet + u'…' : ['…', '…'], # horizontal ellipsis + u'‰' : ['‰', '‰'], # per mille + u'′' : ['′', '′'], # minutes + u'″' : ['″', '″'], # seconds + u'‹' : ['‹', '‹'], # single left angle quotation + u'›' : ['›', '›'], # single right angle quotation + u'‾' : ['‾', '‾'], # overline + u'€' : ['€', '€'], # euro + u'™' : ['™', '™'], # trademark + u'←' : ['←', '←'], # left arrow + u'↑' : ['↑', '↑'], # up arrow + u'→' : ['→', '→'], # right arrow + u'↓' : ['↓', '↓'], # down arrow + u'↔' : ['↔', '↔'], # left right arrow + u'↵' : ['↵', '↵'], # carriage return arrow + u'⌈' : ['⌈', '⌈'], # left ceiling + u'⌉' : ['⌉', '⌉'], # right ceiling + u'⌊' : ['⌊', '⌊'], # left floor + u'⌋' : ['⌋', '⌋'], # right floor + u'◊' : ['◊', '◊'], # lozenge + u'♠' : ['♠', '♠'], # spade + u'♣' : ['♣', '♣'], # club + u'♥' : ['♥', '♥'], # heart + u'♦' : ['♦', '♦'], # diamond + # Extra http://www.ascii.cl/htmlcodes.htm + u'<' : ['<', '<'], # less than sign + u'>' : ['>', '>'], # greater than sign + u'¡' : ['¡', '¡'], # inverted exclamation mark + u'¢' : ['¢', '¢'], # cent sign + u'£' : ['£', '£'], # pound sign + u'¤' : ['¤', '¤'], # currency sign + u'¥' : ['¥', '¥'], # yen sign + u'¦' : ['¦', '¦'], # broken vertical bar + u'§' : ['§', '§'], # section sign + u'¨' : ['¨', '¨'], # spacing diaeresis - umlaut + u'©' : ['©', '©'], # copyright sign + u'ª' : ['ª', 'ª'], # feminine ordinal indicator + u'«' : ['«', '«'], # left double angle quotes + u'¬' : ['¬', '¬'], # not sign + u'®' : ['®', '®'], # registered trade mark sign + u'¯' : ['¯', '¯'], # spacing macron - overline + u'°' : ['°', '°'], # degree sign + u'±' : ['±', '±'], # plus-or-minus sign + u'²' : ['²', '²'], # superscript two - squared + u'³' : ['³', '³'], # superscript three - cubed + u'´' : ['´', '´'], # acute accent - spacing acute + u'µ' : ['µ', 'µ'], # micro sign + u'¶' : ['¶', '¶'], # pilcrow sign - paragraph sign + u'·' : ['·', '·'], # middle dot - Georgian comma + u'¸' : ['¸', '¸'], # spacing cedilla + u'¹' : ['¹', '¹'], # superscript one + u'º' : ['º', 'º'], # masculine ordinal indicator + u'»' : ['»', '»'], # right double angle quotes + u'¼' : ['¼', '¼'], # fraction one quarter + u'½' : ['½', '½'], # fraction one half + u'¾' : ['¾', '¾'], # fraction three quarters + u'¿' : ['¿', '¿'], # inverted question mark + u'À' : ['À', 'À'], # latin capital letter A with grave + u'Á' : ['Á', 'Á'], # latin capital letter A with acute + u'Â' : ['Â', 'Â'], # latin capital letter A with circumflex + u'Ã' : ['Ã', 'Ã'], # latin capital letter A with tilde + u'Ä' : ['Ä', 'Ä'], # latin capital letter A with diaeresis + u'Å' : ['Å', 'Å'], # latin capital letter A with ring above + u'Æ' : ['Æ', 'Æ'], # latin capital letter AE + u'Ç' : ['Ç', 'Ç'], # latin capital letter C with cedilla + u'È' : ['È', 'È'], # latin capital letter E with grave + u'É' : ['É', 'É'], # latin capital letter E with acute + u'Ê' : ['Ê', 'Ê'], # latin capital letter E with circumflex + u'Ë' : ['Ë', 'Ë'], # latin capital letter E with diaeresis + u'Ì' : ['Ì', 'Ì'], # latin capital letter I with grave + u'Í' : ['Í', 'Í'], # latin capital letter I with acute + u'Î' : ['Î', 'Î'], # latin capital letter I with circumflex + u'Ï' : ['Ï', 'Ï'], # latin capital letter I with diaeresis + u'Ð' : ['Ð', 'Ð'], # latin capital letter ETH + u'Ñ' : ['Ñ', 'Ñ'], # latin capital letter N with tilde + u'Ò' : ['Ò', 'Ò'], # latin capital letter O with grave + u'Ó' : ['Ó', 'Ó'], # latin capital letter O with acute + u'Ô' : ['Ô', 'Ô'], # latin capital letter O with circumflex + u'Õ' : ['Õ', 'Õ'], # latin capital letter O with tilde + u'Ö' : ['Ö', 'Ö'], # latin capital letter O with diaeresis + u'×' : ['×', '×'], # multiplication sign + u'Ø' : ['Ø', 'Ø'], # latin capital letter O with slash + u'Ù' : ['Ù', 'Ù'], # latin capital letter U with grave + u'Ú' : ['Ú', 'Ú'], # latin capital letter U with acute + u'Û' : ['Û', 'Û'], # latin capital letter U with circumflex + u'Ü' : ['Ü', 'Ü'], # latin capital letter U with diaeresis + u'Ý' : ['Ý', 'Ý'], # latin capital letter Y with acute + u'Þ' : ['Þ', 'Þ'], # latin capital letter THORN + u'ß' : ['ß', 'ß'], # latin small letter sharp s - ess-zed + u'à' : ['à', 'à'], # latin small letter a with grave + u'á' : ['á', 'á'], # latin small letter a with acute + u'â' : ['â', 'â'], # latin small letter a with circumflex + u'ã' : ['ã', 'ã'], # latin small letter a with tilde + u'ä' : ['ä', 'ä'], # latin small letter a with diaeresis + u'å' : ['å', 'å'], # latin small letter a with ring above + u'æ' : ['æ', 'æ'], # latin small letter ae + u'ç' : ['ç', 'ç'], # latin small letter c with cedilla + u'è' : ['è', 'è'], # latin small letter e with grave + u'é' : ['é', 'é'], # latin small letter e with acute + u'ê' : ['ê', 'ê'], # latin small letter e with circumflex + u'ë' : ['ë', 'ë'], # latin small letter e with diaeresis + u'ì' : ['ì', 'ì'], # latin small letter i with grave + u'í' : ['í', 'í'], # latin small letter i with acute + u'î' : ['î', 'î'], # latin small letter i with circumflex + u'ï' : ['ï', 'ï'], # latin small letter i with diaeresis + u'ð' : ['ð', 'ð'], # latin small letter eth + u'ñ' : ['ñ', 'ñ'], # latin small letter n with tilde + u'ò' : ['ò', 'ò'], # latin small letter o with grave + u'ó' : ['ó', 'ó'], # latin small letter o with acute + u'ô' : ['ô', 'ô'], # latin small letter o with circumflex + u'õ' : ['õ', 'õ'], # latin small letter o with tilde + u'ö' : ['ö', 'ö'], # latin small letter o with diaeresis + u'÷' : ['÷', '÷'], # division sign + u'ø' : ['ø', 'ø'], # latin small letter o with slash + u'ù' : ['ù', 'ù'], # latin small letter u with grave + u'ú' : ['ú', 'ú'], # latin small letter u with acute + u'û' : ['û', 'û'], # latin small letter u with circumflex + u'ü' : ['ü', 'ü'], # latin small letter u with diaeresis + u'ý' : ['ý', 'ý'], # latin small letter y with acute + u'þ' : ['þ', 'þ'], # latin small letter thorn + u'ÿ' : ['ÿ', 'ÿ'], # latin small letter y with diaeresis + } + diff --git a/src/calibre/ebooks/txt/__init__.py b/src/calibre/ebooks/txt/__init__.py new file mode 100644 index 0000000000..dfdbbdb5e2 --- /dev/null +++ b/src/calibre/ebooks/txt/__init__.py @@ -0,0 +1,9 @@ +#!/usr/bin/env python +__license__ = 'GPL v3' +__copyright__ = '2008, John Schember john@nachtimwald.com' +__docformat__ = 'restructuredtext en' + +''' +Used for txt output +''' + diff --git a/src/calibre/ebooks/txt/from_any.py b/src/calibre/ebooks/txt/from_any.py new file mode 100644 index 0000000000..caf5364c3c --- /dev/null +++ b/src/calibre/ebooks/txt/from_any.py @@ -0,0 +1,74 @@ +''' +Convert any ebook format to TXT. +''' + +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net ' \ + 'and Marshall T. Vandegrift ' \ + 'and John Schember ' +__docformat__ = 'restructuredtext en' + +import sys, os, glob, logging + +from calibre.ebooks.epub.from_any import any2epub, formats, USAGE +from calibre.ebooks.epub import config as common_config +from calibre.ptempfile import TemporaryDirectory +from calibre.ebooks.txt.writer import oeb2txt, config as txt_config + +def config(defaults=None): + c = common_config(defaults=defaults, name='txt') + c.remove_opt('profile') + del c.option_set.groups['metadata'] + del c.option_set.groups['traversal'] + del c.option_set.groups['structure detection'] + del c.option_set.groups['toc'] + del c.option_set.groups['page layout'] + txtc = txt_config(defaults=defaults) + c.update(txtc) + return c + +def option_parser(usage=USAGE): + usage = usage % ('TXT', formats()) + parser = config().option_parser(usage=usage) + return parser + +def any2txt(opts, path, notification=None): + ext = os.path.splitext(path)[1] + if not ext: + raise ValueError('Unknown file type: '+path) + ext = ext.lower()[1:] + + if opts.output is None: + opts.output = os.path.splitext(os.path.basename(path))[0]+'.txt' + + opts.output = os.path.abspath(opts.output) + orig_output = opts.output + + with TemporaryDirectory('_any2txt') as tdir: + oebdir = os.path.join(tdir, 'oeb') + os.mkdir(oebdir) + opts.output = os.path.join(tdir, 'dummy.epub') + opts.profile = 'None' + opts.dont_split_on_page_breaks = True + orig_bfs = opts.base_font_size2 + opts.base_font_size2 = 0 + any2epub(opts, path, create_epub=False, oeb_cover=False, extract_to=oebdir) + opts.base_font_size2 = orig_bfs + opf = glob.glob(os.path.join(oebdir, '*.opf'))[0] + opts.output = orig_output + logging.getLogger('html2epub').info(_('Creating TXT file from EPUB...')) + oeb2txt(opts, opf) + +def main(args=sys.argv): + parser = option_parser() + opts, args = parser.parse_args(args) + if len(args) < 2: + parser.print_help() + print 'No input file specified.' + return 1 + any2txt(opts, args[1]) + +if __name__ == '__main__': + sys.exit(main()) diff --git a/src/calibre/ebooks/txt/writer.py b/src/calibre/ebooks/txt/writer.py new file mode 100644 index 0000000000..0fbf4a634c --- /dev/null +++ b/src/calibre/ebooks/txt/writer.py @@ -0,0 +1,237 @@ +# -*- coding: utf-8 -*- +''' +Write content to TXT. +''' +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2009, John Schember ' + +import os, logging, re, sys + +from BeautifulSoup import BeautifulSoup + +from calibre import LoggingInterface +from calibre.ebooks.htmlsymbols import HTML_SYMBOLS +from calibre.ebooks.epub.iterator import SpineItem +from calibre.ebooks.metadata import authors_to_string +from calibre.ebooks.metadata.meta import metadata_from_formats +from calibre.ebooks.metadata.opf2 import OPF +from calibre.customize.ui import run_plugins_on_postprocess +from calibre.utils.config import Config, StringConfig + +class TXTWriter(object): + def __init__(self, newline): + self.newline = newline + + def dump(self, oebpath, path, metadata): + opf = OPF(oebpath, os.path.dirname(oebpath)) + spine = [SpineItem(i.path) for i in opf.spine] + + tmpout = '' + for item in spine: + with open(item, 'r') as itemf: + content = itemf.read().decode(item.encoding) + # Convert newlines to unix style \n for processing. These + # will be changed to the specified type later in the process. + content = self.unix_newlines(content) + content = self.strip_html(content) + content = self.replace_html_symbols(content) + content = self.cleanup_text(content) + content = self.specified_newlines(content) + tmpout = tmpout + content + + # Prepend metadata + if metadata.author != None and metadata.author != '': + tmpout = (u'%s%s%s%s' % (metadata.author.upper(), self.newline, self.newline, self.newline)) + tmpout + if metadata.title != None and metadata.title != '': + tmpout = (u'%s%s%s%s' % (metadata.title.upper(), self.newline, self.newline, self.newline)) + tmpout + + # Put two blank lines at end of file + + end = tmpout[-3 * len(self.newline):] + for i in range(3 - end.count(self.newline)): + tmpout = tmpout + self.newline + + os.remove(path) + with open(path, 'w+b') as out: + out.write(tmpout.encode('utf-8')) + + def strip_html(self, html): + stripped = u'' + + for dom_tree in BeautifulSoup(html).findAll('body'): + text = unicode(dom_tree) + + # Remove unnecessary tags + for tag in ['script', 'style']: + text = re.sub('(?imu)<[ ]*%s[ ]*.*?>(.*)' % (tag, tag), '', text) + text = re.sub('', '', text) + + # Headings usually indicate Chapters. + # We are going to use a marker to insert the proper number of + # newline characters at the end of cleanup_text because cleanup_text + # remove excessive (more than 2 newlines). + for tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']: + text = re.sub('(?imu)<[ ]*%s[ ]*.*?>' % tag, '-vzxedxy-', text) + text = re.sub('(?imu)' % tag, '-vlgzxey-', text) + + # Separate content with space. + for tag in ['td']: + text = re.sub('(?imu)', ' ', text) + + # Separate content with empty line. + for tag in ['p', 'div', 'pre', 'li', 'table', 'tr']: + text = re.sub('(?imu)' % tag, '\n\n', text) + + for tag in ['hr', 'br']: + text = re.sub('(?imu)<[ ]*%s[ ]*/*?>' % tag, '\n\n', text) + + # Remove any tags that do not need special processing. + text = re.sub('<.*?>', '', text) + + stripped = stripped + text + + return stripped + + def replace_html_symbols(self, content): + for symbol in HTML_SYMBOLS: + for code in HTML_SYMBOLS[symbol]: + content = content.replace(code, symbol) + return content + + def cleanup_text(self, text): + # Replace bad characters. + text = text.replace(u'\xc2', '') + text = text.replace(u'\xa0', ' ') + + # Replace tabs, vertical tags and form feeds with single space. + #text = re.sub('\xc2\xa0', '', text) + text = text.replace('\t+', ' ') + text = text.replace('\v+', ' ') + text = text.replace('\f+', ' ') + + # Single line paragraph. + r = re.compile('.\n.') + while True: + mo = r.search(text) + if mo == None: + break + text = '%s %s' % (text[:mo.start()+1], text[mo.end()-1:]) + + # Remove multiple spaces. + text = re.sub('[ ]+', ' ', text) + text = re.sub('(?imu)^[ ]+', '', text) + text = re.sub('(?imu)[ ]+$', '', text) + + # Remove excessive newlines. + text = re.sub('\n[ ]+\n', '\n\n', text) + text = re.sub('\n{3,}', '\n\n', text) + + # Replace markers with the proper characters. + text = text.replace('-vzxedxy-', '\n\n\n\n\n') + text = text.replace('-vlgzxey-', '\n\n\n') + + return text + + def unix_newlines(self, text): + text = text.replace('\r\n', '\n') + text = text.replace('\r', '\n') + + return text + + def specified_newlines(self, text): + if self.newline == '\n': + return text + + return text.replace('\n', self.newline) + +class TxtMetadata(object): + def __init__(self): + self.author = None + self.title = None + self.series = None + + +class TxtNewlines(object): + NEWLINE_TYPES = { + 'system' : os.linesep, + 'unix' : '\n', + 'old_mac' : '\r', + 'windows' : '\r\n' + } + + def __init__(self, newline_type): + self.newline = self.NEWLINE_TYPES.get(newline_type.lower(), os.linesep) + + +def config(defaults=None): + desc = _('Options to control the conversion to TXT') + if defaults is None: + c = Config('txt', desc) + else: + c = StringConfig(defaults, desc) + + txt = c.add_group('TXT', _('TXT options.')) + + txt('newline', ['--newline'], default='system', + help=_('Type of newline to use. Options are %s. Default is \'system\'. ' + 'Use \'old_mac\' for compatibility with Mac OS 9 and earlier. ' + 'For Mac OS X use \'unix\'. \'system\' will default to the newline ' + 'type used by this OS.' % sorted(TxtNewlines.NEWLINE_TYPES.keys()))) + txt('prepend_author', ['--prepend-author'], default='true', + help=_('Write the author to the beginning of the file. ' + 'Default is \'true\'. Use \'false\' to disable.')) + txt('prepend_title', ['--prepend-title'], default='true', + help=_('Write the title to the beginning of the file. ' + 'Default is \'true\'. Use \'false\' to disable.')) + + return c + +def option_parser(): + c = config() + parser = c.option_parser(usage='%prog '+_('[options]')+' file.opf') + parser.add_option( + '-o', '--output', default=None, + help=_('Output file. Default is derived from input filename.')) + parser.add_option( + '-v', '--verbose', default=0, action='count', + help=_('Useful for debugging.')) + return parser + +def oeb2txt(opts, inpath): + logger = LoggingInterface(logging.getLogger('oeb2txt')) + logger.setup_cli_handler(opts.verbose) + + outpath = opts.output + if outpath is None: + outpath = os.path.basename(inpath) + outpath = os.path.splitext(outpath)[0] + '.txt' + + mi = metadata_from_formats([inpath]) + metadata = TxtMetadata() + if opts.prepend_author.lower() == 'true': + metadata.author = opts.authors if opts.authors else authors_to_string(mi.authors) + if opts.prepend_title.lower() == 'true': + metadata.title = opts.title if opts.title else mi.title + + newline = TxtNewlines(opts.newline) + + writer = TXTWriter(newline.newline) + writer.dump(inpath, outpath, metadata) + run_plugins_on_postprocess(outpath, 'txt') + logger.log_info(_('Output written to ') + outpath) + +def main(argv=sys.argv): + parser = option_parser() + opts, args = parser.parse_args(argv[1:]) + if len(args) != 1: + parser.print_help() + return 1 + inpath = args[0] + retval = oeb2txt(opts, inpath) + return retval + +if __name__ == '__main__': + sys.exit(main()) + From 8d124f92d6ea9ac262542507330b6d19b9a0421c Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 21 Mar 2009 07:37:26 -0400 Subject: [PATCH 02/16] Only remove output file if it exists before writing to it in txt output --- src/calibre/ebooks/txt/writer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/calibre/ebooks/txt/writer.py b/src/calibre/ebooks/txt/writer.py index 0fbf4a634c..84376ca2e7 100644 --- a/src/calibre/ebooks/txt/writer.py +++ b/src/calibre/ebooks/txt/writer.py @@ -53,7 +53,8 @@ class TXTWriter(object): for i in range(3 - end.count(self.newline)): tmpout = tmpout + self.newline - os.remove(path) + if os.path.exists(path): + os.remove(path) with open(path, 'w+b') as out: out.write(tmpout.encode('utf-8')) From 11013c26657fe56b2581061a8243981dc3ff0d6a Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 21 Mar 2009 17:31:15 -0400 Subject: [PATCH 03/16] More html symbols --- src/calibre/ebooks/htmlsymbols.py | 91 +++++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) diff --git a/src/calibre/ebooks/htmlsymbols.py b/src/calibre/ebooks/htmlsymbols.py index 9b50f20fcd..fa10873845 100644 --- a/src/calibre/ebooks/htmlsymbols.py +++ b/src/calibre/ebooks/htmlsymbols.py @@ -119,6 +119,97 @@ HTML_SYMBOLS = { u'♥' : ['♥', '♥'], # heart u'♦' : ['♦', '♦'], # diamond # Extra http://www.ascii.cl/htmlcodes.htm + u' ' : [' '], # space + u'!' : ['!'], # exclamation point + u'#' : ['#'], # number sign + u'$' : ['$'], # dollar sign + u'%' : ['%'], # percent sign + u'\'' : ['''], # single quote + u'(' : ['('], # opening parenthesis + u')' : [')'], # closing parenthesis + u'*' : ['*'], # asterisk + u'+' : ['+'], # plus sign + u',' : [','], # comma + u'-' : ['-'], # minus sign - hyphen + u'.' : ['.'], # period + u'/' : ['/'], # slash + u'0' : ['0'], # zero + u'1' : ['1'], # one + u'2' : ['2'], # two + u'3' : ['3'], # three + u'4' : ['4'], # four + u'5' : ['5'], # five + u'6' : ['6'], # six + u'7' : ['7'], # seven + u'8' : ['8'], # eight + u'9' : ['9'], # nine + u':' : [':'], # colon + u';' : [';'], # semicolon + u'=' : ['='], # equal sign + u'?' : ['?'], # question mark + u'@' : ['@'], # at symbol + u'A' : ['A'], # + u'B' : ['B'], # + u'C' : ['C'], # + u'D' : ['D'], # + u'E' : ['E'], # + u'F' : ['F'], # + u'G' : ['G'], # + u'H' : ['H'], # + u'I' : ['I'], # + u'J' : ['J'], # + u'K' : ['K'], # + u'L' : ['L'], # + u'M' : ['M'], # + u'N' : ['N'], # + u'O' : ['O'], # + u'P' : ['P'], # + u'Q' : ['Q'], # + u'R' : ['R'], # + u'S' : ['S'], # + u'T' : ['T'], # + u'U' : ['U'], # + u'V' : ['V'], # + u'W' : ['W'], # + u'X' : ['X'], # + u'Y' : ['Y'], # + u'Z' : ['Z'], # + u'[' : ['['], # opening bracket + u'\\' : ['\'], # backslash + u']' : [']'], # closing bracket + u'^' : ['^'], # caret - circumflex + u'_' : ['_'], # underscore + u'`' : ['`'], # grave accent + u'a' : ['a'], # + u'b' : ['b'], # + u'c' : ['c'], # + u'd' : ['d'], # + u'e' : ['e'], # + u'f' : ['f'], # + u'g' : ['g'], # + u'h' : ['h'], # + u'i' : ['i'], # + u'j' : ['j'], # + u'k' : ['k'], # + u'l' : ['l'], # + u'm' : ['m'], # + u'n' : ['n'], # + u'o' : ['o'], # + u'p' : ['p'], # + u'q' : ['q'], # + u'r' : ['r'], # + u's' : ['s'], # + u't' : ['t'], # + u'u' : ['u'], # + u'v' : ['v'], # + u'w' : ['w'], # + u'x' : ['x'], # + u'y' : ['y'], # + u'z' : ['z'], # + u'{' : ['{'], # opening brace + u'|' : ['|'], # vertical bar + u'}' : ['}'], # closing brace + u'~' : ['~'], # equivalency sign - tilde u'<' : ['<', '<'], # less than sign u'>' : ['>', '>'], # greater than sign u'¡' : ['¡', '¡'], # inverted exclamation mark From 94c5e717a15bf4faf59bf23ab74f7caf6fc161be Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 21 Mar 2009 17:58:53 -0400 Subject: [PATCH 04/16] Txt output: remove more tags, ensure no spaces at beginning and end of lines --- src/calibre/ebooks/txt/writer.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/calibre/ebooks/txt/writer.py b/src/calibre/ebooks/txt/writer.py index 84376ca2e7..205d8423e3 100644 --- a/src/calibre/ebooks/txt/writer.py +++ b/src/calibre/ebooks/txt/writer.py @@ -68,6 +68,9 @@ class TXTWriter(object): for tag in ['script', 'style']: text = re.sub('(?imu)<[ ]*%s[ ]*.*?>(.*)' % (tag, tag), '', text) text = re.sub('', '', text) + text = re.sub('<\?.*?\?>', '', text) + text = re.sub('<@.*?@>', '', text) + text = re.sub('<%.*?%>', '', text) # Headings usually indicate Chapters. # We are going to use a marker to insert the proper number of @@ -107,7 +110,6 @@ class TXTWriter(object): text = text.replace(u'\xa0', ' ') # Replace tabs, vertical tags and form feeds with single space. - #text = re.sub('\xc2\xa0', '', text) text = text.replace('\t+', ' ') text = text.replace('\v+', ' ') text = text.replace('\f+', ' ') @@ -122,8 +124,6 @@ class TXTWriter(object): # Remove multiple spaces. text = re.sub('[ ]+', ' ', text) - text = re.sub('(?imu)^[ ]+', '', text) - text = re.sub('(?imu)[ ]+$', '', text) # Remove excessive newlines. text = re.sub('\n[ ]+\n', '\n\n', text) @@ -133,6 +133,10 @@ class TXTWriter(object): text = text.replace('-vzxedxy-', '\n\n\n\n\n') text = text.replace('-vlgzxey-', '\n\n\n') + # Replace spaces at the beginning and end of lines + text = re.sub('(?imu)^[ ]+', '', text) + text = re.sub('(?imu)[ ]+$', '', text) + return text def unix_newlines(self, text): From 9abca9d60feb6896ef11c0da04a755fd24feb867 Mon Sep 17 00:00:00 2001 From: John Schember Date: Mon, 23 Mar 2009 19:07:14 -0400 Subject: [PATCH 05/16] Do not enable edit rows in device tab for devices that do not support editing ebook metadata. --- src/calibre/devices/interface.py | 2 ++ src/calibre/devices/usbms/driver.py | 1 + src/calibre/gui2/library.py | 12 +++++++++--- src/calibre/gui2/main.py | 2 ++ 4 files changed, 14 insertions(+), 3 deletions(-) diff --git a/src/calibre/devices/interface.py b/src/calibre/devices/interface.py index ed51962236..21790e3c46 100644 --- a/src/calibre/devices/interface.py +++ b/src/calibre/devices/interface.py @@ -24,6 +24,8 @@ class Device(object): # it can be a list of the BCD numbers of all devices supported by this driver. BCD = None THUMBNAIL_HEIGHT = 68 # Height for thumbnails on device + # Whether the metadata on books can be set via the GUI. + CAN_SET_METADATA = True def __init__(self, key='-1', log_packets=False, report_progress=None) : """ diff --git a/src/calibre/devices/usbms/driver.py b/src/calibre/devices/usbms/driver.py index 4285881447..68041a19cd 100644 --- a/src/calibre/devices/usbms/driver.py +++ b/src/calibre/devices/usbms/driver.py @@ -35,6 +35,7 @@ class USBMS(Device): EBOOK_DIR_MAIN = '' EBOOK_DIR_CARD = '' SUPPORTS_SUB_DIRS = False + CAN_SET_METADATA = False def __init__(self, key='-1', log_packets=False, report_progress=None): Device.__init__(self, key=key, log_packets=log_packets, diff --git a/src/calibre/gui2/library.py b/src/calibre/gui2/library.py index d7581bf458..9f82b3b318 100644 --- a/src/calibre/gui2/library.py +++ b/src/calibre/gui2/library.py @@ -708,6 +708,9 @@ class BooksView(TableView): def close(self): self._model.close() + + def set_editable(self, editable): + self._model.set_editable(editable) def connect_to_search_box(self, sb): QObject.connect(sb, SIGNAL('search(PyQt_PyObject, PyQt_PyObject)'), @@ -785,7 +788,7 @@ class DeviceBooksModel(BooksModel): self.unknown = str(self.trUtf8('Unknown')) self.marked_for_deletion = {} self.search_engine = OnDeviceSearch(self) - + self.editable = True def mark_for_deletion(self, job, rows): self.marked_for_deletion[job] = self.indices(rows) @@ -793,7 +796,6 @@ class DeviceBooksModel(BooksModel): indices = self.row_indices(row) self.emit(SIGNAL('dataChanged(QModelIndex, QModelIndex)'), indices[0], indices[-1]) - def deletion_done(self, job, succeeded=True): if not self.marked_for_deletion.has_key(job): return @@ -818,7 +820,7 @@ class DeviceBooksModel(BooksModel): if self.map[index.row()] in self.indices_to_be_deleted(): return Qt.ItemIsUserCheckable # Can't figure out how to get the disabled flag in python flags = QAbstractTableModel.flags(self, index) - if index.isValid(): + if index.isValid() and self.editable: if index.column() in [0, 1] or (index.column() == 4 and self.db.supports_tags()): flags |= Qt.ItemIsEditable return flags @@ -999,6 +1001,10 @@ class DeviceBooksModel(BooksModel): self.sort(col, self.sorted_on[1]) done = True return done + + def set_editable(self, editable): + self.editable = editable + class SearchBox(QLineEdit): diff --git a/src/calibre/gui2/main.py b/src/calibre/gui2/main.py index 76775ae9bf..f297d1465c 100644 --- a/src/calibre/gui2/main.py +++ b/src/calibre/gui2/main.py @@ -585,7 +585,9 @@ class Main(MainWindow, Ui_MainWindow): return mainlist, cardlist = job.result self.memory_view.set_database(mainlist) + self.memory_view.set_editable(self.device_manager.device_class.CAN_SET_METADATA) self.card_view.set_database(cardlist) + self.card_view.set_editable(self.device_manager.device_class.CAN_SET_METADATA) for view in (self.memory_view, self.card_view): view.sortByColumn(3, Qt.DescendingOrder) if not view.restore_column_widths(): From 4579b1057130ae27d6f9b312b3a94ab1a1e86107 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 28 Mar 2009 14:39:17 -0400 Subject: [PATCH 06/16] PDF merging utility --- src/calibre/ebooks/pdf/pdfmerge.py | 94 ++++++++++++++++++++++++++++++ src/calibre/linux.py | 3 +- 2 files changed, 96 insertions(+), 1 deletion(-) create mode 100644 src/calibre/ebooks/pdf/pdfmerge.py diff --git a/src/calibre/ebooks/pdf/pdfmerge.py b/src/calibre/ebooks/pdf/pdfmerge.py new file mode 100644 index 0000000000..e8554dbc6b --- /dev/null +++ b/src/calibre/ebooks/pdf/pdfmerge.py @@ -0,0 +1,94 @@ +''' +Merge PDF files into a single PDF document. +''' +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +import os, sys, re + +from calibre.ebooks.metadata.meta import metadata_from_formats +from calibre.ebooks.metadata import authors_to_string +from calibre.utils.config import Config, StringConfig + +from pyPdf import PdfFileWriter, PdfFileReader + +def config(defaults=None): + desc = _('Options to control the transformation of pdf') + default_crop=10 + if defaults is None: + c = Config('trimpdf', desc) + else: + c = StringConfig(defaults, desc) + c.add_opt('verbose', ['-v', '--verbose'], default=0, action='count', + help=_('Be verbose, useful for debugging. Can be specified multiple times for greater verbosity.')) + c.add_opt('output', ['-o', '--output'],default='merged.pdf', + help=_('Path to output file. By default a file is created in the current directory.')) + return c + +def option_parser(): + c = config() + return c.option_parser(usage=_('''\ + %prog [options] file1.pdf file2.pdf ... + + Merges individual pdfs. Metadata will be used from the first PDF specified. + ''')) + +def merge_files(in_paths, out_path, metadata=None): + if metadata == None: + title = _('Unknown') + author = _('Unknown') + else: + title = metadata.title + author = authors_to_string(metadata.authors) + + out_pdf = PdfFileWriter(title=title, author=author) + + for pdf_path in in_paths: + pdf = PdfFileReader(open(os.path.abspath(pdf_path), 'rb')) + for page in pdf.pages: + out_pdf.addPage(page) + + with open(out_path, 'wb') as out_file: + out_pdf.write(out_file) + +def verify_files(files): + invalid = [] + + for pdf_path in files: + try: + with open(os.path.abspath(pdf_path), 'rb') as pdf_file: + pdf = PdfFileReader(pdf_file) + if pdf.isEncrypted or pdf.numPages <= 0: + raise Exception + except: + invalid.append(pdf_path) + return invalid + +def main(args=sys.argv): + parser = option_parser() + opts, args = parser.parse_args(args) + args = args[1:] + + if len(args) < 2: + print 'Error: Two or more PDF files are required.\n\n' + print parser.get_usage() + return 2 + + bad_pdfs = verify_files(args) + if bad_pdfs != []: + for pdf in bad_pdfs: + print 'Error: Could not read file `%s`. Is it a vaild PDF file or is it encrypted/DRMed?.' % pdf + return 2 + + mi = metadata_from_formats([args[0]]) + + merge_files(args, opts.output, mi) + + return 0 + +if __name__ == '__main__': + sys.exit(main()) + diff --git a/src/calibre/linux.py b/src/calibre/linux.py index 15dcb6fed9..c7a6099623 100644 --- a/src/calibre/linux.py +++ b/src/calibre/linux.py @@ -39,7 +39,8 @@ entry_points = { 'calibre-fontconfig = calibre.utils.fontconfig:main', 'calibre-parallel = calibre.parallel:main', 'calibre-customize = calibre.customize.ui:main', - 'pdftrim = calibre.ebooks.pdf.pdftrim:main' , + 'pdftrim = calibre.ebooks.pdf.pdftrim:main', + 'pdfmerge = calibre.ebooks.pdf.pdfmerge:main', 'fetch-ebook-metadata = calibre.ebooks.metadata.fetch:main', ], 'gui_scripts' : [ From a5228d56d2b04fedba4c77791ac2893c5bd1c6b7 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 28 Mar 2009 21:57:42 -0400 Subject: [PATCH 07/16] Add PDF splitting utility --- src/calibre/ebooks/pdf/pdfmerge.py | 6 +- src/calibre/ebooks/pdf/pdfsplit.py | 189 +++++++++++++++++++++++++++++ src/calibre/linux.py | 1 + 3 files changed, 193 insertions(+), 3 deletions(-) create mode 100644 src/calibre/ebooks/pdf/pdfsplit.py diff --git a/src/calibre/ebooks/pdf/pdfmerge.py b/src/calibre/ebooks/pdf/pdfmerge.py index e8554dbc6b..4a741c4f5a 100644 --- a/src/calibre/ebooks/pdf/pdfmerge.py +++ b/src/calibre/ebooks/pdf/pdfmerge.py @@ -7,7 +7,7 @@ __license__ = 'GPL v3' __copyright__ = '2009, John Schember ' __docformat__ = 'restructuredtext en' -import os, sys, re +import os, sys from calibre.ebooks.metadata.meta import metadata_from_formats from calibre.ebooks.metadata import authors_to_string @@ -24,7 +24,7 @@ def config(defaults=None): c = StringConfig(defaults, desc) c.add_opt('verbose', ['-v', '--verbose'], default=0, action='count', help=_('Be verbose, useful for debugging. Can be specified multiple times for greater verbosity.')) - c.add_opt('output', ['-o', '--output'],default='merged.pdf', + c.add_opt('output', ['-o', '--output'], default='merged.pdf', help=_('Path to output file. By default a file is created in the current directory.')) return c @@ -33,7 +33,7 @@ def option_parser(): return c.option_parser(usage=_('''\ %prog [options] file1.pdf file2.pdf ... - Merges individual pdfs. Metadata will be used from the first PDF specified. + Merges individual PDFs. Metadata will be used from the first PDF specified. ''')) def merge_files(in_paths, out_path, metadata=None): diff --git a/src/calibre/ebooks/pdf/pdfsplit.py b/src/calibre/ebooks/pdf/pdfsplit.py new file mode 100644 index 0000000000..460dbef148 --- /dev/null +++ b/src/calibre/ebooks/pdf/pdfsplit.py @@ -0,0 +1,189 @@ +''' +Split PDF file into multiple PDF documents. +''' +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +import os, sys, re + +from calibre.ebooks.metadata.meta import metadata_from_formats +from calibre.ebooks.metadata import authors_to_string +from calibre.utils.config import Config, StringConfig + +from pyPdf import PdfFileWriter, PdfFileReader + +def config(defaults=None): + desc = _('Options to control the transformation of pdf') + default_crop=10 + if defaults is None: + c = Config('trimpdf', desc) + else: + c = StringConfig(defaults, desc) + c.add_opt('verbose', ['-v', '--verbose'], default=0, action='count', + help=_('Be verbose, useful for debugging. Can be specified multiple times for greater verbosity.')) + c.add_opt('output', ['-o', '--output'], default='split.pdf', + help=_('Path to output file. By default a file is created in the current directory. \ + The file name will be the base name for the output.')) + return c + +def option_parser(): + c = config() + return c.option_parser(usage=_('''\ + + %prog [options] file.pdf page_to_split_on ... + %prog [options] file.pdf page_range_to_split_on ... + + Ex. + + %prog file.pdf 6 + %prog file.pdf 6-12 + %prog file.pdf 6-12 8 10 9-20 + + Split a PDF. + ''')) + +def split_pdf(in_path, pages, page_ranges, out_name, metadata=None): + pdf = PdfFileReader(open(os.path.abspath(in_path), 'rb')) + total_pages = pdf.numPages - 1 + + for index in pages+page_ranges: + if index in pages: + write_pdf(pdf, out_name, '%s' % (index + 1), index, total_pages, metadata) + else: + + write_pdf(pdf, out_name, '%s-%s' % (index[0] + 1, index[1] + 1), index[0], index[1], metadata) + +def write_pdf(pdf, name, suffix, start, end, metadata=None): + if metadata == None: + title = _('Unknown') + author = _('Unknown') + else: + title = metadata.title + author = authors_to_string(metadata.authors) + + out_pdf = PdfFileWriter(title=title, author=author) + for page_num in range(start, end + 1): + out_pdf.addPage(pdf.getPage(page_num)) + with open('%s%s.pdf' % (name, suffix), 'wb') as out_file: + out_pdf.write(out_file) + +def split_args(args): + pdf = '' + pages = [] + page_ranges = [] + bad = [] + + for arg in args: + arg = arg.strip() + # Find the pdf input + if re.search('(?iu)^.*?\.pdf[ ]*$', arg) != None: + if pdf == '': + pdf = arg + else: + bad.append(arg) + # Find single indexes + elif re.search('^[ ]*\d+[ ]*$', arg) != None: + pages.append(arg) + # Find index ranges + elif re.search('^[ ]*\d+[ ]*-[ ]*\d+[ ]*$', arg) != None: + mo = re.search('^[ ]*(?P\d+)[ ]*-[ ]*(?P\d+)[ ]*$', arg) + start = mo.group('start') + end = mo.group('end') + + # check to see if the range is really a single index + if start == end: + pages.append(start) + else: + page_ranges.append([start, end]) + else: + bad.append(arg) + + bad = sorted(list(set(bad))) + + return pdf, pages, page_ranges, bad + +# Remove duplicates from pages and page_ranges. +# Set pages higher than the total number of pages in the pdf to the last page. +# Return pages and page_ranges as lists of ints. +def clean_page_list(pdf_path, pages, page_ranges): + pdf = PdfFileReader(open(os.path.abspath(pdf_path), 'rb')) + + total_pages = pdf.numPages + sorted_pages = [] + sorted_ranges = [] + + for index in pages: + index = int(index) + if index > total_pages: + sorted_pages.append(total_pages - 1) + else: + sorted_pages.append(index - 1) + + for start, end in page_ranges: + start = int(start) + end = int(end) + + if start > total_pages and end > total_pages: + sorted_pages.append(total_pages - 1) + continue + + if start > total_pages: + start = total_pages + if end > total_pages: + end = total_pages + page_range = sorted([start - 1, end - 1]) + if page_range not in sorted_ranges: + sorted_ranges.append(page_range) + + # Remove duplicates and sort + pages = sorted(list(set(sorted_pages))) + page_ranges = sorted(sorted_ranges) + + return pages, page_ranges + +# Return True if the pdf is valid. +def valid_pdf(pdf_path): + try: + with open(os.path.abspath(pdf_path), 'rb') as pdf_file: + pdf = PdfFileReader(pdf_file) + if pdf.isEncrypted or pdf.numPages <= 0: + raise Exception + except: + return False + return True + +def main(args=sys.argv): + parser = option_parser() + opts, args = parser.parse_args(args) + + pdf, pages, page_ranges, unknown = split_args(args[1:]) + + if pdf == '' and (pages == [] or page_ranges == []): + print 'Error: PDF and where to split is required.\n\n' + print parser.get_usage() + return 2 + + if unknown != []: + for arg in unknown: + print 'Error: Unknown argument `%s`' % arg + print parser.get_usage() + return 2 + + if not valid_pdf(pdf): + print 'Error: Could not read file `%s`. Is it a vaild PDF file or is it encrypted/DRMed?.' % pdf + return 2 + + pages, page_ranges = clean_page_list(pdf, pages, page_ranges) + + mi = metadata_from_formats([pdf]) + + split_pdf(pdf, pages, page_ranges, os.path.splitext(opts.output)[0], mi) + + return 0 + +if __name__ == '__main__': + sys.exit(main()) + diff --git a/src/calibre/linux.py b/src/calibre/linux.py index c7a6099623..3ba6f55bc8 100644 --- a/src/calibre/linux.py +++ b/src/calibre/linux.py @@ -41,6 +41,7 @@ entry_points = { 'calibre-customize = calibre.customize.ui:main', 'pdftrim = calibre.ebooks.pdf.pdftrim:main', 'pdfmerge = calibre.ebooks.pdf.pdfmerge:main', + 'pdfsplit = calibre.ebooks.pdf.pdfsplit:main', 'fetch-ebook-metadata = calibre.ebooks.metadata.fetch:main', ], 'gui_scripts' : [ From ffa5f36fae29af536d8eb4a8eb8082eefc917f86 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 29 Mar 2009 09:10:47 -0400 Subject: [PATCH 08/16] bzr 'command sub-command' style wrapper for pdf manipulation --- src/calibre/ebooks/pdf/manipulate.py | 67 +++++++++++++++++++ .../ebooks/pdf/{pdfmerge.py => merge.py} | 13 ++-- .../ebooks/pdf/{pdfsplit.py => split.py} | 21 +++--- .../ebooks/pdf/{pdftrim.py => trim.py} | 10 +-- src/calibre/linux.py | 5 +- 5 files changed, 89 insertions(+), 27 deletions(-) create mode 100644 src/calibre/ebooks/pdf/manipulate.py rename src/calibre/ebooks/pdf/{pdfmerge.py => merge.py} (92%) rename src/calibre/ebooks/pdf/{pdfsplit.py => split.py} (93%) rename src/calibre/ebooks/pdf/{pdftrim.py => trim.py} (95%) diff --git a/src/calibre/ebooks/pdf/manipulate.py b/src/calibre/ebooks/pdf/manipulate.py new file mode 100644 index 0000000000..0e75734bb9 --- /dev/null +++ b/src/calibre/ebooks/pdf/manipulate.py @@ -0,0 +1,67 @@ +''' +Command line interface to run pdf manipulation commands. +''' +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +import string, sys + +from calibre.utils.config import Config, StringConfig +from calibre.ebooks.pdf import merge, split, trim + +COMMANDS = { + 'merge' : merge, + 'split' : split, + 'trim' : trim, + } + +def config(defaults=None): + desc = _('Options to control the transformation of pdf') + if defaults is None: + c = Config('trimpdf', desc) + else: + c = StringConfig(defaults, desc) + return c + +def option_parser(): + c = config() + return c.option_parser(usage=_('''\ + + %prog command ... + + command can be one of the following: + [%%commands] + + Use %prog command --help to get more information about a specific command + + Manipulate a PDF. + '''.replace('%%commands', string.join(sorted(COMMANDS.keys()), ', ')))) + +def main(args=sys.argv): + parser = option_parser() + + if len(args) < 2: + print 'Error: No command sepecified.\n' + print parser.get_usage() + return 2 + + command = args[1].lower().strip() + + if command in COMMANDS.keys(): + del args[1] + return COMMANDS[command].main(args, command) + else: + parser.parse_args(args) + print 'Unknown command %s.\n' % command + print parser.get_usage() + return 2 + + # We should never get here. + return 0 + +if __name__ == '__main__': + sys.exit(main()) + diff --git a/src/calibre/ebooks/pdf/pdfmerge.py b/src/calibre/ebooks/pdf/merge.py similarity index 92% rename from src/calibre/ebooks/pdf/pdfmerge.py rename to src/calibre/ebooks/pdf/merge.py index 4a741c4f5a..7ae35d1065 100644 --- a/src/calibre/ebooks/pdf/pdfmerge.py +++ b/src/calibre/ebooks/pdf/merge.py @@ -17,9 +17,8 @@ from pyPdf import PdfFileWriter, PdfFileReader def config(defaults=None): desc = _('Options to control the transformation of pdf') - default_crop=10 if defaults is None: - c = Config('trimpdf', desc) + c = Config('mergepdf', desc) else: c = StringConfig(defaults, desc) c.add_opt('verbose', ['-v', '--verbose'], default=0, action='count', @@ -28,13 +27,13 @@ def config(defaults=None): help=_('Path to output file. By default a file is created in the current directory.')) return c -def option_parser(): +def option_parser(name): c = config() return c.option_parser(usage=_('''\ - %prog [options] file1.pdf file2.pdf ... + %prog %%name [options] file1.pdf file2.pdf ... Merges individual PDFs. Metadata will be used from the first PDF specified. - ''')) + '''.replace('%%name', name))) def merge_files(in_paths, out_path, metadata=None): if metadata == None: @@ -67,8 +66,8 @@ def verify_files(files): invalid.append(pdf_path) return invalid -def main(args=sys.argv): - parser = option_parser() +def main(args=sys.argv, name=''): + parser = option_parser(name) opts, args = parser.parse_args(args) args = args[1:] diff --git a/src/calibre/ebooks/pdf/pdfsplit.py b/src/calibre/ebooks/pdf/split.py similarity index 93% rename from src/calibre/ebooks/pdf/pdfsplit.py rename to src/calibre/ebooks/pdf/split.py index 460dbef148..36517fb704 100644 --- a/src/calibre/ebooks/pdf/pdfsplit.py +++ b/src/calibre/ebooks/pdf/split.py @@ -17,9 +17,8 @@ from pyPdf import PdfFileWriter, PdfFileReader def config(defaults=None): desc = _('Options to control the transformation of pdf') - default_crop=10 if defaults is None: - c = Config('trimpdf', desc) + c = Config('splitpdf', desc) else: c = StringConfig(defaults, desc) c.add_opt('verbose', ['-v', '--verbose'], default=0, action='count', @@ -29,21 +28,21 @@ def config(defaults=None): The file name will be the base name for the output.')) return c -def option_parser(): +def option_parser(name): c = config() return c.option_parser(usage=_('''\ - %prog [options] file.pdf page_to_split_on ... - %prog [options] file.pdf page_range_to_split_on ... + %prog %%name [options] file.pdf page_to_split_on ... + %prog %%name [options] file.pdf page_range_to_split_on ... Ex. - %prog file.pdf 6 - %prog file.pdf 6-12 - %prog file.pdf 6-12 8 10 9-20 + %prog %%name file.pdf 6 + %prog %%name file.pdf 6-12 + %prog %%name file.pdf 6-12 8 10 9-20 Split a PDF. - ''')) + '''.replace('%%name', name))) def split_pdf(in_path, pages, page_ranges, out_name, metadata=None): pdf = PdfFileReader(open(os.path.abspath(in_path), 'rb')) @@ -155,8 +154,8 @@ def valid_pdf(pdf_path): return False return True -def main(args=sys.argv): - parser = option_parser() +def main(args=sys.argv, name=''): + parser = option_parser(name) opts, args = parser.parse_args(args) pdf, pages, page_ranges, unknown = split_args(args[1:]) diff --git a/src/calibre/ebooks/pdf/pdftrim.py b/src/calibre/ebooks/pdf/trim.py similarity index 95% rename from src/calibre/ebooks/pdf/pdftrim.py rename to src/calibre/ebooks/pdf/trim.py index c1e8fa2494..c999d24a46 100644 --- a/src/calibre/ebooks/pdf/pdftrim.py +++ b/src/calibre/ebooks/pdf/trim.py @@ -33,16 +33,16 @@ def config(defaults=None): return c -def option_parser(): +def option_parser(name): c = config() return c.option_parser(usage=_('''\ - %prog [options] file.pdf + %prog %%name [options] file.pdf Crops a pdf. - ''')) + '''.replace('%%name', name))) -def main(args=sys.argv): - parser = option_parser() +def main(args=sys.argv, name=''): + parser = option_parser(name) opts, args = parser.parse_args(args) try: source = os.path.abspath(args[1]) diff --git a/src/calibre/linux.py b/src/calibre/linux.py index 3ba6f55bc8..6bfe665557 100644 --- a/src/calibre/linux.py +++ b/src/calibre/linux.py @@ -39,10 +39,7 @@ entry_points = { 'calibre-fontconfig = calibre.utils.fontconfig:main', 'calibre-parallel = calibre.parallel:main', 'calibre-customize = calibre.customize.ui:main', - 'pdftrim = calibre.ebooks.pdf.pdftrim:main', - 'pdfmerge = calibre.ebooks.pdf.pdfmerge:main', - 'pdfsplit = calibre.ebooks.pdf.pdfsplit:main', - 'fetch-ebook-metadata = calibre.ebooks.metadata.fetch:main', + 'pdfmanipulate = calibre.ebooks.pdf.manipulate:main', ], 'gui_scripts' : [ __appname__+' = calibre.gui2.main:main', From 9a81882d4f9b306289ffb0dd564e2a1f2f006f9e Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 29 Mar 2009 09:34:43 -0400 Subject: [PATCH 09/16] Remove unnecessary options from pdf manipulation routines --- src/calibre/ebooks/pdf/manipulate.py | 2 +- src/calibre/ebooks/pdf/merge.py | 2 -- src/calibre/ebooks/pdf/split.py | 2 -- src/calibre/ebooks/pdf/trim.py | 2 -- 4 files changed, 1 insertion(+), 7 deletions(-) diff --git a/src/calibre/ebooks/pdf/manipulate.py b/src/calibre/ebooks/pdf/manipulate.py index 0e75734bb9..15c9404e25 100644 --- a/src/calibre/ebooks/pdf/manipulate.py +++ b/src/calibre/ebooks/pdf/manipulate.py @@ -21,7 +21,7 @@ COMMANDS = { def config(defaults=None): desc = _('Options to control the transformation of pdf') if defaults is None: - c = Config('trimpdf', desc) + c = Config('manipulatepdf', desc) else: c = StringConfig(defaults, desc) return c diff --git a/src/calibre/ebooks/pdf/merge.py b/src/calibre/ebooks/pdf/merge.py index 7ae35d1065..c0385080ad 100644 --- a/src/calibre/ebooks/pdf/merge.py +++ b/src/calibre/ebooks/pdf/merge.py @@ -21,8 +21,6 @@ def config(defaults=None): c = Config('mergepdf', desc) else: c = StringConfig(defaults, desc) - c.add_opt('verbose', ['-v', '--verbose'], default=0, action='count', - help=_('Be verbose, useful for debugging. Can be specified multiple times for greater verbosity.')) c.add_opt('output', ['-o', '--output'], default='merged.pdf', help=_('Path to output file. By default a file is created in the current directory.')) return c diff --git a/src/calibre/ebooks/pdf/split.py b/src/calibre/ebooks/pdf/split.py index 36517fb704..cc6965dd68 100644 --- a/src/calibre/ebooks/pdf/split.py +++ b/src/calibre/ebooks/pdf/split.py @@ -21,8 +21,6 @@ def config(defaults=None): c = Config('splitpdf', desc) else: c = StringConfig(defaults, desc) - c.add_opt('verbose', ['-v', '--verbose'], default=0, action='count', - help=_('Be verbose, useful for debugging. Can be specified multiple times for greater verbosity.')) c.add_opt('output', ['-o', '--output'], default='split.pdf', help=_('Path to output file. By default a file is created in the current directory. \ The file name will be the base name for the output.')) diff --git a/src/calibre/ebooks/pdf/trim.py b/src/calibre/ebooks/pdf/trim.py index c999d24a46..b32312fee8 100644 --- a/src/calibre/ebooks/pdf/trim.py +++ b/src/calibre/ebooks/pdf/trim.py @@ -16,8 +16,6 @@ def config(defaults=None): c = Config('trimpdf', desc) else: c = StringConfig(defaults, desc) - c.add_opt('verbose', ['-v', '--verbose'], default=0, action='count', - help=_('Be verbose, useful for debugging. Can be specified multiple times for greater verbosity.')) c.add_opt('output', ['-o', '--output'],default='cropped.pdf', help=_('Path to output file. By default a file is created in the current directory.')) c.add_opt('bottom_left_x', [ '-x', '--leftx'], default=default_crop, From 9e15e485883c6b589967a5ebe4d9f8bc58ae0982 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 29 Mar 2009 10:18:29 -0400 Subject: [PATCH 10/16] PDF info command --- src/calibre/ebooks/pdf/info.py | 89 ++++++++++++++++++++++++++++ src/calibre/ebooks/pdf/manipulate.py | 3 +- 2 files changed, 91 insertions(+), 1 deletion(-) create mode 100644 src/calibre/ebooks/pdf/info.py diff --git a/src/calibre/ebooks/pdf/info.py b/src/calibre/ebooks/pdf/info.py new file mode 100644 index 0000000000..46f1f11681 --- /dev/null +++ b/src/calibre/ebooks/pdf/info.py @@ -0,0 +1,89 @@ +''' +Merge PDF files into a single PDF document. +''' +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +import os, re, sys, time + +from calibre.utils.config import Config, StringConfig + +from pyPdf import PdfFileWriter, PdfFileReader + + +def config(defaults=None): + desc = _('Options to control the transformation of pdf') + if defaults is None: + c = Config('manipulatepdf', desc) + else: + c = StringConfig(defaults, desc) + return c + +def option_parser(name): + c = config() + return c.option_parser(usage=_('''\ + %prog %%name [options] file.pdf ... + + Get info about a PDF. + '''.replace('%%name', name))) + +def print_info(pdf_path): + with open(os.path.abspath(pdf_path), 'rb') as pdf_file: + pdf = PdfFileReader(pdf_file) + print _('Title: %s' % pdf.documentInfo.title) + print _('Author: %s' % pdf.documentInfo.author) + print _('Creator: %s' % pdf.documentInfo.creator) + print _('Producer: %s' % pdf.documentInfo.producer) + print _('Creation Date: %s' % time.strftime('%a %b %d %H:%M:%S %Y', time.gmtime(os.path.getctime(pdf_path)))) + print _('Modification Date: %s' % time.strftime('%a %b %d %H:%M:%S %Y', time.gmtime(os.path.getmtime(pdf_path)))) + print _('Pages: %s' % pdf.numPages) + print _('Encrypted: %s' % pdf.isEncrypted) + try: + print _('File Size: %s bytes' % os.path.getsize(pdf_path)) + except: pass + try: + pdf_file.seek(0) + vline = pdf_file.readline() + mo = re.search('(?iu)^%...-(?P\d+\.\d+)', vline) + if mo != None: + print _('PDF Version: %s' % mo.group('version')) + except: pass + +def verify_files(files): + invalid = [] + + for pdf_path in files: + try: + with open(os.path.abspath(pdf_path), 'rb') as pdf_file: + pdf = PdfFileReader(pdf_file) + except: + invalid.append(pdf_path) + return invalid + +def main(args=sys.argv, name=''): + parser = option_parser(name) + opts, args = parser.parse_args(args) + args = args[1:] + + if len(args) < 1: + print 'Error: No PDF sepecified.\n' + print parser.get_usage() + return 2 + + bad_pdfs = verify_files(args) + if bad_pdfs != []: + for pdf in bad_pdfs: + print 'Error: Could not read file `%s`. Is it a vaild PDF file or is it encrypted/DRMed?.' % pdf + return 2 + + for pdf in args: + print_info(pdf) + + return 0 + +if __name__ == '__main__': + sys.exit(main()) + diff --git a/src/calibre/ebooks/pdf/manipulate.py b/src/calibre/ebooks/pdf/manipulate.py index 15c9404e25..262aaf78d4 100644 --- a/src/calibre/ebooks/pdf/manipulate.py +++ b/src/calibre/ebooks/pdf/manipulate.py @@ -10,9 +10,10 @@ __docformat__ = 'restructuredtext en' import string, sys from calibre.utils.config import Config, StringConfig -from calibre.ebooks.pdf import merge, split, trim +from calibre.ebooks.pdf import info, merge, split, trim COMMANDS = { + 'info' : info, 'merge' : merge, 'split' : split, 'trim' : trim, From 1ed9efeb3904075310e05d3c18b1475617428f19 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 29 Mar 2009 10:20:21 -0400 Subject: [PATCH 11/16] Added subject to pdf info command --- src/calibre/ebooks/pdf/info.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/calibre/ebooks/pdf/info.py b/src/calibre/ebooks/pdf/info.py index 46f1f11681..115e411ce4 100644 --- a/src/calibre/ebooks/pdf/info.py +++ b/src/calibre/ebooks/pdf/info.py @@ -35,6 +35,7 @@ def print_info(pdf_path): pdf = PdfFileReader(pdf_file) print _('Title: %s' % pdf.documentInfo.title) print _('Author: %s' % pdf.documentInfo.author) + print _('Subject: %s' % pdf.documentInfo.subject) print _('Creator: %s' % pdf.documentInfo.creator) print _('Producer: %s' % pdf.documentInfo.producer) print _('Creation Date: %s' % time.strftime('%a %b %d %H:%M:%S %Y', time.gmtime(os.path.getctime(pdf_path)))) From 87580e27ba9e270e1c104b32b9fdc6d0b41fd283 Mon Sep 17 00:00:00 2001 From: John Schember Date: Mon, 30 Mar 2009 19:03:49 -0400 Subject: [PATCH 12/16] TXT metadata reader --- src/calibre/customize/builtins.py | 10 ++++++++++ src/calibre/ebooks/metadata/txt.py | 30 ++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+) create mode 100644 src/calibre/ebooks/metadata/txt.py diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index d37e241891..2cbf036c1f 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -159,6 +159,16 @@ class ODTMetadataReader(MetadataReaderPlugin): def get_metadata(self, stream, ftype): from calibre.ebooks.metadata.odt import get_metadata return get_metadata(stream) + +class TXTMetadataReader(MetaReaderPlugin): + + name = 'Read TXT metadata' + file_types = set(['txt']) + description = _('Read metadata from %s files') % 'TXT' + + def get_metadata(self, stream, ftype): + from calibre.ebooks.metadata.txt import get_metadata + return get_metadata(stream) class LRXMetadataReader(MetadataReaderPlugin): diff --git a/src/calibre/ebooks/metadata/txt.py b/src/calibre/ebooks/metadata/txt.py new file mode 100644 index 0000000000..5a5ab13ae9 --- /dev/null +++ b/src/calibre/ebooks/metadata/txt.py @@ -0,0 +1,30 @@ +'''Read meta information from TXT files''' + +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2009, John Schember ' + +import re + +from calibre.ebooks.metadata import MetaInformation + +def get_metadata(stream, extract_cover=True): + """ Return metadata as a L{MetaInfo} object """ + mi = MetaInformation(_('Unknown'), [_('Unknown')]) + stream.seek(0) + + mdata = '' + for x in range(0, 4): + line = stream.readline() + if line == '': + break + else: + mdata += line + + mo = re.search('(?u)^[ ]*(?P.+)[ ]*\n\n\n[ ]*(?P<author>.+)[ ]*\n$', mdata) + if mo != None: + mi.title = mo.group('title') + mi.authors = mo.group('author').split(',') + + return mi From 90362ab56ae0594651571117c0e934e108c7b877 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Tue, 31 Mar 2009 18:41:49 -0400 Subject: [PATCH 13/16] txt output now uses new conversion pipeline --- src/calibre/customize/builtins.py | 5 +- src/calibre/ebooks/conversion/plumber.py | 2 +- src/calibre/ebooks/metadata/txt.py | 2 +- src/calibre/ebooks/txt/from_any.py | 74 ------------- src/calibre/ebooks/txt/output.py | 62 +++++++++++ src/calibre/ebooks/txt/writer.py | 130 ++++------------------- 6 files changed, 90 insertions(+), 185 deletions(-) delete mode 100644 src/calibre/ebooks/txt/from_any.py create mode 100644 src/calibre/ebooks/txt/output.py diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 2cbf036c1f..acc7ba71ec 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -160,7 +160,7 @@ class ODTMetadataReader(MetadataReaderPlugin): from calibre.ebooks.metadata.odt import get_metadata return get_metadata(stream) -class TXTMetadataReader(MetaReaderPlugin): +class TXTMetadataReader(MetadataReaderPlugin): name = 'Read TXT metadata' file_types = set(['txt']) @@ -266,9 +266,10 @@ class MOBIMetadataWriter(MetadataWriterPlugin): from calibre.ebooks.epub.input import EPUBInput from calibre.ebooks.mobi.input import MOBIInput from calibre.ebooks.oeb.output import OEBOutput +from calibre.ebooks.txt.output import TXTOutput from calibre.customize.profiles import input_profiles, output_profiles -plugins = [HTML2ZIP, EPUBInput, MOBIInput, OEBOutput] +plugins = [HTML2ZIP, EPUBInput, MOBIInput, OEBOutput, TXTOutput] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ x.__name__.endswith('MetadataReader')] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index 5393aaf034..da41423750 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -195,7 +195,7 @@ OptionRecommendation(name='language', self.input_fmt = input_fmt self.output_fmt = output_fmt - # Build set of all possible options. Two options are equal iff their + # Build set of all possible options. Two options are equal if their # names are the same. self.input_options = self.input_plugin.options.union( self.input_plugin.common_options) diff --git a/src/calibre/ebooks/metadata/txt.py b/src/calibre/ebooks/metadata/txt.py index 5a5ab13ae9..6283c72256 100644 --- a/src/calibre/ebooks/metadata/txt.py +++ b/src/calibre/ebooks/metadata/txt.py @@ -22,7 +22,7 @@ def get_metadata(stream, extract_cover=True): else: mdata += line - mo = re.search('(?u)^[ ]*(?P<title>.+)[ ]*\n\n\n[ ]*(?P<author>.+)[ ]*\n$', mdata) + mo = re.search('(?u)^[ ]*(?P<title>.+)[ ]*(\n{3}|(\r\n){3}|\r{3})[ ]*(?P<author>.+)[ ]*(\n|\r\n|\r)$', mdata) if mo != None: mi.title = mo.group('title') mi.authors = mo.group('author').split(',') diff --git a/src/calibre/ebooks/txt/from_any.py b/src/calibre/ebooks/txt/from_any.py deleted file mode 100644 index caf5364c3c..0000000000 --- a/src/calibre/ebooks/txt/from_any.py +++ /dev/null @@ -1,74 +0,0 @@ -''' -Convert any ebook format to TXT. -''' - -from __future__ import with_statement - -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net ' \ - 'and Marshall T. Vandegrift <llasram@gmail.com>' \ - 'and John Schember <john@nachtimwald.com>' -__docformat__ = 'restructuredtext en' - -import sys, os, glob, logging - -from calibre.ebooks.epub.from_any import any2epub, formats, USAGE -from calibre.ebooks.epub import config as common_config -from calibre.ptempfile import TemporaryDirectory -from calibre.ebooks.txt.writer import oeb2txt, config as txt_config - -def config(defaults=None): - c = common_config(defaults=defaults, name='txt') - c.remove_opt('profile') - del c.option_set.groups['metadata'] - del c.option_set.groups['traversal'] - del c.option_set.groups['structure detection'] - del c.option_set.groups['toc'] - del c.option_set.groups['page layout'] - txtc = txt_config(defaults=defaults) - c.update(txtc) - return c - -def option_parser(usage=USAGE): - usage = usage % ('TXT', formats()) - parser = config().option_parser(usage=usage) - return parser - -def any2txt(opts, path, notification=None): - ext = os.path.splitext(path)[1] - if not ext: - raise ValueError('Unknown file type: '+path) - ext = ext.lower()[1:] - - if opts.output is None: - opts.output = os.path.splitext(os.path.basename(path))[0]+'.txt' - - opts.output = os.path.abspath(opts.output) - orig_output = opts.output - - with TemporaryDirectory('_any2txt') as tdir: - oebdir = os.path.join(tdir, 'oeb') - os.mkdir(oebdir) - opts.output = os.path.join(tdir, 'dummy.epub') - opts.profile = 'None' - opts.dont_split_on_page_breaks = True - orig_bfs = opts.base_font_size2 - opts.base_font_size2 = 0 - any2epub(opts, path, create_epub=False, oeb_cover=False, extract_to=oebdir) - opts.base_font_size2 = orig_bfs - opf = glob.glob(os.path.join(oebdir, '*.opf'))[0] - opts.output = orig_output - logging.getLogger('html2epub').info(_('Creating TXT file from EPUB...')) - oeb2txt(opts, opf) - -def main(args=sys.argv): - parser = option_parser() - opts, args = parser.parse_args(args) - if len(args) < 2: - parser.print_help() - print 'No input file specified.' - return 1 - any2txt(opts, args[1]) - -if __name__ == '__main__': - sys.exit(main()) diff --git a/src/calibre/ebooks/txt/output.py b/src/calibre/ebooks/txt/output.py new file mode 100644 index 0000000000..21498074ac --- /dev/null +++ b/src/calibre/ebooks/txt/output.py @@ -0,0 +1,62 @@ +# -*- coding: utf-8 -*- +__license__ = 'GPL 3' +__copyright__ = '2009, John Schember <john@nachtimwald.com>' +__docformat__ = 'restructuredtext en' + +import os + +from calibre.customize.conversion import OutputFormatPlugin, \ + OptionRecommendation +from calibre.ebooks.txt.writer import TxtWriter, TxtNewlines, TxtMetadata +from calibre.ebooks.metadata import authors_to_string + +class TXTOutput(OutputFormatPlugin): + + name = 'TXT Output' + author = 'John Schember' + file_type = 'txt' + + options = set([ + OptionRecommendation(name='newline', recommended_value='system', + level=OptionRecommendation.LOW, long_switch='newline', + short_switch='n', choices=TxtNewlines.NEWLINE_TYPES.keys(), + help=_('Type of newline to use. Options are %s. Default is \'system\'. ' + 'Use \'old_mac\' for compatibility with Mac OS 9 and earlier. ' + 'For Mac OS X use \'unix\'. \'system\' will default to the newline ' + 'type used by this OS.' % sorted(TxtNewlines.NEWLINE_TYPES.keys()))), + OptionRecommendation(name='prepend_author', recommended_value='true', + level=OptionRecommendation.LOW, long_switch='prepend_author', + choices=['true', 'false'], + help=_('Write the author to the beginning of the file. ' + 'Default is \'true\'. Use \'false\' to disable.')), + OptionRecommendation(name='prepend_title', recommended_value='true', + choices=['true', 'false'], + level=OptionRecommendation.LOW, long_switch='prepend_title', + help=_('Write the title to the beginning of the file. ' + 'Default is \'true\'. Use \'false\' to disable.')) + ]) + + def convert(self, oeb_book, output_path, input_plugin, opts, log): + metadata = TxtMetadata() + if opts.prepend_author.lower() == 'true': + metadata.author = opts.authors if opts.authors else authors_to_string(oeb_book.metadata.authors) + if opts.prepend_title.lower() == 'true': + metadata.title = opts.title if opts.title else oeb_book.metadata.title + + writer = TxtWriter(TxtNewlines(opts.newline).newline, log) + txt = writer.dump(oeb_book.spine, metadata) + + close = False + if not hasattr(output_path, 'write'): + close = True + if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path) != '': + os.makedirs(os.path.dirname(output_path)) + out_stream = open(output_path, 'wb') + else: + out_stream = output_path + + out_stream.seek(0) + out_stream.write(txt) + + if close: + out_stream.close() diff --git a/src/calibre/ebooks/txt/writer.py b/src/calibre/ebooks/txt/writer.py index 205d8423e3..eabc2d64ed 100644 --- a/src/calibre/ebooks/txt/writer.py +++ b/src/calibre/ebooks/txt/writer.py @@ -1,34 +1,26 @@ # -*- coding: utf-8 -*- +from __future__ import with_statement ''' Write content to TXT. ''' -from __future__ import with_statement __license__ = 'GPL v3' __copyright__ = '2009, John Schember <john@nachtimwald.com>' +__docformat__ = 'restructuredtext en' -import os, logging, re, sys +import os, re, sys + +from calibre.ebooks.htmlsymbols import HTML_SYMBOLS from BeautifulSoup import BeautifulSoup -from calibre import LoggingInterface -from calibre.ebooks.htmlsymbols import HTML_SYMBOLS -from calibre.ebooks.epub.iterator import SpineItem -from calibre.ebooks.metadata import authors_to_string -from calibre.ebooks.metadata.meta import metadata_from_formats -from calibre.ebooks.metadata.opf2 import OPF -from calibre.customize.ui import run_plugins_on_postprocess -from calibre.utils.config import Config, StringConfig - -class TXTWriter(object): - def __init__(self, newline): +class TxtWriter(object): + def __init__(self, newline, log): self.newline = newline + self.log = log - def dump(self, oebpath, path, metadata): - opf = OPF(oebpath, os.path.dirname(oebpath)) - spine = [SpineItem(i.path) for i in opf.spine] - - tmpout = '' + def dump(self, spine, metadata): + out = u'' for item in spine: with open(item, 'r') as itemf: content = itemf.read().decode(item.encoding) @@ -39,25 +31,21 @@ class TXTWriter(object): content = self.replace_html_symbols(content) content = self.cleanup_text(content) content = self.specified_newlines(content) - tmpout = tmpout + content + out += content # Prepend metadata if metadata.author != None and metadata.author != '': - tmpout = (u'%s%s%s%s' % (metadata.author.upper(), self.newline, self.newline, self.newline)) + tmpout + out = (u'%s%s%s%s' % (metadata.author.upper(), self.newline, self.newline, self.newline)) + out if metadata.title != None and metadata.title != '': - tmpout = (u'%s%s%s%s' % (metadata.title.upper(), self.newline, self.newline, self.newline)) + tmpout + out = (u'%s%s%s%s' % (metadata.title.upper(), self.newline, self.newline, self.newline)) + out # Put two blank lines at end of file - - end = tmpout[-3 * len(self.newline):] + end = out[-3 * len(self.newline):] for i in range(3 - end.count(self.newline)): - tmpout = tmpout + self.newline + out += self.newline + + return out - if os.path.exists(path): - os.remove(path) - with open(path, 'w+b') as out: - out.write(tmpout.encode('utf-8')) - def strip_html(self, html): stripped = u'' @@ -149,14 +137,8 @@ class TXTWriter(object): if self.newline == '\n': return text - return text.replace('\n', self.newline) - -class TxtMetadata(object): - def __init__(self): - self.author = None - self.title = None - self.series = None - + return text.replace('\n', self.newline) + class TxtNewlines(object): NEWLINE_TYPES = { @@ -170,73 +152,7 @@ class TxtNewlines(object): self.newline = self.NEWLINE_TYPES.get(newline_type.lower(), os.linesep) -def config(defaults=None): - desc = _('Options to control the conversion to TXT') - if defaults is None: - c = Config('txt', desc) - else: - c = StringConfig(defaults, desc) - - txt = c.add_group('TXT', _('TXT options.')) - - txt('newline', ['--newline'], default='system', - help=_('Type of newline to use. Options are %s. Default is \'system\'. ' - 'Use \'old_mac\' for compatibility with Mac OS 9 and earlier. ' - 'For Mac OS X use \'unix\'. \'system\' will default to the newline ' - 'type used by this OS.' % sorted(TxtNewlines.NEWLINE_TYPES.keys()))) - txt('prepend_author', ['--prepend-author'], default='true', - help=_('Write the author to the beginning of the file. ' - 'Default is \'true\'. Use \'false\' to disable.')) - txt('prepend_title', ['--prepend-title'], default='true', - help=_('Write the title to the beginning of the file. ' - 'Default is \'true\'. Use \'false\' to disable.')) - - return c - -def option_parser(): - c = config() - parser = c.option_parser(usage='%prog '+_('[options]')+' file.opf') - parser.add_option( - '-o', '--output', default=None, - help=_('Output file. Default is derived from input filename.')) - parser.add_option( - '-v', '--verbose', default=0, action='count', - help=_('Useful for debugging.')) - return parser - -def oeb2txt(opts, inpath): - logger = LoggingInterface(logging.getLogger('oeb2txt')) - logger.setup_cli_handler(opts.verbose) - - outpath = opts.output - if outpath is None: - outpath = os.path.basename(inpath) - outpath = os.path.splitext(outpath)[0] + '.txt' - - mi = metadata_from_formats([inpath]) - metadata = TxtMetadata() - if opts.prepend_author.lower() == 'true': - metadata.author = opts.authors if opts.authors else authors_to_string(mi.authors) - if opts.prepend_title.lower() == 'true': - metadata.title = opts.title if opts.title else mi.title - - newline = TxtNewlines(opts.newline) - - writer = TXTWriter(newline.newline) - writer.dump(inpath, outpath, metadata) - run_plugins_on_postprocess(outpath, 'txt') - logger.log_info(_('Output written to ') + outpath) - -def main(argv=sys.argv): - parser = option_parser() - opts, args = parser.parse_args(argv[1:]) - if len(args) != 1: - parser.print_help() - return 1 - inpath = args[0] - retval = oeb2txt(opts, inpath) - return retval - -if __name__ == '__main__': - sys.exit(main()) - +class TxtMetadata(object): + def __init__(self): + self.title = None + self.author = None From 79e509eeb48bf7156e62bae9ca9291311dd25778 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Tue, 31 Mar 2009 20:23:49 -0400 Subject: [PATCH 14/16] Move PDF output to use new conversion framework --- src/calibre/customize/builtins.py | 3 +- src/calibre/ebooks/pdf/from_any.py | 69 --------------------- src/calibre/ebooks/pdf/output.py | 62 +++++++++++++++++++ src/calibre/ebooks/pdf/writer.py | 99 +++++------------------------- src/calibre/ebooks/txt/output.py | 1 + 5 files changed, 79 insertions(+), 155 deletions(-) delete mode 100644 src/calibre/ebooks/pdf/from_any.py create mode 100644 src/calibre/ebooks/pdf/output.py diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index acc7ba71ec..932261c45d 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -267,9 +267,10 @@ from calibre.ebooks.epub.input import EPUBInput from calibre.ebooks.mobi.input import MOBIInput from calibre.ebooks.oeb.output import OEBOutput from calibre.ebooks.txt.output import TXTOutput +from calibre.ebooks.pdf.output import PDFOutput from calibre.customize.profiles import input_profiles, output_profiles -plugins = [HTML2ZIP, EPUBInput, MOBIInput, OEBOutput, TXTOutput] +plugins = [HTML2ZIP, EPUBInput, MOBIInput, OEBOutput, TXTOutput, PDFOutput] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ x.__name__.endswith('MetadataReader')] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ diff --git a/src/calibre/ebooks/pdf/from_any.py b/src/calibre/ebooks/pdf/from_any.py deleted file mode 100644 index e4fb937cdb..0000000000 --- a/src/calibre/ebooks/pdf/from_any.py +++ /dev/null @@ -1,69 +0,0 @@ -''' -Convert any ebook format to PDF. -''' - -from __future__ import with_statement - -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net ' \ - 'and Marshall T. Vandegrift <llasram@gmail.com>' \ - 'and John Schember <john@nachtimwald.com>' -__docformat__ = 'restructuredtext en' - -import sys, os, glob, logging - -from calibre.ebooks.epub.from_any import any2epub, formats, USAGE -from calibre.ebooks.epub import config as common_config -from calibre.ptempfile import TemporaryDirectory -from calibre.ebooks.pdf.writer import oeb2pdf, config as pdf_config - -def config(defaults=None): - c = common_config(defaults=defaults, name='pdf') - c.remove_opt('profile') - pdfc = pdf_config(defaults=defaults) - c.update(pdfc) - return c - -def option_parser(usage=USAGE): - usage = usage % ('PDF', formats()) - parser = config().option_parser(usage=usage) - return parser - -def any2pdf(opts, path, notification=None): - ext = os.path.splitext(path)[1] - if not ext: - raise ValueError('Unknown file type: '+path) - ext = ext.lower()[1:] - - if opts.output is None: - opts.output = os.path.splitext(os.path.basename(path))[0]+'.pdf' - - opts.output = os.path.abspath(opts.output) - orig_output = opts.output - - with TemporaryDirectory('_any2pdf') as tdir: - oebdir = os.path.join(tdir, 'oeb') - os.mkdir(oebdir) - opts.output = os.path.join(tdir, 'dummy.epub') - opts.profile = 'None' - opts.dont_split_on_page_breaks = True - orig_bfs = opts.base_font_size2 - opts.base_font_size2 = 0 - any2epub(opts, path, create_epub=False, oeb_cover=True, extract_to=oebdir) - opts.base_font_size2 = orig_bfs - opf = glob.glob(os.path.join(oebdir, '*.opf'))[0] - opts.output = orig_output - logging.getLogger('html2epub').info(_('Creating PDF file from EPUB...')) - oeb2pdf(opts, opf) - -def main(args=sys.argv): - parser = option_parser() - opts, args = parser.parse_args(args) - if len(args) < 2: - parser.print_help() - print 'No input file specified.' - return 1 - any2pdf(opts, args[1]) - -if __name__ == '__main__': - sys.exit(main()) diff --git a/src/calibre/ebooks/pdf/output.py b/src/calibre/ebooks/pdf/output.py new file mode 100644 index 0000000000..71bd77ee73 --- /dev/null +++ b/src/calibre/ebooks/pdf/output.py @@ -0,0 +1,62 @@ +# -*- coding: utf-8 -*- +__license__ = 'GPL 3' +__copyright__ = '2009, John Schember <john@nachtimwald.com>' +__docformat__ = 'restructuredtext en' + +''' +Convert OEB ebook format to PDF. +''' + +#unit, papersize, orientation, custom_size, profile + +import os + +from calibre.customize.conversion import OutputFormatPlugin, \ + OptionRecommendation +from calibre.ebooks.pdf.writer import PDFWriter, PDFMargins + +class PDFOutput(OutputFormatPlugin): + + name = 'PDF Output' + author = 'John Schember' + file_type = 'pdf' + + options = set([ + OptionRecommendation(name='margin_top', recommended_value='1', + level=OptionRecommendation.LOW, long_switch='margin_top', + help=_('The top margin around the document.')), + OptionRecommendation(name='margin_bottom', recommended_value='1', + level=OptionRecommendation.LOW, long_switch='margin_bottom', + help=_('The bottom margin around the document.')), + OptionRecommendation(name='margin_left', recommended_value='1', + level=OptionRecommendation.LOW, long_switch='margin_left', + help=_('The left margin around the document.')), + OptionRecommendation(name='margin_right', recommended_value='1', + level=OptionRecommendation.LOW, long_switch='margin_right', + help=_('The right margin around the document.')), + ]) + + def convert(self, oeb_book, output_path, input_plugin, opts, log): + margins = PDFMargins() + margins.top = opts.margin_top + margins.bottom = opts.margin_bottom + margins.left = opts.margin_left + margins.right = opts.margin_right + + writer = PDFWriter(log, margins) + + close = False + if not hasattr(output_path, 'write'): + close = True + if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path) != '': + os.makedirs(os.path.dirname(output_path)) + out_stream = open(output_path, 'wb') + else: + out_stream = output_path + + out_stream.seek(0) + out_stream.truncate() + writer.dump(oeb_book.spine, out_stream) + + if close: + out_stream.close() diff --git a/src/calibre/ebooks/pdf/writer.py b/src/calibre/ebooks/pdf/writer.py index c189407dac..511c968a20 100644 --- a/src/calibre/ebooks/pdf/writer.py +++ b/src/calibre/ebooks/pdf/writer.py @@ -1,20 +1,17 @@ -''' -Write content to PDF. -''' +# -*- coding: utf-8 -*- from __future__ import with_statement __license__ = 'GPL v3' __copyright__ = '2009, John Schember <john@nachtimwald.com>' +__docformat__ = 'restructuredtext en' -import os, logging, shutil, sys +''' +Write content to PDF. +''' + +import os, shutil, sys -from calibre import LoggingInterface -from calibre.ebooks.epub.iterator import SpineItem -from calibre.ebooks.metadata.opf2 import OPF from calibre.ptempfile import PersistentTemporaryDirectory -from calibre.customize.ui import run_plugins_on_postprocess -from calibre.utils.config import Config, StringConfig - from PyQt4 import QtCore from PyQt4.Qt import QUrl, QEventLoop, SIGNAL, QObject, QApplication, QPrinter, \ QMetaObject, Qt @@ -29,13 +26,14 @@ class PDFMargins: self.left = margin self.right = margin + class PDFWriter(QObject): - def __init__(self, margins=PDFMargins()): + def __init__(self, log, margins=PDFMargins()): if QApplication.instance() is None: QApplication([]) QObject.__init__(self) - self.logger = logging.getLogger('oeb2pdf') + self.logger = log self.loop = QEventLoop() self.view = QWebView() @@ -45,13 +43,12 @@ class PDFWriter(QObject): self.tmp_path = PersistentTemporaryDirectory('_any2pdf_parts') self.margins = margins - def dump(self, oebpath, path): + def dump(self, spine, out_stream): self._delete_tmpdir() - opf = OPF(oebpath, os.path.dirname(oebpath)) - self.render_queue = [SpineItem(i.path) for i in opf.spine] + self.render_queue = spine[:] self.combine_queue = [] - self.path = path + self.out_stream = out_stream QMetaObject.invokeMethod(self, "_render_book", Qt.QueuedConnection) self.loop.exec_() @@ -98,75 +95,7 @@ class PDFWriter(QObject): inputPDF = PdfFileReader(file(item, 'rb')) for page in inputPDF.pages: outPDF.addPage(page) - outputStream = file(self.path, 'wb') - outPDF.write(outputStream) - outputStream.close() + outPDF.write(self.out_stream) finally: self._delete_tmpdir() self.loop.exit(0) - - -def config(defaults=None): - desc = _('Options to control the conversion to PDF') - if defaults is None: - c = Config('pdf', desc) - else: - c = StringConfig(defaults, desc) - - pdf = c.add_group('PDF', _('PDF options.')) - - pdf('margin_top', ['--margin_top'], default=1, - help=_('The top margin around the document in inches.')) - pdf('margin_bottom', ['--margin_bottom'], default=1, - help=_('The bottom margin around the document in inches.')) - pdf('margin_left', ['--margin_left'], default=1, - help=_('The left margin around the document in inches.')) - pdf('margin_right', ['--margin_right'], default=1, - help=_('The right margin around the document in inches.')) - - return c - -def option_parser(): - c = config() - parser = c.option_parser(usage='%prog '+_('[options]')+' file.opf') - parser.add_option( - '-o', '--output', default=None, - help=_('Output file. Default is derived from input filename.')) - parser.add_option( - '-v', '--verbose', default=0, action='count', - help=_('Useful for debugging.')) - return parser - -def oeb2pdf(opts, inpath): - logger = LoggingInterface(logging.getLogger('oeb2pdf')) - logger.setup_cli_handler(opts.verbose) - - outpath = opts.output - if outpath is None: - outpath = os.path.basename(inpath) - outpath = os.path.splitext(outpath)[0] + '.pdf' - - margins = PDFMargins() - margins.top = opts.margin_top - margins.bottom = opts.margin_bottom - margins.left = opts.margin_left - margins.right = opts.margin_right - - writer = PDFWriter(margins) - writer.dump(inpath, outpath) - run_plugins_on_postprocess(outpath, 'pdf') - logger.log_info(_('Output written to ') + outpath) - -def main(argv=sys.argv): - parser = option_parser() - opts, args = parser.parse_args(argv[1:]) - if len(args) != 1: - parser.print_help() - return 1 - inpath = args[0] - retval = oeb2pdf(opts, inpath) - return retval - -if __name__ == '__main__': - sys.exit(main()) - diff --git a/src/calibre/ebooks/txt/output.py b/src/calibre/ebooks/txt/output.py index 21498074ac..7d44172b3f 100644 --- a/src/calibre/ebooks/txt/output.py +++ b/src/calibre/ebooks/txt/output.py @@ -56,6 +56,7 @@ class TXTOutput(OutputFormatPlugin): out_stream = output_path out_stream.seek(0) + out_stream.truncate() out_stream.write(txt) if close: From 596e3f71388cef57c1e7593c796431a984e66233 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Wed, 1 Apr 2009 07:39:41 -0400 Subject: [PATCH 15/16] More robust pdf output --- src/calibre/ebooks/pdf/output.py | 37 ++++++++-- src/calibre/ebooks/pdf/pageoptions.py | 98 +++++++++++++++++++++++++++ src/calibre/ebooks/pdf/writer.py | 18 ++--- 3 files changed, 135 insertions(+), 18 deletions(-) create mode 100644 src/calibre/ebooks/pdf/pageoptions.py diff --git a/src/calibre/ebooks/pdf/output.py b/src/calibre/ebooks/pdf/output.py index 71bd77ee73..5af4e4bed7 100644 --- a/src/calibre/ebooks/pdf/output.py +++ b/src/calibre/ebooks/pdf/output.py @@ -13,7 +13,9 @@ import os from calibre.customize.conversion import OutputFormatPlugin, \ OptionRecommendation -from calibre.ebooks.pdf.writer import PDFWriter, PDFMargins +from calibre.ebooks.pdf.writer import PDFWriter +from calibre.ebooks.pdf.pageoptions import UNITS, unit, PAPER_SIZES, \ + paper_size, ORIENTATIONS, orientation, PageOptions class PDFOutput(OutputFormatPlugin): @@ -34,16 +36,37 @@ class PDFOutput(OutputFormatPlugin): OptionRecommendation(name='margin_right', recommended_value='1', level=OptionRecommendation.LOW, long_switch='margin_right', help=_('The right margin around the document.')), + + OptionRecommendation(name='unit', recommended_value='inch', + level=OptionRecommendation.LOW, short_switch='u', + long_switch='unit', choices=UNITS.keys(), + help=_('The unit of measure. Default is inch. Choices ' + 'are %s' % UNITS.keys())), + OptionRecommendation(name='paper_size', recommended_value='letter', + level=OptionRecommendation.LOW, + long_switch='paper_size', choices=PAPER_SIZES.keys(), + help=_('The size of the paper. Default is letter. Choices ' + 'are %s' % PAPER_SIZES.keys())), + OptionRecommendation(name='orientation', recommended_value='portrait', + level=OptionRecommendation.LOW, + long_switch='orientation', choices=ORIENTATIONS.keys(), + help=_('The orientation of the page. Default is portrait. Choices ' + 'are %s' % ORIENTATIONS.keys())), ]) def convert(self, oeb_book, output_path, input_plugin, opts, log): - margins = PDFMargins() - margins.top = opts.margin_top - margins.bottom = opts.margin_bottom - margins.left = opts.margin_left - margins.right = opts.margin_right + popts = PageOptions() + + popts.set_margin_top(opts.margin_top) + popts.set_margin_bottom(opts.margin_bottom) + popts.set_margin_left(opts.margin_left) + popts.set_margin_right(opts.margin_right) + + popts.unit = unit(opts.unit) + popts.paper_size = paper_size(opts.paper_size) + popts.orientation = orientation(opts.orientation) - writer = PDFWriter(log, margins) + writer = PDFWriter(log, popts) close = False if not hasattr(output_path, 'write'): diff --git a/src/calibre/ebooks/pdf/pageoptions.py b/src/calibre/ebooks/pdf/pageoptions.py new file mode 100644 index 0000000000..26fae81662 --- /dev/null +++ b/src/calibre/ebooks/pdf/pageoptions.py @@ -0,0 +1,98 @@ +# -*- coding: utf-8 -*- +__license__ = 'GPL 3' +__copyright__ = '2009, John Schember <john@nachtimwald.com>' +__docformat__ = 'restructuredtext en' + +from PyQt4.Qt import QPrinter + +UNITS = { + 'millimeter' : QPrinter.Millimeter, + 'point' : QPrinter.Point, + 'inch' : QPrinter.Inch, + 'pica' : QPrinter.Pica, + 'didot' : QPrinter.Didot, + 'cicero' : QPrinter.Cicero, + 'devicepixel' : QPrinter.DevicePixel, + } + +def unit(unit): + return UNITS.get(unit, QPrinter.Inch) + +PAPER_SIZES = { + 'a0' : QPrinter.A0, # 841 x 1189 mm + 'a1' : QPrinter.A1, # 594 x 841 mm + 'a2' : QPrinter.A2, # 420 x 594 mm + 'a3' : QPrinter.A3, # 297 x 420 mm + 'a4' : QPrinter.A4, # 210 x 297 mm, 8.26 x 11.69 inches + 'a5' : QPrinter.A5, # 148 x 210 mm + 'a6' : QPrinter.A6, # 105 x 148 mm + 'a7' : QPrinter.A7, # 74 x 105 mm + 'a8' : QPrinter.A8, # 52 x 74 mm + 'a9' : QPrinter.A9, # 37 x 52 mm + 'b0' : QPrinter.B0, # 1030 x 1456 mm + 'b1' : QPrinter.B1, # 728 x 1030 mm + 'b2' : QPrinter.B2, # 515 x 728 mm + 'b3' : QPrinter.B3, # 364 x 515 mm + 'b4' : QPrinter.B4, # 257 x 364 mm + 'b5' : QPrinter.B5, # 182 x 257 mm, 7.17 x 10.13 inches + 'b6' : QPrinter.B6, # 128 x 182 mm + 'b7' : QPrinter.B7, # 91 x 128 mm + 'b8' : QPrinter.B8, # 64 x 91 mm + 'b9' : QPrinter.B9, # 45 x 64 mm + 'b10' : QPrinter.B10, # 32 x 45 mm + 'c5e' : QPrinter.C5E, # 163 x 229 mm + 'comm10e' : QPrinter.Comm10E, # 105 x 241 mm, U.S. Common 10 Envelope + 'dle' : QPrinter.DLE, # 110 x 220 mm + 'executive' : QPrinter.Executive, # 7.5 x 10 inches, 191 x 254 mm + 'folio' : QPrinter.Folio, # 210 x 330 mm + 'ledger' : QPrinter.Ledger, # 432 x 279 mm + 'legal' : QPrinter.Legal, # 8.5 x 14 inches, 216 x 356 mm + 'letter' : QPrinter.Letter, # 8.5 x 11 inches, 216 x 279 mm + 'tabloid' : QPrinter.Tabloid, # 279 x 432 mm + #'custom' : QPrinter.Custom, # Unknown, or a user defined size. + } + +def paper_size(size): + return PAPER_SIZES.get(size, QPrinter.Letter) + +ORIENTATIONS = { + 'portrait' : QPrinter.Portrait, + 'landscape' : QPrinter.Landscape, + } + +def orientation(orientation): + return ORIENTATIONS.get(orientation, QPrinter.Portrait) + + +class PageOptions(object): + margin_top = 1 + margin_bottom = 1 + margin_left = 1 + margin_right = 1 + unit = QPrinter.Inch + paper_size = QPrinter.Letter + orientation = QPrinter.Portrait + + def set_margin_top(self, size): + try: + self.margin_top = int(size) + except: + self.margin_top = 1 + + def set_margin_bottom(self, size): + try: + self.margin_bottom = int(size) + except: + self.margin_bottom = 1 + + def set_margin_left(self, size): + try: + self.margin_left = int(size) + except: + self.margin_left = 1 + + def set_margin_right(self, size): + try: + self.margin_right = int(size) + except: + self.margin_right = 1 diff --git a/src/calibre/ebooks/pdf/writer.py b/src/calibre/ebooks/pdf/writer.py index 511c968a20..cf77aebc14 100644 --- a/src/calibre/ebooks/pdf/writer.py +++ b/src/calibre/ebooks/pdf/writer.py @@ -12,23 +12,17 @@ Write content to PDF. import os, shutil, sys from calibre.ptempfile import PersistentTemporaryDirectory +from calibre.ebooks.pdf.pageoptions import PageOptions + from PyQt4 import QtCore from PyQt4.Qt import QUrl, QEventLoop, SIGNAL, QObject, QApplication, QPrinter, \ QMetaObject, Qt from PyQt4.QtWebKit import QWebView from pyPdf import PdfFileWriter, PdfFileReader - -class PDFMargins: - def __init__(self, margin=1): - self.top = margin - self.bottom = margin - self.left = margin - self.right = margin - class PDFWriter(QObject): - def __init__(self, log, margins=PDFMargins()): + def __init__(self, log, popts=PageOptions()): if QApplication.instance() is None: QApplication([]) QObject.__init__(self) @@ -41,7 +35,7 @@ class PDFWriter(QObject): self.render_queue = [] self.combine_queue = [] self.tmp_path = PersistentTemporaryDirectory('_any2pdf_parts') - self.margins = margins + self.popts = popts def dump(self, spine, out_stream): self._delete_tmpdir() @@ -75,7 +69,9 @@ class PDFWriter(QObject): self.logger.debug('\tRendering item as %s' % item_path) printer = QPrinter(QPrinter.HighResolution) - printer.setPageMargins(self.margins.left, self.margins.top, self.margins.right, self.margins.bottom, QPrinter.Inch) + printer.setPageMargins(self.popts.margin_left, self.popts.margin_top, self.popts.margin_right, self.popts.margin_bottom, self.popts.unit) + printer.setPaperSize(self.popts.paper_size) + printer.setOrientation(self.popts.orientation) printer.setOutputFormat(QPrinter.PdfFormat) printer.setOutputFileName(item_path) self.view.print_(printer) From 118fd6ece0625f9bb95657df74401abe46f775ad Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Wed, 1 Apr 2009 08:08:03 -0400 Subject: [PATCH 16/16] reverse pdfmanipulate command --- src/calibre/ebooks/pdf/manipulate.py | 11 ++-- src/calibre/ebooks/pdf/reverse.py | 88 ++++++++++++++++++++++++++++ 2 files changed, 94 insertions(+), 5 deletions(-) create mode 100644 src/calibre/ebooks/pdf/reverse.py diff --git a/src/calibre/ebooks/pdf/manipulate.py b/src/calibre/ebooks/pdf/manipulate.py index 262aaf78d4..8c49650730 100644 --- a/src/calibre/ebooks/pdf/manipulate.py +++ b/src/calibre/ebooks/pdf/manipulate.py @@ -10,13 +10,14 @@ __docformat__ = 'restructuredtext en' import string, sys from calibre.utils.config import Config, StringConfig -from calibre.ebooks.pdf import info, merge, split, trim +from calibre.ebooks.pdf import info, merge, reverse, split, trim COMMANDS = { - 'info' : info, - 'merge' : merge, - 'split' : split, - 'trim' : trim, + 'info' : info, + 'merge' : merge, + 'reverse' : reverse, + 'split' : split, + 'trim' : trim, } def config(defaults=None): diff --git a/src/calibre/ebooks/pdf/reverse.py b/src/calibre/ebooks/pdf/reverse.py new file mode 100644 index 0000000000..87bb9018c1 --- /dev/null +++ b/src/calibre/ebooks/pdf/reverse.py @@ -0,0 +1,88 @@ +# -*- coding: utf-8 -*- +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2009, John Schember <john@nachtimwald.com>' +__docformat__ = 'restructuredtext en' + +''' +Reverse content of PDF. +''' + +import os, sys + +from calibre.ebooks.metadata.meta import metadata_from_formats +from calibre.ebooks.metadata import authors_to_string +from calibre.utils.config import Config, StringConfig + +from pyPdf import PdfFileWriter, PdfFileReader + +def config(defaults=None): + desc = _('Options to control the transformation of pdf') + if defaults is None: + c = Config('reversepdf', desc) + else: + c = StringConfig(defaults, desc) + c.add_opt('output', ['-o', '--output'], default='reversed.pdf', + help=_('Path to output file. By default a file is created in the current directory.')) + return c + +def option_parser(name): + c = config() + return c.option_parser(usage=_('''\ + %prog %%name [options] file1.pdf + + Reverse PDF. + '''.replace('%%name', name))) + +def reverse(pdf_path, out_path, metadata=None): + if metadata == None: + title = _('Unknown') + author = _('Unknown') + else: + title = metadata.title + author = authors_to_string(metadata.authors) + + out_pdf = PdfFileWriter(title=title, author=author) + + pdf = PdfFileReader(open(os.path.abspath(pdf_path), 'rb')) + for page in reversed(pdf.pages): + out_pdf.addPage(page) + + with open(out_path, 'wb') as out_file: + out_pdf.write(out_file) + +# Return True if the pdf is valid. +def valid_pdf(pdf_path): + try: + with open(os.path.abspath(pdf_path), 'rb') as pdf_file: + pdf = PdfFileReader(pdf_file) + if pdf.isEncrypted or pdf.numPages <= 0: + raise Exception + except: + return False + return True + + +def main(args=sys.argv, name=''): + parser = option_parser(name) + opts, args = parser.parse_args(args) + args = args[1:] + + if len(args) < 1: + print 'Error: A PDF file is required.\n\n' + print parser.get_usage() + return 2 + + if not valid_pdf(args[0]): + print 'Error: Could not read file `%s`. Is it a vaild PDF file or is it encrypted/DRMed?.' % args[0] + return 2 + + mi = metadata_from_formats([args[0]]) + + reverse(args[0], opts.output, mi) + + return 0 + +if __name__ == '__main__': + sys.exit(main())