From 9c82e833acb21f074d4373d246dd21842703240b Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 13 Aug 2007 22:13:45 +0000 Subject: [PATCH] Implemented any2lrf --- setup.py | 1 + src/libprs500/__init__.py | 3 +- src/libprs500/devices/interface.py | 2 +- src/libprs500/ebooks/lrf/any/__init__.py | 14 ++ src/libprs500/ebooks/lrf/any/convert_from.py | 136 +++++++++++++++ src/libprs500/ebooks/lrf/html/convert_from.py | 164 +++++------------- src/libprs500/ebooks/lrf/lit/convert_from.py | 53 +++--- src/libprs500/ebooks/lrf/meta.py | 26 ++- src/libprs500/ebooks/lrf/pdf/convert_from.py | 43 +++-- src/libprs500/ebooks/lrf/rtf/convert_from.py | 52 +++--- src/libprs500/ebooks/lrf/txt/convert_from.py | 37 ++-- src/libprs500/ebooks/metadata/__init__.py | 14 +- src/libprs500/ebooks/metadata/opf.py | 155 +++++++++++++++++ src/libprs500/ebooks/metadata/rtf.py | 5 + src/libprs500/gui/database.py | 2 - src/libprs500/gui2/dialogs/__init__.py | 1 + src/libprs500/gui2/main.py | 2 +- src/libprs500/linux.py | 4 +- 18 files changed, 506 insertions(+), 208 deletions(-) create mode 100644 src/libprs500/ebooks/lrf/any/__init__.py create mode 100644 src/libprs500/ebooks/lrf/any/convert_from.py create mode 100644 src/libprs500/ebooks/metadata/opf.py diff --git a/setup.py b/setup.py index 386ce97789..d8ab326511 100644 --- a/setup.py +++ b/setup.py @@ -31,6 +31,7 @@ entry_points = { 'web2disk = libprs500.web.fetch.simple:main',\ 'web2lrf = libprs500.ebooks.lrf.web.convert_from:main',\ 'pdf2lrf = libprs500.ebooks.lrf.pdf.convert_from:main',\ + 'any2lrf = libprs500.ebooks.lrf.any.convert_from:main',\ 'libprs500-beta = libprs500.gui2.main:main',\ ], 'gui_scripts' : [ APPNAME+' = libprs500.gui.main:main'] diff --git a/src/libprs500/__init__.py b/src/libprs500/__init__.py index 8fbe6f627e..0b63eb6d91 100644 --- a/src/libprs500/__init__.py +++ b/src/libprs500/__init__.py @@ -64,14 +64,13 @@ def filename_to_utf8(name): return name.decode(codec, 'replace').encode('utf8') def extract(path, dir): - import os ext = os.path.splitext(path)[1][1:].lower() extractor = None if ext == 'zip': from libprs500.libunzip import extract extractor = extract elif ext == 'rar': - from libprs500.libunrar import extract + from libprs500.libunrar import extract # In case the dll is not found extractor = extract if not extractor: raise Exception('Unknown archive type') diff --git a/src/libprs500/devices/interface.py b/src/libprs500/devices/interface.py index bcaed5e495..ad9dcc3afa 100644 --- a/src/libprs500/devices/interface.py +++ b/src/libprs500/devices/interface.py @@ -131,7 +131,7 @@ class Device(object): keys C{title}, C{authors}, C{cover}, C{tags}. The value of the C{cover} element can be None or a three element tuple (width, height, data) where data is the image data in JPEG format as a string. C{tags} must be - a possibly empty list of strings. + a possibly empty list of strings. C{authors} must be a string. @param booklists: A tuple containing the result of calls to (L{books}(oncard=False), L{books}(oncard=True)). ''' diff --git a/src/libprs500/ebooks/lrf/any/__init__.py b/src/libprs500/ebooks/lrf/any/__init__.py new file mode 100644 index 0000000000..aaf49de99e --- /dev/null +++ b/src/libprs500/ebooks/lrf/any/__init__.py @@ -0,0 +1,14 @@ +## Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2 of the License, or +## (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License along +## with this program; if not, write to the Free Software Foundation, Inc., +## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. diff --git a/src/libprs500/ebooks/lrf/any/convert_from.py b/src/libprs500/ebooks/lrf/any/convert_from.py new file mode 100644 index 0000000000..23c863afda --- /dev/null +++ b/src/libprs500/ebooks/lrf/any/convert_from.py @@ -0,0 +1,136 @@ +## Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2 of the License, or +## (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License along +## with this program; if not, write to the Free Software Foundation, Inc., +## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +'''Convert any ebook file into a LRF file.''' + +import sys, os, logging, shutil, tempfile, glob + +from libprs500.ebooks.lrf import option_parser +from libprs500 import __appname__, setup_cli_handlers, extract +from libprs500.ebooks.lrf.lit.convert_from import process_file as lit2lrf +from libprs500.ebooks.lrf.pdf.convert_from import process_file as pdf2lrf +from libprs500.ebooks.lrf.rtf.convert_from import process_file as rtf2lrf +from libprs500.ebooks.lrf.txt.convert_from import process_file as txt2lrf +from libprs500.ebooks.lrf.html.convert_from import process_file as html2lrf + +def largest_file(files): + maxsize, file = 0, None + for f in files: + size = os.stat(f).st_size + if size > maxsize: + maxsize = size + file = f + return file + +def find_htmlfile(dir): + for pair in (('*toc*.htm*', '*toc*.xhtm*'), ('*.htm*', '*.xhtm*')): + files = glob.glob(os.path.join(dir, pair[0])) + files += glob.glob(os.path.join(dir, pair[1])) + file = largest_file(files) + if file: + return file + + +def handle_archive(path): + tdir = tempfile.mkdtemp(prefix=__appname__+'_') + extract(path, tdir) + files = [] + cdir = tdir + temp = os.listdir(tdir) + file = None + if len(temp) == 1 and os.path.isdir(os.path.join(tdir, temp[0])): + cdir = os.path.join(tdir, temp[0]) + for ext in ('lit', 'rtf', 'pdf', 'txt'): + pat = os.path.join(cdir, '*.'+ext) + files.extend(glob.glob(pat)) + file = largest_file(files) + if file: + return tdir, file + file = find_htmlfile(cdir) + return tdir, file + +def process_file(path, options, logger=None): + path = os.path.abspath(os.path.expanduser(path)) + tdir = None + if logger is None: + level = logging.DEBUG if options.verbose else logging.INFO + logger = logging.getLogger('any2lrf') + setup_cli_handlers(logger, level) + if not os.access(path, os.R_OK): + logger.critical('Cannot read from %s', path) + return 1 + ext = os.path.splitext(path)[1] + if not ext or ext == '.': + logger.critical('Unknown file type: %s', path) + return 1 + ext = ext[1:].lower() + cwd = os.getcwd() + if not options.output: + fmt = '.lrs' if options.lrs else '.lrf' + options.output = os.path.splitext(os.path.basename(path))[0] + fmt + options.output = os.path.abspath(os.path.expanduser(options.output)) + if ext in ['zip', 'rar']: + newpath = None + try: + tdir, newpath = handle_archive(path) + except: + logger.exception(' ') + if not newpath: + logger.critical('Could not find ebook in archive') + return 1 + path = newpath + logger.info('Found ebook in archive: %s', path) + try: + ext = os.path.splitext(path)[1][1:].lower() + convertor = None + if 'htm' in ext: + convertor = html2lrf + elif 'lit' == ext: + convertor = lit2lrf + elif 'pdf' == ext: + convertor = pdf2lrf + elif 'rtf' == ext: + convertor = rtf2lrf + elif 'txt' == ext: + convertor = txt2lrf + convertor(path, options, logger) + finally: + os.chdir(cwd) + if tdir and os.path.exists(tdir): + shutil.rmtree(tdir) + + +def main(args=sys.argv, logger=None): + parser = option_parser('''\ +any2lrf myfile + +Convert any ebook format into LRF. Supported formats are: +LIT, RTF, TXT, HTML and PDF. any2lrf will also process a RAR or +ZIP archive. + ''') + options, args = parser.parse_args(args) + if len(args) != 2: + parser.print_help() + print + print 'No file to convert specified.' + return 1 + + process_file(args[1], options, logger) + + + + return 0 + +if __name__ == '__main__': + sys.exit(main()) \ No newline at end of file diff --git a/src/libprs500/ebooks/lrf/html/convert_from.py b/src/libprs500/ebooks/lrf/html/convert_from.py index 901c0c1137..3949531b4a 100644 --- a/src/libprs500/ebooks/lrf/html/convert_from.py +++ b/src/libprs500/ebooks/lrf/html/convert_from.py @@ -20,20 +20,18 @@ Code to convert HTML ebooks into LRF ebooks. I am indebted to esperanc for the initial CSS->Xylog Style conversion code and to Falstaff for pylrs. """ -import os, re, sys, shutil, copy, glob, logging +import os, re, sys, copy, glob, logging from htmlentitydefs import name2codepoint from urllib import unquote from urlparse import urlparse -from tempfile import mkdtemp -from operator import itemgetter from math import ceil, floor try: from PIL import Image as PILImage except ImportError: import Image as PILImage -from libprs500.ebooks.BeautifulSoup import BeautifulSoup, BeautifulStoneSoup, \ - Comment, Tag, NavigableString, Declaration, ProcessingInstruction +from libprs500.ebooks.BeautifulSoup import BeautifulSoup, Comment, Tag, \ + NavigableString, Declaration, ProcessingInstruction from libprs500.ebooks.lrf.pylrs.pylrs import Paragraph, CR, Italic, ImageStream, \ TextBlock, ImageBlock, JumpButton, CharButton, \ Plot, Image, BlockSpace, RuledLine, BookSetting, Canvas, DropCaps, \ @@ -43,8 +41,9 @@ from libprs500.ebooks.lrf import Book from libprs500.ebooks.lrf import option_parser as lrf_option_parser from libprs500.ebooks import ConversionError from libprs500.ebooks.lrf.html.table import Table -from libprs500 import extract, filename_to_utf8, setup_cli_handlers +from libprs500 import filename_to_utf8, setup_cli_handlers, __appname__ from libprs500.ptempfile import PersistentTemporaryFile +from libprs500.ebooks.metadata.opf import OPFReader class Span(_Span): replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo' ] @@ -643,7 +642,7 @@ class HTMLConverter(object): except Exception: self.logger.warning('Unable to process %s', path) if self.verbose: - self.logger.exception('') + self.logger.exception(' ') continue finally: os.chdir(cwd) @@ -1291,15 +1290,13 @@ def process_file(path, options, logger=None): logger = logging.getLogger('html2lrf') setup_cli_handlers(logger, level) cwd = os.getcwd() - dirpath = None default_title = filename_to_utf8(os.path.splitext(os.path.basename(path))[0]) + dirpath = os.path.dirname(path) try: - dirpath, path = get_path(path) cpath, tpath = '', '' try_opf(path, options, logger) - if options.cover: - dp = dirpath if dirpath else os.path.dirname(path) - cpath = os.path.join(dp, os.path.basename(options.cover)) + if options.cover: + cpath = os.path.join(dirpath, os.path.basename(options.cover)) if not os.path.exists(cpath): cpath = os.path.abspath(os.path.expanduser(options.cover)) options.cover = cpath @@ -1309,7 +1306,7 @@ def process_file(path, options, logger=None): cim = im.resize((options.profile.screen_width, options.profile.screen_height), PILImage.BICUBIC).convert('RGB') - cf = PersistentTemporaryFile(prefix="html2lrf_", suffix=".jpg") + cf = PersistentTemporaryFile(prefix=__appname__+"_", suffix=".jpg") cf.close() cim.save(cf.name) cpath = cf.name @@ -1376,70 +1373,57 @@ def process_file(path, options, logger=None): return oname finally: os.chdir(cwd) - if dirpath: - shutil.rmtree(dirpath, True) def try_opf(path, options, logger): try: opf = glob.glob(os.path.join(os.path.dirname(path),'*.opf'))[0] except IndexError: return - soup = BeautifulStoneSoup(open(opf).read()) + opf = OPFReader(open(opf, 'rb')) try: - title = soup.package.metadata.find('dc:title') + title = opf.title if title and not options.title: - options.title = title.string - creators = soup.package.metadata.findAll('dc:creator') + options.title = title if options.author == 'Unknown': - for author in creators: - role = author.get('role') - if not role: - role = author.get('opf:role') - if role == 'aut': - options.author = author.string - fa = author.get('file-as') - if fa: - options.author_sort = fa + if opf.authors: + options.author = ', '.join(opf.authors) + if opf.author_sort: + options.author_sort = opf.author_sort if options.publisher == 'Unknown': - publisher = soup.package.metadata.find('dc:publisher') + publisher = opf.publisher if publisher: - options.publisher = publisher.string - if not options.category.strip(): - category = soup.package.metadata.find('dc:type') + options.publisher = publisher + if not options.category: + category = opf.category if category: - options.category = category.string - isbn = [] - for item in soup.package.metadata.findAll('dc:identifier'): - scheme = item.get('scheme') - if not scheme: - scheme = item.get('opf:scheme') - isbn.append((scheme, item.string)) - if not options.cover: - for item in isbn: - src = item[1].replace('-', '') - matches = glob.glob(os.path.join(os.path.dirname(path), src+'.*')) - for match in matches: - test = os.path.splitext(match)[1].lower() - if test in ['.jpeg', '.jpg', '.gif', '.png']: - options.cover = match - break - - + options.category = category if not options.cover: - # Search for cover image in opf as created by convertlit - ref = soup.package.find('reference', {'type':'other.ms-coverimage-standard'}) - if ref: - try: - options.cover = os.path.join(os.path.dirname(path), ref.get('href')) - if not os.access(options.cover, os.R_OK): - options.cover = None - except: - logger.exception('Could not load cover') + cover = opf.cover + if cover: + cover = os.path.join(os.path.dirname(path), cover) + if os.access(cover, os.R_OK): + try: + PILImage.open(cover) + options.cover = cover + except: + pass + if not options.cover: + for prefix in opf.possible_cover_prefixes(): + if options.cover: + break + for suffix in ['.jpg', '.jpeg', '.gif', '.png', '.bmp']: + cpath = os.path.join(os.path.dirname(path), prefix+suffix) + try: + PILImage.open(cpath) + options.cover = cpath + break + except: + continue except Exception: logger.exception('Failed to process opf file') def option_parser(): - return lrf_option_parser('''Usage: %prog [options] mybook.[html|rar|zip]\n\n''' + return lrf_option_parser('''Usage: %prog [options] mybook.html\n\n''' '''%prog converts mybook.html to mybook.lrf''') def main(args=sys.argv): @@ -1461,66 +1445,6 @@ def main(args=sys.argv): process_file(src, options) return 0 -def console_query(dirpath, candidate, docs): - if len(docs) == 1: - return 0 - try: - import readline - except ImportError: - pass - i = 0 - for doc in docs: - prefix = '>' if i == candidate else '' - print prefix+str(i)+'.\t', doc[0] - i += 1 - print - while True: - try: - choice = raw_input('Choose file to convert (0-'+str(i-1) + \ - '). Current choice is ['+ str(candidate) + ']:') - if not choice: - return candidate - choice = int(choice) - if choice < 0 or choice >= i: - continue - candidate = choice - except EOFError, KeyboardInterrupt: - sys.exit() - except: - continue - break - return candidate - - -def get_path(path, query=console_query): - path = os.path.abspath(os.path.expanduser(path)) - ext = os.path.splitext(path)[1][1:].lower() - if ext in ['htm', 'html', 'xhtml', 'php']: - return None, path - dirpath = mkdtemp('','html2lrf') - extract(path, dirpath) - candidate, docs = None, [] - for root, dirs, files in os.walk(dirpath): - for name in files: - ext = os.path.splitext(name)[1][1:].lower() - if ext not in ['html', 'xhtml', 'htm', 'xhtm']: - continue - docs.append((name, root, os.stat(os.path.join(root, name)).st_size)) - if 'toc' in name.lower(): - candidate = name - docs.sort(key=itemgetter(2)) - if candidate: - for i in range(len(docs)): - if docs[i][0] == candidate: - candidate = i - break - else: - candidate = len(docs) - 1 - if len(docs) == 0: - raise ConversionError('No suitable files found in archive') - if len(docs) > 0: - candidate = query(dirpath, candidate, docs) - return dirpath, os.path.join(docs[candidate][1], docs[candidate][0]) if __name__ == '__main__': diff --git a/src/libprs500/ebooks/lrf/lit/convert_from.py b/src/libprs500/ebooks/lrf/lit/convert_from.py index 994abde005..6d6d9d5cbd 100644 --- a/src/libprs500/ebooks/lrf/lit/convert_from.py +++ b/src/libprs500/ebooks/lrf/lit/convert_from.py @@ -12,13 +12,13 @@ ## You should have received a copy of the GNU General Public License along ## with this program; if not, write to the Free Software Foundation, Inc., ## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. -import os, sys, shutil, glob +import os, sys, shutil, glob, logging from tempfile import mkdtemp from subprocess import Popen, PIPE from libprs500.ebooks.lrf import option_parser as lrf_option_parser from libprs500.ebooks import ConversionError -from libprs500.ebooks.lrf.html.convert_from import process_file -from libprs500 import isosx, __appname__ +from libprs500.ebooks.lrf.html.convert_from import process_file as html_process_file +from libprs500 import isosx, __appname__, setup_cli_handlers CLIT = 'clit' if isosx and hasattr(sys, 'frameworks_dir'): CLIT = os.path.join(sys.frameworks_dir, CLIT) @@ -29,29 +29,27 @@ def option_parser(): '''%prog converts mybook.lit to mybook.lrf''' ) -def generate_html(pathtolit): +def generate_html(pathtolit, logger): if not os.access(pathtolit, os.R_OK): raise ConversionError, 'Cannot read from ' + pathtolit tdir = mkdtemp(prefix=__appname__+'_') cmd = ' '.join([CLIT, '"'+pathtolit+'"', tdir]) - p = Popen(cmd, shell=True, stderr=PIPE) + p = Popen(cmd, shell=True, stderr=PIPE, stdout=PIPE) ret = p.wait() + logger.info(p.stdout.read()) if ret != 0: shutil.rmtree(tdir) err = p.stderr.read() raise ConversionError, err return tdir -def main(args=sys.argv): - parser = option_parser() - options, args = parser.parse_args(args) - if len(args) != 2: - parser.print_help() - print - print 'No lit file specified' - return 1 - lit = os.path.abspath(os.path.expanduser(args[1])) - tdir = generate_html(lit) +def process_file(path, options, logger=None): + if logger is None: + level = logging.DEBUG if options.verbose else logging.INFO + logger = logging.getLogger('lit2lrf') + setup_cli_handlers(logger, level) + lit = os.path.abspath(os.path.expanduser(path)) + tdir = generate_html(lit, logger) try: l = glob.glob(os.path.join(tdir, '*toc*.htm*')) if not l: @@ -61,7 +59,9 @@ def main(args=sys.argv): if not l: l = glob.glob(os.path.join(tdir, '*.htm*')) if not l: - raise ConversionError, 'Conversion of lit to html failed. Cannot find html file.' + l = glob.glob(os.path.join(tdir, '*.txt*')) # Some lit file apparently have .txt files in them + if not l: + raise ConversionError('Conversion of lit to html failed. Cannot find html file.') maxsize, htmlfile = 0, None for c in l: sz = os.path.getsize(c) @@ -71,13 +71,24 @@ def main(args=sys.argv): htmlfile = l[0] if not options.output: ext = '.lrs' if options.lrs else '.lrf' - options.output = os.path.abspath(os.path.basename(os.path.splitext(args[1])[0]) + ext) - else: - options.output = os.path.abspath(options.output) - process_file(htmlfile, options) + options.output = os.path.abspath(os.path.basename(os.path.splitext(path)[0]) + ext) + options.output = os.path.abspath(os.path.expanduser(options.output)) + html_process_file(htmlfile, options, logger=logger) finally: shutil.rmtree(tdir) - + + +def main(args=sys.argv, logger=None): + parser = option_parser() + options, args = parser.parse_args(args) + if len(args) != 2: + parser.print_help() + print + print 'No lit file specified' + return 1 + process_file(options, args[1], logger) + return 0 + if __name__ == '__main__': sys.exit(main()) diff --git a/src/libprs500/ebooks/lrf/meta.py b/src/libprs500/ebooks/lrf/meta.py index dc757e51a3..4713830392 100644 --- a/src/libprs500/ebooks/lrf/meta.py +++ b/src/libprs500/ebooks/lrf/meta.py @@ -254,19 +254,35 @@ def get_metadata(stream): L{MetaInformation} object. """ lrf = LRFMetaFile(stream) - mi = MetaInformation(lrf.title.strip(), lrf.author.strip()) + au = lrf.author.strip().split(',') + authors = [] + for i in au: + authors.extend(i.split('&')) + mi = MetaInformation(lrf.title.strip(), authors) + mi.author = lrf.author.strip() mi.comments = lrf.free_text.strip() - mi.category = lrf.category.strip() - mi.classification = lrf.classification.strip() + mi.category = lrf.category.strip()+', '+lrf.classification.strip() mi.publisher = lrf.publisher.strip() + try: + mi.title_sort = lrf.title_reading.strip() + if not mi.title_sort: + mi.title_sort = None + except: + pass + try: + mi.author_sort = lrf.author_reading.strip() + if not mi.author_sort: + mi.author_sort = None + except: + pass if not mi.title or 'unknown' in mi.title.lower(): mi.title = None + if not mi.authors: + mi.authors = None if not mi.author or 'unknown' in mi.author.lower(): mi.author = None if not mi.category or 'unknown' in mi.category.lower(): mi.category = None - if not mi.classification or 'unknown' in mi.classification.lower(): - mi.classification = None if not mi.publisher or 'unknown' in mi.publisher.lower() or \ 'some publisher' in mi.publisher.lower(): mi.publisher = None diff --git a/src/libprs500/ebooks/lrf/pdf/convert_from.py b/src/libprs500/ebooks/lrf/pdf/convert_from.py index 823b6f7aae..cc3c6e77a7 100644 --- a/src/libprs500/ebooks/lrf/pdf/convert_from.py +++ b/src/libprs500/ebooks/lrf/pdf/convert_from.py @@ -15,19 +15,19 @@ from libprs500 import filename_to_utf8 '''''' -import sys, os, subprocess -from libprs500 import isosx +import sys, os, subprocess, logging +from libprs500 import isosx, setup_cli_handlers from libprs500.ebooks import ConversionError from libprs500.ptempfile import PersistentTemporaryFile from libprs500.ebooks.lrf import option_parser as lrf_option_parser -from libprs500.ebooks.lrf.html.convert_from import process_file +from libprs500.ebooks.lrf.html.convert_from import process_file as html_process_file PDFTOHTML = 'pdftohtml' if isosx and hasattr(sys, 'frameworks_dir'): PDFTOHTML = os.path.join(sys.frameworks_dir, PDFTOHTML) -def generate_html(pathtopdf): +def generate_html(pathtopdf, logger): ''' Convert the pdf into html. @return: A closed PersistentTemporaryFile. @@ -41,8 +41,10 @@ def generate_html(pathtopdf): cwd = os.getcwd() try: os.chdir(os.path.dirname(pf.name)) - p = subprocess.Popen(cmd, shell=True, stderr=subprocess.PIPE) + p = subprocess.Popen(cmd, shell=True, stderr=subprocess.PIPE, + stdout=subprocess.PIPE) ret = p.wait() + logger.info(p.stdout.read()) if ret != 0: err = p.stderr.read() raise ConversionError, err @@ -56,8 +58,25 @@ def option_parser(): '''%prog converts mybook.pdf to mybook.lrf\n\n''' ) +def process_file(path, options, logger=None): + if logger is None: + level = logging.DEBUG if options.verbose else logging.INFO + logger = logging.getLogger('pdf2lrf') + setup_cli_handlers(logger, level) + pdf = os.path.abspath(os.path.expanduser(path)) + htmlfile = generate_html(pdf, logger) + if not options.output: + ext = '.lrs' if options.lrs else '.lrf' + options.output = os.path.abspath(os.path.basename(os.path.splitext(path)[0]) + ext) + else: + options.output = os.path.abspath(options.output) + options.pdftohtml = True + if not options.title: + options.title = filename_to_utf8(os.path.splitext(os.path.basename(options.output))[0]) + html_process_file(htmlfile.name, options, logger) -def main(args=sys.argv): + +def main(args=sys.argv, logger=None): parser = option_parser() options, args = parser.parse_args(args) if len(args) != 2: @@ -65,17 +84,7 @@ def main(args=sys.argv): print print 'No pdf file specified' return 1 - pdf = os.path.abspath(os.path.expanduser(args[1])) - htmlfile = generate_html(pdf) - if not options.output: - ext = '.lrs' if options.lrs else '.lrf' - options.output = os.path.abspath(os.path.basename(os.path.splitext(args[1])[0]) + ext) - else: - options.output = os.path.abspath(options.output) - options.pdftohtml = True - if not options.title: - options.title = filename_to_utf8(os.path.splitext(os.path.basename(options.output))[0]) - process_file(htmlfile.name, options) + process_file(args[1], options, logger) return 0 if __name__ == '__main__': diff --git a/src/libprs500/ebooks/lrf/rtf/convert_from.py b/src/libprs500/ebooks/lrf/rtf/convert_from.py index b1004c2727..fe797def63 100644 --- a/src/libprs500/ebooks/lrf/rtf/convert_from.py +++ b/src/libprs500/ebooks/lrf/rtf/convert_from.py @@ -12,13 +12,13 @@ ## You should have received a copy of the GNU General Public License along ## with this program; if not, write to the Free Software Foundation, Inc., ## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. -import os, sys, tempfile, subprocess, shutil +import os, sys, tempfile, subprocess, shutil, logging from libprs500.ebooks.lrf import option_parser as lrf_option_parser from libprs500.ebooks.metadata.meta import get_metadata -from libprs500.ebooks.lrf.html.convert_from import process_file +from libprs500.ebooks.lrf.html.convert_from import process_file as html_process_file from libprs500.ebooks import ConversionError -from libprs500 import isosx +from libprs500 import isosx, setup_cli_handlers UNRTF = 'unrtf' if isosx and hasattr(sys, 'frameworks_dir'): @@ -30,50 +30,47 @@ def option_parser(): '''%prog converts mybook.rtf to mybook.lrf''' ) -def generate_html(rtfpath): +def generate_html(rtfpath, logger): tdir = tempfile.mkdtemp(prefix='rtf2lrf_') cwd = os.path.abspath(os.getcwd()) os.chdir(tdir) try: - print 'Converting to HTML...', + logger.info('Converting to HTML...') sys.stdout.flush() handle, path = tempfile.mkstemp(dir=tdir, suffix='.html') file = os.fdopen(handle, 'wb') cmd = ' '.join([UNRTF, '"'+rtfpath+'"']) - p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) + p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, + stderr=subprocess.PIPE) file.write(p.stdout.read()) ret = p.wait() if ret != 0: if isosx and ret == -11: #unrtf segfaults on OSX but seems to convert most of the file. file.write('\n') else: + logger.critical(p.stderr.read()) raise ConversionError, 'unrtf failed with error code: %d'%(ret,) - print 'done' file.close() return path finally: os.chdir(cwd) -def main(args=sys.argv): - parser = option_parser() - options, args = parser.parse_args(args) - if len(args) != 2: - parser.print_help() - print - print 'No rtf file specified' - return 1 - rtf = os.path.abspath(os.path.expanduser(args[1])) +def process_file(path, options, logger=None): + if logger is None: + level = logging.DEBUG if options.verbose else logging.INFO + logger = logging.getLogger('pdf2lrf') + setup_cli_handlers(logger, level) + rtf = os.path.abspath(os.path.expanduser(path)) f = open(rtf, 'rb') mi = get_metadata(f, 'rtf') f.close() - html = generate_html(rtf) + html = generate_html(rtf, logger) tdir = os.path.dirname(html) try: if not options.output: ext = '.lrs' if options.lrs else '.lrf' - options.output = os.path.abspath(os.path.basename(os.path.splitext(args[1])[0]) + ext) - else: - options.output = os.path.abspath(options.output) + options.output = os.path.abspath(os.path.basename(os.path.splitext(path)[0]) + ext) + options.output = os.path.abspath(os.path.expanduser(options.output)) if (not options.title or options.title == 'Unknown') and mi.title: sys.argv.append('-t') sys.argv.append('"'+mi.title+'"') @@ -86,9 +83,22 @@ def main(args=sys.argv): if (not options.freetext or options.freetext == 'Unknown') and mi.comments: sys.argv.append('--comment') sys.argv.append('"'+mi.comments+'"') - process_file(html, options) + html_process_file(html, options, logger) finally: shutil.rmtree(tdir) + +def main(args=sys.argv, logger=None): + parser = option_parser() + options, args = parser.parse_args(args) + if len(args) != 2: + parser.print_help() + print + print 'No rtf file specified' + return 1 + process_file(args[1], options, logger) + return 0 + + if __name__ == '__main__': sys.exit(main()) diff --git a/src/libprs500/ebooks/lrf/txt/convert_from.py b/src/libprs500/ebooks/lrf/txt/convert_from.py index a1056d41d8..00a65f127e 100644 --- a/src/libprs500/ebooks/lrf/txt/convert_from.py +++ b/src/libprs500/ebooks/lrf/txt/convert_from.py @@ -15,14 +15,14 @@ """ Convert .txt files to .lrf """ -import os, sys, codecs +import os, sys, codecs, logging -from libprs500 import iswindows from libprs500.ptempfile import PersistentTemporaryFile from libprs500.ebooks.lrf import option_parser as lrf_option_parser from libprs500.ebooks import ConversionError -from libprs500.ebooks.lrf.html.convert_from import process_file +from libprs500.ebooks.lrf.html.convert_from import process_file as html_process_file from libprs500.ebooks.markdown import markdown +from libprs500 import setup_cli_handlers def option_parser(): parser = lrf_option_parser('''Usage: %prog [options] mybook.txt\n\n''' @@ -65,7 +65,24 @@ def generate_html(txtfile, encoding): codecs.open(p.name, 'wb', enc).write(html) return p -def main(args=sys.argv): +def process_file(path, options, logger=None): + if logger is None: + level = logging.DEBUG if options.verbose else logging.INFO + logger = logging.getLogger('txt2lrf') + setup_cli_handlers(logger, level) + txt = os.path.abspath(os.path.expanduser(path)) + if not hasattr(options, 'encoding'): + options.encoding = None + htmlfile = generate_html(txt, options.encoding) + options.force_page_break = 'h2' + if not options.output: + ext = '.lrs' if options.lrs else '.lrf' + options.output = os.path.abspath(os.path.basename(os.path.splitext(path)[0]) + ext) + options.output = os.path.abspath(os.path.expanduser(options.output)) + + html_process_file(htmlfile.name, options, logger) + +def main(args=sys.argv, logger=None): parser = option_parser() options, args = parser.parse_args(args) if len(args) != 2: @@ -73,16 +90,8 @@ def main(args=sys.argv): print print 'No txt file specified' return 1 - txt = os.path.abspath(os.path.expanduser(args[1])) - htmlfile = generate_html(txt, options.encoding) - options.force_page_break = 'h2' - if not options.output: - ext = '.lrs' if options.lrs else '.lrf' - options.output = os.path.abspath(os.path.basename(os.path.splitext(args[1])[0]) + ext) - else: - options.output = os.path.abspath(options.output) - - process_file(htmlfile.name, options) + process_file(args[1], options, logger) + return 0 if __name__ == '__main__': sys.exit(main()) \ No newline at end of file diff --git a/src/libprs500/ebooks/metadata/__init__.py b/src/libprs500/ebooks/metadata/__init__.py index 009b263455..bd05fe1940 100644 --- a/src/libprs500/ebooks/metadata/__init__.py +++ b/src/libprs500/ebooks/metadata/__init__.py @@ -40,12 +40,20 @@ def get_parser(extension): class MetaInformation(object): '''Convenient encapsulation of book metadata''' - def __init__(self, title, author): + def __init__(self, title, authors): + ''' + @param title: title or "Unknonw" + @param authors: List of strings or [] + ''' self.title = title - self.author = author + self.author = authors # Needed for backward compatibility + #: List of strings or [] + self.authors = authors + #: Sort text for author + self.author_sort = None + self.title_sort = None self.comments = None self.category = None - self.classification = None self.publisher = None self.series = None self.series_index = None diff --git a/src/libprs500/ebooks/metadata/opf.py b/src/libprs500/ebooks/metadata/opf.py new file mode 100644 index 0000000000..883e0745fe --- /dev/null +++ b/src/libprs500/ebooks/metadata/opf.py @@ -0,0 +1,155 @@ +## Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2 of the License, or +## (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License along +## with this program; if not, write to the Free Software Foundation, Inc., +## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +'''Read/Write metadata from Open Packaging Format (.opf) files.''' + +import sys + +from libprs500.ebooks.metadata import MetaInformation +from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup + +class OPFReader(MetaInformation): + + def __init__(self, stream): + self.default_title = stream.name if hasattr(stream, 'name') else 'Unknown' + if hasattr(stream, 'seek'): + stream.seek(0) + self.soup = BeautifulStoneSoup(stream.read()) + self.series = self.series_index = self.rating = None + + @apply + def title(): + doc = '''title''' + def fget(self): + title = self.soup.package.metadata.find('dc:title') + if title: + return title.string + return self.default_title + return property(doc=doc, fget=fget) + + @apply + def authors(): + doc = '''authors''' + def fget(self): + creators = self.soup.package.metadata.findAll('dc:creator') + for elem in creators: + role = elem.get('role') + if not role: + role = elem.get('opf:role') + if role == 'aut': + au = elem.string.split(',') + ans = [] + for i in au: + ans.extend(i.split('&')) + return ans + return None + return property(doc=doc, fget=fget) + + @apply + def author_sort(): + doc = '''author sort''' + def fget(self): + creators = self.soup.package.metadata.findAll('dc:creator') + for elem in creators: + role = elem.get('role') + if not role: + role = elem.get('opf:role') + if role == 'aut': + fa = elem.get('file-as') + return fa if fa else None + return property(doc=doc, fget=fget) + + @apply + def title_sort(): + doc = 'title sort' + def fget(self): + return None + return property(doc=doc, fget=fget) + + @apply + def comments(): + doc = 'comments' + def fget(self): + comments = self.soup.find('dc:description') + if comments: + return comments.string + return None + return property(doc=doc, fget=fget) + + @apply + def category(): + doc = 'category' + def fget(self): + category = self.soup.find('dc:type') + if category: + return category.string + return None + return property(doc=doc, fget=fget) + + @apply + def publisher(): + doc = 'publisher' + def fget(self): + publisher = self.soup.find('dc:publisher') + if publisher: + return publisher.string + return None + return property(doc=doc, fget=fget) + + @apply + def isbn(): + doc = 'ISBN number' + def fget(self): + for item in self.soup.package.metadata.findAll('dc:identifier'): + scheme = item.get('scheme') + if not scheme: + scheme = item.get('opf:scheme') + if scheme.lower() == 'isbn': + return item.string + return None + return property(doc=doc, fget=fget) + + @apply + def cover(): + doc = 'cover' + def fget(self): + guide = self.soup.package.find('guide') + if guide: + references = guide.findAll('reference') + for reference in references: + type = reference.get('type') + if not type: + continue + if type.lower() in ['cover', 'other.ms-coverimage-standard']: + return reference.get('href') + return None + return property(doc=doc, fget=fget) + + def possible_cover_prefixes(self): + isbn, ans = [], [] + for item in self.soup.package.metadata.findAll('dc:identifier'): + scheme = item.get('scheme') + if not scheme: + scheme = item.get('opf:scheme') + isbn.append((scheme, item.string)) + for item in isbn: + ans.append(item[1].replace('-', '')) + return ans + + +def main(args=sys.argv): + return 0 + +if __name__ == '__main__': + sys.exit(main()) \ No newline at end of file diff --git a/src/libprs500/ebooks/metadata/rtf.py b/src/libprs500/ebooks/metadata/rtf.py index 03bde7ccfb..e617e33feb 100644 --- a/src/libprs500/ebooks/metadata/rtf.py +++ b/src/libprs500/ebooks/metadata/rtf.py @@ -86,6 +86,11 @@ def get_metadata(stream): if category_match: category = category_match.group(1).strip() mi = MetaInformation(title, author) + if author: + au = author.split(',') + mi.authors = [] + for i in au: + mi.authors.extend(i.split('&')) mi.comments = comment mi.category = category return mi diff --git a/src/libprs500/gui/database.py b/src/libprs500/gui/database.py index 112903c81e..409973c038 100644 --- a/src/libprs500/gui/database.py +++ b/src/libprs500/gui/database.py @@ -68,8 +68,6 @@ class LibraryDatabase(object): mi.title = title if mi.category: tags.append(mi.category) - if mi.classification: - tags.append(mi.classification) if tags: tags = ', '.join(tags) else: diff --git a/src/libprs500/gui2/dialogs/__init__.py b/src/libprs500/gui2/dialogs/__init__.py index 537ea16197..0a634b3b4e 100644 --- a/src/libprs500/gui2/dialogs/__init__.py +++ b/src/libprs500/gui2/dialogs/__init__.py @@ -23,6 +23,7 @@ class Dialog(QObject): self.dialog = QDialog(window) self.accept = self.dialog.accept self.reject = self.dialog.reject + self._close_event = self.dialog.closeEvent self.dialog.closeEvent = self.close_event self.window = window self.isVisible = self.dialog.isVisible diff --git a/src/libprs500/gui2/main.py b/src/libprs500/gui2/main.py index 76705a42ee..566e15a705 100644 --- a/src/libprs500/gui2/main.py +++ b/src/libprs500/gui2/main.py @@ -265,7 +265,7 @@ class Main(QObject, Ui_MainWindow): formats.append(format) metadata.append(mi) names.append(os.path.basename(book)) - infos.append({'title':mi.title, 'authors':mi.author, + infos.append({'title':mi.title, 'authors':', '.join(mi.authors), 'cover':self.default_thumbnail, 'tags':[]}) if not to_device: diff --git a/src/libprs500/linux.py b/src/libprs500/linux.py index f0ee2145a1..2b0446b189 100644 --- a/src/libprs500/linux.py +++ b/src/libprs500/linux.py @@ -81,7 +81,9 @@ def setup_completion(): f.write(opts_and_exts('lit2lrf', htmlop, ['lit'])) f.write(opts_and_exts('rtf2lrf', htmlop, ['rtf'])) f.write(opts_and_exts('pdf2lrf', htmlop, ['pdf'])) - f.write(opts_and_exts('lrf-meta', metaop, ['lrf'])) + f.write(opts_and_exts('any2lrf', htmlop, + ['htm', 'html', 'xhtml', 'xhtm', 'rar', 'zip', 'txt', 'lit', 'rtf', 'pdf'])) + f.write(opts_and_exts('lrf-meta', metaop, ['lrf'])) f.write(''' _prs500_ls() {