Implemented any2lrf

This commit is contained in:
Kovid Goyal 2007-08-13 22:13:45 +00:00
parent 709dd81a08
commit 9c82e833ac
18 changed files with 506 additions and 208 deletions

View File

@ -31,6 +31,7 @@ entry_points = {
'web2disk = libprs500.web.fetch.simple:main',\ 'web2disk = libprs500.web.fetch.simple:main',\
'web2lrf = libprs500.ebooks.lrf.web.convert_from:main',\ 'web2lrf = libprs500.ebooks.lrf.web.convert_from:main',\
'pdf2lrf = libprs500.ebooks.lrf.pdf.convert_from:main',\ 'pdf2lrf = libprs500.ebooks.lrf.pdf.convert_from:main',\
'any2lrf = libprs500.ebooks.lrf.any.convert_from:main',\
'libprs500-beta = libprs500.gui2.main:main',\ 'libprs500-beta = libprs500.gui2.main:main',\
], ],
'gui_scripts' : [ APPNAME+' = libprs500.gui.main:main'] 'gui_scripts' : [ APPNAME+' = libprs500.gui.main:main']

View File

@ -64,14 +64,13 @@ def filename_to_utf8(name):
return name.decode(codec, 'replace').encode('utf8') return name.decode(codec, 'replace').encode('utf8')
def extract(path, dir): def extract(path, dir):
import os
ext = os.path.splitext(path)[1][1:].lower() ext = os.path.splitext(path)[1][1:].lower()
extractor = None extractor = None
if ext == 'zip': if ext == 'zip':
from libprs500.libunzip import extract from libprs500.libunzip import extract
extractor = extract extractor = extract
elif ext == 'rar': elif ext == 'rar':
from libprs500.libunrar import extract from libprs500.libunrar import extract # In case the dll is not found
extractor = extract extractor = extract
if not extractor: if not extractor:
raise Exception('Unknown archive type') raise Exception('Unknown archive type')

View File

@ -131,7 +131,7 @@ class Device(object):
keys C{title}, C{authors}, C{cover}, C{tags}. The value of the C{cover} keys C{title}, C{authors}, C{cover}, C{tags}. The value of the C{cover}
element can be None or a three element tuple (width, height, data) element can be None or a three element tuple (width, height, data)
where data is the image data in JPEG format as a string. C{tags} must be where data is the image data in JPEG format as a string. C{tags} must be
a possibly empty list of strings. a possibly empty list of strings. C{authors} must be a string.
@param booklists: A tuple containing the result of calls to @param booklists: A tuple containing the result of calls to
(L{books}(oncard=False), L{books}(oncard=True)). (L{books}(oncard=False), L{books}(oncard=True)).
''' '''

View File

@ -0,0 +1,14 @@
## Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

View File

@ -0,0 +1,136 @@
## Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
'''Convert any ebook file into a LRF file.'''
import sys, os, logging, shutil, tempfile, glob
from libprs500.ebooks.lrf import option_parser
from libprs500 import __appname__, setup_cli_handlers, extract
from libprs500.ebooks.lrf.lit.convert_from import process_file as lit2lrf
from libprs500.ebooks.lrf.pdf.convert_from import process_file as pdf2lrf
from libprs500.ebooks.lrf.rtf.convert_from import process_file as rtf2lrf
from libprs500.ebooks.lrf.txt.convert_from import process_file as txt2lrf
from libprs500.ebooks.lrf.html.convert_from import process_file as html2lrf
def largest_file(files):
maxsize, file = 0, None
for f in files:
size = os.stat(f).st_size
if size > maxsize:
maxsize = size
file = f
return file
def find_htmlfile(dir):
for pair in (('*toc*.htm*', '*toc*.xhtm*'), ('*.htm*', '*.xhtm*')):
files = glob.glob(os.path.join(dir, pair[0]))
files += glob.glob(os.path.join(dir, pair[1]))
file = largest_file(files)
if file:
return file
def handle_archive(path):
tdir = tempfile.mkdtemp(prefix=__appname__+'_')
extract(path, tdir)
files = []
cdir = tdir
temp = os.listdir(tdir)
file = None
if len(temp) == 1 and os.path.isdir(os.path.join(tdir, temp[0])):
cdir = os.path.join(tdir, temp[0])
for ext in ('lit', 'rtf', 'pdf', 'txt'):
pat = os.path.join(cdir, '*.'+ext)
files.extend(glob.glob(pat))
file = largest_file(files)
if file:
return tdir, file
file = find_htmlfile(cdir)
return tdir, file
def process_file(path, options, logger=None):
path = os.path.abspath(os.path.expanduser(path))
tdir = None
if logger is None:
level = logging.DEBUG if options.verbose else logging.INFO
logger = logging.getLogger('any2lrf')
setup_cli_handlers(logger, level)
if not os.access(path, os.R_OK):
logger.critical('Cannot read from %s', path)
return 1
ext = os.path.splitext(path)[1]
if not ext or ext == '.':
logger.critical('Unknown file type: %s', path)
return 1
ext = ext[1:].lower()
cwd = os.getcwd()
if not options.output:
fmt = '.lrs' if options.lrs else '.lrf'
options.output = os.path.splitext(os.path.basename(path))[0] + fmt
options.output = os.path.abspath(os.path.expanduser(options.output))
if ext in ['zip', 'rar']:
newpath = None
try:
tdir, newpath = handle_archive(path)
except:
logger.exception(' ')
if not newpath:
logger.critical('Could not find ebook in archive')
return 1
path = newpath
logger.info('Found ebook in archive: %s', path)
try:
ext = os.path.splitext(path)[1][1:].lower()
convertor = None
if 'htm' in ext:
convertor = html2lrf
elif 'lit' == ext:
convertor = lit2lrf
elif 'pdf' == ext:
convertor = pdf2lrf
elif 'rtf' == ext:
convertor = rtf2lrf
elif 'txt' == ext:
convertor = txt2lrf
convertor(path, options, logger)
finally:
os.chdir(cwd)
if tdir and os.path.exists(tdir):
shutil.rmtree(tdir)
def main(args=sys.argv, logger=None):
parser = option_parser('''\
any2lrf myfile
Convert any ebook format into LRF. Supported formats are:
LIT, RTF, TXT, HTML and PDF. any2lrf will also process a RAR or
ZIP archive.
''')
options, args = parser.parse_args(args)
if len(args) != 2:
parser.print_help()
print
print 'No file to convert specified.'
return 1
process_file(args[1], options, logger)
return 0
if __name__ == '__main__':
sys.exit(main())

View File

@ -20,20 +20,18 @@ Code to convert HTML ebooks into LRF ebooks.
I am indebted to esperanc for the initial CSS->Xylog Style conversion code I am indebted to esperanc for the initial CSS->Xylog Style conversion code
and to Falstaff for pylrs. and to Falstaff for pylrs.
""" """
import os, re, sys, shutil, copy, glob, logging import os, re, sys, copy, glob, logging
from htmlentitydefs import name2codepoint from htmlentitydefs import name2codepoint
from urllib import unquote from urllib import unquote
from urlparse import urlparse from urlparse import urlparse
from tempfile import mkdtemp
from operator import itemgetter
from math import ceil, floor from math import ceil, floor
try: try:
from PIL import Image as PILImage from PIL import Image as PILImage
except ImportError: except ImportError:
import Image as PILImage import Image as PILImage
from libprs500.ebooks.BeautifulSoup import BeautifulSoup, BeautifulStoneSoup, \ from libprs500.ebooks.BeautifulSoup import BeautifulSoup, Comment, Tag, \
Comment, Tag, NavigableString, Declaration, ProcessingInstruction NavigableString, Declaration, ProcessingInstruction
from libprs500.ebooks.lrf.pylrs.pylrs import Paragraph, CR, Italic, ImageStream, \ from libprs500.ebooks.lrf.pylrs.pylrs import Paragraph, CR, Italic, ImageStream, \
TextBlock, ImageBlock, JumpButton, CharButton, \ TextBlock, ImageBlock, JumpButton, CharButton, \
Plot, Image, BlockSpace, RuledLine, BookSetting, Canvas, DropCaps, \ Plot, Image, BlockSpace, RuledLine, BookSetting, Canvas, DropCaps, \
@ -43,8 +41,9 @@ from libprs500.ebooks.lrf import Book
from libprs500.ebooks.lrf import option_parser as lrf_option_parser from libprs500.ebooks.lrf import option_parser as lrf_option_parser
from libprs500.ebooks import ConversionError from libprs500.ebooks import ConversionError
from libprs500.ebooks.lrf.html.table import Table from libprs500.ebooks.lrf.html.table import Table
from libprs500 import extract, filename_to_utf8, setup_cli_handlers from libprs500 import filename_to_utf8, setup_cli_handlers, __appname__
from libprs500.ptempfile import PersistentTemporaryFile from libprs500.ptempfile import PersistentTemporaryFile
from libprs500.ebooks.metadata.opf import OPFReader
class Span(_Span): class Span(_Span):
replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo' ] replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo' ]
@ -643,7 +642,7 @@ class HTMLConverter(object):
except Exception: except Exception:
self.logger.warning('Unable to process %s', path) self.logger.warning('Unable to process %s', path)
if self.verbose: if self.verbose:
self.logger.exception('') self.logger.exception(' ')
continue continue
finally: finally:
os.chdir(cwd) os.chdir(cwd)
@ -1291,15 +1290,13 @@ def process_file(path, options, logger=None):
logger = logging.getLogger('html2lrf') logger = logging.getLogger('html2lrf')
setup_cli_handlers(logger, level) setup_cli_handlers(logger, level)
cwd = os.getcwd() cwd = os.getcwd()
dirpath = None
default_title = filename_to_utf8(os.path.splitext(os.path.basename(path))[0]) default_title = filename_to_utf8(os.path.splitext(os.path.basename(path))[0])
dirpath = os.path.dirname(path)
try: try:
dirpath, path = get_path(path)
cpath, tpath = '', '' cpath, tpath = '', ''
try_opf(path, options, logger) try_opf(path, options, logger)
if options.cover: if options.cover:
dp = dirpath if dirpath else os.path.dirname(path) cpath = os.path.join(dirpath, os.path.basename(options.cover))
cpath = os.path.join(dp, os.path.basename(options.cover))
if not os.path.exists(cpath): if not os.path.exists(cpath):
cpath = os.path.abspath(os.path.expanduser(options.cover)) cpath = os.path.abspath(os.path.expanduser(options.cover))
options.cover = cpath options.cover = cpath
@ -1309,7 +1306,7 @@ def process_file(path, options, logger=None):
cim = im.resize((options.profile.screen_width, cim = im.resize((options.profile.screen_width,
options.profile.screen_height), options.profile.screen_height),
PILImage.BICUBIC).convert('RGB') PILImage.BICUBIC).convert('RGB')
cf = PersistentTemporaryFile(prefix="html2lrf_", suffix=".jpg") cf = PersistentTemporaryFile(prefix=__appname__+"_", suffix=".jpg")
cf.close() cf.close()
cim.save(cf.name) cim.save(cf.name)
cpath = cf.name cpath = cf.name
@ -1376,70 +1373,57 @@ def process_file(path, options, logger=None):
return oname return oname
finally: finally:
os.chdir(cwd) os.chdir(cwd)
if dirpath:
shutil.rmtree(dirpath, True)
def try_opf(path, options, logger): def try_opf(path, options, logger):
try: try:
opf = glob.glob(os.path.join(os.path.dirname(path),'*.opf'))[0] opf = glob.glob(os.path.join(os.path.dirname(path),'*.opf'))[0]
except IndexError: except IndexError:
return return
soup = BeautifulStoneSoup(open(opf).read()) opf = OPFReader(open(opf, 'rb'))
try: try:
title = soup.package.metadata.find('dc:title') title = opf.title
if title and not options.title: if title and not options.title:
options.title = title.string options.title = title
creators = soup.package.metadata.findAll('dc:creator')
if options.author == 'Unknown': if options.author == 'Unknown':
for author in creators: if opf.authors:
role = author.get('role') options.author = ', '.join(opf.authors)
if not role: if opf.author_sort:
role = author.get('opf:role') options.author_sort = opf.author_sort
if role == 'aut':
options.author = author.string
fa = author.get('file-as')
if fa:
options.author_sort = fa
if options.publisher == 'Unknown': if options.publisher == 'Unknown':
publisher = soup.package.metadata.find('dc:publisher') publisher = opf.publisher
if publisher: if publisher:
options.publisher = publisher.string options.publisher = publisher
if not options.category.strip(): if not options.category:
category = soup.package.metadata.find('dc:type') category = opf.category
if category: if category:
options.category = category.string options.category = category
isbn = []
for item in soup.package.metadata.findAll('dc:identifier'):
scheme = item.get('scheme')
if not scheme:
scheme = item.get('opf:scheme')
isbn.append((scheme, item.string))
if not options.cover:
for item in isbn:
src = item[1].replace('-', '')
matches = glob.glob(os.path.join(os.path.dirname(path), src+'.*'))
for match in matches:
test = os.path.splitext(match)[1].lower()
if test in ['.jpeg', '.jpg', '.gif', '.png']:
options.cover = match
break
if not options.cover: if not options.cover:
# Search for cover image in opf as created by convertlit cover = opf.cover
ref = soup.package.find('reference', {'type':'other.ms-coverimage-standard'}) if cover:
if ref: cover = os.path.join(os.path.dirname(path), cover)
try: if os.access(cover, os.R_OK):
options.cover = os.path.join(os.path.dirname(path), ref.get('href')) try:
if not os.access(options.cover, os.R_OK): PILImage.open(cover)
options.cover = None options.cover = cover
except: except:
logger.exception('Could not load cover') pass
if not options.cover:
for prefix in opf.possible_cover_prefixes():
if options.cover:
break
for suffix in ['.jpg', '.jpeg', '.gif', '.png', '.bmp']:
cpath = os.path.join(os.path.dirname(path), prefix+suffix)
try:
PILImage.open(cpath)
options.cover = cpath
break
except:
continue
except Exception: except Exception:
logger.exception('Failed to process opf file') logger.exception('Failed to process opf file')
def option_parser(): def option_parser():
return lrf_option_parser('''Usage: %prog [options] mybook.[html|rar|zip]\n\n''' return lrf_option_parser('''Usage: %prog [options] mybook.html\n\n'''
'''%prog converts mybook.html to mybook.lrf''') '''%prog converts mybook.html to mybook.lrf''')
def main(args=sys.argv): def main(args=sys.argv):
@ -1461,66 +1445,6 @@ def main(args=sys.argv):
process_file(src, options) process_file(src, options)
return 0 return 0
def console_query(dirpath, candidate, docs):
if len(docs) == 1:
return 0
try:
import readline
except ImportError:
pass
i = 0
for doc in docs:
prefix = '>' if i == candidate else ''
print prefix+str(i)+'.\t', doc[0]
i += 1
print
while True:
try:
choice = raw_input('Choose file to convert (0-'+str(i-1) + \
'). Current choice is ['+ str(candidate) + ']:')
if not choice:
return candidate
choice = int(choice)
if choice < 0 or choice >= i:
continue
candidate = choice
except EOFError, KeyboardInterrupt:
sys.exit()
except:
continue
break
return candidate
def get_path(path, query=console_query):
path = os.path.abspath(os.path.expanduser(path))
ext = os.path.splitext(path)[1][1:].lower()
if ext in ['htm', 'html', 'xhtml', 'php']:
return None, path
dirpath = mkdtemp('','html2lrf')
extract(path, dirpath)
candidate, docs = None, []
for root, dirs, files in os.walk(dirpath):
for name in files:
ext = os.path.splitext(name)[1][1:].lower()
if ext not in ['html', 'xhtml', 'htm', 'xhtm']:
continue
docs.append((name, root, os.stat(os.path.join(root, name)).st_size))
if 'toc' in name.lower():
candidate = name
docs.sort(key=itemgetter(2))
if candidate:
for i in range(len(docs)):
if docs[i][0] == candidate:
candidate = i
break
else:
candidate = len(docs) - 1
if len(docs) == 0:
raise ConversionError('No suitable files found in archive')
if len(docs) > 0:
candidate = query(dirpath, candidate, docs)
return dirpath, os.path.join(docs[candidate][1], docs[candidate][0])
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -12,13 +12,13 @@
## You should have received a copy of the GNU General Public License along ## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc., ## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
import os, sys, shutil, glob import os, sys, shutil, glob, logging
from tempfile import mkdtemp from tempfile import mkdtemp
from subprocess import Popen, PIPE from subprocess import Popen, PIPE
from libprs500.ebooks.lrf import option_parser as lrf_option_parser from libprs500.ebooks.lrf import option_parser as lrf_option_parser
from libprs500.ebooks import ConversionError from libprs500.ebooks import ConversionError
from libprs500.ebooks.lrf.html.convert_from import process_file from libprs500.ebooks.lrf.html.convert_from import process_file as html_process_file
from libprs500 import isosx, __appname__ from libprs500 import isosx, __appname__, setup_cli_handlers
CLIT = 'clit' CLIT = 'clit'
if isosx and hasattr(sys, 'frameworks_dir'): if isosx and hasattr(sys, 'frameworks_dir'):
CLIT = os.path.join(sys.frameworks_dir, CLIT) CLIT = os.path.join(sys.frameworks_dir, CLIT)
@ -29,29 +29,27 @@ def option_parser():
'''%prog converts mybook.lit to mybook.lrf''' '''%prog converts mybook.lit to mybook.lrf'''
) )
def generate_html(pathtolit): def generate_html(pathtolit, logger):
if not os.access(pathtolit, os.R_OK): if not os.access(pathtolit, os.R_OK):
raise ConversionError, 'Cannot read from ' + pathtolit raise ConversionError, 'Cannot read from ' + pathtolit
tdir = mkdtemp(prefix=__appname__+'_') tdir = mkdtemp(prefix=__appname__+'_')
cmd = ' '.join([CLIT, '"'+pathtolit+'"', tdir]) cmd = ' '.join([CLIT, '"'+pathtolit+'"', tdir])
p = Popen(cmd, shell=True, stderr=PIPE) p = Popen(cmd, shell=True, stderr=PIPE, stdout=PIPE)
ret = p.wait() ret = p.wait()
logger.info(p.stdout.read())
if ret != 0: if ret != 0:
shutil.rmtree(tdir) shutil.rmtree(tdir)
err = p.stderr.read() err = p.stderr.read()
raise ConversionError, err raise ConversionError, err
return tdir return tdir
def main(args=sys.argv): def process_file(path, options, logger=None):
parser = option_parser() if logger is None:
options, args = parser.parse_args(args) level = logging.DEBUG if options.verbose else logging.INFO
if len(args) != 2: logger = logging.getLogger('lit2lrf')
parser.print_help() setup_cli_handlers(logger, level)
print lit = os.path.abspath(os.path.expanduser(path))
print 'No lit file specified' tdir = generate_html(lit, logger)
return 1
lit = os.path.abspath(os.path.expanduser(args[1]))
tdir = generate_html(lit)
try: try:
l = glob.glob(os.path.join(tdir, '*toc*.htm*')) l = glob.glob(os.path.join(tdir, '*toc*.htm*'))
if not l: if not l:
@ -61,7 +59,9 @@ def main(args=sys.argv):
if not l: if not l:
l = glob.glob(os.path.join(tdir, '*.htm*')) l = glob.glob(os.path.join(tdir, '*.htm*'))
if not l: if not l:
raise ConversionError, 'Conversion of lit to html failed. Cannot find html file.' l = glob.glob(os.path.join(tdir, '*.txt*')) # Some lit file apparently have .txt files in them
if not l:
raise ConversionError('Conversion of lit to html failed. Cannot find html file.')
maxsize, htmlfile = 0, None maxsize, htmlfile = 0, None
for c in l: for c in l:
sz = os.path.getsize(c) sz = os.path.getsize(c)
@ -71,13 +71,24 @@ def main(args=sys.argv):
htmlfile = l[0] htmlfile = l[0]
if not options.output: if not options.output:
ext = '.lrs' if options.lrs else '.lrf' ext = '.lrs' if options.lrs else '.lrf'
options.output = os.path.abspath(os.path.basename(os.path.splitext(args[1])[0]) + ext) options.output = os.path.abspath(os.path.basename(os.path.splitext(path)[0]) + ext)
else: options.output = os.path.abspath(os.path.expanduser(options.output))
options.output = os.path.abspath(options.output) html_process_file(htmlfile, options, logger=logger)
process_file(htmlfile, options)
finally: finally:
shutil.rmtree(tdir) shutil.rmtree(tdir)
def main(args=sys.argv, logger=None):
parser = option_parser()
options, args = parser.parse_args(args)
if len(args) != 2:
parser.print_help()
print
print 'No lit file specified'
return 1
process_file(options, args[1], logger)
return 0
if __name__ == '__main__': if __name__ == '__main__':
sys.exit(main()) sys.exit(main())

View File

@ -254,19 +254,35 @@ def get_metadata(stream):
L{MetaInformation} object. L{MetaInformation} object.
""" """
lrf = LRFMetaFile(stream) lrf = LRFMetaFile(stream)
mi = MetaInformation(lrf.title.strip(), lrf.author.strip()) au = lrf.author.strip().split(',')
authors = []
for i in au:
authors.extend(i.split('&'))
mi = MetaInformation(lrf.title.strip(), authors)
mi.author = lrf.author.strip()
mi.comments = lrf.free_text.strip() mi.comments = lrf.free_text.strip()
mi.category = lrf.category.strip() mi.category = lrf.category.strip()+', '+lrf.classification.strip()
mi.classification = lrf.classification.strip()
mi.publisher = lrf.publisher.strip() mi.publisher = lrf.publisher.strip()
try:
mi.title_sort = lrf.title_reading.strip()
if not mi.title_sort:
mi.title_sort = None
except:
pass
try:
mi.author_sort = lrf.author_reading.strip()
if not mi.author_sort:
mi.author_sort = None
except:
pass
if not mi.title or 'unknown' in mi.title.lower(): if not mi.title or 'unknown' in mi.title.lower():
mi.title = None mi.title = None
if not mi.authors:
mi.authors = None
if not mi.author or 'unknown' in mi.author.lower(): if not mi.author or 'unknown' in mi.author.lower():
mi.author = None mi.author = None
if not mi.category or 'unknown' in mi.category.lower(): if not mi.category or 'unknown' in mi.category.lower():
mi.category = None mi.category = None
if not mi.classification or 'unknown' in mi.classification.lower():
mi.classification = None
if not mi.publisher or 'unknown' in mi.publisher.lower() or \ if not mi.publisher or 'unknown' in mi.publisher.lower() or \
'some publisher' in mi.publisher.lower(): 'some publisher' in mi.publisher.lower():
mi.publisher = None mi.publisher = None

View File

@ -15,19 +15,19 @@
from libprs500 import filename_to_utf8 from libprs500 import filename_to_utf8
'''''' ''''''
import sys, os, subprocess import sys, os, subprocess, logging
from libprs500 import isosx from libprs500 import isosx, setup_cli_handlers
from libprs500.ebooks import ConversionError from libprs500.ebooks import ConversionError
from libprs500.ptempfile import PersistentTemporaryFile from libprs500.ptempfile import PersistentTemporaryFile
from libprs500.ebooks.lrf import option_parser as lrf_option_parser from libprs500.ebooks.lrf import option_parser as lrf_option_parser
from libprs500.ebooks.lrf.html.convert_from import process_file from libprs500.ebooks.lrf.html.convert_from import process_file as html_process_file
PDFTOHTML = 'pdftohtml' PDFTOHTML = 'pdftohtml'
if isosx and hasattr(sys, 'frameworks_dir'): if isosx and hasattr(sys, 'frameworks_dir'):
PDFTOHTML = os.path.join(sys.frameworks_dir, PDFTOHTML) PDFTOHTML = os.path.join(sys.frameworks_dir, PDFTOHTML)
def generate_html(pathtopdf): def generate_html(pathtopdf, logger):
''' '''
Convert the pdf into html. Convert the pdf into html.
@return: A closed PersistentTemporaryFile. @return: A closed PersistentTemporaryFile.
@ -41,8 +41,10 @@ def generate_html(pathtopdf):
cwd = os.getcwd() cwd = os.getcwd()
try: try:
os.chdir(os.path.dirname(pf.name)) os.chdir(os.path.dirname(pf.name))
p = subprocess.Popen(cmd, shell=True, stderr=subprocess.PIPE) p = subprocess.Popen(cmd, shell=True, stderr=subprocess.PIPE,
stdout=subprocess.PIPE)
ret = p.wait() ret = p.wait()
logger.info(p.stdout.read())
if ret != 0: if ret != 0:
err = p.stderr.read() err = p.stderr.read()
raise ConversionError, err raise ConversionError, err
@ -56,8 +58,25 @@ def option_parser():
'''%prog converts mybook.pdf to mybook.lrf\n\n''' '''%prog converts mybook.pdf to mybook.lrf\n\n'''
) )
def process_file(path, options, logger=None):
if logger is None:
level = logging.DEBUG if options.verbose else logging.INFO
logger = logging.getLogger('pdf2lrf')
setup_cli_handlers(logger, level)
pdf = os.path.abspath(os.path.expanduser(path))
htmlfile = generate_html(pdf, logger)
if not options.output:
ext = '.lrs' if options.lrs else '.lrf'
options.output = os.path.abspath(os.path.basename(os.path.splitext(path)[0]) + ext)
else:
options.output = os.path.abspath(options.output)
options.pdftohtml = True
if not options.title:
options.title = filename_to_utf8(os.path.splitext(os.path.basename(options.output))[0])
html_process_file(htmlfile.name, options, logger)
def main(args=sys.argv):
def main(args=sys.argv, logger=None):
parser = option_parser() parser = option_parser()
options, args = parser.parse_args(args) options, args = parser.parse_args(args)
if len(args) != 2: if len(args) != 2:
@ -65,17 +84,7 @@ def main(args=sys.argv):
print print
print 'No pdf file specified' print 'No pdf file specified'
return 1 return 1
pdf = os.path.abspath(os.path.expanduser(args[1])) process_file(args[1], options, logger)
htmlfile = generate_html(pdf)
if not options.output:
ext = '.lrs' if options.lrs else '.lrf'
options.output = os.path.abspath(os.path.basename(os.path.splitext(args[1])[0]) + ext)
else:
options.output = os.path.abspath(options.output)
options.pdftohtml = True
if not options.title:
options.title = filename_to_utf8(os.path.splitext(os.path.basename(options.output))[0])
process_file(htmlfile.name, options)
return 0 return 0
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -12,13 +12,13 @@
## You should have received a copy of the GNU General Public License along ## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc., ## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
import os, sys, tempfile, subprocess, shutil import os, sys, tempfile, subprocess, shutil, logging
from libprs500.ebooks.lrf import option_parser as lrf_option_parser from libprs500.ebooks.lrf import option_parser as lrf_option_parser
from libprs500.ebooks.metadata.meta import get_metadata from libprs500.ebooks.metadata.meta import get_metadata
from libprs500.ebooks.lrf.html.convert_from import process_file from libprs500.ebooks.lrf.html.convert_from import process_file as html_process_file
from libprs500.ebooks import ConversionError from libprs500.ebooks import ConversionError
from libprs500 import isosx from libprs500 import isosx, setup_cli_handlers
UNRTF = 'unrtf' UNRTF = 'unrtf'
if isosx and hasattr(sys, 'frameworks_dir'): if isosx and hasattr(sys, 'frameworks_dir'):
@ -30,50 +30,47 @@ def option_parser():
'''%prog converts mybook.rtf to mybook.lrf''' '''%prog converts mybook.rtf to mybook.lrf'''
) )
def generate_html(rtfpath): def generate_html(rtfpath, logger):
tdir = tempfile.mkdtemp(prefix='rtf2lrf_') tdir = tempfile.mkdtemp(prefix='rtf2lrf_')
cwd = os.path.abspath(os.getcwd()) cwd = os.path.abspath(os.getcwd())
os.chdir(tdir) os.chdir(tdir)
try: try:
print 'Converting to HTML...', logger.info('Converting to HTML...')
sys.stdout.flush() sys.stdout.flush()
handle, path = tempfile.mkstemp(dir=tdir, suffix='.html') handle, path = tempfile.mkstemp(dir=tdir, suffix='.html')
file = os.fdopen(handle, 'wb') file = os.fdopen(handle, 'wb')
cmd = ' '.join([UNRTF, '"'+rtfpath+'"']) cmd = ' '.join([UNRTF, '"'+rtfpath+'"'])
p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
file.write(p.stdout.read()) file.write(p.stdout.read())
ret = p.wait() ret = p.wait()
if ret != 0: if ret != 0:
if isosx and ret == -11: #unrtf segfaults on OSX but seems to convert most of the file. if isosx and ret == -11: #unrtf segfaults on OSX but seems to convert most of the file.
file.write('</body>\n</html>') file.write('</body>\n</html>')
else: else:
logger.critical(p.stderr.read())
raise ConversionError, 'unrtf failed with error code: %d'%(ret,) raise ConversionError, 'unrtf failed with error code: %d'%(ret,)
print 'done'
file.close() file.close()
return path return path
finally: finally:
os.chdir(cwd) os.chdir(cwd)
def main(args=sys.argv): def process_file(path, options, logger=None):
parser = option_parser() if logger is None:
options, args = parser.parse_args(args) level = logging.DEBUG if options.verbose else logging.INFO
if len(args) != 2: logger = logging.getLogger('pdf2lrf')
parser.print_help() setup_cli_handlers(logger, level)
print rtf = os.path.abspath(os.path.expanduser(path))
print 'No rtf file specified'
return 1
rtf = os.path.abspath(os.path.expanduser(args[1]))
f = open(rtf, 'rb') f = open(rtf, 'rb')
mi = get_metadata(f, 'rtf') mi = get_metadata(f, 'rtf')
f.close() f.close()
html = generate_html(rtf) html = generate_html(rtf, logger)
tdir = os.path.dirname(html) tdir = os.path.dirname(html)
try: try:
if not options.output: if not options.output:
ext = '.lrs' if options.lrs else '.lrf' ext = '.lrs' if options.lrs else '.lrf'
options.output = os.path.abspath(os.path.basename(os.path.splitext(args[1])[0]) + ext) options.output = os.path.abspath(os.path.basename(os.path.splitext(path)[0]) + ext)
else: options.output = os.path.abspath(os.path.expanduser(options.output))
options.output = os.path.abspath(options.output)
if (not options.title or options.title == 'Unknown') and mi.title: if (not options.title or options.title == 'Unknown') and mi.title:
sys.argv.append('-t') sys.argv.append('-t')
sys.argv.append('"'+mi.title+'"') sys.argv.append('"'+mi.title+'"')
@ -86,9 +83,22 @@ def main(args=sys.argv):
if (not options.freetext or options.freetext == 'Unknown') and mi.comments: if (not options.freetext or options.freetext == 'Unknown') and mi.comments:
sys.argv.append('--comment') sys.argv.append('--comment')
sys.argv.append('"'+mi.comments+'"') sys.argv.append('"'+mi.comments+'"')
process_file(html, options) html_process_file(html, options, logger)
finally: finally:
shutil.rmtree(tdir) shutil.rmtree(tdir)
def main(args=sys.argv, logger=None):
parser = option_parser()
options, args = parser.parse_args(args)
if len(args) != 2:
parser.print_help()
print
print 'No rtf file specified'
return 1
process_file(args[1], options, logger)
return 0
if __name__ == '__main__': if __name__ == '__main__':
sys.exit(main()) sys.exit(main())

View File

@ -15,14 +15,14 @@
""" """
Convert .txt files to .lrf Convert .txt files to .lrf
""" """
import os, sys, codecs import os, sys, codecs, logging
from libprs500 import iswindows
from libprs500.ptempfile import PersistentTemporaryFile from libprs500.ptempfile import PersistentTemporaryFile
from libprs500.ebooks.lrf import option_parser as lrf_option_parser from libprs500.ebooks.lrf import option_parser as lrf_option_parser
from libprs500.ebooks import ConversionError from libprs500.ebooks import ConversionError
from libprs500.ebooks.lrf.html.convert_from import process_file from libprs500.ebooks.lrf.html.convert_from import process_file as html_process_file
from libprs500.ebooks.markdown import markdown from libprs500.ebooks.markdown import markdown
from libprs500 import setup_cli_handlers
def option_parser(): def option_parser():
parser = lrf_option_parser('''Usage: %prog [options] mybook.txt\n\n''' parser = lrf_option_parser('''Usage: %prog [options] mybook.txt\n\n'''
@ -65,7 +65,24 @@ def generate_html(txtfile, encoding):
codecs.open(p.name, 'wb', enc).write(html) codecs.open(p.name, 'wb', enc).write(html)
return p return p
def main(args=sys.argv): def process_file(path, options, logger=None):
if logger is None:
level = logging.DEBUG if options.verbose else logging.INFO
logger = logging.getLogger('txt2lrf')
setup_cli_handlers(logger, level)
txt = os.path.abspath(os.path.expanduser(path))
if not hasattr(options, 'encoding'):
options.encoding = None
htmlfile = generate_html(txt, options.encoding)
options.force_page_break = 'h2'
if not options.output:
ext = '.lrs' if options.lrs else '.lrf'
options.output = os.path.abspath(os.path.basename(os.path.splitext(path)[0]) + ext)
options.output = os.path.abspath(os.path.expanduser(options.output))
html_process_file(htmlfile.name, options, logger)
def main(args=sys.argv, logger=None):
parser = option_parser() parser = option_parser()
options, args = parser.parse_args(args) options, args = parser.parse_args(args)
if len(args) != 2: if len(args) != 2:
@ -73,16 +90,8 @@ def main(args=sys.argv):
print print
print 'No txt file specified' print 'No txt file specified'
return 1 return 1
txt = os.path.abspath(os.path.expanduser(args[1])) process_file(args[1], options, logger)
htmlfile = generate_html(txt, options.encoding) return 0
options.force_page_break = 'h2'
if not options.output:
ext = '.lrs' if options.lrs else '.lrf'
options.output = os.path.abspath(os.path.basename(os.path.splitext(args[1])[0]) + ext)
else:
options.output = os.path.abspath(options.output)
process_file(htmlfile.name, options)
if __name__ == '__main__': if __name__ == '__main__':
sys.exit(main()) sys.exit(main())

View File

@ -40,12 +40,20 @@ def get_parser(extension):
class MetaInformation(object): class MetaInformation(object):
'''Convenient encapsulation of book metadata''' '''Convenient encapsulation of book metadata'''
def __init__(self, title, author): def __init__(self, title, authors):
'''
@param title: title or "Unknonw"
@param authors: List of strings or []
'''
self.title = title self.title = title
self.author = author self.author = authors # Needed for backward compatibility
#: List of strings or []
self.authors = authors
#: Sort text for author
self.author_sort = None
self.title_sort = None
self.comments = None self.comments = None
self.category = None self.category = None
self.classification = None
self.publisher = None self.publisher = None
self.series = None self.series = None
self.series_index = None self.series_index = None

View File

@ -0,0 +1,155 @@
## Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
'''Read/Write metadata from Open Packaging Format (.opf) files.'''
import sys
from libprs500.ebooks.metadata import MetaInformation
from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup
class OPFReader(MetaInformation):
def __init__(self, stream):
self.default_title = stream.name if hasattr(stream, 'name') else 'Unknown'
if hasattr(stream, 'seek'):
stream.seek(0)
self.soup = BeautifulStoneSoup(stream.read())
self.series = self.series_index = self.rating = None
@apply
def title():
doc = '''title'''
def fget(self):
title = self.soup.package.metadata.find('dc:title')
if title:
return title.string
return self.default_title
return property(doc=doc, fget=fget)
@apply
def authors():
doc = '''authors'''
def fget(self):
creators = self.soup.package.metadata.findAll('dc:creator')
for elem in creators:
role = elem.get('role')
if not role:
role = elem.get('opf:role')
if role == 'aut':
au = elem.string.split(',')
ans = []
for i in au:
ans.extend(i.split('&'))
return ans
return None
return property(doc=doc, fget=fget)
@apply
def author_sort():
doc = '''author sort'''
def fget(self):
creators = self.soup.package.metadata.findAll('dc:creator')
for elem in creators:
role = elem.get('role')
if not role:
role = elem.get('opf:role')
if role == 'aut':
fa = elem.get('file-as')
return fa if fa else None
return property(doc=doc, fget=fget)
@apply
def title_sort():
doc = 'title sort'
def fget(self):
return None
return property(doc=doc, fget=fget)
@apply
def comments():
doc = 'comments'
def fget(self):
comments = self.soup.find('dc:description')
if comments:
return comments.string
return None
return property(doc=doc, fget=fget)
@apply
def category():
doc = 'category'
def fget(self):
category = self.soup.find('dc:type')
if category:
return category.string
return None
return property(doc=doc, fget=fget)
@apply
def publisher():
doc = 'publisher'
def fget(self):
publisher = self.soup.find('dc:publisher')
if publisher:
return publisher.string
return None
return property(doc=doc, fget=fget)
@apply
def isbn():
doc = 'ISBN number'
def fget(self):
for item in self.soup.package.metadata.findAll('dc:identifier'):
scheme = item.get('scheme')
if not scheme:
scheme = item.get('opf:scheme')
if scheme.lower() == 'isbn':
return item.string
return None
return property(doc=doc, fget=fget)
@apply
def cover():
doc = 'cover'
def fget(self):
guide = self.soup.package.find('guide')
if guide:
references = guide.findAll('reference')
for reference in references:
type = reference.get('type')
if not type:
continue
if type.lower() in ['cover', 'other.ms-coverimage-standard']:
return reference.get('href')
return None
return property(doc=doc, fget=fget)
def possible_cover_prefixes(self):
isbn, ans = [], []
for item in self.soup.package.metadata.findAll('dc:identifier'):
scheme = item.get('scheme')
if not scheme:
scheme = item.get('opf:scheme')
isbn.append((scheme, item.string))
for item in isbn:
ans.append(item[1].replace('-', ''))
return ans
def main(args=sys.argv):
return 0
if __name__ == '__main__':
sys.exit(main())

View File

@ -86,6 +86,11 @@ def get_metadata(stream):
if category_match: if category_match:
category = category_match.group(1).strip() category = category_match.group(1).strip()
mi = MetaInformation(title, author) mi = MetaInformation(title, author)
if author:
au = author.split(',')
mi.authors = []
for i in au:
mi.authors.extend(i.split('&'))
mi.comments = comment mi.comments = comment
mi.category = category mi.category = category
return mi return mi

View File

@ -68,8 +68,6 @@ class LibraryDatabase(object):
mi.title = title mi.title = title
if mi.category: if mi.category:
tags.append(mi.category) tags.append(mi.category)
if mi.classification:
tags.append(mi.classification)
if tags: if tags:
tags = ', '.join(tags) tags = ', '.join(tags)
else: else:

View File

@ -23,6 +23,7 @@ class Dialog(QObject):
self.dialog = QDialog(window) self.dialog = QDialog(window)
self.accept = self.dialog.accept self.accept = self.dialog.accept
self.reject = self.dialog.reject self.reject = self.dialog.reject
self._close_event = self.dialog.closeEvent
self.dialog.closeEvent = self.close_event self.dialog.closeEvent = self.close_event
self.window = window self.window = window
self.isVisible = self.dialog.isVisible self.isVisible = self.dialog.isVisible

View File

@ -265,7 +265,7 @@ class Main(QObject, Ui_MainWindow):
formats.append(format) formats.append(format)
metadata.append(mi) metadata.append(mi)
names.append(os.path.basename(book)) names.append(os.path.basename(book))
infos.append({'title':mi.title, 'authors':mi.author, infos.append({'title':mi.title, 'authors':', '.join(mi.authors),
'cover':self.default_thumbnail, 'tags':[]}) 'cover':self.default_thumbnail, 'tags':[]})
if not to_device: if not to_device:

View File

@ -81,7 +81,9 @@ def setup_completion():
f.write(opts_and_exts('lit2lrf', htmlop, ['lit'])) f.write(opts_and_exts('lit2lrf', htmlop, ['lit']))
f.write(opts_and_exts('rtf2lrf', htmlop, ['rtf'])) f.write(opts_and_exts('rtf2lrf', htmlop, ['rtf']))
f.write(opts_and_exts('pdf2lrf', htmlop, ['pdf'])) f.write(opts_and_exts('pdf2lrf', htmlop, ['pdf']))
f.write(opts_and_exts('lrf-meta', metaop, ['lrf'])) f.write(opts_and_exts('any2lrf', htmlop,
['htm', 'html', 'xhtml', 'xhtm', 'rar', 'zip', 'txt', 'lit', 'rtf', 'pdf']))
f.write(opts_and_exts('lrf-meta', metaop, ['lrf']))
f.write(''' f.write('''
_prs500_ls() _prs500_ls()
{ {