mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-31 14:33:54 -04:00
Implemented any2lrf
This commit is contained in:
parent
709dd81a08
commit
9c82e833ac
1
setup.py
1
setup.py
@ -31,6 +31,7 @@ entry_points = {
|
||||
'web2disk = libprs500.web.fetch.simple:main',\
|
||||
'web2lrf = libprs500.ebooks.lrf.web.convert_from:main',\
|
||||
'pdf2lrf = libprs500.ebooks.lrf.pdf.convert_from:main',\
|
||||
'any2lrf = libprs500.ebooks.lrf.any.convert_from:main',\
|
||||
'libprs500-beta = libprs500.gui2.main:main',\
|
||||
],
|
||||
'gui_scripts' : [ APPNAME+' = libprs500.gui.main:main']
|
||||
|
@ -64,14 +64,13 @@ def filename_to_utf8(name):
|
||||
return name.decode(codec, 'replace').encode('utf8')
|
||||
|
||||
def extract(path, dir):
|
||||
import os
|
||||
ext = os.path.splitext(path)[1][1:].lower()
|
||||
extractor = None
|
||||
if ext == 'zip':
|
||||
from libprs500.libunzip import extract
|
||||
extractor = extract
|
||||
elif ext == 'rar':
|
||||
from libprs500.libunrar import extract
|
||||
from libprs500.libunrar import extract # In case the dll is not found
|
||||
extractor = extract
|
||||
if not extractor:
|
||||
raise Exception('Unknown archive type')
|
||||
|
@ -131,7 +131,7 @@ class Device(object):
|
||||
keys C{title}, C{authors}, C{cover}, C{tags}. The value of the C{cover}
|
||||
element can be None or a three element tuple (width, height, data)
|
||||
where data is the image data in JPEG format as a string. C{tags} must be
|
||||
a possibly empty list of strings.
|
||||
a possibly empty list of strings. C{authors} must be a string.
|
||||
@param booklists: A tuple containing the result of calls to
|
||||
(L{books}(oncard=False), L{books}(oncard=True)).
|
||||
'''
|
||||
|
14
src/libprs500/ebooks/lrf/any/__init__.py
Normal file
14
src/libprs500/ebooks/lrf/any/__init__.py
Normal file
@ -0,0 +1,14 @@
|
||||
## Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net
|
||||
## This program is free software; you can redistribute it and/or modify
|
||||
## it under the terms of the GNU General Public License as published by
|
||||
## the Free Software Foundation; either version 2 of the License, or
|
||||
## (at your option) any later version.
|
||||
##
|
||||
## This program is distributed in the hope that it will be useful,
|
||||
## but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
## GNU General Public License for more details.
|
||||
##
|
||||
## You should have received a copy of the GNU General Public License along
|
||||
## with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
136
src/libprs500/ebooks/lrf/any/convert_from.py
Normal file
136
src/libprs500/ebooks/lrf/any/convert_from.py
Normal file
@ -0,0 +1,136 @@
|
||||
## Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net
|
||||
## This program is free software; you can redistribute it and/or modify
|
||||
## it under the terms of the GNU General Public License as published by
|
||||
## the Free Software Foundation; either version 2 of the License, or
|
||||
## (at your option) any later version.
|
||||
##
|
||||
## This program is distributed in the hope that it will be useful,
|
||||
## but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
## GNU General Public License for more details.
|
||||
##
|
||||
## You should have received a copy of the GNU General Public License along
|
||||
## with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
'''Convert any ebook file into a LRF file.'''
|
||||
|
||||
import sys, os, logging, shutil, tempfile, glob
|
||||
|
||||
from libprs500.ebooks.lrf import option_parser
|
||||
from libprs500 import __appname__, setup_cli_handlers, extract
|
||||
from libprs500.ebooks.lrf.lit.convert_from import process_file as lit2lrf
|
||||
from libprs500.ebooks.lrf.pdf.convert_from import process_file as pdf2lrf
|
||||
from libprs500.ebooks.lrf.rtf.convert_from import process_file as rtf2lrf
|
||||
from libprs500.ebooks.lrf.txt.convert_from import process_file as txt2lrf
|
||||
from libprs500.ebooks.lrf.html.convert_from import process_file as html2lrf
|
||||
|
||||
def largest_file(files):
|
||||
maxsize, file = 0, None
|
||||
for f in files:
|
||||
size = os.stat(f).st_size
|
||||
if size > maxsize:
|
||||
maxsize = size
|
||||
file = f
|
||||
return file
|
||||
|
||||
def find_htmlfile(dir):
|
||||
for pair in (('*toc*.htm*', '*toc*.xhtm*'), ('*.htm*', '*.xhtm*')):
|
||||
files = glob.glob(os.path.join(dir, pair[0]))
|
||||
files += glob.glob(os.path.join(dir, pair[1]))
|
||||
file = largest_file(files)
|
||||
if file:
|
||||
return file
|
||||
|
||||
|
||||
def handle_archive(path):
|
||||
tdir = tempfile.mkdtemp(prefix=__appname__+'_')
|
||||
extract(path, tdir)
|
||||
files = []
|
||||
cdir = tdir
|
||||
temp = os.listdir(tdir)
|
||||
file = None
|
||||
if len(temp) == 1 and os.path.isdir(os.path.join(tdir, temp[0])):
|
||||
cdir = os.path.join(tdir, temp[0])
|
||||
for ext in ('lit', 'rtf', 'pdf', 'txt'):
|
||||
pat = os.path.join(cdir, '*.'+ext)
|
||||
files.extend(glob.glob(pat))
|
||||
file = largest_file(files)
|
||||
if file:
|
||||
return tdir, file
|
||||
file = find_htmlfile(cdir)
|
||||
return tdir, file
|
||||
|
||||
def process_file(path, options, logger=None):
|
||||
path = os.path.abspath(os.path.expanduser(path))
|
||||
tdir = None
|
||||
if logger is None:
|
||||
level = logging.DEBUG if options.verbose else logging.INFO
|
||||
logger = logging.getLogger('any2lrf')
|
||||
setup_cli_handlers(logger, level)
|
||||
if not os.access(path, os.R_OK):
|
||||
logger.critical('Cannot read from %s', path)
|
||||
return 1
|
||||
ext = os.path.splitext(path)[1]
|
||||
if not ext or ext == '.':
|
||||
logger.critical('Unknown file type: %s', path)
|
||||
return 1
|
||||
ext = ext[1:].lower()
|
||||
cwd = os.getcwd()
|
||||
if not options.output:
|
||||
fmt = '.lrs' if options.lrs else '.lrf'
|
||||
options.output = os.path.splitext(os.path.basename(path))[0] + fmt
|
||||
options.output = os.path.abspath(os.path.expanduser(options.output))
|
||||
if ext in ['zip', 'rar']:
|
||||
newpath = None
|
||||
try:
|
||||
tdir, newpath = handle_archive(path)
|
||||
except:
|
||||
logger.exception(' ')
|
||||
if not newpath:
|
||||
logger.critical('Could not find ebook in archive')
|
||||
return 1
|
||||
path = newpath
|
||||
logger.info('Found ebook in archive: %s', path)
|
||||
try:
|
||||
ext = os.path.splitext(path)[1][1:].lower()
|
||||
convertor = None
|
||||
if 'htm' in ext:
|
||||
convertor = html2lrf
|
||||
elif 'lit' == ext:
|
||||
convertor = lit2lrf
|
||||
elif 'pdf' == ext:
|
||||
convertor = pdf2lrf
|
||||
elif 'rtf' == ext:
|
||||
convertor = rtf2lrf
|
||||
elif 'txt' == ext:
|
||||
convertor = txt2lrf
|
||||
convertor(path, options, logger)
|
||||
finally:
|
||||
os.chdir(cwd)
|
||||
if tdir and os.path.exists(tdir):
|
||||
shutil.rmtree(tdir)
|
||||
|
||||
|
||||
def main(args=sys.argv, logger=None):
|
||||
parser = option_parser('''\
|
||||
any2lrf myfile
|
||||
|
||||
Convert any ebook format into LRF. Supported formats are:
|
||||
LIT, RTF, TXT, HTML and PDF. any2lrf will also process a RAR or
|
||||
ZIP archive.
|
||||
''')
|
||||
options, args = parser.parse_args(args)
|
||||
if len(args) != 2:
|
||||
parser.print_help()
|
||||
print
|
||||
print 'No file to convert specified.'
|
||||
return 1
|
||||
|
||||
process_file(args[1], options, logger)
|
||||
|
||||
|
||||
|
||||
return 0
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
@ -20,20 +20,18 @@ Code to convert HTML ebooks into LRF ebooks.
|
||||
I am indebted to esperanc for the initial CSS->Xylog Style conversion code
|
||||
and to Falstaff for pylrs.
|
||||
"""
|
||||
import os, re, sys, shutil, copy, glob, logging
|
||||
import os, re, sys, copy, glob, logging
|
||||
from htmlentitydefs import name2codepoint
|
||||
from urllib import unquote
|
||||
from urlparse import urlparse
|
||||
from tempfile import mkdtemp
|
||||
from operator import itemgetter
|
||||
from math import ceil, floor
|
||||
try:
|
||||
from PIL import Image as PILImage
|
||||
except ImportError:
|
||||
import Image as PILImage
|
||||
|
||||
from libprs500.ebooks.BeautifulSoup import BeautifulSoup, BeautifulStoneSoup, \
|
||||
Comment, Tag, NavigableString, Declaration, ProcessingInstruction
|
||||
from libprs500.ebooks.BeautifulSoup import BeautifulSoup, Comment, Tag, \
|
||||
NavigableString, Declaration, ProcessingInstruction
|
||||
from libprs500.ebooks.lrf.pylrs.pylrs import Paragraph, CR, Italic, ImageStream, \
|
||||
TextBlock, ImageBlock, JumpButton, CharButton, \
|
||||
Plot, Image, BlockSpace, RuledLine, BookSetting, Canvas, DropCaps, \
|
||||
@ -43,8 +41,9 @@ from libprs500.ebooks.lrf import Book
|
||||
from libprs500.ebooks.lrf import option_parser as lrf_option_parser
|
||||
from libprs500.ebooks import ConversionError
|
||||
from libprs500.ebooks.lrf.html.table import Table
|
||||
from libprs500 import extract, filename_to_utf8, setup_cli_handlers
|
||||
from libprs500 import filename_to_utf8, setup_cli_handlers, __appname__
|
||||
from libprs500.ptempfile import PersistentTemporaryFile
|
||||
from libprs500.ebooks.metadata.opf import OPFReader
|
||||
|
||||
class Span(_Span):
|
||||
replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo' ]
|
||||
@ -643,7 +642,7 @@ class HTMLConverter(object):
|
||||
except Exception:
|
||||
self.logger.warning('Unable to process %s', path)
|
||||
if self.verbose:
|
||||
self.logger.exception('')
|
||||
self.logger.exception(' ')
|
||||
continue
|
||||
finally:
|
||||
os.chdir(cwd)
|
||||
@ -1291,15 +1290,13 @@ def process_file(path, options, logger=None):
|
||||
logger = logging.getLogger('html2lrf')
|
||||
setup_cli_handlers(logger, level)
|
||||
cwd = os.getcwd()
|
||||
dirpath = None
|
||||
default_title = filename_to_utf8(os.path.splitext(os.path.basename(path))[0])
|
||||
dirpath = os.path.dirname(path)
|
||||
try:
|
||||
dirpath, path = get_path(path)
|
||||
cpath, tpath = '', ''
|
||||
try_opf(path, options, logger)
|
||||
if options.cover:
|
||||
dp = dirpath if dirpath else os.path.dirname(path)
|
||||
cpath = os.path.join(dp, os.path.basename(options.cover))
|
||||
if options.cover:
|
||||
cpath = os.path.join(dirpath, os.path.basename(options.cover))
|
||||
if not os.path.exists(cpath):
|
||||
cpath = os.path.abspath(os.path.expanduser(options.cover))
|
||||
options.cover = cpath
|
||||
@ -1309,7 +1306,7 @@ def process_file(path, options, logger=None):
|
||||
cim = im.resize((options.profile.screen_width,
|
||||
options.profile.screen_height),
|
||||
PILImage.BICUBIC).convert('RGB')
|
||||
cf = PersistentTemporaryFile(prefix="html2lrf_", suffix=".jpg")
|
||||
cf = PersistentTemporaryFile(prefix=__appname__+"_", suffix=".jpg")
|
||||
cf.close()
|
||||
cim.save(cf.name)
|
||||
cpath = cf.name
|
||||
@ -1376,70 +1373,57 @@ def process_file(path, options, logger=None):
|
||||
return oname
|
||||
finally:
|
||||
os.chdir(cwd)
|
||||
if dirpath:
|
||||
shutil.rmtree(dirpath, True)
|
||||
|
||||
def try_opf(path, options, logger):
|
||||
try:
|
||||
opf = glob.glob(os.path.join(os.path.dirname(path),'*.opf'))[0]
|
||||
except IndexError:
|
||||
return
|
||||
soup = BeautifulStoneSoup(open(opf).read())
|
||||
opf = OPFReader(open(opf, 'rb'))
|
||||
try:
|
||||
title = soup.package.metadata.find('dc:title')
|
||||
title = opf.title
|
||||
if title and not options.title:
|
||||
options.title = title.string
|
||||
creators = soup.package.metadata.findAll('dc:creator')
|
||||
options.title = title
|
||||
if options.author == 'Unknown':
|
||||
for author in creators:
|
||||
role = author.get('role')
|
||||
if not role:
|
||||
role = author.get('opf:role')
|
||||
if role == 'aut':
|
||||
options.author = author.string
|
||||
fa = author.get('file-as')
|
||||
if fa:
|
||||
options.author_sort = fa
|
||||
if opf.authors:
|
||||
options.author = ', '.join(opf.authors)
|
||||
if opf.author_sort:
|
||||
options.author_sort = opf.author_sort
|
||||
if options.publisher == 'Unknown':
|
||||
publisher = soup.package.metadata.find('dc:publisher')
|
||||
publisher = opf.publisher
|
||||
if publisher:
|
||||
options.publisher = publisher.string
|
||||
if not options.category.strip():
|
||||
category = soup.package.metadata.find('dc:type')
|
||||
options.publisher = publisher
|
||||
if not options.category:
|
||||
category = opf.category
|
||||
if category:
|
||||
options.category = category.string
|
||||
isbn = []
|
||||
for item in soup.package.metadata.findAll('dc:identifier'):
|
||||
scheme = item.get('scheme')
|
||||
if not scheme:
|
||||
scheme = item.get('opf:scheme')
|
||||
isbn.append((scheme, item.string))
|
||||
if not options.cover:
|
||||
for item in isbn:
|
||||
src = item[1].replace('-', '')
|
||||
matches = glob.glob(os.path.join(os.path.dirname(path), src+'.*'))
|
||||
for match in matches:
|
||||
test = os.path.splitext(match)[1].lower()
|
||||
if test in ['.jpeg', '.jpg', '.gif', '.png']:
|
||||
options.cover = match
|
||||
break
|
||||
|
||||
|
||||
options.category = category
|
||||
if not options.cover:
|
||||
# Search for cover image in opf as created by convertlit
|
||||
ref = soup.package.find('reference', {'type':'other.ms-coverimage-standard'})
|
||||
if ref:
|
||||
try:
|
||||
options.cover = os.path.join(os.path.dirname(path), ref.get('href'))
|
||||
if not os.access(options.cover, os.R_OK):
|
||||
options.cover = None
|
||||
except:
|
||||
logger.exception('Could not load cover')
|
||||
cover = opf.cover
|
||||
if cover:
|
||||
cover = os.path.join(os.path.dirname(path), cover)
|
||||
if os.access(cover, os.R_OK):
|
||||
try:
|
||||
PILImage.open(cover)
|
||||
options.cover = cover
|
||||
except:
|
||||
pass
|
||||
if not options.cover:
|
||||
for prefix in opf.possible_cover_prefixes():
|
||||
if options.cover:
|
||||
break
|
||||
for suffix in ['.jpg', '.jpeg', '.gif', '.png', '.bmp']:
|
||||
cpath = os.path.join(os.path.dirname(path), prefix+suffix)
|
||||
try:
|
||||
PILImage.open(cpath)
|
||||
options.cover = cpath
|
||||
break
|
||||
except:
|
||||
continue
|
||||
except Exception:
|
||||
logger.exception('Failed to process opf file')
|
||||
|
||||
def option_parser():
|
||||
return lrf_option_parser('''Usage: %prog [options] mybook.[html|rar|zip]\n\n'''
|
||||
return lrf_option_parser('''Usage: %prog [options] mybook.html\n\n'''
|
||||
'''%prog converts mybook.html to mybook.lrf''')
|
||||
|
||||
def main(args=sys.argv):
|
||||
@ -1461,66 +1445,6 @@ def main(args=sys.argv):
|
||||
process_file(src, options)
|
||||
return 0
|
||||
|
||||
def console_query(dirpath, candidate, docs):
|
||||
if len(docs) == 1:
|
||||
return 0
|
||||
try:
|
||||
import readline
|
||||
except ImportError:
|
||||
pass
|
||||
i = 0
|
||||
for doc in docs:
|
||||
prefix = '>' if i == candidate else ''
|
||||
print prefix+str(i)+'.\t', doc[0]
|
||||
i += 1
|
||||
print
|
||||
while True:
|
||||
try:
|
||||
choice = raw_input('Choose file to convert (0-'+str(i-1) + \
|
||||
'). Current choice is ['+ str(candidate) + ']:')
|
||||
if not choice:
|
||||
return candidate
|
||||
choice = int(choice)
|
||||
if choice < 0 or choice >= i:
|
||||
continue
|
||||
candidate = choice
|
||||
except EOFError, KeyboardInterrupt:
|
||||
sys.exit()
|
||||
except:
|
||||
continue
|
||||
break
|
||||
return candidate
|
||||
|
||||
|
||||
def get_path(path, query=console_query):
|
||||
path = os.path.abspath(os.path.expanduser(path))
|
||||
ext = os.path.splitext(path)[1][1:].lower()
|
||||
if ext in ['htm', 'html', 'xhtml', 'php']:
|
||||
return None, path
|
||||
dirpath = mkdtemp('','html2lrf')
|
||||
extract(path, dirpath)
|
||||
candidate, docs = None, []
|
||||
for root, dirs, files in os.walk(dirpath):
|
||||
for name in files:
|
||||
ext = os.path.splitext(name)[1][1:].lower()
|
||||
if ext not in ['html', 'xhtml', 'htm', 'xhtm']:
|
||||
continue
|
||||
docs.append((name, root, os.stat(os.path.join(root, name)).st_size))
|
||||
if 'toc' in name.lower():
|
||||
candidate = name
|
||||
docs.sort(key=itemgetter(2))
|
||||
if candidate:
|
||||
for i in range(len(docs)):
|
||||
if docs[i][0] == candidate:
|
||||
candidate = i
|
||||
break
|
||||
else:
|
||||
candidate = len(docs) - 1
|
||||
if len(docs) == 0:
|
||||
raise ConversionError('No suitable files found in archive')
|
||||
if len(docs) > 0:
|
||||
candidate = query(dirpath, candidate, docs)
|
||||
return dirpath, os.path.join(docs[candidate][1], docs[candidate][0])
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
@ -12,13 +12,13 @@
|
||||
## You should have received a copy of the GNU General Public License along
|
||||
## with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
import os, sys, shutil, glob
|
||||
import os, sys, shutil, glob, logging
|
||||
from tempfile import mkdtemp
|
||||
from subprocess import Popen, PIPE
|
||||
from libprs500.ebooks.lrf import option_parser as lrf_option_parser
|
||||
from libprs500.ebooks import ConversionError
|
||||
from libprs500.ebooks.lrf.html.convert_from import process_file
|
||||
from libprs500 import isosx, __appname__
|
||||
from libprs500.ebooks.lrf.html.convert_from import process_file as html_process_file
|
||||
from libprs500 import isosx, __appname__, setup_cli_handlers
|
||||
CLIT = 'clit'
|
||||
if isosx and hasattr(sys, 'frameworks_dir'):
|
||||
CLIT = os.path.join(sys.frameworks_dir, CLIT)
|
||||
@ -29,29 +29,27 @@ def option_parser():
|
||||
'''%prog converts mybook.lit to mybook.lrf'''
|
||||
)
|
||||
|
||||
def generate_html(pathtolit):
|
||||
def generate_html(pathtolit, logger):
|
||||
if not os.access(pathtolit, os.R_OK):
|
||||
raise ConversionError, 'Cannot read from ' + pathtolit
|
||||
tdir = mkdtemp(prefix=__appname__+'_')
|
||||
cmd = ' '.join([CLIT, '"'+pathtolit+'"', tdir])
|
||||
p = Popen(cmd, shell=True, stderr=PIPE)
|
||||
p = Popen(cmd, shell=True, stderr=PIPE, stdout=PIPE)
|
||||
ret = p.wait()
|
||||
logger.info(p.stdout.read())
|
||||
if ret != 0:
|
||||
shutil.rmtree(tdir)
|
||||
err = p.stderr.read()
|
||||
raise ConversionError, err
|
||||
return tdir
|
||||
|
||||
def main(args=sys.argv):
|
||||
parser = option_parser()
|
||||
options, args = parser.parse_args(args)
|
||||
if len(args) != 2:
|
||||
parser.print_help()
|
||||
print
|
||||
print 'No lit file specified'
|
||||
return 1
|
||||
lit = os.path.abspath(os.path.expanduser(args[1]))
|
||||
tdir = generate_html(lit)
|
||||
def process_file(path, options, logger=None):
|
||||
if logger is None:
|
||||
level = logging.DEBUG if options.verbose else logging.INFO
|
||||
logger = logging.getLogger('lit2lrf')
|
||||
setup_cli_handlers(logger, level)
|
||||
lit = os.path.abspath(os.path.expanduser(path))
|
||||
tdir = generate_html(lit, logger)
|
||||
try:
|
||||
l = glob.glob(os.path.join(tdir, '*toc*.htm*'))
|
||||
if not l:
|
||||
@ -61,7 +59,9 @@ def main(args=sys.argv):
|
||||
if not l:
|
||||
l = glob.glob(os.path.join(tdir, '*.htm*'))
|
||||
if not l:
|
||||
raise ConversionError, 'Conversion of lit to html failed. Cannot find html file.'
|
||||
l = glob.glob(os.path.join(tdir, '*.txt*')) # Some lit file apparently have .txt files in them
|
||||
if not l:
|
||||
raise ConversionError('Conversion of lit to html failed. Cannot find html file.')
|
||||
maxsize, htmlfile = 0, None
|
||||
for c in l:
|
||||
sz = os.path.getsize(c)
|
||||
@ -71,13 +71,24 @@ def main(args=sys.argv):
|
||||
htmlfile = l[0]
|
||||
if not options.output:
|
||||
ext = '.lrs' if options.lrs else '.lrf'
|
||||
options.output = os.path.abspath(os.path.basename(os.path.splitext(args[1])[0]) + ext)
|
||||
else:
|
||||
options.output = os.path.abspath(options.output)
|
||||
process_file(htmlfile, options)
|
||||
options.output = os.path.abspath(os.path.basename(os.path.splitext(path)[0]) + ext)
|
||||
options.output = os.path.abspath(os.path.expanduser(options.output))
|
||||
html_process_file(htmlfile, options, logger=logger)
|
||||
finally:
|
||||
shutil.rmtree(tdir)
|
||||
|
||||
|
||||
|
||||
def main(args=sys.argv, logger=None):
|
||||
parser = option_parser()
|
||||
options, args = parser.parse_args(args)
|
||||
if len(args) != 2:
|
||||
parser.print_help()
|
||||
print
|
||||
print 'No lit file specified'
|
||||
return 1
|
||||
process_file(options, args[1], logger)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
|
@ -254,19 +254,35 @@ def get_metadata(stream):
|
||||
L{MetaInformation} object.
|
||||
"""
|
||||
lrf = LRFMetaFile(stream)
|
||||
mi = MetaInformation(lrf.title.strip(), lrf.author.strip())
|
||||
au = lrf.author.strip().split(',')
|
||||
authors = []
|
||||
for i in au:
|
||||
authors.extend(i.split('&'))
|
||||
mi = MetaInformation(lrf.title.strip(), authors)
|
||||
mi.author = lrf.author.strip()
|
||||
mi.comments = lrf.free_text.strip()
|
||||
mi.category = lrf.category.strip()
|
||||
mi.classification = lrf.classification.strip()
|
||||
mi.category = lrf.category.strip()+', '+lrf.classification.strip()
|
||||
mi.publisher = lrf.publisher.strip()
|
||||
try:
|
||||
mi.title_sort = lrf.title_reading.strip()
|
||||
if not mi.title_sort:
|
||||
mi.title_sort = None
|
||||
except:
|
||||
pass
|
||||
try:
|
||||
mi.author_sort = lrf.author_reading.strip()
|
||||
if not mi.author_sort:
|
||||
mi.author_sort = None
|
||||
except:
|
||||
pass
|
||||
if not mi.title or 'unknown' in mi.title.lower():
|
||||
mi.title = None
|
||||
if not mi.authors:
|
||||
mi.authors = None
|
||||
if not mi.author or 'unknown' in mi.author.lower():
|
||||
mi.author = None
|
||||
if not mi.category or 'unknown' in mi.category.lower():
|
||||
mi.category = None
|
||||
if not mi.classification or 'unknown' in mi.classification.lower():
|
||||
mi.classification = None
|
||||
if not mi.publisher or 'unknown' in mi.publisher.lower() or \
|
||||
'some publisher' in mi.publisher.lower():
|
||||
mi.publisher = None
|
||||
|
@ -15,19 +15,19 @@
|
||||
from libprs500 import filename_to_utf8
|
||||
''''''
|
||||
|
||||
import sys, os, subprocess
|
||||
from libprs500 import isosx
|
||||
import sys, os, subprocess, logging
|
||||
from libprs500 import isosx, setup_cli_handlers
|
||||
from libprs500.ebooks import ConversionError
|
||||
from libprs500.ptempfile import PersistentTemporaryFile
|
||||
from libprs500.ebooks.lrf import option_parser as lrf_option_parser
|
||||
from libprs500.ebooks.lrf.html.convert_from import process_file
|
||||
from libprs500.ebooks.lrf.html.convert_from import process_file as html_process_file
|
||||
|
||||
PDFTOHTML = 'pdftohtml'
|
||||
if isosx and hasattr(sys, 'frameworks_dir'):
|
||||
PDFTOHTML = os.path.join(sys.frameworks_dir, PDFTOHTML)
|
||||
|
||||
|
||||
def generate_html(pathtopdf):
|
||||
def generate_html(pathtopdf, logger):
|
||||
'''
|
||||
Convert the pdf into html.
|
||||
@return: A closed PersistentTemporaryFile.
|
||||
@ -41,8 +41,10 @@ def generate_html(pathtopdf):
|
||||
cwd = os.getcwd()
|
||||
try:
|
||||
os.chdir(os.path.dirname(pf.name))
|
||||
p = subprocess.Popen(cmd, shell=True, stderr=subprocess.PIPE)
|
||||
p = subprocess.Popen(cmd, shell=True, stderr=subprocess.PIPE,
|
||||
stdout=subprocess.PIPE)
|
||||
ret = p.wait()
|
||||
logger.info(p.stdout.read())
|
||||
if ret != 0:
|
||||
err = p.stderr.read()
|
||||
raise ConversionError, err
|
||||
@ -56,8 +58,25 @@ def option_parser():
|
||||
'''%prog converts mybook.pdf to mybook.lrf\n\n'''
|
||||
)
|
||||
|
||||
def process_file(path, options, logger=None):
|
||||
if logger is None:
|
||||
level = logging.DEBUG if options.verbose else logging.INFO
|
||||
logger = logging.getLogger('pdf2lrf')
|
||||
setup_cli_handlers(logger, level)
|
||||
pdf = os.path.abspath(os.path.expanduser(path))
|
||||
htmlfile = generate_html(pdf, logger)
|
||||
if not options.output:
|
||||
ext = '.lrs' if options.lrs else '.lrf'
|
||||
options.output = os.path.abspath(os.path.basename(os.path.splitext(path)[0]) + ext)
|
||||
else:
|
||||
options.output = os.path.abspath(options.output)
|
||||
options.pdftohtml = True
|
||||
if not options.title:
|
||||
options.title = filename_to_utf8(os.path.splitext(os.path.basename(options.output))[0])
|
||||
html_process_file(htmlfile.name, options, logger)
|
||||
|
||||
def main(args=sys.argv):
|
||||
|
||||
def main(args=sys.argv, logger=None):
|
||||
parser = option_parser()
|
||||
options, args = parser.parse_args(args)
|
||||
if len(args) != 2:
|
||||
@ -65,17 +84,7 @@ def main(args=sys.argv):
|
||||
print
|
||||
print 'No pdf file specified'
|
||||
return 1
|
||||
pdf = os.path.abspath(os.path.expanduser(args[1]))
|
||||
htmlfile = generate_html(pdf)
|
||||
if not options.output:
|
||||
ext = '.lrs' if options.lrs else '.lrf'
|
||||
options.output = os.path.abspath(os.path.basename(os.path.splitext(args[1])[0]) + ext)
|
||||
else:
|
||||
options.output = os.path.abspath(options.output)
|
||||
options.pdftohtml = True
|
||||
if not options.title:
|
||||
options.title = filename_to_utf8(os.path.splitext(os.path.basename(options.output))[0])
|
||||
process_file(htmlfile.name, options)
|
||||
process_file(args[1], options, logger)
|
||||
return 0
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
@ -12,13 +12,13 @@
|
||||
## You should have received a copy of the GNU General Public License along
|
||||
## with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
import os, sys, tempfile, subprocess, shutil
|
||||
import os, sys, tempfile, subprocess, shutil, logging
|
||||
|
||||
from libprs500.ebooks.lrf import option_parser as lrf_option_parser
|
||||
from libprs500.ebooks.metadata.meta import get_metadata
|
||||
from libprs500.ebooks.lrf.html.convert_from import process_file
|
||||
from libprs500.ebooks.lrf.html.convert_from import process_file as html_process_file
|
||||
from libprs500.ebooks import ConversionError
|
||||
from libprs500 import isosx
|
||||
from libprs500 import isosx, setup_cli_handlers
|
||||
|
||||
UNRTF = 'unrtf'
|
||||
if isosx and hasattr(sys, 'frameworks_dir'):
|
||||
@ -30,50 +30,47 @@ def option_parser():
|
||||
'''%prog converts mybook.rtf to mybook.lrf'''
|
||||
)
|
||||
|
||||
def generate_html(rtfpath):
|
||||
def generate_html(rtfpath, logger):
|
||||
tdir = tempfile.mkdtemp(prefix='rtf2lrf_')
|
||||
cwd = os.path.abspath(os.getcwd())
|
||||
os.chdir(tdir)
|
||||
try:
|
||||
print 'Converting to HTML...',
|
||||
logger.info('Converting to HTML...')
|
||||
sys.stdout.flush()
|
||||
handle, path = tempfile.mkstemp(dir=tdir, suffix='.html')
|
||||
file = os.fdopen(handle, 'wb')
|
||||
cmd = ' '.join([UNRTF, '"'+rtfpath+'"'])
|
||||
p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
|
||||
p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE)
|
||||
file.write(p.stdout.read())
|
||||
ret = p.wait()
|
||||
if ret != 0:
|
||||
if isosx and ret == -11: #unrtf segfaults on OSX but seems to convert most of the file.
|
||||
file.write('</body>\n</html>')
|
||||
else:
|
||||
logger.critical(p.stderr.read())
|
||||
raise ConversionError, 'unrtf failed with error code: %d'%(ret,)
|
||||
print 'done'
|
||||
file.close()
|
||||
return path
|
||||
finally:
|
||||
os.chdir(cwd)
|
||||
|
||||
def main(args=sys.argv):
|
||||
parser = option_parser()
|
||||
options, args = parser.parse_args(args)
|
||||
if len(args) != 2:
|
||||
parser.print_help()
|
||||
print
|
||||
print 'No rtf file specified'
|
||||
return 1
|
||||
rtf = os.path.abspath(os.path.expanduser(args[1]))
|
||||
def process_file(path, options, logger=None):
|
||||
if logger is None:
|
||||
level = logging.DEBUG if options.verbose else logging.INFO
|
||||
logger = logging.getLogger('pdf2lrf')
|
||||
setup_cli_handlers(logger, level)
|
||||
rtf = os.path.abspath(os.path.expanduser(path))
|
||||
f = open(rtf, 'rb')
|
||||
mi = get_metadata(f, 'rtf')
|
||||
f.close()
|
||||
html = generate_html(rtf)
|
||||
html = generate_html(rtf, logger)
|
||||
tdir = os.path.dirname(html)
|
||||
try:
|
||||
if not options.output:
|
||||
ext = '.lrs' if options.lrs else '.lrf'
|
||||
options.output = os.path.abspath(os.path.basename(os.path.splitext(args[1])[0]) + ext)
|
||||
else:
|
||||
options.output = os.path.abspath(options.output)
|
||||
options.output = os.path.abspath(os.path.basename(os.path.splitext(path)[0]) + ext)
|
||||
options.output = os.path.abspath(os.path.expanduser(options.output))
|
||||
if (not options.title or options.title == 'Unknown') and mi.title:
|
||||
sys.argv.append('-t')
|
||||
sys.argv.append('"'+mi.title+'"')
|
||||
@ -86,9 +83,22 @@ def main(args=sys.argv):
|
||||
if (not options.freetext or options.freetext == 'Unknown') and mi.comments:
|
||||
sys.argv.append('--comment')
|
||||
sys.argv.append('"'+mi.comments+'"')
|
||||
process_file(html, options)
|
||||
html_process_file(html, options, logger)
|
||||
finally:
|
||||
shutil.rmtree(tdir)
|
||||
|
||||
def main(args=sys.argv, logger=None):
|
||||
parser = option_parser()
|
||||
options, args = parser.parse_args(args)
|
||||
if len(args) != 2:
|
||||
parser.print_help()
|
||||
print
|
||||
print 'No rtf file specified'
|
||||
return 1
|
||||
process_file(args[1], options, logger)
|
||||
return 0
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
|
@ -15,14 +15,14 @@
|
||||
"""
|
||||
Convert .txt files to .lrf
|
||||
"""
|
||||
import os, sys, codecs
|
||||
import os, sys, codecs, logging
|
||||
|
||||
from libprs500 import iswindows
|
||||
from libprs500.ptempfile import PersistentTemporaryFile
|
||||
from libprs500.ebooks.lrf import option_parser as lrf_option_parser
|
||||
from libprs500.ebooks import ConversionError
|
||||
from libprs500.ebooks.lrf.html.convert_from import process_file
|
||||
from libprs500.ebooks.lrf.html.convert_from import process_file as html_process_file
|
||||
from libprs500.ebooks.markdown import markdown
|
||||
from libprs500 import setup_cli_handlers
|
||||
|
||||
def option_parser():
|
||||
parser = lrf_option_parser('''Usage: %prog [options] mybook.txt\n\n'''
|
||||
@ -65,7 +65,24 @@ def generate_html(txtfile, encoding):
|
||||
codecs.open(p.name, 'wb', enc).write(html)
|
||||
return p
|
||||
|
||||
def main(args=sys.argv):
|
||||
def process_file(path, options, logger=None):
|
||||
if logger is None:
|
||||
level = logging.DEBUG if options.verbose else logging.INFO
|
||||
logger = logging.getLogger('txt2lrf')
|
||||
setup_cli_handlers(logger, level)
|
||||
txt = os.path.abspath(os.path.expanduser(path))
|
||||
if not hasattr(options, 'encoding'):
|
||||
options.encoding = None
|
||||
htmlfile = generate_html(txt, options.encoding)
|
||||
options.force_page_break = 'h2'
|
||||
if not options.output:
|
||||
ext = '.lrs' if options.lrs else '.lrf'
|
||||
options.output = os.path.abspath(os.path.basename(os.path.splitext(path)[0]) + ext)
|
||||
options.output = os.path.abspath(os.path.expanduser(options.output))
|
||||
|
||||
html_process_file(htmlfile.name, options, logger)
|
||||
|
||||
def main(args=sys.argv, logger=None):
|
||||
parser = option_parser()
|
||||
options, args = parser.parse_args(args)
|
||||
if len(args) != 2:
|
||||
@ -73,16 +90,8 @@ def main(args=sys.argv):
|
||||
print
|
||||
print 'No txt file specified'
|
||||
return 1
|
||||
txt = os.path.abspath(os.path.expanduser(args[1]))
|
||||
htmlfile = generate_html(txt, options.encoding)
|
||||
options.force_page_break = 'h2'
|
||||
if not options.output:
|
||||
ext = '.lrs' if options.lrs else '.lrf'
|
||||
options.output = os.path.abspath(os.path.basename(os.path.splitext(args[1])[0]) + ext)
|
||||
else:
|
||||
options.output = os.path.abspath(options.output)
|
||||
|
||||
process_file(htmlfile.name, options)
|
||||
process_file(args[1], options, logger)
|
||||
return 0
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
@ -40,12 +40,20 @@ def get_parser(extension):
|
||||
class MetaInformation(object):
|
||||
'''Convenient encapsulation of book metadata'''
|
||||
|
||||
def __init__(self, title, author):
|
||||
def __init__(self, title, authors):
|
||||
'''
|
||||
@param title: title or "Unknonw"
|
||||
@param authors: List of strings or []
|
||||
'''
|
||||
self.title = title
|
||||
self.author = author
|
||||
self.author = authors # Needed for backward compatibility
|
||||
#: List of strings or []
|
||||
self.authors = authors
|
||||
#: Sort text for author
|
||||
self.author_sort = None
|
||||
self.title_sort = None
|
||||
self.comments = None
|
||||
self.category = None
|
||||
self.classification = None
|
||||
self.publisher = None
|
||||
self.series = None
|
||||
self.series_index = None
|
||||
|
155
src/libprs500/ebooks/metadata/opf.py
Normal file
155
src/libprs500/ebooks/metadata/opf.py
Normal file
@ -0,0 +1,155 @@
|
||||
## Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net
|
||||
## This program is free software; you can redistribute it and/or modify
|
||||
## it under the terms of the GNU General Public License as published by
|
||||
## the Free Software Foundation; either version 2 of the License, or
|
||||
## (at your option) any later version.
|
||||
##
|
||||
## This program is distributed in the hope that it will be useful,
|
||||
## but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
## GNU General Public License for more details.
|
||||
##
|
||||
## You should have received a copy of the GNU General Public License along
|
||||
## with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
'''Read/Write metadata from Open Packaging Format (.opf) files.'''
|
||||
|
||||
import sys
|
||||
|
||||
from libprs500.ebooks.metadata import MetaInformation
|
||||
from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup
|
||||
|
||||
class OPFReader(MetaInformation):
|
||||
|
||||
def __init__(self, stream):
|
||||
self.default_title = stream.name if hasattr(stream, 'name') else 'Unknown'
|
||||
if hasattr(stream, 'seek'):
|
||||
stream.seek(0)
|
||||
self.soup = BeautifulStoneSoup(stream.read())
|
||||
self.series = self.series_index = self.rating = None
|
||||
|
||||
@apply
|
||||
def title():
|
||||
doc = '''title'''
|
||||
def fget(self):
|
||||
title = self.soup.package.metadata.find('dc:title')
|
||||
if title:
|
||||
return title.string
|
||||
return self.default_title
|
||||
return property(doc=doc, fget=fget)
|
||||
|
||||
@apply
|
||||
def authors():
|
||||
doc = '''authors'''
|
||||
def fget(self):
|
||||
creators = self.soup.package.metadata.findAll('dc:creator')
|
||||
for elem in creators:
|
||||
role = elem.get('role')
|
||||
if not role:
|
||||
role = elem.get('opf:role')
|
||||
if role == 'aut':
|
||||
au = elem.string.split(',')
|
||||
ans = []
|
||||
for i in au:
|
||||
ans.extend(i.split('&'))
|
||||
return ans
|
||||
return None
|
||||
return property(doc=doc, fget=fget)
|
||||
|
||||
@apply
|
||||
def author_sort():
|
||||
doc = '''author sort'''
|
||||
def fget(self):
|
||||
creators = self.soup.package.metadata.findAll('dc:creator')
|
||||
for elem in creators:
|
||||
role = elem.get('role')
|
||||
if not role:
|
||||
role = elem.get('opf:role')
|
||||
if role == 'aut':
|
||||
fa = elem.get('file-as')
|
||||
return fa if fa else None
|
||||
return property(doc=doc, fget=fget)
|
||||
|
||||
@apply
|
||||
def title_sort():
|
||||
doc = 'title sort'
|
||||
def fget(self):
|
||||
return None
|
||||
return property(doc=doc, fget=fget)
|
||||
|
||||
@apply
|
||||
def comments():
|
||||
doc = 'comments'
|
||||
def fget(self):
|
||||
comments = self.soup.find('dc:description')
|
||||
if comments:
|
||||
return comments.string
|
||||
return None
|
||||
return property(doc=doc, fget=fget)
|
||||
|
||||
@apply
|
||||
def category():
|
||||
doc = 'category'
|
||||
def fget(self):
|
||||
category = self.soup.find('dc:type')
|
||||
if category:
|
||||
return category.string
|
||||
return None
|
||||
return property(doc=doc, fget=fget)
|
||||
|
||||
@apply
|
||||
def publisher():
|
||||
doc = 'publisher'
|
||||
def fget(self):
|
||||
publisher = self.soup.find('dc:publisher')
|
||||
if publisher:
|
||||
return publisher.string
|
||||
return None
|
||||
return property(doc=doc, fget=fget)
|
||||
|
||||
@apply
|
||||
def isbn():
|
||||
doc = 'ISBN number'
|
||||
def fget(self):
|
||||
for item in self.soup.package.metadata.findAll('dc:identifier'):
|
||||
scheme = item.get('scheme')
|
||||
if not scheme:
|
||||
scheme = item.get('opf:scheme')
|
||||
if scheme.lower() == 'isbn':
|
||||
return item.string
|
||||
return None
|
||||
return property(doc=doc, fget=fget)
|
||||
|
||||
@apply
|
||||
def cover():
|
||||
doc = 'cover'
|
||||
def fget(self):
|
||||
guide = self.soup.package.find('guide')
|
||||
if guide:
|
||||
references = guide.findAll('reference')
|
||||
for reference in references:
|
||||
type = reference.get('type')
|
||||
if not type:
|
||||
continue
|
||||
if type.lower() in ['cover', 'other.ms-coverimage-standard']:
|
||||
return reference.get('href')
|
||||
return None
|
||||
return property(doc=doc, fget=fget)
|
||||
|
||||
def possible_cover_prefixes(self):
|
||||
isbn, ans = [], []
|
||||
for item in self.soup.package.metadata.findAll('dc:identifier'):
|
||||
scheme = item.get('scheme')
|
||||
if not scheme:
|
||||
scheme = item.get('opf:scheme')
|
||||
isbn.append((scheme, item.string))
|
||||
for item in isbn:
|
||||
ans.append(item[1].replace('-', ''))
|
||||
return ans
|
||||
|
||||
|
||||
def main(args=sys.argv):
|
||||
return 0
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
@ -86,6 +86,11 @@ def get_metadata(stream):
|
||||
if category_match:
|
||||
category = category_match.group(1).strip()
|
||||
mi = MetaInformation(title, author)
|
||||
if author:
|
||||
au = author.split(',')
|
||||
mi.authors = []
|
||||
for i in au:
|
||||
mi.authors.extend(i.split('&'))
|
||||
mi.comments = comment
|
||||
mi.category = category
|
||||
return mi
|
||||
|
@ -68,8 +68,6 @@ class LibraryDatabase(object):
|
||||
mi.title = title
|
||||
if mi.category:
|
||||
tags.append(mi.category)
|
||||
if mi.classification:
|
||||
tags.append(mi.classification)
|
||||
if tags:
|
||||
tags = ', '.join(tags)
|
||||
else:
|
||||
|
@ -23,6 +23,7 @@ class Dialog(QObject):
|
||||
self.dialog = QDialog(window)
|
||||
self.accept = self.dialog.accept
|
||||
self.reject = self.dialog.reject
|
||||
self._close_event = self.dialog.closeEvent
|
||||
self.dialog.closeEvent = self.close_event
|
||||
self.window = window
|
||||
self.isVisible = self.dialog.isVisible
|
||||
|
@ -265,7 +265,7 @@ class Main(QObject, Ui_MainWindow):
|
||||
formats.append(format)
|
||||
metadata.append(mi)
|
||||
names.append(os.path.basename(book))
|
||||
infos.append({'title':mi.title, 'authors':mi.author,
|
||||
infos.append({'title':mi.title, 'authors':', '.join(mi.authors),
|
||||
'cover':self.default_thumbnail, 'tags':[]})
|
||||
|
||||
if not to_device:
|
||||
|
@ -81,7 +81,9 @@ def setup_completion():
|
||||
f.write(opts_and_exts('lit2lrf', htmlop, ['lit']))
|
||||
f.write(opts_and_exts('rtf2lrf', htmlop, ['rtf']))
|
||||
f.write(opts_and_exts('pdf2lrf', htmlop, ['pdf']))
|
||||
f.write(opts_and_exts('lrf-meta', metaop, ['lrf']))
|
||||
f.write(opts_and_exts('any2lrf', htmlop,
|
||||
['htm', 'html', 'xhtml', 'xhtm', 'rar', 'zip', 'txt', 'lit', 'rtf', 'pdf']))
|
||||
f.write(opts_and_exts('lrf-meta', metaop, ['lrf']))
|
||||
f.write('''
|
||||
_prs500_ls()
|
||||
{
|
||||
|
Loading…
x
Reference in New Issue
Block a user