mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Make plumber archive handling code re-useable
This commit is contained in:
parent
0483d5f22f
commit
1fe527e351
47
src/calibre/ebooks/conversion/archives.py
Normal file
47
src/calibre/ebooks/conversion/archives.py
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# License: GPLv3 Copyright: 2025, Kovid Goyal <kovid at kovidgoyal.net>
|
||||||
|
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
|
||||||
|
from calibre import extract, filesystem_encoding, walk
|
||||||
|
|
||||||
|
ARCHIVE_FMTS = ('zip', 'rar', 'oebzip')
|
||||||
|
|
||||||
|
|
||||||
|
def unarchive(path, tdir):
|
||||||
|
extract(path, tdir)
|
||||||
|
files = list(walk(tdir))
|
||||||
|
files = [f if isinstance(f, str) else f.decode(filesystem_encoding)
|
||||||
|
for f in files]
|
||||||
|
from calibre.customize.ui import available_input_formats
|
||||||
|
fmts = set(available_input_formats())
|
||||||
|
fmts -= {'htm', 'html', 'xhtm', 'xhtml'}
|
||||||
|
fmts -= set(ARCHIVE_FMTS)
|
||||||
|
|
||||||
|
for ext in fmts:
|
||||||
|
for f in files:
|
||||||
|
if f.lower().endswith('.'+ext):
|
||||||
|
if ext in ['txt', 'rtf'] and os.stat(f).st_size < 2048:
|
||||||
|
continue
|
||||||
|
return f, ext
|
||||||
|
return find_html_index(files)
|
||||||
|
|
||||||
|
|
||||||
|
def find_html_index(files):
|
||||||
|
'''
|
||||||
|
Given a list of files, find the most likely root HTML file in the
|
||||||
|
list.
|
||||||
|
'''
|
||||||
|
html_pat = re.compile(r'\.(x){0,1}htm(l){0,1}$', re.IGNORECASE)
|
||||||
|
html_files = [f for f in files if html_pat.search(f) is not None]
|
||||||
|
if not html_files:
|
||||||
|
raise ValueError(_('Could not find an e-book inside the archive'))
|
||||||
|
html_files = [(f, os.stat(f).st_size) for f in html_files]
|
||||||
|
html_files.sort(key=lambda x: x[1])
|
||||||
|
html_files = [f[0] for f in html_files]
|
||||||
|
for q in ('toc', 'index'):
|
||||||
|
for f in html_files:
|
||||||
|
if os.path.splitext(os.path.basename(f))[0].lower() == q:
|
||||||
|
return f, os.path.splitext(f)[1].lower()[1:]
|
||||||
|
return html_files[-1], os.path.splitext(html_files[-1])[1].lower()[1:]
|
@ -5,12 +5,11 @@ __docformat__ = 'restructuredtext en'
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import pprint
|
import pprint
|
||||||
import re
|
|
||||||
import shutil
|
import shutil
|
||||||
import sys
|
import sys
|
||||||
from functools import partial
|
from functools import partial
|
||||||
|
|
||||||
from calibre import extract, filesystem_encoding, get_types_map, isbytestring, walk
|
from calibre import filesystem_encoding, get_types_map, isbytestring
|
||||||
from calibre.constants import __version__
|
from calibre.constants import __version__
|
||||||
from calibre.customize.conversion import DummyReporter, OptionRecommendation
|
from calibre.customize.conversion import DummyReporter, OptionRecommendation
|
||||||
from calibre.customize.ui import (
|
from calibre.customize.ui import (
|
||||||
@ -23,6 +22,7 @@ from calibre.customize.ui import (
|
|||||||
run_plugins_on_postprocess,
|
run_plugins_on_postprocess,
|
||||||
run_plugins_on_preprocess,
|
run_plugins_on_preprocess,
|
||||||
)
|
)
|
||||||
|
from calibre.ebooks.conversion.archives import ARCHIVE_FMTS, unarchive
|
||||||
from calibre.ebooks.conversion.preprocess import HTMLPreProcessor
|
from calibre.ebooks.conversion.preprocess import HTMLPreProcessor
|
||||||
from calibre.ptempfile import PersistentTemporaryDirectory
|
from calibre.ptempfile import PersistentTemporaryDirectory
|
||||||
from calibre.utils.date import parse_date
|
from calibre.utils.date import parse_date
|
||||||
@ -74,9 +74,6 @@ class CompositeProgressReporter:
|
|||||||
self.global_reporter(global_frac, msg)
|
self.global_reporter(global_frac, msg)
|
||||||
|
|
||||||
|
|
||||||
ARCHIVE_FMTS = ('zip', 'rar', 'oebzip')
|
|
||||||
|
|
||||||
|
|
||||||
class Plumber:
|
class Plumber:
|
||||||
'''
|
'''
|
||||||
The `Plumber` manages the conversion pipeline. An UI should call the methods
|
The `Plumber` manages the conversion pipeline. An UI should call the methods
|
||||||
@ -746,7 +743,7 @@ OptionRecommendation(name='search_replace',
|
|||||||
if input_fmt in ARCHIVE_FMTS:
|
if input_fmt in ARCHIVE_FMTS:
|
||||||
self.log('Processing archive...')
|
self.log('Processing archive...')
|
||||||
tdir = PersistentTemporaryDirectory('_pl_arc')
|
tdir = PersistentTemporaryDirectory('_pl_arc')
|
||||||
self.input, input_fmt = self.unarchive(self.input, tdir)
|
self.input, input_fmt = unarchive(self.input, tdir)
|
||||||
self.archive_input_tdir = tdir
|
self.archive_input_tdir = tdir
|
||||||
if os.access(self.input, os.R_OK):
|
if os.access(self.input, os.R_OK):
|
||||||
nfp = run_plugins_on_preprocess(self.input, input_fmt)
|
nfp = run_plugins_on_preprocess(self.input, input_fmt)
|
||||||
@ -811,43 +808,6 @@ OptionRecommendation(name='search_replace',
|
|||||||
self.merge_plugin_recommendations()
|
self.merge_plugin_recommendations()
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def unarchive(self, path, tdir):
|
|
||||||
extract(path, tdir)
|
|
||||||
files = list(walk(tdir))
|
|
||||||
files = [f if isinstance(f, str) else f.decode(filesystem_encoding)
|
|
||||||
for f in files]
|
|
||||||
from calibre.customize.ui import available_input_formats
|
|
||||||
fmts = set(available_input_formats())
|
|
||||||
fmts -= {'htm', 'html', 'xhtm', 'xhtml'}
|
|
||||||
fmts -= set(ARCHIVE_FMTS)
|
|
||||||
|
|
||||||
for ext in fmts:
|
|
||||||
for f in files:
|
|
||||||
if f.lower().endswith('.'+ext):
|
|
||||||
if ext in ['txt', 'rtf'] and os.stat(f).st_size < 2048:
|
|
||||||
continue
|
|
||||||
return f, ext
|
|
||||||
return self.find_html_index(files)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def find_html_index(self, files):
|
|
||||||
'''
|
|
||||||
Given a list of files, find the most likely root HTML file in the
|
|
||||||
list.
|
|
||||||
'''
|
|
||||||
html_pat = re.compile(r'\.(x){0,1}htm(l){0,1}$', re.IGNORECASE)
|
|
||||||
html_files = [f for f in files if html_pat.search(f) is not None]
|
|
||||||
if not html_files:
|
|
||||||
raise ValueError(_('Could not find an e-book inside the archive'))
|
|
||||||
html_files = [(f, os.stat(f).st_size) for f in html_files]
|
|
||||||
html_files.sort(key=lambda x: x[1])
|
|
||||||
html_files = [f[0] for f in html_files]
|
|
||||||
for q in ('toc', 'index'):
|
|
||||||
for f in html_files:
|
|
||||||
if os.path.splitext(os.path.basename(f))[0].lower() == q:
|
|
||||||
return f, os.path.splitext(f)[1].lower()[1:]
|
|
||||||
return html_files[-1], os.path.splitext(html_files[-1])[1].lower()[1:]
|
|
||||||
|
|
||||||
def get_all_options(self):
|
def get_all_options(self):
|
||||||
ans = {}
|
ans = {}
|
||||||
for group in (self.input_options, self.pipeline_options,
|
for group in (self.input_options, self.pipeline_options,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user