From 334ebbfe7d4979daf27f0974dff8c25d4879da5c Mon Sep 17 00:00:00 2001 From: Jonathan Jogenfors Date: Wed, 11 Feb 2026 09:37:26 +0100 Subject: [PATCH] feat: spawn external crawler --- server/Dockerfile | 6 + server/Dockerfile.dev | 6 + server/src/dtos/library.dto.ts | 5 +- server/src/repositories/storage.repository.ts | 123 +++++++++++------- server/src/services/library.service.spec.ts | 21 +-- server/src/services/library.service.ts | 35 +++-- 6 files changed, 120 insertions(+), 76 deletions(-) diff --git a/server/Dockerfile b/server/Dockerfile index a8a8b04713..a609bd8bf6 100644 --- a/server/Dockerfile +++ b/server/Dockerfile @@ -73,6 +73,12 @@ RUN --mount=type=cache,id=pnpm-plugins,target=/buildcache/pnpm-store \ FROM ghcr.io/immich-app/base-server-prod:202601131104@sha256:c649c5838b6348836d27db6d49cadbbc6157feae7a1a237180c3dec03577ba8f +RUN apt-get update && \ + apt-get install -y fd-find && \ + ln -s /usr/bin/fdfind /usr/local/bin/fd && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + WORKDIR /usr/src/app ENV NODE_ENV=production \ NVIDIA_DRIVER_CAPABILITIES=all \ diff --git a/server/Dockerfile.dev b/server/Dockerfile.dev index be752dd862..485fc8f995 100644 --- a/server/Dockerfile.dev +++ b/server/Dockerfile.dev @@ -18,6 +18,12 @@ WORKDIR /tmp/create-dep-cache RUN pnpm fetch && rm -rf /tmp/create-dep-cache && chmod -R o+rw /buildcache WORKDIR /usr/src/app +RUN apt-get update && \ + apt-get install -y fd-find && \ + ln -s /usr/bin/fdfind /usr/local/bin/fd && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + ENV PATH="${PATH}:/usr/src/app/server/bin:/usr/src/app/web/bin" \ IMMICH_ENV=development \ NVIDIA_DRIVER_CAPABILITIES=all \ diff --git a/server/src/dtos/library.dto.ts b/server/src/dtos/library.dto.ts index 3f71b8a0ed..af45fc9ec2 100644 --- a/server/src/dtos/library.dto.ts +++ b/server/src/dtos/library.dto.ts @@ -58,10 +58,7 @@ export interface CrawlOptionsDto { pathsToCrawl: string[]; includeHidden?: boolean; exclusionPatterns?: string[]; -} - -export interface WalkOptionsDto extends CrawlOptionsDto { - take: number; + take?: number; } export class ValidateLibraryDto { diff --git a/server/src/repositories/storage.repository.ts b/server/src/repositories/storage.repository.ts index 5a1a936e77..5b81d3594f 100644 --- a/server/src/repositories/storage.repository.ts +++ b/server/src/repositories/storage.repository.ts @@ -1,13 +1,13 @@ import { Injectable } from '@nestjs/common'; import archiver from 'archiver'; import chokidar, { ChokidarOptions } from 'chokidar'; -import { escapePath, glob, globStream } from 'fast-glob'; +import { spawn } from 'node:child_process'; import { constants, createReadStream, createWriteStream, existsSync, mkdirSync, ReadOptionsWithBuffer } from 'node:fs'; import fs from 'node:fs/promises'; import path from 'node:path'; import { PassThrough, Readable, Writable } from 'node:stream'; import { createGunzip, createGzip } from 'node:zlib'; -import { CrawlOptionsDto, WalkOptionsDto } from 'src/dtos/library.dto'; +import { CrawlOptionsDto } from 'src/dtos/library.dto'; import { LoggingRepository } from 'src/repositories/logging.repository'; import { mimeTypes } from 'src/utils/mime-types'; @@ -198,52 +198,87 @@ export class StorageRepository { }; } - crawl(crawlOptions: CrawlOptionsDto): Promise { + async crawl(crawlOptions: CrawlOptionsDto): Promise { const { pathsToCrawl, exclusionPatterns, includeHidden } = crawlOptions; if (pathsToCrawl.length === 0) { - return Promise.resolve([]); + return []; } - const globbedPaths = pathsToCrawl.map((path) => this.asGlob(path)); + return new Promise((resolve, reject) => { + const args: string[] = [ + '-t', + 'f', // File type: only files + '-a', // Absolute paths + '-i', // Case insensitive + '.', // Search pattern: match all files + ]; - return glob(globbedPaths, { - absolute: true, - caseSensitiveMatch: false, - onlyFiles: true, - dot: includeHidden, - ignore: exclusionPatterns, - }); - } - - async *walk(walkOptions: WalkOptionsDto): AsyncGenerator { - const { pathsToCrawl, exclusionPatterns, includeHidden } = walkOptions; - if (pathsToCrawl.length === 0) { - async function* emptyGenerator() {} - return emptyGenerator(); - } - - const globbedPaths = pathsToCrawl.map((path) => this.asGlob(path)); - - const stream = globStream(globbedPaths, { - absolute: true, - caseSensitiveMatch: false, - onlyFiles: true, - dot: includeHidden, - ignore: exclusionPatterns, - }); - - let batch: string[] = []; - for await (const value of stream) { - batch.push(value.toString()); - if (batch.length === walkOptions.take) { - yield batch; - batch = []; + if (includeHidden) { + args.push('-H'); } - } - if (batch.length > 0) { - yield batch; - } + for (const pattern of exclusionPatterns ?? []) { + args.push('-E', pattern); + } + + const extensions = mimeTypes.getSupportedFileExtensions(); + for (const ext of extensions) { + // fd expects extensions without the dot + args.push('-e', ext.replace(/^\./, '')); + } + + args.push(...pathsToCrawl); + + const fdfind = spawn('fdfind', args); + + const files: string[] = []; + let buffer = ''; + let stderr = ''; + + fdfind.stdout.on('data', (data) => { + buffer += data.toString(); + const lines = buffer.split('\n'); + // Keep the last partial line in the buffer + buffer = lines.pop() || ''; + + for (const line of lines) { + const trimmed = line.trim(); + if (trimmed.length > 0) { + files.push(trimmed); + } + } + }); + + fdfind.stderr.on('data', (data) => { + stderr += data.toString(); + }); + + fdfind.on('close', (code) => { + // Process any remaining data in the buffer + if (buffer.length > 0) { + const trimmed = buffer.trim(); + if (trimmed.length > 0) { + files.push(trimmed); + } + } + + if (code === 0) { + resolve(files); + } else { + reject(new Error(`fdfind process exited with code ${code}: ${stderr}`)); + } + }); + + fdfind.on('error', (error) => { + if ((error as NodeJS.ErrnoException).code === 'ENOENT') { + reject( + new Error('fdfind command not found. Please install fd-find: https://github.com/sharkdp/fd#installation'), + ); + } else { + reject(new Error(`Failed to spawn fdfind: ${error.message}`)); + } + }); + }); } watch(paths: string[], options: ChokidarOptions, events: Partial) { @@ -257,10 +292,4 @@ export class StorageRepository { return () => watcher.close(); } - - private asGlob(pathToCrawl: string): string { - const escapedPath = escapePath(pathToCrawl).replaceAll('"', '["]').replaceAll("'", "[']").replaceAll('`', '[`]'); - const extensions = `*{${mimeTypes.getSupportedFileExtensions().join(',')}}`; - return `${escapedPath}/**/${extensions}`; - } } diff --git a/server/src/services/library.service.spec.ts b/server/src/services/library.service.spec.ts index dbff1ca467..52e01846bf 100644 --- a/server/src/services/library.service.spec.ts +++ b/server/src/services/library.service.spec.ts @@ -1,7 +1,6 @@ import { BadRequestException } from '@nestjs/common'; import { Stats } from 'node:fs'; import { defaults, SystemConfig } from 'src/config'; -import { JOBS_LIBRARY_PAGINATION_SIZE } from 'src/constants'; import { mapLibrary } from 'src/dtos/library.dto'; import { AssetType, CronJob, ImmichWorker, JobName, JobStatus } from 'src/enum'; import { LibraryService } from 'src/services/library.service'; @@ -14,10 +13,6 @@ import { factory, newUuid } from 'test/small.factory'; import { makeStream, newTestService, ServiceMocks } from 'test/utils'; import { vitest } from 'vitest'; -async function* mockWalk() { - yield await Promise.resolve(['/data/user1/photo.jpg']); -} - describe(LibraryService.name, () => { let sut: LibraryService; @@ -165,7 +160,7 @@ describe(LibraryService.name, () => { const library = factory.library({ importPaths: ['/foo', '/bar'] }); mocks.library.get.mockResolvedValue(library); - mocks.storage.walk.mockImplementation(mockWalk); + mocks.storage.crawl.mockResolvedValue(['/data/user1/photo.jpg']); mocks.storage.stat.mockResolvedValue({ isDirectory: () => true } as Stats); mocks.storage.checkFileExists.mockResolvedValue(true); mocks.asset.filterNewExternalAssetPaths.mockResolvedValue(['/data/user1/photo.jpg']); @@ -206,11 +201,10 @@ describe(LibraryService.name, () => { await sut.handleQueueSyncFiles({ id: library.id }); - expect(mocks.storage.walk).toHaveBeenCalledWith({ + expect(mocks.storage.crawl).toHaveBeenCalledWith({ pathsToCrawl: [library.importPaths[1]], exclusionPatterns: [], includeHidden: false, - take: JOBS_LIBRARY_PAGINATION_SIZE, }); }); }); @@ -220,7 +214,7 @@ describe(LibraryService.name, () => { const library = factory.library({ importPaths: ['/foo', '/bar'] }); mocks.library.get.mockResolvedValue(library); - mocks.storage.walk.mockImplementation(mockWalk); + mocks.storage.crawl.mockResolvedValue(['/data/user1/photo.jpg']); mocks.storage.stat.mockResolvedValue({ isDirectory: () => true } as Stats); mocks.storage.checkFileExists.mockResolvedValue(true); mocks.asset.filterNewExternalAssetPaths.mockResolvedValue(['/data/user1/photo.jpg']); @@ -262,11 +256,10 @@ describe(LibraryService.name, () => { await sut.handleQueueSyncFiles({ id: library.id }); - expect(mocks.storage.walk).toHaveBeenCalledWith({ + expect(mocks.storage.crawl).toHaveBeenCalledWith({ pathsToCrawl: [library.importPaths[1]], exclusionPatterns: [], includeHidden: false, - take: JOBS_LIBRARY_PAGINATION_SIZE, }); }); }); @@ -276,7 +269,7 @@ describe(LibraryService.name, () => { const library = factory.library(); mocks.library.get.mockResolvedValue(library); - mocks.storage.walk.mockImplementation(async function* generator() {}); + mocks.storage.crawl.mockResolvedValue([]); mocks.asset.getLibraryAssetCount.mockResolvedValue(1); mocks.asset.detectOfflineExternalAssets.mockResolvedValue({ numUpdatedRows: 1n }); @@ -294,7 +287,7 @@ describe(LibraryService.name, () => { const library = factory.library(); mocks.library.get.mockResolvedValue(library); - mocks.storage.walk.mockImplementation(async function* generator() {}); + mocks.storage.crawl.mockResolvedValue([]); mocks.asset.getLibraryAssetCount.mockResolvedValue(0); mocks.asset.detectOfflineExternalAssets.mockResolvedValue({ numUpdatedRows: 1n }); @@ -308,7 +301,7 @@ describe(LibraryService.name, () => { const library = factory.library({ importPaths: ['/foo', '/bar'] }); mocks.library.get.mockResolvedValue(library); - mocks.storage.walk.mockImplementation(async function* generator() {}); + mocks.storage.crawl.mockResolvedValue([]); mocks.library.streamAssetIds.mockReturnValue(makeStream([assetStub.external])); mocks.asset.getLibraryAssetCount.mockResolvedValue(1); mocks.asset.detectOfflineExternalAssets.mockResolvedValue({ numUpdatedRows: 0n }); diff --git a/server/src/services/library.service.ts b/server/src/services/library.service.ts index 841fa4743c..5c33dd8979 100644 --- a/server/src/services/library.service.ts +++ b/server/src/services/library.service.ts @@ -394,7 +394,16 @@ export class LibraryService extends BaseService { private async processEntity(filePath: string, ownerId: string, libraryId: string) { const assetPath = path.normalize(filePath); - const stat = await this.storageRepository.stat(assetPath); + + let stat: Stats; + try { + stat = await this.storageRepository.stat(assetPath); + } catch (error: any) { + if (error.code === 'ENOENT') { + this.logger.error(`File not found during import: ${assetPath} (original path: ${filePath})`); + } + throw error; + } return { ownerId, @@ -636,21 +645,25 @@ export class LibraryService extends BaseService { return JobStatus.Skipped; } - const pathsOnDisk = this.storageRepository.walk({ + this.logger.log(`Starting disk crawl of ${validImportPaths.length} import path(s) for library ${library.id}...`); + + const crawlStart = Date.now(); + + const pathsOnDisk = await this.storageRepository.crawl({ pathsToCrawl: validImportPaths, includeHidden: false, exclusionPatterns: library.exclusionPatterns, - take: JOBS_LIBRARY_PAGINATION_SIZE, }); let importCount = 0; - let crawlCount = 0; - this.logger.log(`Starting disk crawl of ${validImportPaths.length} import path(s) for library ${library.id}...`); + this.logger.log( + `Found ${pathsOnDisk.length} file(s) on disk in ${((Date.now() - crawlStart) / 1000).toFixed(2)}s, queuing for import...`, + ); - for await (const pathBatch of pathsOnDisk) { - crawlCount += pathBatch.length; - const paths = await this.assetRepository.filterNewExternalAssetPaths(library.id, pathBatch); + for (let i = 0; i < pathsOnDisk.length; i += JOBS_LIBRARY_PAGINATION_SIZE) { + const pathChunk = pathsOnDisk.slice(i, i + JOBS_LIBRARY_PAGINATION_SIZE); + const paths = await this.assetRepository.filterNewExternalAssetPaths(library.id, pathChunk); if (paths.length > 0) { importCount += paths.length; @@ -660,18 +673,18 @@ export class LibraryService extends BaseService { data: { libraryId: library.id, paths, - progressCounter: crawlCount, + progressCounter: i + pathChunk.length, }, }); } this.logger.log( - `Crawled ${crawlCount} file(s) so far: ${paths.length} of current batch of ${pathBatch.length} will be imported to library ${library.id}...`, + `Processed ${i + pathChunk.length} file(s): ${paths.length} of current batch of ${pathChunk.length} will be imported to library ${library.id}...`, ); } this.logger.log( - `Finished disk crawl, ${crawlCount} file(s) found on disk and queued ${importCount} file(s) for import into ${library.id}`, + `Finished disk crawl, ${pathsOnDisk.length} file(s) found on disk and queued ${importCount} file(s) for import into ${library.id}`, ); await this.libraryRepository.update(job.id, { refreshedAt: new Date() });