Compare commits

...

1 Commits

Author SHA1 Message Date
Jonathan Jogenfors
334ebbfe7d feat: spawn external crawler 2026-02-11 21:58:14 +01:00
6 changed files with 120 additions and 76 deletions

View File

@@ -73,6 +73,12 @@ RUN --mount=type=cache,id=pnpm-plugins,target=/buildcache/pnpm-store \
FROM ghcr.io/immich-app/base-server-prod:202601131104@sha256:c649c5838b6348836d27db6d49cadbbc6157feae7a1a237180c3dec03577ba8f
RUN apt-get update && \
apt-get install -y fd-find && \
ln -s /usr/bin/fdfind /usr/local/bin/fd && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
WORKDIR /usr/src/app
ENV NODE_ENV=production \
NVIDIA_DRIVER_CAPABILITIES=all \

View File

@@ -18,6 +18,12 @@ WORKDIR /tmp/create-dep-cache
RUN pnpm fetch && rm -rf /tmp/create-dep-cache && chmod -R o+rw /buildcache
WORKDIR /usr/src/app
RUN apt-get update && \
apt-get install -y fd-find && \
ln -s /usr/bin/fdfind /usr/local/bin/fd && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
ENV PATH="${PATH}:/usr/src/app/server/bin:/usr/src/app/web/bin" \
IMMICH_ENV=development \
NVIDIA_DRIVER_CAPABILITIES=all \

View File

@@ -58,10 +58,7 @@ export interface CrawlOptionsDto {
pathsToCrawl: string[];
includeHidden?: boolean;
exclusionPatterns?: string[];
}
export interface WalkOptionsDto extends CrawlOptionsDto {
take: number;
take?: number;
}
export class ValidateLibraryDto {

View File

@@ -1,13 +1,13 @@
import { Injectable } from '@nestjs/common';
import archiver from 'archiver';
import chokidar, { ChokidarOptions } from 'chokidar';
import { escapePath, glob, globStream } from 'fast-glob';
import { spawn } from 'node:child_process';
import { constants, createReadStream, createWriteStream, existsSync, mkdirSync, ReadOptionsWithBuffer } from 'node:fs';
import fs from 'node:fs/promises';
import path from 'node:path';
import { PassThrough, Readable, Writable } from 'node:stream';
import { createGunzip, createGzip } from 'node:zlib';
import { CrawlOptionsDto, WalkOptionsDto } from 'src/dtos/library.dto';
import { CrawlOptionsDto } from 'src/dtos/library.dto';
import { LoggingRepository } from 'src/repositories/logging.repository';
import { mimeTypes } from 'src/utils/mime-types';
@@ -198,52 +198,87 @@ export class StorageRepository {
};
}
crawl(crawlOptions: CrawlOptionsDto): Promise<string[]> {
async crawl(crawlOptions: CrawlOptionsDto): Promise<string[]> {
const { pathsToCrawl, exclusionPatterns, includeHidden } = crawlOptions;
if (pathsToCrawl.length === 0) {
return Promise.resolve([]);
return [];
}
const globbedPaths = pathsToCrawl.map((path) => this.asGlob(path));
return new Promise((resolve, reject) => {
const args: string[] = [
'-t',
'f', // File type: only files
'-a', // Absolute paths
'-i', // Case insensitive
'.', // Search pattern: match all files
];
return glob(globbedPaths, {
absolute: true,
caseSensitiveMatch: false,
onlyFiles: true,
dot: includeHidden,
ignore: exclusionPatterns,
});
}
async *walk(walkOptions: WalkOptionsDto): AsyncGenerator<string[]> {
const { pathsToCrawl, exclusionPatterns, includeHidden } = walkOptions;
if (pathsToCrawl.length === 0) {
async function* emptyGenerator() {}
return emptyGenerator();
}
const globbedPaths = pathsToCrawl.map((path) => this.asGlob(path));
const stream = globStream(globbedPaths, {
absolute: true,
caseSensitiveMatch: false,
onlyFiles: true,
dot: includeHidden,
ignore: exclusionPatterns,
});
let batch: string[] = [];
for await (const value of stream) {
batch.push(value.toString());
if (batch.length === walkOptions.take) {
yield batch;
batch = [];
if (includeHidden) {
args.push('-H');
}
}
if (batch.length > 0) {
yield batch;
}
for (const pattern of exclusionPatterns ?? []) {
args.push('-E', pattern);
}
const extensions = mimeTypes.getSupportedFileExtensions();
for (const ext of extensions) {
// fd expects extensions without the dot
args.push('-e', ext.replace(/^\./, ''));
}
args.push(...pathsToCrawl);
const fdfind = spawn('fdfind', args);
const files: string[] = [];
let buffer = '';
let stderr = '';
fdfind.stdout.on('data', (data) => {
buffer += data.toString();
const lines = buffer.split('\n');
// Keep the last partial line in the buffer
buffer = lines.pop() || '';
for (const line of lines) {
const trimmed = line.trim();
if (trimmed.length > 0) {
files.push(trimmed);
}
}
});
fdfind.stderr.on('data', (data) => {
stderr += data.toString();
});
fdfind.on('close', (code) => {
// Process any remaining data in the buffer
if (buffer.length > 0) {
const trimmed = buffer.trim();
if (trimmed.length > 0) {
files.push(trimmed);
}
}
if (code === 0) {
resolve(files);
} else {
reject(new Error(`fdfind process exited with code ${code}: ${stderr}`));
}
});
fdfind.on('error', (error) => {
if ((error as NodeJS.ErrnoException).code === 'ENOENT') {
reject(
new Error('fdfind command not found. Please install fd-find: https://github.com/sharkdp/fd#installation'),
);
} else {
reject(new Error(`Failed to spawn fdfind: ${error.message}`));
}
});
});
}
watch(paths: string[], options: ChokidarOptions, events: Partial<WatchEvents>) {
@@ -257,10 +292,4 @@ export class StorageRepository {
return () => watcher.close();
}
private asGlob(pathToCrawl: string): string {
const escapedPath = escapePath(pathToCrawl).replaceAll('"', '["]').replaceAll("'", "[']").replaceAll('`', '[`]');
const extensions = `*{${mimeTypes.getSupportedFileExtensions().join(',')}}`;
return `${escapedPath}/**/${extensions}`;
}
}

View File

@@ -1,7 +1,6 @@
import { BadRequestException } from '@nestjs/common';
import { Stats } from 'node:fs';
import { defaults, SystemConfig } from 'src/config';
import { JOBS_LIBRARY_PAGINATION_SIZE } from 'src/constants';
import { mapLibrary } from 'src/dtos/library.dto';
import { AssetType, CronJob, ImmichWorker, JobName, JobStatus } from 'src/enum';
import { LibraryService } from 'src/services/library.service';
@@ -14,10 +13,6 @@ import { factory, newUuid } from 'test/small.factory';
import { makeStream, newTestService, ServiceMocks } from 'test/utils';
import { vitest } from 'vitest';
async function* mockWalk() {
yield await Promise.resolve(['/data/user1/photo.jpg']);
}
describe(LibraryService.name, () => {
let sut: LibraryService;
@@ -165,7 +160,7 @@ describe(LibraryService.name, () => {
const library = factory.library({ importPaths: ['/foo', '/bar'] });
mocks.library.get.mockResolvedValue(library);
mocks.storage.walk.mockImplementation(mockWalk);
mocks.storage.crawl.mockResolvedValue(['/data/user1/photo.jpg']);
mocks.storage.stat.mockResolvedValue({ isDirectory: () => true } as Stats);
mocks.storage.checkFileExists.mockResolvedValue(true);
mocks.asset.filterNewExternalAssetPaths.mockResolvedValue(['/data/user1/photo.jpg']);
@@ -206,11 +201,10 @@ describe(LibraryService.name, () => {
await sut.handleQueueSyncFiles({ id: library.id });
expect(mocks.storage.walk).toHaveBeenCalledWith({
expect(mocks.storage.crawl).toHaveBeenCalledWith({
pathsToCrawl: [library.importPaths[1]],
exclusionPatterns: [],
includeHidden: false,
take: JOBS_LIBRARY_PAGINATION_SIZE,
});
});
});
@@ -220,7 +214,7 @@ describe(LibraryService.name, () => {
const library = factory.library({ importPaths: ['/foo', '/bar'] });
mocks.library.get.mockResolvedValue(library);
mocks.storage.walk.mockImplementation(mockWalk);
mocks.storage.crawl.mockResolvedValue(['/data/user1/photo.jpg']);
mocks.storage.stat.mockResolvedValue({ isDirectory: () => true } as Stats);
mocks.storage.checkFileExists.mockResolvedValue(true);
mocks.asset.filterNewExternalAssetPaths.mockResolvedValue(['/data/user1/photo.jpg']);
@@ -262,11 +256,10 @@ describe(LibraryService.name, () => {
await sut.handleQueueSyncFiles({ id: library.id });
expect(mocks.storage.walk).toHaveBeenCalledWith({
expect(mocks.storage.crawl).toHaveBeenCalledWith({
pathsToCrawl: [library.importPaths[1]],
exclusionPatterns: [],
includeHidden: false,
take: JOBS_LIBRARY_PAGINATION_SIZE,
});
});
});
@@ -276,7 +269,7 @@ describe(LibraryService.name, () => {
const library = factory.library();
mocks.library.get.mockResolvedValue(library);
mocks.storage.walk.mockImplementation(async function* generator() {});
mocks.storage.crawl.mockResolvedValue([]);
mocks.asset.getLibraryAssetCount.mockResolvedValue(1);
mocks.asset.detectOfflineExternalAssets.mockResolvedValue({ numUpdatedRows: 1n });
@@ -294,7 +287,7 @@ describe(LibraryService.name, () => {
const library = factory.library();
mocks.library.get.mockResolvedValue(library);
mocks.storage.walk.mockImplementation(async function* generator() {});
mocks.storage.crawl.mockResolvedValue([]);
mocks.asset.getLibraryAssetCount.mockResolvedValue(0);
mocks.asset.detectOfflineExternalAssets.mockResolvedValue({ numUpdatedRows: 1n });
@@ -308,7 +301,7 @@ describe(LibraryService.name, () => {
const library = factory.library({ importPaths: ['/foo', '/bar'] });
mocks.library.get.mockResolvedValue(library);
mocks.storage.walk.mockImplementation(async function* generator() {});
mocks.storage.crawl.mockResolvedValue([]);
mocks.library.streamAssetIds.mockReturnValue(makeStream([assetStub.external]));
mocks.asset.getLibraryAssetCount.mockResolvedValue(1);
mocks.asset.detectOfflineExternalAssets.mockResolvedValue({ numUpdatedRows: 0n });

View File

@@ -394,7 +394,16 @@ export class LibraryService extends BaseService {
private async processEntity(filePath: string, ownerId: string, libraryId: string) {
const assetPath = path.normalize(filePath);
const stat = await this.storageRepository.stat(assetPath);
let stat: Stats;
try {
stat = await this.storageRepository.stat(assetPath);
} catch (error: any) {
if (error.code === 'ENOENT') {
this.logger.error(`File not found during import: ${assetPath} (original path: ${filePath})`);
}
throw error;
}
return {
ownerId,
@@ -636,21 +645,25 @@ export class LibraryService extends BaseService {
return JobStatus.Skipped;
}
const pathsOnDisk = this.storageRepository.walk({
this.logger.log(`Starting disk crawl of ${validImportPaths.length} import path(s) for library ${library.id}...`);
const crawlStart = Date.now();
const pathsOnDisk = await this.storageRepository.crawl({
pathsToCrawl: validImportPaths,
includeHidden: false,
exclusionPatterns: library.exclusionPatterns,
take: JOBS_LIBRARY_PAGINATION_SIZE,
});
let importCount = 0;
let crawlCount = 0;
this.logger.log(`Starting disk crawl of ${validImportPaths.length} import path(s) for library ${library.id}...`);
this.logger.log(
`Found ${pathsOnDisk.length} file(s) on disk in ${((Date.now() - crawlStart) / 1000).toFixed(2)}s, queuing for import...`,
);
for await (const pathBatch of pathsOnDisk) {
crawlCount += pathBatch.length;
const paths = await this.assetRepository.filterNewExternalAssetPaths(library.id, pathBatch);
for (let i = 0; i < pathsOnDisk.length; i += JOBS_LIBRARY_PAGINATION_SIZE) {
const pathChunk = pathsOnDisk.slice(i, i + JOBS_LIBRARY_PAGINATION_SIZE);
const paths = await this.assetRepository.filterNewExternalAssetPaths(library.id, pathChunk);
if (paths.length > 0) {
importCount += paths.length;
@@ -660,18 +673,18 @@ export class LibraryService extends BaseService {
data: {
libraryId: library.id,
paths,
progressCounter: crawlCount,
progressCounter: i + pathChunk.length,
},
});
}
this.logger.log(
`Crawled ${crawlCount} file(s) so far: ${paths.length} of current batch of ${pathBatch.length} will be imported to library ${library.id}...`,
`Processed ${i + pathChunk.length} file(s): ${paths.length} of current batch of ${pathChunk.length} will be imported to library ${library.id}...`,
);
}
this.logger.log(
`Finished disk crawl, ${crawlCount} file(s) found on disk and queued ${importCount} file(s) for import into ${library.id}`,
`Finished disk crawl, ${pathsOnDisk.length} file(s) found on disk and queued ${importCount} file(s) for import into ${library.id}`,
);
await this.libraryRepository.update(job.id, { refreshedAt: new Date() });