Compare commits

...

6 Commits

Author SHA1 Message Date
Jonathan Jogenfors
f7c0e0712f Merge branch 'main' of https://github.com/immich-app/immich into feat/crawl-wrapper 2026-02-14 23:45:03 +01:00
Jonathan Jogenfors
3f93169301 Merge branch 'main' of https://github.com/immich-app/immich into feat/crawl-wrapper 2026-02-14 22:38:07 +01:00
Jonathan Jogenfors
8937fe0133 feat: crawl using ignore 2026-02-13 22:51:40 +01:00
Jonathan Jogenfors
0a055d0fc7 Merge branch 'feat/fd-glob' of https://github.com/immich-app/immich into feat/crawl-wrapper 2026-02-11 21:58:54 +01:00
Jonathan Jogenfors
334ebbfe7d feat: spawn external crawler 2026-02-11 21:58:14 +01:00
Jonathan Jogenfors
57dd127162 feat: spawn external crawler 2026-02-11 12:41:31 +01:00
9 changed files with 138 additions and 157 deletions

9
pnpm-lock.yaml generated
View File

@@ -343,6 +343,9 @@ importers:
'@extism/extism':
specifier: 2.0.0-rc13
version: 2.0.0-rc13
'@immich/walkrs':
specifier: 0.0.0
version: 0.0.0
'@nestjs/bullmq':
specifier: ^11.0.1
version: 11.0.4(@nestjs/common@11.1.13(class-transformer@0.5.1)(class-validator@0.14.3)(reflect-metadata@0.2.2)(rxjs@7.8.2))(@nestjs/core@11.1.13)(bullmq@5.67.3)
@@ -3023,6 +3026,10 @@ packages:
peerDependencies:
svelte: ^5.0.0
'@immich/walkrs@0.0.0':
resolution: {integrity: sha512-wx2J/qhZjzaHmu9QXkmh8ECfrJog7KrD7p50xva9bqiYMtr37tbOAMDeXTx2mhDDzpPV8B2YlR0wq5O28tojuw==}
engines: {pnpm: '>=10.0.0'}
'@inquirer/ansi@1.0.2':
resolution: {integrity: sha512-S8qNSZiYzFd0wAcyG5AXCvUHC5Sr7xpZ9wZ2py9XR88jUz8wooStVx5M6dRzczbBWjic9NP7+rY0Xi7qqK/aMQ==}
engines: {node: '>=18'}
@@ -14977,6 +14984,8 @@ snapshots:
transitivePeerDependencies:
- '@sveltejs/kit'
'@immich/walkrs@0.0.0': {}
'@inquirer/ansi@1.0.2': {}
'@inquirer/checkbox@4.3.2(@types/node@24.10.13)':

View File

@@ -14,6 +14,9 @@ COPY ./package* ./pnpm* .pnpmfile.cjs /tmp/create-dep-cache/
COPY ./web/package* ./web/pnpm* /tmp/create-dep-cache/web/
COPY ./server/package* ./server/pnpm* /tmp/create-dep-cache/server/
COPY ./open-api/typescript-sdk/package* ./open-api/typescript-sdk/pnpm* /tmp/create-dep-cache/open-api/typescript-sdk/
COPY --from=walkrs ./package*.json /tmp/walkrs/
COPY --from=walkrs ./Cargo.toml /tmp/walkrs/
COPY --from=walkrs ./src /tmp/walkrs/src/
WORKDIR /tmp/create-dep-cache
RUN pnpm fetch && rm -rf /tmp/create-dep-cache && chmod -R o+rw /buildcache
WORKDIR /usr/src/app

View File

@@ -35,6 +35,7 @@
},
"dependencies": {
"@extism/extism": "2.0.0-rc13",
"@immich/walkrs": "0.0.0",
"@nestjs/bullmq": "^11.0.1",
"@nestjs/common": "^11.0.4",
"@nestjs/core": "^11.0.4",

View File

@@ -54,16 +54,12 @@ export class UpdateLibraryDto {
exclusionPatterns?: string[];
}
export interface CrawlOptionsDto {
pathsToCrawl: string[];
export interface WalkOptionsDto {
pathsToWalk: string[];
includeHidden?: boolean;
exclusionPatterns?: string[];
}
export interface WalkOptionsDto extends CrawlOptionsDto {
take: number;
}
export class ValidateLibraryDto {
@ApiPropertyOptional({ description: 'Import paths to validate (max 128)' })
@Optional()

View File

@@ -1,13 +1,13 @@
import { walk } from '@immich/walkrs';
import { Injectable } from '@nestjs/common';
import archiver from 'archiver';
import chokidar, { ChokidarOptions } from 'chokidar';
import { escapePath, glob, globStream } from 'fast-glob';
import { constants, createReadStream, createWriteStream, existsSync, mkdirSync, ReadOptionsWithBuffer } from 'node:fs';
import fs from 'node:fs/promises';
import path from 'node:path';
import { PassThrough, Readable, Writable } from 'node:stream';
import { createGunzip, createGzip } from 'node:zlib';
import { CrawlOptionsDto, WalkOptionsDto } from 'src/dtos/library.dto';
import { WalkOptionsDto } from 'src/dtos/library.dto';
import { LoggingRepository } from 'src/repositories/logging.repository';
import { mimeTypes } from 'src/utils/mime-types';
@@ -198,54 +198,22 @@ export class StorageRepository {
};
}
crawl(crawlOptions: CrawlOptionsDto): Promise<string[]> {
const { pathsToCrawl, exclusionPatterns, includeHidden } = crawlOptions;
if (pathsToCrawl.length === 0) {
return Promise.resolve([]);
async walk(walkOptions: WalkOptionsDto): Promise<string[]> {
const { pathsToWalk, exclusionPatterns, includeHidden } = walkOptions;
if (pathsToWalk.length === 0) {
return [];
}
const globbedPaths = pathsToCrawl.map((path) => this.asGlob(path));
const extensions = mimeTypes.getSupportedFileExtensions().map((ext) => ext.toLowerCase());
return glob(globbedPaths, {
absolute: true,
caseSensitiveMatch: false,
onlyFiles: true,
dot: includeHidden,
ignore: exclusionPatterns,
return await walk({
paths: pathsToWalk.map((p) => path.resolve(p)),
includeHidden: includeHidden ?? false,
exclusionPatterns,
extensions,
});
}
async *walk(walkOptions: WalkOptionsDto): AsyncGenerator<string[]> {
const { pathsToCrawl, exclusionPatterns, includeHidden } = walkOptions;
if (pathsToCrawl.length === 0) {
async function* emptyGenerator() {}
return emptyGenerator();
}
const globbedPaths = pathsToCrawl.map((path) => this.asGlob(path));
const stream = globStream(globbedPaths, {
absolute: true,
caseSensitiveMatch: false,
onlyFiles: true,
dot: includeHidden,
ignore: exclusionPatterns,
});
let batch: string[] = [];
for await (const value of stream) {
batch.push(value.toString());
if (batch.length === walkOptions.take) {
yield batch;
batch = [];
}
}
if (batch.length > 0) {
yield batch;
}
}
watch(paths: string[], options: ChokidarOptions, events: Partial<WatchEvents>) {
const watcher = chokidar.watch(paths, options);
@@ -257,10 +225,4 @@ export class StorageRepository {
return () => watcher.close();
}
private asGlob(pathToCrawl: string): string {
const escapedPath = escapePath(pathToCrawl).replaceAll('"', '["]').replaceAll("'", "[']").replaceAll('`', '[`]');
const extensions = `*{${mimeTypes.getSupportedFileExtensions().join(',')}}`;
return `${escapedPath}/**/${extensions}`;
}
}

View File

@@ -1,7 +1,6 @@
import { BadRequestException } from '@nestjs/common';
import { Stats } from 'node:fs';
import { defaults, SystemConfig } from 'src/config';
import { JOBS_LIBRARY_PAGINATION_SIZE } from 'src/constants';
import { mapLibrary } from 'src/dtos/library.dto';
import { AssetType, CronJob, ImmichWorker, JobName, JobStatus } from 'src/enum';
import { LibraryService } from 'src/services/library.service';
@@ -14,10 +13,6 @@ import { factory, newDate, newUuid } from 'test/small.factory';
import { makeStream, newTestService, ServiceMocks } from 'test/utils';
import { vitest } from 'vitest';
async function* mockWalk() {
yield await Promise.resolve(['/data/user1/photo.jpg']);
}
describe(LibraryService.name, () => {
let sut: LibraryService;
@@ -165,7 +160,7 @@ describe(LibraryService.name, () => {
const library = factory.library({ importPaths: ['/foo', '/bar'] });
mocks.library.get.mockResolvedValue(library);
mocks.storage.walk.mockImplementation(mockWalk);
mocks.storage.walk.mockResolvedValue(['/data/user1/photo.jpg']);
mocks.storage.stat.mockResolvedValue({ isDirectory: () => true } as Stats);
mocks.storage.checkFileExists.mockResolvedValue(true);
mocks.asset.filterNewExternalAssetPaths.mockResolvedValue(['/data/user1/photo.jpg']);
@@ -201,16 +196,16 @@ describe(LibraryService.name, () => {
});
mocks.storage.checkFileExists.mockResolvedValue(true);
mocks.storage.walk.mockResolvedValue(['/data/user1/photo.jpg']);
mocks.library.get.mockResolvedValue(library);
mocks.asset.filterNewExternalAssetPaths.mockResolvedValue(['/data/user1/photo.jpg']);
await sut.handleQueueSyncFiles({ id: library.id });
expect(mocks.storage.walk).toHaveBeenCalledWith({
pathsToCrawl: [library.importPaths[1]],
pathsToWalk: [library.importPaths[1]],
exclusionPatterns: [],
includeHidden: false,
take: JOBS_LIBRARY_PAGINATION_SIZE,
});
});
});
@@ -220,7 +215,7 @@ describe(LibraryService.name, () => {
const library = factory.library({ importPaths: ['/foo', '/bar'] });
mocks.library.get.mockResolvedValue(library);
mocks.storage.walk.mockImplementation(mockWalk);
mocks.storage.walk.mockResolvedValue(['/data/user1/photo.jpg']);
mocks.storage.stat.mockResolvedValue({ isDirectory: () => true } as Stats);
mocks.storage.checkFileExists.mockResolvedValue(true);
mocks.asset.filterNewExternalAssetPaths.mockResolvedValue(['/data/user1/photo.jpg']);
@@ -242,33 +237,6 @@ describe(LibraryService.name, () => {
await expect(sut.handleQueueSyncFiles({ id: library.id })).resolves.toBe(JobStatus.Skipped);
});
it('should ignore import paths that do not exist', async () => {
const library = factory.library({ importPaths: ['/foo', '/bar'] });
mocks.storage.stat.mockImplementation((path): Promise<Stats> => {
if (path === library.importPaths[0]) {
const error = { code: 'ENOENT' } as any;
throw error;
}
return Promise.resolve({
isDirectory: () => true,
} as Stats);
});
mocks.storage.checkFileExists.mockResolvedValue(true);
mocks.library.get.mockResolvedValue(library);
await sut.handleQueueSyncFiles({ id: library.id });
expect(mocks.storage.walk).toHaveBeenCalledWith({
pathsToCrawl: [library.importPaths[1]],
exclusionPatterns: [],
includeHidden: false,
take: JOBS_LIBRARY_PAGINATION_SIZE,
});
});
});
describe('handleQueueSyncAssets', () => {
@@ -276,7 +244,7 @@ describe(LibraryService.name, () => {
const library = factory.library();
mocks.library.get.mockResolvedValue(library);
mocks.storage.walk.mockImplementation(async function* generator() {});
mocks.storage.walk.mockResolvedValue([]);
mocks.asset.getLibraryAssetCount.mockResolvedValue(1);
mocks.asset.detectOfflineExternalAssets.mockResolvedValue({ numUpdatedRows: 1n });
@@ -294,7 +262,7 @@ describe(LibraryService.name, () => {
const library = factory.library();
mocks.library.get.mockResolvedValue(library);
mocks.storage.walk.mockImplementation(async function* generator() {});
mocks.storage.walk.mockResolvedValue([]);
mocks.asset.getLibraryAssetCount.mockResolvedValue(0);
mocks.asset.detectOfflineExternalAssets.mockResolvedValue({ numUpdatedRows: 1n });
@@ -309,7 +277,7 @@ describe(LibraryService.name, () => {
const asset = AssetFactory.create({ libraryId: library.id, isExternal: true });
mocks.library.get.mockResolvedValue(library);
mocks.storage.walk.mockImplementation(async function* generator() {});
mocks.storage.walk.mockResolvedValue([]);
mocks.library.streamAssetIds.mockReturnValue(makeStream([asset]));
mocks.asset.getLibraryAssetCount.mockResolvedValue(1);
mocks.asset.detectOfflineExternalAssets.mockResolvedValue({ numUpdatedRows: 0n });

View File

@@ -394,7 +394,16 @@ export class LibraryService extends BaseService {
private async processEntity(filePath: string, ownerId: string, libraryId: string) {
const assetPath = path.normalize(filePath);
const stat = await this.storageRepository.stat(assetPath);
let stat: Stats;
try {
stat = await this.storageRepository.stat(assetPath);
} catch (error: any) {
if (error.code === 'ENOENT') {
this.logger.error(`File not found during import: ${assetPath} (original path: ${filePath})`);
}
throw error;
}
return {
ownerId,
@@ -636,21 +645,25 @@ export class LibraryService extends BaseService {
return JobStatus.Skipped;
}
const pathsOnDisk = this.storageRepository.walk({
pathsToCrawl: validImportPaths,
includeHidden: false,
exclusionPatterns: library.exclusionPatterns,
take: JOBS_LIBRARY_PAGINATION_SIZE,
});
let importCount = 0;
let crawlCount = 0;
this.logger.log(`Starting disk crawl of ${validImportPaths.length} import path(s) for library ${library.id}...`);
for await (const pathBatch of pathsOnDisk) {
crawlCount += pathBatch.length;
const paths = await this.assetRepository.filterNewExternalAssetPaths(library.id, pathBatch);
const crawlStart = Date.now();
const pathsOnDisk = await this.storageRepository.walk({
pathsToWalk: validImportPaths,
includeHidden: false,
exclusionPatterns: library.exclusionPatterns,
});
this.logger.log(
`Found ${pathsOnDisk.length} file(s) on disk in ${((Date.now() - crawlStart) / 1000).toFixed(2)}s, queuing for import...`,
);
let importCount = 0;
for (let i = 0; i < pathsOnDisk.length; i += JOBS_LIBRARY_PAGINATION_SIZE) {
const pathChunk = pathsOnDisk.slice(i, i + JOBS_LIBRARY_PAGINATION_SIZE);
const paths = await this.assetRepository.filterNewExternalAssetPaths(library.id, pathChunk);
if (paths.length > 0) {
importCount += paths.length;
@@ -660,18 +673,18 @@ export class LibraryService extends BaseService {
data: {
libraryId: library.id,
paths,
progressCounter: crawlCount,
progressCounter: i + pathChunk.length,
},
});
}
this.logger.log(
`Crawled ${crawlCount} file(s) so far: ${paths.length} of current batch of ${pathBatch.length} will be imported to library ${library.id}...`,
`Processed ${i + pathChunk.length} file(s): ${paths.length} of current batch of ${pathChunk.length} will be imported to library ${library.id}...`,
);
}
this.logger.log(
`Finished disk crawl, ${crawlCount} file(s) found on disk and queued ${importCount} file(s) for import into ${library.id}`,
`Finished disk crawl, ${pathsOnDisk.length} file(s) found on disk and queued ${importCount} file(s) for import into ${library.id}`,
);
await this.libraryRepository.update(job.id, { refreshedAt: new Date() });

View File

@@ -1,29 +1,45 @@
import mockfs from 'mock-fs';
import { CrawlOptionsDto } from 'src/dtos/library.dto';
import { Kysely } from 'kysely';
import fs from 'node:fs/promises';
import os from 'node:os';
import path from 'node:path';
import { WalkOptionsDto } from 'src/dtos/library.dto';
import { LoggingRepository } from 'src/repositories/logging.repository';
import { StorageRepository } from 'src/repositories/storage.repository';
import { automock } from 'test/utils';
import { DB } from 'src/schema';
import { BaseService } from 'src/services/base.service';
import { newMediumService } from 'test/medium.factory';
import { getKyselyDB } from 'test/utils';
let defaultDatabase: Kysely<DB>;
interface Test {
test: string;
options: CrawlOptionsDto;
options: WalkOptionsDto;
files: Record<string, boolean>;
}
const cwd = process.cwd();
const createTestFiles = async (basePath: string, files: string[]) => {
await Promise.all(
files.map(async (file) => {
const fullPath = path.join(basePath, file.replace(/^\//, ''));
await fs.mkdir(path.dirname(fullPath), { recursive: true });
await fs.writeFile(fullPath, '');
}),
);
};
const tests: Test[] = [
{
test: 'should return empty when crawling an empty path list',
options: {
pathsToCrawl: [],
pathsToWalk: [],
},
files: {},
},
{
test: 'should crawl a single path',
options: {
pathsToCrawl: ['/photos/'],
pathsToWalk: ['/photos/'],
},
files: {
'/photos/image.jpg': true,
@@ -32,7 +48,7 @@ const tests: Test[] = [
{
test: 'should exclude by file extension',
options: {
pathsToCrawl: ['/photos/'],
pathsToWalk: ['/photos/'],
exclusionPatterns: ['**/*.tif'],
},
files: {
@@ -43,7 +59,7 @@ const tests: Test[] = [
{
test: 'should exclude by file extension without case sensitivity',
options: {
pathsToCrawl: ['/photos/'],
pathsToWalk: ['/photos/'],
exclusionPatterns: ['**/*.TIF'],
},
files: {
@@ -54,7 +70,7 @@ const tests: Test[] = [
{
test: 'should exclude by folder',
options: {
pathsToCrawl: ['/photos/'],
pathsToWalk: ['/photos/'],
exclusionPatterns: ['**/raw/**'],
},
files: {
@@ -68,7 +84,7 @@ const tests: Test[] = [
{
test: 'should crawl multiple paths',
options: {
pathsToCrawl: ['/photos/', '/images/', '/albums/'],
pathsToWalk: ['/photos/', '/images/', '/albums/'],
},
files: {
'/photos/image1.jpg': true,
@@ -79,7 +95,7 @@ const tests: Test[] = [
{
test: 'should crawl a single path without trailing slash',
options: {
pathsToCrawl: ['/photos'],
pathsToWalk: ['/photos'],
},
files: {
'/photos/image.jpg': true,
@@ -88,7 +104,7 @@ const tests: Test[] = [
{
test: 'should crawl a single path',
options: {
pathsToCrawl: ['/photos/'],
pathsToWalk: ['/photos/'],
},
files: {
'/photos/image.jpg': true,
@@ -100,7 +116,7 @@ const tests: Test[] = [
{
test: 'should filter file extensions',
options: {
pathsToCrawl: ['/photos/'],
pathsToWalk: ['/photos/'],
},
files: {
'/photos/image.jpg': true,
@@ -111,7 +127,7 @@ const tests: Test[] = [
{
test: 'should include photo and video extensions',
options: {
pathsToCrawl: ['/photos/', '/videos/'],
pathsToWalk: ['/photos/', '/videos/'],
},
files: {
'/photos/image.jpg': true,
@@ -133,7 +149,7 @@ const tests: Test[] = [
{
test: 'should check file extensions without case sensitivity',
options: {
pathsToCrawl: ['/photos/'],
pathsToWalk: ['/photos/'],
},
files: {
'/photos/image.jpg': true,
@@ -150,28 +166,17 @@ const tests: Test[] = [
{
test: 'should normalize the path',
options: {
pathsToCrawl: ['/photos/1/../2'],
pathsToWalk: ['/photos/1/../2'],
},
files: {
'/photos/1/image.jpg': false,
'/photos/2/image.jpg': true,
},
},
{
test: 'should return absolute paths',
options: {
pathsToCrawl: ['photos'],
},
files: {
[`${cwd}/photos/1.jpg`]: true,
[`${cwd}/photos/2.jpg`]: true,
[`/photos/3.jpg`]: false,
},
},
{
test: 'should support special characters in paths',
options: {
pathsToCrawl: ['/photos (new)'],
pathsToWalk: ['/photos (new)'],
},
files: {
['/photos (new)/1.jpg']: true,
@@ -179,29 +184,54 @@ const tests: Test[] = [
},
];
const setup = (db?: Kysely<DB>) => {
const { ctx } = newMediumService(BaseService, {
database: db || defaultDatabase,
real: [],
mock: [LoggingRepository],
});
return { sut: ctx.get(StorageRepository) };
};
beforeAll(async () => {
defaultDatabase = await getKyselyDB();
});
describe(StorageRepository.name, () => {
let sut: StorageRepository;
beforeEach(() => {
// eslint-disable-next-line no-sparse-arrays
sut = new StorageRepository(automock(LoggingRepository, { args: [, { getEnv: () => ({}) }], strict: false }));
});
afterEach(() => {
mockfs.restore();
({ sut } = setup());
});
describe('crawl', () => {
for (const { test, options, files } of tests) {
it(test, async () => {
mockfs(Object.fromEntries(Object.keys(files).map((file) => [file, ''])));
describe(test, () => {
const fileList = Object.keys(files);
let tempDir: string;
const actual = await sut.crawl(options);
const expected = Object.entries(files)
.filter((entry) => entry[1])
.map(([file]) => file);
beforeEach(async () => {
tempDir = await fs.mkdtemp(path.join(os.tmpdir(), 'immich-storage-test-'));
await createTestFiles(tempDir, fileList);
});
expect(actual.toSorted()).toEqual(expected.toSorted());
afterEach(async () => {
await fs.rm(tempDir, { recursive: true, force: true });
});
it('returns expected files', async () => {
const adjustedOptions = {
...options,
pathsToWalk: options.pathsToWalk.map((p) => path.join(tempDir, p.replace(/^\//, ''))),
};
const actual = await sut.walk(adjustedOptions);
const expected = Object.entries(files)
.filter((entry) => entry[1])
.map(([file]) => path.join(tempDir, file.replace(/^\//, '')));
expect(actual.toSorted()).toEqual(expected.toSorted());
});
});
}
});

View File

@@ -68,8 +68,7 @@ export const newStorageRepositoryMock = (): Mocked<RepositoryInterface<StorageRe
readdir: vitest.fn(),
realpath: vitest.fn().mockImplementation((filepath: string) => Promise.resolve(filepath)),
stat: vitest.fn(),
crawl: vitest.fn(),
walk: vitest.fn().mockImplementation(async function* () {}),
walk: vitest.fn(),
rename: vitest.fn(),
copyFile: vitest.fn(),
utimes: vitest.fn(),