feat: vectorchord (#18042)

* wip

auto-detect available extensions

auto-recovery, fix reindexing check

use original image for ml

* set probes

* update image for sql checker

update images for gha

* cascade

* fix new instance

* accurate dummy vector

* simplify dummy

* preexisiting pg docs

* handle different db name

* maybe fix sql generation

* revert refreshfaces sql change

* redundant switch

* outdated message

* update docker compose files

* Update docs/docs/administration/postgres-standalone.md

Co-authored-by: Daniel Dietzler <36593685+danieldietzler@users.noreply.github.com>

* tighten range

* avoid always printing "vector reindexing complete"

* remove nesting

* use new images

* add vchord to unit tests

* debug e2e image

* mention 1.107.2 in startup error

* support new vchord versions

---------

Co-authored-by: Daniel Dietzler <36593685+danieldietzler@users.noreply.github.com>
This commit is contained in:
Mert
2025-05-20 09:36:43 -04:00
committed by GitHub
parent fe71894308
commit 0d773af6c3
35 changed files with 572 additions and 444 deletions

View File

@@ -5,9 +5,9 @@ import { randomUUID } from 'node:crypto';
import { DB, Exif } from 'src/db';
import { DummyValue, GenerateSql } from 'src/decorators';
import { MapAsset } from 'src/dtos/asset-response.dto';
import { AssetStatus, AssetType, AssetVisibility } from 'src/enum';
import { ConfigRepository } from 'src/repositories/config.repository';
import { anyUuid, asUuid, searchAssetBuilder, vectorIndexQuery } from 'src/utils/database';
import { AssetStatus, AssetType, AssetVisibility, VectorIndex } from 'src/enum';
import { probes } from 'src/repositories/database.repository';
import { anyUuid, asUuid, searchAssetBuilder } from 'src/utils/database';
import { paginationHelper } from 'src/utils/pagination';
import { isValidInteger } from 'src/validation';
@@ -168,10 +168,7 @@ export interface GetCameraMakesOptions {
@Injectable()
export class SearchRepository {
constructor(
@InjectKysely() private db: Kysely<DB>,
private configRepository: ConfigRepository,
) {}
constructor(@InjectKysely() private db: Kysely<DB>) {}
@GenerateSql({
params: [
@@ -236,19 +233,21 @@ export class SearchRepository {
},
],
})
async searchSmart(pagination: SearchPaginationOptions, options: SmartSearchOptions) {
searchSmart(pagination: SearchPaginationOptions, options: SmartSearchOptions) {
if (!isValidInteger(pagination.size, { min: 1, max: 1000 })) {
throw new Error(`Invalid value for 'size': ${pagination.size}`);
}
const items = await searchAssetBuilder(this.db, options)
.innerJoin('smart_search', 'assets.id', 'smart_search.assetId')
.orderBy(sql`smart_search.embedding <=> ${options.embedding}`)
.limit(pagination.size + 1)
.offset((pagination.page - 1) * pagination.size)
.execute();
return paginationHelper(items, pagination.size);
return this.db.transaction().execute(async (trx) => {
await sql`set local vchordrq.probes = ${sql.lit(probes[VectorIndex.CLIP])}`.execute(trx);
const items = await searchAssetBuilder(trx, options)
.innerJoin('smart_search', 'assets.id', 'smart_search.assetId')
.orderBy(sql`smart_search.embedding <=> ${options.embedding}`)
.limit(pagination.size + 1)
.offset((pagination.page - 1) * pagination.size)
.execute();
return paginationHelper(items, pagination.size);
});
}
@GenerateSql({
@@ -263,29 +262,32 @@ export class SearchRepository {
],
})
searchDuplicates({ assetId, embedding, maxDistance, type, userIds }: AssetDuplicateSearch) {
return this.db
.with('cte', (qb) =>
qb
.selectFrom('assets')
.select([
'assets.id as assetId',
'assets.duplicateId',
sql<number>`smart_search.embedding <=> ${embedding}`.as('distance'),
])
.innerJoin('smart_search', 'assets.id', 'smart_search.assetId')
.where('assets.ownerId', '=', anyUuid(userIds))
.where('assets.deletedAt', 'is', null)
.where('assets.visibility', '!=', AssetVisibility.HIDDEN)
.where('assets.type', '=', type)
.where('assets.id', '!=', asUuid(assetId))
.where('assets.stackId', 'is', null)
.orderBy(sql`smart_search.embedding <=> ${embedding}`)
.limit(64),
)
.selectFrom('cte')
.selectAll()
.where('cte.distance', '<=', maxDistance as number)
.execute();
return this.db.transaction().execute(async (trx) => {
await sql`set local vchordrq.probes = ${sql.lit(probes[VectorIndex.CLIP])}`.execute(trx);
return await trx
.with('cte', (qb) =>
qb
.selectFrom('assets')
.select([
'assets.id as assetId',
'assets.duplicateId',
sql<number>`smart_search.embedding <=> ${embedding}`.as('distance'),
])
.innerJoin('smart_search', 'assets.id', 'smart_search.assetId')
.where('assets.ownerId', '=', anyUuid(userIds))
.where('assets.deletedAt', 'is', null)
.where('assets.visibility', '!=', AssetVisibility.HIDDEN)
.where('assets.type', '=', type)
.where('assets.id', '!=', asUuid(assetId))
.where('assets.stackId', 'is', null)
.orderBy('distance')
.limit(64),
)
.selectFrom('cte')
.selectAll()
.where('cte.distance', '<=', maxDistance as number)
.execute();
});
}
@GenerateSql({
@@ -303,31 +305,36 @@ export class SearchRepository {
throw new Error(`Invalid value for 'numResults': ${numResults}`);
}
return this.db
.with('cte', (qb) =>
qb
.selectFrom('asset_faces')
.select([
'asset_faces.id',
'asset_faces.personId',
sql<number>`face_search.embedding <=> ${embedding}`.as('distance'),
])
.innerJoin('assets', 'assets.id', 'asset_faces.assetId')
.innerJoin('face_search', 'face_search.faceId', 'asset_faces.id')
.leftJoin('person', 'person.id', 'asset_faces.personId')
.where('assets.ownerId', '=', anyUuid(userIds))
.where('assets.deletedAt', 'is', null)
.$if(!!hasPerson, (qb) => qb.where('asset_faces.personId', 'is not', null))
.$if(!!minBirthDate, (qb) =>
qb.where((eb) => eb.or([eb('person.birthDate', 'is', null), eb('person.birthDate', '<=', minBirthDate!)])),
)
.orderBy(sql`face_search.embedding <=> ${embedding}`)
.limit(numResults),
)
.selectFrom('cte')
.selectAll()
.where('cte.distance', '<=', maxDistance)
.execute();
return this.db.transaction().execute(async (trx) => {
await sql`set local vchordrq.probes = ${sql.lit(probes[VectorIndex.FACE])}`.execute(trx);
return await trx
.with('cte', (qb) =>
qb
.selectFrom('asset_faces')
.select([
'asset_faces.id',
'asset_faces.personId',
sql<number>`face_search.embedding <=> ${embedding}`.as('distance'),
])
.innerJoin('assets', 'assets.id', 'asset_faces.assetId')
.innerJoin('face_search', 'face_search.faceId', 'asset_faces.id')
.leftJoin('person', 'person.id', 'asset_faces.personId')
.where('assets.ownerId', '=', anyUuid(userIds))
.where('assets.deletedAt', 'is', null)
.$if(!!hasPerson, (qb) => qb.where('asset_faces.personId', 'is not', null))
.$if(!!minBirthDate, (qb) =>
qb.where((eb) =>
eb.or([eb('person.birthDate', 'is', null), eb('person.birthDate', '<=', minBirthDate!)]),
),
)
.orderBy('distance')
.limit(numResults),
)
.selectFrom('cte')
.selectAll()
.where('cte.distance', '<=', maxDistance)
.execute();
});
}
@GenerateSql({ params: [DummyValue.STRING] })
@@ -416,56 +423,6 @@ export class SearchRepository {
.execute();
}
async getDimensionSize(): Promise<number> {
const { rows } = await sql<{ dimsize: number }>`
select atttypmod as dimsize
from pg_attribute f
join pg_class c ON c.oid = f.attrelid
where c.relkind = 'r'::char
and f.attnum > 0
and c.relname = 'smart_search'
and f.attname = 'embedding'
`.execute(this.db);
const dimSize = rows[0]['dimsize'];
if (!isValidInteger(dimSize, { min: 1, max: 2 ** 16 })) {
throw new Error(`Could not retrieve CLIP dimension size`);
}
return dimSize;
}
async setDimensionSize(dimSize: number): Promise<void> {
if (!isValidInteger(dimSize, { min: 1, max: 2 ** 16 })) {
throw new Error(`Invalid CLIP dimension size: ${dimSize}`);
}
// this is done in two transactions to handle concurrent writes
await this.db.transaction().execute(async (trx) => {
await sql`delete from ${sql.table('smart_search')}`.execute(trx);
await trx.schema.alterTable('smart_search').dropConstraint('dim_size_constraint').ifExists().execute();
await sql`alter table ${sql.table('smart_search')} add constraint dim_size_constraint check (array_length(embedding::real[], 1) = ${sql.lit(dimSize)})`.execute(
trx,
);
});
const vectorExtension = this.configRepository.getEnv().database.vectorExtension;
await this.db.transaction().execute(async (trx) => {
await sql`drop index if exists clip_index`.execute(trx);
await trx.schema
.alterTable('smart_search')
.alterColumn('embedding', (col) => col.setDataType(sql.raw(`vector(${dimSize})`)))
.execute();
await sql.raw(vectorIndexQuery({ vectorExtension, table: 'smart_search', indexName: 'clip_index' })).execute(trx);
await trx.schema.alterTable('smart_search').dropConstraint('dim_size_constraint').ifExists().execute();
});
await sql`vacuum analyze ${sql.table('smart_search')}`.execute(this.db);
}
async deleteAllSearchEmbeddings(): Promise<void> {
await sql`truncate ${sql.table('smart_search')}`.execute(this.db);
}
async getCountries(userIds: string[]): Promise<string[]> {
const res = await this.getExifField('country', userIds).execute();
return res.map((row) => row.country!);