feat(server): near-duplicate detection (#8228)

* duplicate detection job, entity, config

* queueing

* job panel, update api

* use embedding in db instead of fetching

* disable concurrency

* only queue visible assets

* handle multiple duplicateIds

* update concurrent queue check

* add provider

* add web placeholder, server endpoint, migration, various fixes

* update sql

* select embedding by default

* rename variable

* simplify

* remove separate entity, handle re-running with different threshold, set default back to 0.02

* fix tests

* add tests

* add index to entity

* formatting

* update asset mock

* fix `upsertJobStatus` signature

* update sql

* formatting

* default to 0.03

* optimize clustering

* use asset's `duplicateId` if present

* update sql

* update tests

* expose admin setting

* refactor

* formatting

* skip if ml is disabled

* debug trash e2e

* remove from web

* remove from sidebar

* test if ml is disabled

* update sql

* separate duplicate detection from clip in config, disable by default for now

* fix doc

* lower minimum `maxDistance`

* update api

* Add and Use Duplicate Detection Feature Flag (#9364)

* Add Duplicate Detection Flag

* Use Duplicate Detection Flag

* Attempt Fixes for Failing Checks

* lower minimum `maxDistance`

* fix tests

---------

Co-authored-by: mertalev <101130780+mertalev@users.noreply.github.com>

* chore: fixes and additions after rebase

* chore: update api (remove new Role enum)

* fix: left join smart search so getAll works without machine learning

* test: trash e2e go back to checking length of assets is zero

* chore: regen api after rebase

* test: fix tests after rebase

* redundant join

---------

Co-authored-by: Nicholas Flamy <30300649+NicholasFlamy@users.noreply.github.com>
Co-authored-by: Zack Pollard <zackpollard@ymail.com>
Co-authored-by: Zack Pollard <zack@futo.org>
This commit is contained in:
Mert
2024-05-16 13:08:37 -04:00
committed by GitHub
parent 673e97e71d
commit 64636c0618
61 changed files with 1254 additions and 61 deletions

View File

@@ -10,6 +10,8 @@ import { SmartSearchEntity } from 'src/entities/smart-search.entity';
import { DatabaseExtension } from 'src/interfaces/database.interface';
import { ILoggerRepository } from 'src/interfaces/logger.interface';
import {
AssetDuplicateResult,
AssetDuplicateSearch,
AssetSearchOptions,
FaceEmbeddingSearch,
FaceSearchResult,
@@ -145,6 +147,44 @@ export class SearchRepository implements ISearchRepository {
return results;
}
@GenerateSql({
params: [
{
embedding: Array.from({ length: 512 }, Math.random),
maxDistance: 0.6,
userIds: [DummyValue.UUID],
},
],
})
searchDuplicates({
assetId,
embedding,
maxDistance,
userIds,
}: AssetDuplicateSearch): Promise<AssetDuplicateResult[]> {
const cte = this.assetRepository.createQueryBuilder('asset');
cte
.select('search.assetId', 'assetId')
.addSelect('asset.duplicateId', 'duplicateId')
.addSelect(`search.embedding <=> :embedding`, 'distance')
.innerJoin('asset.smartSearch', 'search')
.where('asset.ownerId IN (:...userIds )')
.andWhere('asset.id != :assetId')
.andWhere('asset.isVisible = :isVisible')
.orderBy('search.embedding <=> :embedding')
.limit(64)
.setParameters({ assetId, embedding: asVector(embedding), isVisible: true, userIds });
const builder = this.assetRepository.manager
.createQueryBuilder()
.addCommonTableExpression(cte, 'cte')
.from('cte', 'res')
.select('res.*')
.where('res.distance <= :maxDistance', { maxDistance });
return builder.getRawMany() as any as Promise<AssetDuplicateResult[]>;
}
@GenerateSql({
params: [
{