refactor: duplicate queries (#19136)

This commit is contained in:
Jason Rasmussen
2025-06-12 14:23:02 -04:00
committed by GitHub
parent 144cc8ab6d
commit 5cd186d3d4
14 changed files with 280 additions and 270 deletions

View File

@@ -5,7 +5,6 @@ import { InjectKysely } from 'nestjs-kysely';
import { Stack } from 'src/database';
import { AssetFiles, AssetJobStatus, Assets, DB, Exif } from 'src/db';
import { Chunked, ChunkedArray, DummyValue, GenerateSql } from 'src/decorators';
import { MapAsset } from 'src/dtos/asset-response.dto';
import { AssetFileType, AssetOrder, AssetStatus, AssetType, AssetVisibility } from 'src/enum';
import {
anyUuid,
@@ -29,13 +28,13 @@ import { globToSqlPattern } from 'src/utils/misc';
export type AssetStats = Record<AssetType, number>;
export interface AssetStatsOptions {
interface AssetStatsOptions {
isFavorite?: boolean;
isTrashed?: boolean;
visibility?: AssetVisibility;
}
export interface LivePhotoSearchOptions {
interface LivePhotoSearchOptions {
ownerId: string;
libraryId?: string | null;
livePhotoCID: string;
@@ -43,16 +42,12 @@ export interface LivePhotoSearchOptions {
type: AssetType;
}
export enum WithProperty {
SIDECAR = 'sidecar',
}
export enum TimeBucketSize {
DAY = 'DAY',
MONTH = 'MONTH',
}
export interface AssetBuilderOptions {
interface AssetBuilderOptions {
isFavorite?: boolean;
isTrashed?: boolean;
isDuplicate?: boolean;
@@ -81,43 +76,31 @@ export interface MonthDay {
month: number;
}
export interface AssetExploreFieldOptions {
interface AssetExploreFieldOptions {
maxFields: number;
minAssetsPerField: number;
}
export interface AssetFullSyncOptions {
interface AssetFullSyncOptions {
ownerId: string;
lastId?: string;
updatedUntil: Date;
limit: number;
}
export interface AssetDeltaSyncOptions {
interface AssetDeltaSyncOptions {
userIds: string[];
updatedAfter: Date;
limit: number;
}
export interface AssetUpdateDuplicateOptions {
targetDuplicateId: string | null;
assetIds: string[];
duplicateIds: string[];
}
export interface UpsertFileOptions {
assetId: string;
type: AssetFileType;
path: string;
}
export interface AssetGetByChecksumOptions {
interface AssetGetByChecksumOptions {
ownerId: string;
checksum: Buffer;
libraryId?: string;
}
export interface GetByIdsRelations {
interface GetByIdsRelations {
exifInfo?: boolean;
faces?: { person?: boolean; withDeleted?: boolean };
files?: boolean;
@@ -128,16 +111,6 @@ export interface GetByIdsRelations {
tags?: boolean;
}
export interface DuplicateGroup {
duplicateId: string;
assets: MapAsset[];
}
export interface DayOfYearAssets {
yearsAgo: number;
assets: MapAsset[];
}
@Injectable()
export class AssetRepository {
constructor(@InjectKysely() private db: Kysely<DB>) {}
@@ -418,19 +391,6 @@ export class AssetRepository {
await this.db.updateTable('assets').set(options).where('libraryId', '=', asUuid(libraryId)).execute();
}
@GenerateSql({
params: [{ targetDuplicateId: DummyValue.UUID, duplicateIds: [DummyValue.UUID], assetIds: [DummyValue.UUID] }],
})
async updateDuplicates(options: AssetUpdateDuplicateOptions): Promise<void> {
await this.db
.updateTable('assets')
.set({ duplicateId: options.targetDuplicateId })
.where((eb) =>
eb.or([eb('duplicateId', '=', anyUuid(options.duplicateIds)), eb('id', '=', anyUuid(options.assetIds))]),
)
.execute();
}
async update(asset: Updateable<Assets> & { id: string }) {
const value = omitBy(asset, isUndefined);
delete value.id;
@@ -696,58 +656,6 @@ export class AssetRepository {
return query.executeTakeFirstOrThrow();
}
@GenerateSql({ params: [DummyValue.UUID] })
getDuplicates(userId: string) {
return (
this.db
.with('duplicates', (qb) =>
qb
.selectFrom('assets')
.$call(withDefaultVisibility)
.leftJoinLateral(
(qb) =>
qb
.selectFrom('exif')
.selectAll('assets')
.select((eb) => eb.table('exif').as('exifInfo'))
.whereRef('exif.assetId', '=', 'assets.id')
.as('asset'),
(join) => join.onTrue(),
)
.select('assets.duplicateId')
.select((eb) =>
eb.fn.jsonAgg('asset').orderBy('assets.localDateTime', 'asc').$castTo<MapAsset[]>().as('assets'),
)
.where('assets.ownerId', '=', asUuid(userId))
.where('assets.duplicateId', 'is not', null)
.$narrowType<{ duplicateId: NotNull }>()
.where('assets.deletedAt', 'is', null)
.where('assets.stackId', 'is', null)
.groupBy('assets.duplicateId'),
)
.with('unique', (qb) =>
qb
.selectFrom('duplicates')
.select('duplicateId')
.where((eb) => eb(eb.fn('json_array_length', ['assets']), '=', 1)),
)
.with('removed_unique', (qb) =>
qb
.updateTable('assets')
.set({ duplicateId: null })
.from('unique')
.whereRef('assets.duplicateId', '=', 'unique.duplicateId'),
)
.selectFrom('duplicates')
.selectAll()
// TODO: compare with filtering by json_array_length > 1
.where(({ not, exists }) =>
not(exists((eb) => eb.selectFrom('unique').whereRef('unique.duplicateId', '=', 'duplicates.duplicateId'))),
)
.execute()
);
}
@GenerateSql({ params: [DummyValue.UUID, { minAssetsPerField: 5, maxFields: 12 }] })
async getAssetIdByCity(ownerId: string, { minAssetsPerField, maxFields }: AssetExploreFieldOptions) {
const items = await this.db

View File

@@ -0,0 +1,133 @@
import { Injectable } from '@nestjs/common';
import { Kysely, NotNull, sql } from 'kysely';
import { InjectKysely } from 'nestjs-kysely';
import { DB } from 'src/db';
import { DummyValue, GenerateSql } from 'src/decorators';
import { MapAsset } from 'src/dtos/asset-response.dto';
import { AssetType, VectorIndex } from 'src/enum';
import { probes } from 'src/repositories/database.repository';
import { anyUuid, asUuid, withDefaultVisibility } from 'src/utils/database';
interface DuplicateSearch {
assetId: string;
embedding: string;
maxDistance: number;
type: AssetType;
userIds: string[];
}
interface DuplicateMergeOptions {
targetId: string | null;
assetIds: string[];
sourceIds: string[];
}
@Injectable()
export class DuplicateRepository {
constructor(@InjectKysely() private db: Kysely<DB>) {}
@GenerateSql({ params: [DummyValue.UUID] })
getAll(userId: string) {
return (
this.db
.with('duplicates', (qb) =>
qb
.selectFrom('assets')
.$call(withDefaultVisibility)
.leftJoinLateral(
(qb) =>
qb
.selectFrom('exif')
.selectAll('assets')
.select((eb) => eb.table('exif').as('exifInfo'))
.whereRef('exif.assetId', '=', 'assets.id')
.as('asset'),
(join) => join.onTrue(),
)
.select('assets.duplicateId')
.select((eb) =>
eb.fn.jsonAgg('asset').orderBy('assets.localDateTime', 'asc').$castTo<MapAsset[]>().as('assets'),
)
.where('assets.ownerId', '=', asUuid(userId))
.where('assets.duplicateId', 'is not', null)
.$narrowType<{ duplicateId: NotNull }>()
.where('assets.deletedAt', 'is', null)
.where('assets.stackId', 'is', null)
.groupBy('assets.duplicateId'),
)
.with('unique', (qb) =>
qb
.selectFrom('duplicates')
.select('duplicateId')
.where((eb) => eb(eb.fn('json_array_length', ['assets']), '=', 1)),
)
.with('removed_unique', (qb) =>
qb
.updateTable('assets')
.set({ duplicateId: null })
.from('unique')
.whereRef('assets.duplicateId', '=', 'unique.duplicateId'),
)
.selectFrom('duplicates')
.selectAll()
// TODO: compare with filtering by json_array_length > 1
.where(({ not, exists }) =>
not(exists((eb) => eb.selectFrom('unique').whereRef('unique.duplicateId', '=', 'duplicates.duplicateId'))),
)
.execute()
);
}
@GenerateSql({
params: [
{
assetId: DummyValue.UUID,
embedding: DummyValue.VECTOR,
maxDistance: 0.6,
type: AssetType.IMAGE,
userIds: [DummyValue.UUID],
},
],
})
search({ assetId, embedding, maxDistance, type, userIds }: DuplicateSearch) {
return this.db.transaction().execute(async (trx) => {
await sql`set local vchordrq.probes = ${sql.lit(probes[VectorIndex.CLIP])}`.execute(trx);
return await trx
.with('cte', (qb) =>
qb
.selectFrom('assets')
.$call(withDefaultVisibility)
.select([
'assets.id as assetId',
'assets.duplicateId',
sql<number>`smart_search.embedding <=> ${embedding}`.as('distance'),
])
.innerJoin('smart_search', 'assets.id', 'smart_search.assetId')
.where('assets.ownerId', '=', anyUuid(userIds))
.where('assets.deletedAt', 'is', null)
.where('assets.type', '=', type)
.where('assets.id', '!=', asUuid(assetId))
.where('assets.stackId', 'is', null)
.orderBy('distance')
.limit(64),
)
.selectFrom('cte')
.selectAll()
.where('cte.distance', '<=', maxDistance as number)
.execute();
});
}
@GenerateSql({
params: [{ targetDuplicateId: DummyValue.UUID, duplicateIds: [DummyValue.UUID], assetIds: [DummyValue.UUID] }],
})
async merge(options: DuplicateMergeOptions): Promise<void> {
await this.db
.updateTable('assets')
.set({ duplicateId: options.targetId })
.where((eb) =>
eb.or([eb('duplicateId', '=', anyUuid(options.sourceIds)), eb('id', '=', anyUuid(options.assetIds))]),
)
.execute();
}
}

View File

@@ -11,6 +11,7 @@ import { CronRepository } from 'src/repositories/cron.repository';
import { CryptoRepository } from 'src/repositories/crypto.repository';
import { DatabaseRepository } from 'src/repositories/database.repository';
import { DownloadRepository } from 'src/repositories/download.repository';
import { DuplicateRepository } from 'src/repositories/duplicate.repository';
import { EmailRepository } from 'src/repositories/email.repository';
import { EventRepository } from 'src/repositories/event.repository';
import { JobRepository } from 'src/repositories/job.repository';
@@ -56,6 +57,7 @@ export const repositories = [
CryptoRepository,
DatabaseRepository,
DownloadRepository,
DuplicateRepository,
EmailRepository,
EventRepository,
JobRepository,

View File

@@ -7,7 +7,7 @@ import { DummyValue, GenerateSql } from 'src/decorators';
import { MapAsset } from 'src/dtos/asset-response.dto';
import { AssetStatus, AssetType, AssetVisibility, VectorIndex } from 'src/enum';
import { probes } from 'src/repositories/database.repository';
import { anyUuid, asUuid, searchAssetBuilder, withDefaultVisibility } from 'src/utils/database';
import { anyUuid, searchAssetBuilder } from 'src/utils/database';
import { paginationHelper } from 'src/utils/pagination';
import { isValidInteger } from 'src/validation';
@@ -135,14 +135,6 @@ export interface FaceEmbeddingSearch extends SearchEmbeddingOptions {
minBirthDate?: Date | null;
}
export interface AssetDuplicateSearch {
assetId: string;
embedding: string;
maxDistance: number;
type: AssetType;
userIds: string[];
}
export interface FaceSearchResult {
distance: number;
id: string;
@@ -275,46 +267,6 @@ export class SearchRepository {
});
}
@GenerateSql({
params: [
{
assetId: DummyValue.UUID,
embedding: DummyValue.VECTOR,
maxDistance: 0.6,
type: AssetType.IMAGE,
userIds: [DummyValue.UUID],
},
],
})
searchDuplicates({ assetId, embedding, maxDistance, type, userIds }: AssetDuplicateSearch) {
return this.db.transaction().execute(async (trx) => {
await sql`set local vchordrq.probes = ${sql.lit(probes[VectorIndex.CLIP])}`.execute(trx);
return await trx
.with('cte', (qb) =>
qb
.selectFrom('assets')
.$call(withDefaultVisibility)
.select([
'assets.id as assetId',
'assets.duplicateId',
sql<number>`smart_search.embedding <=> ${embedding}`.as('distance'),
])
.innerJoin('smart_search', 'assets.id', 'smart_search.assetId')
.where('assets.ownerId', '=', anyUuid(userIds))
.where('assets.deletedAt', 'is', null)
.where('assets.type', '=', type)
.where('assets.id', '!=', asUuid(assetId))
.where('assets.stackId', 'is', null)
.orderBy('distance')
.limit(64),
)
.selectFrom('cte')
.selectAll()
.where('cte.distance', '<=', maxDistance as number)
.execute();
});
}
@GenerateSql({
params: [
{