fix(server): cjk migration (#24320)

* join string

* use pagination instead
pull/24330/head
Mert 2025-12-01 16:41:19 -05:00 committed by GitHub
parent 95c29a8aea
commit 7c19b0591f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
1 changed files with 21 additions and 14 deletions

View File

@ -3,21 +3,28 @@ import { tokenizeForSearch } from 'src/utils/database';
export async function up(db: Kysely<any>): Promise<void> { export async function up(db: Kysely<any>): Promise<void> {
await sql`truncate ${sql.table('ocr_search')}`.execute(db); await sql`truncate ${sql.table('ocr_search')}`.execute(db);
const batch = [];
for await (const { assetId, text } of db let lastAssetId: string | undefined;
while (true) {
const rows = await db
.selectFrom('asset_ocr') .selectFrom('asset_ocr')
.select(['assetId', sql<string>`string_agg(text, ' ')`.as('text')]) .select(['assetId', sql<string>`string_agg(text, ' ')`.as('text')])
.$if(lastAssetId !== undefined, (qb) => qb.where('assetId', '>', lastAssetId))
.groupBy('assetId') .groupBy('assetId')
.stream()) { .orderBy('assetId')
batch.push({ assetId, text: tokenizeForSearch(text) }); .limit(5000)
if (batch.length >= 5000) { .execute();
await db.insertInto('ocr_search').values(batch).execute();
batch.length = 0; if (rows.length === 0) {
} break;
} }
if (batch.length > 0) { await db
await db.insertInto('ocr_search').values(batch).execute(); .insertInto('ocr_search')
.values(rows.map(({ assetId, text }) => ({ assetId, text: tokenizeForSearch(text).join(' ') })))
.execute();
lastAssetId = rows.at(-1)!.assetId;
} }
} }