Skip to content

Commit 133f2a8

Browse files
authored
Send lk.segment_id attribute in transcription events so non final transcriptions can be deduplicated client side (#580)
1 parent 20f5ef4 commit 133f2a8

File tree

4 files changed

+57
-2
lines changed

4 files changed

+57
-2
lines changed

.changeset/heavy-cherries-yell.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
'@livekit/agents': patch
3+
---
4+
5+
add `lk.segment_id` to all generated trancription events

agents/src/constants.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,5 @@
44
export const ATTRIBUTE_TRANSCRIPTION_TRACK_ID = 'lk.transcribed_track_id';
55
export const ATTRIBUTE_TRANSCRIPTION_FINAL = 'lk.transcription_final';
66
export const TOPIC_TRANSCRIPTION = 'lk.transcription';
7+
export const ATTRIBUTE_SEGMENT_ID = 'lk.segment_id';
78
export const TOPIC_CHAT = 'lk.chat';

agents/src/multimodal/multimodal_agent.ts

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,11 @@ import {
1818
TrackPublishOptions,
1919
TrackSource,
2020
} from '@livekit/rtc-node';
21+
import { randomUUID } from 'node:crypto';
2122
import { EventEmitter } from 'node:events';
2223
import { AudioByteStream } from '../audio.js';
2324
import {
25+
ATTRIBUTE_SEGMENT_ID,
2426
ATTRIBUTE_TRANSCRIPTION_FINAL,
2527
ATTRIBUTE_TRANSCRIPTION_TRACK_ID,
2628
TOPIC_TRANSCRIPTION,
@@ -72,6 +74,7 @@ export class MultimodalAgent extends EventEmitter {
7274

7375
#textResponseRetries = 0;
7476
#maxTextResponseRetries: number;
77+
#transcriptionId?: string;
7578

7679
constructor({
7780
model,
@@ -257,13 +260,20 @@ export class MultimodalAgent extends EventEmitter {
257260

258261
const synchronizer = new TextAudioSynchronizer(defaultTextSyncOptions);
259262
synchronizer.on('textUpdated', async (text) => {
263+
if (!this.#transcriptionId) {
264+
this.#transcriptionId = randomUUID();
265+
}
260266
await this.#publishTranscription(
261267
this.room!.localParticipant!.identity!,
262268
this.#getLocalTrackSid()!,
263269
text.text,
264270
text.final,
265271
text.id,
272+
this.#transcriptionId,
266273
);
274+
if (text.final) {
275+
this.#transcriptionId = undefined;
276+
}
267277
});
268278

269279
const handle = this.#agentPlayout?.play(
@@ -312,7 +322,17 @@ export class MultimodalAgent extends EventEmitter {
312322
const participantIdentity = this.linkedParticipant?.identity;
313323
const trackSid = this.subscribedTrack?.sid;
314324
if (participantIdentity && trackSid) {
315-
await this.#publishTranscription(participantIdentity, trackSid, '…', false, ev.itemId);
325+
if (!this.#transcriptionId) {
326+
this.#transcriptionId = randomUUID();
327+
}
328+
await this.#publishTranscription(
329+
participantIdentity,
330+
trackSid,
331+
'…',
332+
false,
333+
ev.itemId,
334+
this.#transcriptionId,
335+
);
316336
} else {
317337
this.#logger.error('Participant or track not set');
318338
}
@@ -325,13 +345,18 @@ export class MultimodalAgent extends EventEmitter {
325345
const participantIdentity = this.linkedParticipant?.identity;
326346
const trackSid = this.subscribedTrack?.sid;
327347
if (participantIdentity && trackSid) {
348+
if (!this.#transcriptionId) {
349+
this.#transcriptionId = randomUUID();
350+
}
328351
await this.#publishTranscription(
329352
participantIdentity,
330353
trackSid,
331354
transcription,
332355
true,
333356
ev.itemId,
357+
this.#transcriptionId,
334358
);
359+
this.#transcriptionId = undefined;
335360
} else {
336361
this.#logger.error('Participant or track not set');
337362
}
@@ -360,7 +385,17 @@ export class MultimodalAgent extends EventEmitter {
360385
const participantIdentity = this.linkedParticipant?.identity;
361386
const trackSid = this.subscribedTrack?.sid;
362387
if (participantIdentity && trackSid) {
363-
await this.#publishTranscription(participantIdentity, trackSid, '…', false, ev.itemId);
388+
if (!this.#transcriptionId) {
389+
this.#transcriptionId = randomUUID();
390+
}
391+
await this.#publishTranscription(
392+
participantIdentity,
393+
trackSid,
394+
'…',
395+
false,
396+
ev.itemId,
397+
this.#transcriptionId,
398+
);
364399
}
365400
});
366401

@@ -492,6 +527,7 @@ export class MultimodalAgent extends EventEmitter {
492527
text: string,
493528
isFinal: boolean,
494529
id: string,
530+
segmentId: string,
495531
): Promise<void> {
496532
this.#logger.debug(
497533
`Publishing transcription ${participantIdentity} ${trackSid} ${text} ${isFinal} ${id}`,
@@ -522,6 +558,7 @@ export class MultimodalAgent extends EventEmitter {
522558
attributes: {
523559
[ATTRIBUTE_TRANSCRIPTION_TRACK_ID]: trackSid,
524560
[ATTRIBUTE_TRANSCRIPTION_FINAL]: isFinal.toString(),
561+
[ATTRIBUTE_SEGMENT_ID]: segmentId,
525562
},
526563
});
527564
await stream.write(text);

agents/src/pipeline/pipeline_agent.ts

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ import type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';
1818
import { randomUUID } from 'node:crypto';
1919
import EventEmitter from 'node:events';
2020
import {
21+
ATTRIBUTE_SEGMENT_ID,
2122
ATTRIBUTE_TRANSCRIPTION_FINAL,
2223
ATTRIBUTE_TRANSCRIPTION_TRACK_ID,
2324
TOPIC_TRANSCRIPTION,
@@ -537,6 +538,7 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
537538
this.#transcribedInterimText,
538539
false,
539540
this.#transcriptionId,
541+
this.#transcriptionId,
540542
);
541543
});
542544
this.#humanInput.on(HumanInputEvent.FINAL_TRANSCRIPT, async (event) => {
@@ -556,6 +558,7 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
556558
this.transcribedText,
557559
true,
558560
this.#transcriptionId,
561+
this.#transcriptionId,
559562
);
560563

561564
this.#transcriptionId = undefined;
@@ -895,6 +898,7 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
895898
text: string,
896899
isFinal: boolean,
897900
id: string,
901+
segmentId: string,
898902
) {
899903
this.#room!.localParticipant!.publishTranscription({
900904
participantIdentity: participantIdentity,
@@ -916,6 +920,7 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
916920
attributes: {
917921
[ATTRIBUTE_TRANSCRIPTION_TRACK_ID]: trackSid,
918922
[ATTRIBUTE_TRANSCRIPTION_FINAL]: isFinal.toString(),
923+
[ATTRIBUTE_SEGMENT_ID]: segmentId,
919924
},
920925
});
921926
await stream.write(text);
@@ -930,13 +935,20 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
930935
// TODO: where possible we would want to use deltas instead of full text segments, esp for LLM streams over the streamText API
931936
synchronizer.on('textUpdated', async (text) => {
932937
this.#agentTranscribedText = text.text;
938+
if (!this.#transcriptionId) {
939+
this.#transcriptionId = randomUUID();
940+
}
933941
await this.#publishTranscription(
934942
this.#room!.localParticipant!.identity!,
935943
this.#agentPublication?.sid ?? '',
936944
text.text,
937945
text.final,
938946
text.id,
947+
this.#transcriptionId,
939948
);
949+
if (text.final) {
950+
this.#transcriptionId = undefined;
951+
}
940952
});
941953

942954
if (!this.#agentOutput) {

0 commit comments

Comments
 (0)