diff --git a/client.go b/client.go index fec1a00..339c2fd 100644 --- a/client.go +++ b/client.go @@ -2,6 +2,7 @@ package openairt import ( "context" + "fmt" "net/http" "net/url" ) @@ -59,6 +60,7 @@ func (c *Client) getHeaders() http.Header { type connectOption struct { model string + intent string dialer WebSocketDialer logger Logger } @@ -72,6 +74,13 @@ func WithModel(model string) ConnectOption { } } +// Set transcription intent instead of model +func WithIntent() ConnectOption { + return func(opts *connectOption) { + opts.intent = "transcription" + } +} + // WithDialer sets the dialer for the connection. func WithDialer(dialer WebSocketDialer) ConnectOption { return func(opts *connectOption) { @@ -103,7 +112,14 @@ func (c *Client) Connect(ctx context.Context, opts ...ConnectOption) (*Conn, err headers := c.getHeaders() // get url by model - url := c.getURL(connectOpts.model) + var url string + if connectOpts.intent == "" { + url = c.getURL(connectOpts.model) + } else if c.config.APIType != APITypeOpenAI { + return nil, fmt.Errorf("Azure API type with intent set not implemented"); + } else { + url = c.config.BaseURL + "?" + "intent=" + connectOpts.intent + } // dial conn, err := connectOpts.dialer.Dial(ctx, url, headers) diff --git a/client_event.go b/client_event.go index 9c9c165..e637f3b 100644 --- a/client_event.go +++ b/client_event.go @@ -6,15 +6,16 @@ import "encoding/json" type ClientEventType string const ( - ClientEventTypeSessionUpdate ClientEventType = "session.update" - ClientEventTypeInputAudioBufferAppend ClientEventType = "input_audio_buffer.append" - ClientEventTypeInputAudioBufferCommit ClientEventType = "input_audio_buffer.commit" - ClientEventTypeInputAudioBufferClear ClientEventType = "input_audio_buffer.clear" - ClientEventTypeConversationItemCreate ClientEventType = "conversation.item.create" - ClientEventTypeConversationItemTruncate ClientEventType = "conversation.item.truncate" - ClientEventTypeConversationItemDelete ClientEventType = "conversation.item.delete" - ClientEventTypeResponseCreate ClientEventType = "response.create" - ClientEventTypeResponseCancel ClientEventType = "response.cancel" + ClientEventTypeSessionUpdate ClientEventType = "session.update" + ClientEventTypeTranscriptionSessionUpdate ClientEventType = "transcription_session.update" + ClientEventTypeInputAudioBufferAppend ClientEventType = "input_audio_buffer.append" + ClientEventTypeInputAudioBufferCommit ClientEventType = "input_audio_buffer.commit" + ClientEventTypeInputAudioBufferClear ClientEventType = "input_audio_buffer.clear" + ClientEventTypeConversationItemCreate ClientEventType = "conversation.item.create" + ClientEventTypeConversationItemTruncate ClientEventType = "conversation.item.truncate" + ClientEventTypeConversationItemDelete ClientEventType = "conversation.item.delete" + ClientEventTypeResponseCreate ClientEventType = "response.create" + ClientEventTypeResponseCancel ClientEventType = "response.cancel" ) // ClientEvent is the interface for client event. @@ -78,6 +79,57 @@ func (m SessionUpdateEvent) MarshalJSON() ([]byte, error) { return json.Marshal(v) } +type NoiseReductionType string + +const ( + NearFieldNoiseReduction NoiseReductionType = "near_field" + FarFieldNoiseReduction NoiseReductionType = "far_field" +) + +type InputAudioNoiseReduction struct { + // Type of noise reduction. near_field is for close-talking microphones such as headphones, far_field is for far-field microphones such as laptop or conference room microphones. + Type NoiseReductionType `json:"type"` +} + +type ClientTranscriptionSession struct { + Include []string `json:"include,omitempty"` + // The set of modalities the model can respond with. To disable audio, set this to ["text"]. + Modalities []Modality `json:"modalities,omitempty"` + // The format of input audio. Options are "pcm16", "g711_ulaw", or "g711_alaw". + InputAudioFormat AudioFormat `json:"input_audio_format,omitempty"` + // Configuration for input audio noise reduction. This can be set to null to turn off. Noise reduction filters audio added to the input audio buffer before it is sent to VAD and the model. Filtering the audio can improve VAD and turn detection accuracy (reducing false positives) and model performance by improving perception of the input audio. + InputAudioNoiseReduction *InputAudioNoiseReduction `json:"input_audio_noise_reduction,omitempty"` + // Configuration for input audio transcription. Can be set to `nil` to turn off. + InputAudioTranscription *InputAudioTranscription `json:"input_audio_transcription,omitempty"` + // Configuration for turn detection. Can be set to `nil` to turn off. + TurnDetection *ClientTurnDetection `json:"turn_detection"` +} + +// SessionUpdateEvent is the event for session update. +// Send this event to update the session’s default configuration. +// See https://platform.openai.com/docs/api-reference/realtime-client-events/session/update +type TranscriptionSessionUpdateEvent struct { + EventBase + // Session configuration to update. + Session ClientTranscriptionSession `json:"session"` +} + +func (m TranscriptionSessionUpdateEvent) ClientEventType() ClientEventType { + return ClientEventTypeTranscriptionSessionUpdate +} + +func (m TranscriptionSessionUpdateEvent) MarshalJSON() ([]byte, error) { + type sessionUpdateEvent TranscriptionSessionUpdateEvent + v := struct { + *sessionUpdateEvent + Type ClientEventType `json:"type"` + }{ + sessionUpdateEvent: (*sessionUpdateEvent)(&m), + Type: m.ClientEventType(), + } + return json.Marshal(v) +} + // InputAudioBufferAppendEvent is the event for input audio buffer append. // Send this event to append audio bytes to the input audio buffer. // See https://platform.openai.com/docs/api-reference/realtime-client-events/input_audio_buffer/append diff --git a/server_event.go b/server_event.go index fd824c3..5976a9e 100644 --- a/server_event.go +++ b/server_event.go @@ -11,6 +11,7 @@ const ( ServerEventTypeError ServerEventType = "error" ServerEventTypeSessionCreated ServerEventType = "session.created" ServerEventTypeSessionUpdated ServerEventType = "session.updated" + ServerEventTypeTranscriptionSessionUpdated ServerEventType = "transcription_session.updated" ServerEventTypeConversationCreated ServerEventType = "conversation.created" ServerEventTypeInputAudioBufferCommitted ServerEventType = "input_audio_buffer.committed" ServerEventTypeInputAudioBufferCleared ServerEventType = "input_audio_buffer.cleared" @@ -82,6 +83,15 @@ type SessionUpdatedEvent struct { Session ServerSession `json:"session"` } +// TranscriptionSessionUpdatedEvent is the event for session updated. +// Returned when a session is updated. +// See https://platform.openai.com/docs/api-reference/realtime-server-events/session/updated +type TranscriptionSessionUpdatedEvent struct { + ServerEventBase + // The updated session resource. + Session ServerSession `json:"session"` +} + // ConversationCreatedEvent is the event for conversation created. // Returned when a conversation is created. Emitted right after session creation. // See https://platform.openai.com/docs/api-reference/realtime-server-events/conversation/created @@ -375,6 +385,7 @@ type ServerEventInterface interface { ErrorEvent | SessionCreatedEvent | SessionUpdatedEvent | + TranscriptionSessionUpdatedEvent | ConversationCreatedEvent | InputAudioBufferCommittedEvent | InputAudioBufferClearedEvent | @@ -427,6 +438,8 @@ func UnmarshalServerEvent(data []byte) (ServerEvent, error) { //nolint:funlen,cy return unmarshalServerEvent[SessionCreatedEvent](data) case ServerEventTypeSessionUpdated: return unmarshalServerEvent[SessionUpdatedEvent](data) + case ServerEventTypeTranscriptionSessionUpdated: + return unmarshalServerEvent[TranscriptionSessionUpdatedEvent](data) case ServerEventTypeConversationCreated: return unmarshalServerEvent[ConversationCreatedEvent](data) case ServerEventTypeInputAudioBufferCommitted: diff --git a/types.go b/types.go index ee7ca44..f951641 100644 --- a/types.go +++ b/types.go @@ -123,7 +123,11 @@ const ( type InputAudioTranscription struct { // The model used for transcription. - Model string `json:"model"` + Model string `json:"model"` + // The language of the input audio. Supplying the input language in ISO-639-1 (e.g. en) format will improve accuracy and latency. + Language string `json:"language,omitempty"` + // An optional text to guide the model's style or continue a previous audio segment. For whisper-1, the prompt is a list of keywords. For gpt-4o-transcribe models, the prompt is a free text string, for example "expect words related to technology". + Prompt string `json:"prompt,omitempty"` } type Tool struct {