Skip to content

feat: Transcription only mode #19

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jun 15, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 17 additions & 1 deletion client.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import (
"context"
"fmt"
"net/http"
"net/url"
)
Expand Down Expand Up @@ -59,6 +60,7 @@

type connectOption struct {
model string
intent string
dialer WebSocketDialer
logger Logger
}
Expand All @@ -72,6 +74,13 @@
}
}

// Set transcription intent instead of model

Check failure on line 77 in client.go

View workflow job for this annotation

GitHub Actions / Sanity check (1.19)

Comment should end in a period (godot)

Check failure on line 77 in client.go

View workflow job for this annotation

GitHub Actions / Sanity check (1.20)

Comment should end in a period (godot)

Check failure on line 77 in client.go

View workflow job for this annotation

GitHub Actions / Sanity check (1.22)

Comment should end in a period (godot)

Check failure on line 77 in client.go

View workflow job for this annotation

GitHub Actions / Code coverage

Comment should end in a period (godot)
func WithIntent() ConnectOption {
return func(opts *connectOption) {
opts.intent = "transcription"
}
}

// WithDialer sets the dialer for the connection.
func WithDialer(dialer WebSocketDialer) ConnectOption {
return func(opts *connectOption) {
Expand Down Expand Up @@ -103,7 +112,14 @@
headers := c.getHeaders()

// get url by model
url := c.getURL(connectOpts.model)
var url string
if connectOpts.intent == "" {

Check failure on line 116 in client.go

View workflow job for this annotation

GitHub Actions / Sanity check (1.19)

ifElseChain: rewrite if-else to switch statement (gocritic)

Check failure on line 116 in client.go

View workflow job for this annotation

GitHub Actions / Sanity check (1.20)

ifElseChain: rewrite if-else to switch statement (gocritic)

Check failure on line 116 in client.go

View workflow job for this annotation

GitHub Actions / Sanity check (1.22)

ifElseChain: rewrite if-else to switch statement (gocritic)

Check failure on line 116 in client.go

View workflow job for this annotation

GitHub Actions / Code coverage

ifElseChain: rewrite if-else to switch statement (gocritic)
url = c.getURL(connectOpts.model)
} else if c.config.APIType != APITypeOpenAI {
return nil, fmt.Errorf("Azure API type with intent set not implemented");

Check failure on line 119 in client.go

View workflow job for this annotation

GitHub Actions / Sanity check (1.19)

File is not properly formatted (goimports)

Check failure on line 119 in client.go

View workflow job for this annotation

GitHub Actions / Sanity check (1.20)

File is not properly formatted (goimports)

Check failure on line 119 in client.go

View workflow job for this annotation

GitHub Actions / Sanity check (1.22)

File is not properly formatted (goimports)

Check failure on line 119 in client.go

View workflow job for this annotation

GitHub Actions / Code coverage

File is not properly formatted (goimports)
} else {
url = c.config.BaseURL + "?" + "intent=" + connectOpts.intent
}

// dial
conn, err := connectOpts.dialer.Dial(ctx, url, headers)
Expand Down
70 changes: 61 additions & 9 deletions client_event.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,16 @@ import "encoding/json"
type ClientEventType string

const (
ClientEventTypeSessionUpdate ClientEventType = "session.update"
ClientEventTypeInputAudioBufferAppend ClientEventType = "input_audio_buffer.append"
ClientEventTypeInputAudioBufferCommit ClientEventType = "input_audio_buffer.commit"
ClientEventTypeInputAudioBufferClear ClientEventType = "input_audio_buffer.clear"
ClientEventTypeConversationItemCreate ClientEventType = "conversation.item.create"
ClientEventTypeConversationItemTruncate ClientEventType = "conversation.item.truncate"
ClientEventTypeConversationItemDelete ClientEventType = "conversation.item.delete"
ClientEventTypeResponseCreate ClientEventType = "response.create"
ClientEventTypeResponseCancel ClientEventType = "response.cancel"
ClientEventTypeSessionUpdate ClientEventType = "session.update"
ClientEventTypeTranscriptionSessionUpdate ClientEventType = "transcription_session.update"
ClientEventTypeInputAudioBufferAppend ClientEventType = "input_audio_buffer.append"
ClientEventTypeInputAudioBufferCommit ClientEventType = "input_audio_buffer.commit"
ClientEventTypeInputAudioBufferClear ClientEventType = "input_audio_buffer.clear"
ClientEventTypeConversationItemCreate ClientEventType = "conversation.item.create"
ClientEventTypeConversationItemTruncate ClientEventType = "conversation.item.truncate"
ClientEventTypeConversationItemDelete ClientEventType = "conversation.item.delete"
ClientEventTypeResponseCreate ClientEventType = "response.create"
ClientEventTypeResponseCancel ClientEventType = "response.cancel"
)

// ClientEvent is the interface for client event.
Expand Down Expand Up @@ -78,6 +79,57 @@ func (m SessionUpdateEvent) MarshalJSON() ([]byte, error) {
return json.Marshal(v)
}

type NoiseReductionType string

const (
NearFieldNoiseReduction NoiseReductionType = "near_field"
FarFieldNoiseReduction NoiseReductionType = "far_field"
)

type InputAudioNoiseReduction struct {
// Type of noise reduction. near_field is for close-talking microphones such as headphones, far_field is for far-field microphones such as laptop or conference room microphones.
Type NoiseReductionType `json:"type"`
}

type ClientTranscriptionSession struct {
Include []string `json:"include,omitempty"`
// The set of modalities the model can respond with. To disable audio, set this to ["text"].
Modalities []Modality `json:"modalities,omitempty"`
// The format of input audio. Options are "pcm16", "g711_ulaw", or "g711_alaw".
InputAudioFormat AudioFormat `json:"input_audio_format,omitempty"`
// Configuration for input audio noise reduction. This can be set to null to turn off. Noise reduction filters audio added to the input audio buffer before it is sent to VAD and the model. Filtering the audio can improve VAD and turn detection accuracy (reducing false positives) and model performance by improving perception of the input audio.
InputAudioNoiseReduction *InputAudioNoiseReduction `json:"input_audio_noise_reduction,omitempty"`
// Configuration for input audio transcription. Can be set to `nil` to turn off.
InputAudioTranscription *InputAudioTranscription `json:"input_audio_transcription,omitempty"`
// Configuration for turn detection. Can be set to `nil` to turn off.
TurnDetection *ClientTurnDetection `json:"turn_detection"`
}

// SessionUpdateEvent is the event for session update.
// Send this event to update the session’s default configuration.
// See https://platform.openai.com/docs/api-reference/realtime-client-events/session/update
type TranscriptionSessionUpdateEvent struct {
EventBase
// Session configuration to update.
Session ClientTranscriptionSession `json:"session"`
}

func (m TranscriptionSessionUpdateEvent) ClientEventType() ClientEventType {
return ClientEventTypeTranscriptionSessionUpdate
}

func (m TranscriptionSessionUpdateEvent) MarshalJSON() ([]byte, error) {
type sessionUpdateEvent TranscriptionSessionUpdateEvent
v := struct {
*sessionUpdateEvent
Type ClientEventType `json:"type"`
}{
sessionUpdateEvent: (*sessionUpdateEvent)(&m),
Type: m.ClientEventType(),
}
return json.Marshal(v)
}

// InputAudioBufferAppendEvent is the event for input audio buffer append.
// Send this event to append audio bytes to the input audio buffer.
// See https://platform.openai.com/docs/api-reference/realtime-client-events/input_audio_buffer/append
Expand Down
13 changes: 13 additions & 0 deletions server_event.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
ServerEventTypeError ServerEventType = "error"
ServerEventTypeSessionCreated ServerEventType = "session.created"
ServerEventTypeSessionUpdated ServerEventType = "session.updated"
ServerEventTypeTranscriptionSessionUpdated ServerEventType = "transcription_session.updated"
ServerEventTypeConversationCreated ServerEventType = "conversation.created"
ServerEventTypeInputAudioBufferCommitted ServerEventType = "input_audio_buffer.committed"
ServerEventTypeInputAudioBufferCleared ServerEventType = "input_audio_buffer.cleared"
Expand Down Expand Up @@ -82,6 +83,15 @@
Session ServerSession `json:"session"`
}

// TranscriptionSessionUpdatedEvent is the event for session updated.
// Returned when a session is updated.
// See https://platform.openai.com/docs/api-reference/realtime-server-events/session/updated
type TranscriptionSessionUpdatedEvent struct {
ServerEventBase
// The updated session resource.
Session ServerSession `json:"session"`
}

// ConversationCreatedEvent is the event for conversation created.
// Returned when a conversation is created. Emitted right after session creation.
// See https://platform.openai.com/docs/api-reference/realtime-server-events/conversation/created
Expand Down Expand Up @@ -375,6 +385,7 @@
ErrorEvent |
SessionCreatedEvent |
SessionUpdatedEvent |
TranscriptionSessionUpdatedEvent |
ConversationCreatedEvent |
InputAudioBufferCommittedEvent |
InputAudioBufferClearedEvent |
Expand Down Expand Up @@ -412,7 +423,7 @@
}

// UnmarshalServerEvent unmarshals the server event from the given JSON data.
func UnmarshalServerEvent(data []byte) (ServerEvent, error) { //nolint:funlen,cyclop // TODO: optimize

Check failure on line 426 in server_event.go

View workflow job for this annotation

GitHub Actions / Sanity check (1.19)

cyclomatic complexity 31 of func `UnmarshalServerEvent` is high (> 30) (gocyclo)

Check failure on line 426 in server_event.go

View workflow job for this annotation

GitHub Actions / Sanity check (1.20)

cyclomatic complexity 31 of func `UnmarshalServerEvent` is high (> 30) (gocyclo)

Check failure on line 426 in server_event.go

View workflow job for this annotation

GitHub Actions / Sanity check (1.22)

cyclomatic complexity 31 of func `UnmarshalServerEvent` is high (> 30) (gocyclo)

Check failure on line 426 in server_event.go

View workflow job for this annotation

GitHub Actions / Code coverage

cyclomatic complexity 31 of func `UnmarshalServerEvent` is high (> 30) (gocyclo)
var eventType struct {
Type ServerEventType `json:"type"`
}
Expand All @@ -427,6 +438,8 @@
return unmarshalServerEvent[SessionCreatedEvent](data)
case ServerEventTypeSessionUpdated:
return unmarshalServerEvent[SessionUpdatedEvent](data)
case ServerEventTypeTranscriptionSessionUpdated:
return unmarshalServerEvent[TranscriptionSessionUpdatedEvent](data)
case ServerEventTypeConversationCreated:
return unmarshalServerEvent[ConversationCreatedEvent](data)
case ServerEventTypeInputAudioBufferCommitted:
Expand Down
6 changes: 5 additions & 1 deletion types.go
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,11 @@

type InputAudioTranscription struct {
// The model used for transcription.
Model string `json:"model"`
Model string `json:"model"`

Check failure on line 126 in types.go

View workflow job for this annotation

GitHub Actions / Sanity check (1.19)

File is not properly formatted (goimports)

Check failure on line 126 in types.go

View workflow job for this annotation

GitHub Actions / Sanity check (1.20)

File is not properly formatted (goimports)

Check failure on line 126 in types.go

View workflow job for this annotation

GitHub Actions / Sanity check (1.22)

File is not properly formatted (goimports)

Check failure on line 126 in types.go

View workflow job for this annotation

GitHub Actions / Code coverage

File is not properly formatted (goimports)
// The language of the input audio. Supplying the input language in ISO-639-1 (e.g. en) format will improve accuracy and latency.
Language string `json:"language,omitempty"`
// An optional text to guide the model's style or continue a previous audio segment. For whisper-1, the prompt is a list of keywords. For gpt-4o-transcribe models, the prompt is a free text string, for example "expect words related to technology".
Prompt string `json:"prompt,omitempty"`
}

type Tool struct {
Expand Down
Loading