Skip to main content

Streaming STT - WebSocket

This guide shows how to implement the WebSocket mode for streaming STT.

Example

Examples use a local audio file as the source. For microphones or other devices, replace file reads with device input code.

Authentication token

Obtain a token via the Authentication guide.

Parameters

See Common DecoderConfig/Parameters.

Response

{
seq: integer, // utterance id
start_at: integer, // ms from stream start
duration: integer, // ms (0 if final=false)
final: boolean,
alternatives: [
{
text: string,
confidence: float,
words?: [
{
text: string,
start_at: integer,
duration: integer,
confidence: float
}
]
}
]
}

Sample code

package main

import (
"encoding/json"
"flag"
"fmt"
"io"
"log"
"net/http"
"net/url"
"os"
"os/signal"
"path/filepath"
"strconv"
"strings"
"time"

"github.com/gorilla/websocket"
"github.com/xfrr/goffmpeg/transcoder"
)

const ServerHost = "openapi.vito.ai"

var ClientId = os.Getenv("RTZR_CLIENT_ID")
var ClientSecret = os.Getenv("RTZR_CLIENT_SECRET")

const SAMPLE_RATE int = 8000
const BYTES_PER_SAMPLE int = 2

/*
본 예제에서는 스트리밍 입력을 음성파일을 읽어서 시뮬레이션 합니다.
실제사용시에는 마이크 입력 등의 실시간 음성 스트림이 들어와야합니다.
*/
type FileStreamer struct {
file *os.File
}

func (fs *FileStreamer) Read(p []byte) (int, error) {
byteSize := len(p)
maxSize := 1024 * 1024
if byteSize > maxSize {
byteSize = maxSize
}

defer time.Sleep(time.Duration(byteSize/((SAMPLE_RATE*BYTES_PER_SAMPLE)/1000)) * time.Millisecond)
return fs.file.Read(p[:byteSize])
}

func (fs *FileStreamer) Close() error {
defer os.Remove(fs.file.Name())
return fs.file.Close()
}

func OpenAudioFile(audioFile string) (io.ReadCloser, error) {
fileName := filepath.Base(audioFile)
i := strings.LastIndex(fileName, ".")
audioFileName8K := filepath.Join(os.TempDir(), fileName[:i]) + fmt.Sprintf("_%d.%s", SAMPLE_RATE, "wav")
trans := new(transcoder.Transcoder)
if err := trans.Initialize(audioFile, audioFileName8K); err != nil {
log.Fatal(err)
}

trans.MediaFile().SetAudioRate(SAMPLE_RATE)
trans.MediaFile().SetAudioChannels(1)
trans.MediaFile().SetSkipVideo(true)
trans.MediaFile().SetAudioFilter("aresample=resampler=soxr")

err := <-trans.Run(false)
if err != nil {
return nil, fmt.Errorf("transcode audio file failed: %w", err)
}

file, err := os.Open(audioFileName8K)
if err != nil {
return nil, fmt.Errorf("open audio file failed: %w", err)
}

return &FileStreamer{file: file}, nil
}

func main() {
flag.Parse()
log.SetFlags(0)

interrupt := make(chan os.Signal, 1)
signal.Notify(interrupt, os.Interrupt)

u := url.URL{Scheme: "wss", Host: ServerHost, Path: "/v1/transcribe:streaming"}

query := u.Query()
query.Set("sample_rate", strconv.Itoa(SAMPLE_RATE))
query.Set("encoding", "LINEAR16")
query.Set("use_itn", "true")
query.Set("use_disfluency_filter", "true")
query.Set("use_profanity_filter", "false")
u.RawQuery = query.Encode()

log.Printf("connecting to %s", u.String())

audioFile := flag.Arg(0)
streamingFile, err := OpenAudioFile(audioFile)
if err != nil {
log.Fatal(err)
}
defer streamingFile.Close()

data := map[string][]string{
"client_id": []string{ClientId},
"client_secret": []string{ClientSecret},
}
resp, _ := http.PostForm("https://openapi.vito.ai/v1/authenticate", data)
if resp.StatusCode != 200 {
panic("Failed to authenticate")
}

bytes, _ := io.ReadAll(resp.Body)
var result struct {
Token string `json:"access_token"`
}
json.Unmarshal(bytes, &result)
requestHeader := http.Header{
"Authorization": []string{fmt.Sprintf("Bearer %s", result.Token)},
}
c, res, err := websocket.DefaultDialer.Dial(u.String(), requestHeader)
if err != nil {
log.Printf("status: %s, body: %v", res.Status, res.Body)
log.Fatal("dial:", err)
}
defer c.Close()

start := time.Now()
go func() {
buf := make([]byte, 1024)
for {
n, err := streamingFile.Read(buf)
if err == io.EOF {
// Nothing else to pipe, close the stream.
log.Println("send EOS")
if err := c.WriteMessage(websocket.TextMessage, []byte("EOS")); err != nil {
log.Fatalf("Could not close stream: %v", err)
}
return
}
if err != nil {
log.Printf("Could not read from %s: %v", audioFile, err)
continue
}
err = c.WriteMessage(websocket.BinaryMessage, buf[:n])
if err != nil {
log.Println("write:", err)
}

select {
case <-interrupt:
log.Println("interrupt")
if err := c.WriteMessage(websocket.TextMessage, []byte("EOS")); err != nil {
log.Fatalf("Could not close: %v", err)
}
return
default:

}
}
}()

start2 := time.Now()
for {
_, message, err := c.ReadMessage()
if err != nil {
log.Printf("elapsed[%v]\n", time.Since(start))
log.Println("read:", err)
return
}
log.Printf("[%v]recv: %s", time.Since(start2), message)
start2 = time.Now()
}

}

Error codes

Errors are indicated by the WebSocket dial response status code.

HttpStatusCodeNotes
400H0001Invalid params
401H0002Unauthorized
429A0001Usage exceeded
500E500Server error

Notes

For converting audio files to text, Batch STT is often simpler than Streaming STT.