스트리밍 STT - gRPC

본 문서는 스트리밍 STT 중에서 gRPC로 구현하는 방식에 대한 가이드를 제공합니다.

연동 예제

본 문서의 예제는 로컬 오디오 파일로부터 스트리밍 음성인식을 수행하는 방법을 설명합니다. 마이크와 같은 스트리밍 입력 장치로 API를 이용하고 싶은 경우, 파일로 읽어오는 코드 부분을 장치로부터 입력을 받는 코드로 변경하여 사용하실 수 있습니다. gRPC 연동을 위한 proto 파일을 확인할 수 있습니다.

인증 토큰 발급

스트리밍 STT API를 사용하기 위해서는 인증 토큰 발급 가이드를 통해 토큰을 발급받아야 합니다.

DecoderConfig

gRPC 연동 시 사용되는 DecoderConfig 에 대한 상세 정보는 공통 DecoderConfig/Parameter 정보에서 확인할 수 있습니다.

StreamingRecognitionResult

{
  // 스트리밍 시작 기준 문장의 발화 시점 (단위: msec)
  start_at: integer
  // final이 true인 경우 문자의 발화 시간, final이 false인 경우 0 (단위: msec)
  duration: integer
  // 문장의 종료 여부
  is_final: bool
  // 대체 텍스트, 첫 번째 값이 정확도가 가장 높은 결과
  alternatives: [
    SpeechRecognitionAlternative {
      // 문장의 텍스트
      text: string
      // 문장의 정확도 (beta)
      confidence: float
      // 단어의 정보, is_final이 true인 경우에만 제공
      words?: [
        WordInfo {
          text: string
          // 문장의 시작 기준 단어의 발화 시점 (단위: msec)
          start_at: integer
          // 발화 시간 (단위: msec)
          duration: integer
          // 정확도 (미지원)
          confidence: float
        }
      ]
    }
  ]
}

샘플 코드

Golang
Python
Java

package main

import (
	"context"
	"encoding/json"
	"flag"
	"fmt"
	"io"
	"log"
	"net/http"
	"os"
	"path/filepath"
	"strings"
	"sync"
	"time"

	"github.com/grpc-ecosystem/go-grpc-middleware/util/metautils"
	pb "github.com/vito-ai/go-genproto/vito-openapi/stt"
	"github.com/xfrr/goffmpeg/transcoder"
	"google.golang.org/grpc"
	"google.golang.org/grpc/credentials"
	"google.golang.org/grpc/metadata"
)

const ServerHost = "grpc-openapi.vito.ai:443"

var ClientId = os.Getenv("RTZR_CLIENT_ID")
var ClientSecret = os.Getenv("RTZR_CLIENT_SECRET")

const SAMPLE_RATE int = 8000
const BYTES_PER_SAMPLE int = 2

var False = false
var True = true

/*
본 예제에서는 스트리밍 입력을 음성파일을 읽어서 시뮬레이션 합니다.
실제사용시에는 마이크 입력 등의 실시간 음성 스트림이 들어와야합니다.
*/
type FileStreamer struct {
	file *os.File
}

func (fs *FileStreamer) Read(p []byte) (int, error) {
	byteSize := len(p)
	maxSize := 1024
	if byteSize > maxSize {
		byteSize = maxSize
	}

	defer time.Sleep(time.Duration(byteSize/((SAMPLE_RATE*BYTES_PER_SAMPLE)/1000)) * time.Millisecond)
	return fs.file.Read(p[:byteSize])
}

func (fs *FileStreamer) Close() error {
	defer os.Remove(fs.file.Name())
	return fs.file.Close()
}

func OpenAudioFile(audioFile string) (io.ReadCloser, error) {
	fileName := filepath.Base(audioFile)
	i := strings.LastIndex(fileName, ".")
	audioFileName8K := filepath.Join(os.TempDir(), fileName[:i]) + fmt.Sprintf("_%d.%s", SAMPLE_RATE, "wav")
	trans := new(transcoder.Transcoder)
	if err := trans.Initialize(audioFile, audioFileName8K); err != nil {
		log.Fatal(err)
	}

	trans.MediaFile().SetAudioRate(SAMPLE_RATE)
	trans.MediaFile().SetAudioChannels(1)
	trans.MediaFile().SetSkipVideo(true)
	trans.MediaFile().SetAudioFilter("aresample=resampler=soxr")

	err := <-trans.Run(false)
	if err != nil {
		return nil, fmt.Errorf("transcode audio file failed: %w", err)
	}

	file, err := os.Open(audioFileName8K)
	if err != nil {
		return nil, fmt.Errorf("open audio file failed: %w", err)
	}

	return &FileStreamer{file: file}, nil
}

func main() {
	flag.Usage = func() {
		fmt.Fprintf(os.Stderr, "Usage: %s <AUDIOFILE>\n", filepath.Base(os.Args[0]))
		fmt.Fprintf(os.Stderr, "<AUDIOFILE> must be a path to a local audio file. Audio file must be a 16-bit signed little-endian encoded with a sample rate of 16000.\n")

	}
	flag.Parse()
	if len(flag.Args()) != 1 {
		log.Fatal("Please pass path to your local audio file as a command line argument")
	}
	audioFile := flag.Arg(0)

	data := map[string][]string{
		"client_id":     []string{ClientId},
		"client_secret": []string{ClientSecret},
	}
	resp, _ := http.PostForm("https://openapi.vito.ai/v1/authenticate", data)

	if resp.StatusCode != 200 {
		panic("Failed to authenticate")
	}

	bytes, _ := io.ReadAll(resp.Body)
	var result struct {
		Token string `json:"access_token"`
	}
	json.Unmarshal(bytes, &result)

	var dialOpts []grpc.DialOption
	dialOpts = append(dialOpts, grpc.WithTransportCredentials(credentials.NewClientTLSFromCert(nil, "")))
	dialOpts = append(dialOpts, grpc.WithBlock())
	dialOpts = append(dialOpts, grpc.WithTimeout(10*time.Second))
	conn, err := grpc.Dial(ServerHost, dialOpts...)
	if err != nil {
		log.Fatalf("fail to dial: %v", err)
	}
	defer conn.Close()

	md := metadata.Pairs("authorization", fmt.Sprintf("%s %v", "bearer", result.Token))
	ctx := context.Background()
	nCtx := metautils.NiceMD(md).ToOutgoing(ctx)
	client := pb.NewOnlineDecoderClient(conn)
	stream, err := client.Decode(nCtx)
	if err != nil {
		log.Printf("Failed to create stream: %v\n", err)
		log.Fatal(err)
	}

	// Send the initial configuration message.
	if err := stream.Send(&pb.DecoderRequest{
		StreamingRequest: &pb.DecoderRequest_StreamingConfig{
			StreamingConfig: &pb.DecoderConfig{
				SampleRate:          int32(SAMPLE_RATE),
				Encoding:            pb.DecoderConfig_LINEAR16,
				UseItn:              &True,
				UseDisfluencyFilter: &False,
				UseProfanityFilter:  &False,
			},
		},
	}); err != nil {
		log.Fatal(err)
	}

	streamingFile, err := OpenAudioFile(audioFile)
	if err != nil {
		log.Fatal(err)
	}
	defer streamingFile.Close()

	var wg sync.WaitGroup
	wg.Add(1)
	go func() {
		defer wg.Done()
		buf := make([]byte, 1024)
		for {
			n, err := streamingFile.Read(buf)
			if n > 0 {
				if err := stream.Send(&pb.DecoderRequest{
					StreamingRequest: &pb.DecoderRequest_AudioContent{
						AudioContent: buf[:n],
					},
				}); err != nil {
					log.Printf("Could not send audio: %v", err)
				}
			}
			if err == io.EOF {
				// Nothing else to pipe, close the stream.
				if err := stream.CloseSend(); err != nil {
					log.Fatalf("Could not close stream: %v", err)
				}
				return
			}
			if err != nil {
				log.Printf("Could not read from %s: %v", audioFile, err)
				continue
			}
		}
	}()

	_, err = stream.Recv()
	if err != nil {
		log.Fatalf("failed to recv: %v", err)
	}

	for {
		resp, err := stream.Recv()
		if err == io.EOF {
			break
		}
		if err != nil {
			log.Printf("Cannot stream results: %v", err)
			break
		}

		if err := resp.Error; err {
			log.Printf("Could not recognize: %v", err)
			break
		}
		for _, result := range resp.Results {
			if result.IsFinal {
				fmt.Printf("final: %v\n", result.Alternatives[0].Text)
			} else {
				fmt.Printf("%v\n", result.Alternatives[0].Text)
			}
		}
	}
	wg.Wait()
}

"""

To Download definition (.proto) file

$ wget https://raw.github.com/vito-ai/openapi-grpc/main/protos/vito-stt-client.proto

The sample code has a dependency on FFmpeg.
Please refer to the installation methods for each operating system to install it.

MacOS
$ brew install ffmpeg

Windows
$ winget install ffmpeg

Ubuntu/Debian
$ apt-get install ffmpeg

To generate gRPC code

$ pip install grpcio-tools
$ python -m grpc_tools.protoc -I. --python_out=. --grpc_python_out=. ./vito-stt-client.proto

NOTE: This module requires the dependencies `grpcio` and `requests`.
To install using pip:
    pip install grpcio
    pip install requests
    pip install pydub

Example usage:
    python vitoopenapi-stt-streaming-sample.py sample/filepath

"""

import argparse
import logging
import os
import tempfile
import time
from io import DEFAULT_BUFFER_SIZE

import grpc
import vito_stt_client_pb2 as pb
import vito_stt_client_pb2_grpc as pb_grpc
from pydub import AudioSegment
from requests import Session

API_BASE = "https://openapi.vito.ai"
GRPC_SERVER_URL = "grpc-openapi.vito.ai:443"
CLIENT_ID = os.environ.get("RTZR_CLIENT_ID")
CLIENT_SECRET = os.environ.get("RTZR_CLIENT_SECRET")

if not CLIENT_ID or not CLIENT_SECRET:
    raise ValueError("RTZR_CLIENT_ID and RTZR_CLIENT_SECRET must be set")

SAMPLE_RATE = 8000
ENCODING = pb.DecoderConfig.AudioEncoding.LINEAR16
BYTES_PER_SAMPLE = 2


# 본 예제에서는 스트리밍 입력을 음성파일을 읽어서 시뮬레이션 합니다.
# 실제사용시에는 마이크 입력 등의 실시간 음성 스트림이 들어와야합니다.
class FileStreamer:
    def __init__(self, file):
        file_name = os.path.basename(file)
        i = file_name.rindex(".")
        audio_file_8k_path = os.path.join(tempfile.gettempdir(), file_name[:i]) + "_" + str(SAMPLE_RATE) + ".wav"
        self.filepath = audio_file_8k_path
        audio = AudioSegment.from_file(file=file, format=file[i + 1 :])
        audio = audio.set_frame_rate(SAMPLE_RATE)
        audio = audio.set_channels(1)
        audio.export(audio_file_8k_path, format="wav")
        self.file = open(audio_file_8k_path, "rb")

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.file.close()
        os.remove(self.filepath)

    def read(self, size):
        if size > 1024 * 1024:
            size = 1024 * 1024
        time.sleep(size / (SAMPLE_RATE * BYTES_PER_SAMPLE))
        content = self.file.read(size)
        return content


class RTZROpenAPIClient:
    def __init__(self, client_id, client_secret):
        super().__init__()
        self._logger = logging.getLogger(__name__)
        self.client_id = client_id
        self.client_secret = client_secret
        self._sess = Session()
        self._token = None

    @property
    def token(self):
        if self._token is None or self._token["expire_at"] < time.time():
            resp = self._sess.post(
                API_BASE + "/v1/authenticate",
                data={"client_id": self.client_id, "client_secret": self.client_secret},
            )
            resp.raise_for_status()
            self._token = resp.json()
        return self._token["access_token"]

    def transcribe_streaming_grpc(self, filepath, config):
        base = GRPC_SERVER_URL
        with grpc.secure_channel(base, credentials=grpc.ssl_channel_credentials()) as channel:
            stub = pb_grpc.OnlineDecoderStub(channel)
            cred = grpc.access_token_call_credentials(self.token)

            def req_iterator():
                yield pb.DecoderRequest(streaming_config=config)
                with FileStreamer(filepath) as f:
                    while True:
                        buff = f.read(size=DEFAULT_BUFFER_SIZE)
                        if buff is None or len(buff) == 0:
                            break
                        yield pb.DecoderRequest(audio_content=buff)

            req_iter = req_iterator()
            resp_iter = stub.Decode(req_iter, credentials=cred)
            for resp in resp_iter:
                resp: pb.DecoderResponse
                for res in resp.results:
                    print("[online-grpc] final:{}, text:{}".format(res.is_final, res.alternatives[0].text))


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument("stream", help="File to stream to the API")
    args = parser.parse_args()

    config = pb.DecoderConfig(
        sample_rate=SAMPLE_RATE,
        encoding=ENCODING,
        use_itn=True,
        use_disfluency_filter=False,
        use_profanity_filter=False,
        domain="CALL",
    )

    client = RTZROpenAPIClient(CLIENT_ID, CLIENT_SECRET)
    client.transcribe_streaming_grpc(args.stream, config)

// full version
// https://github.com/vito-ai/java-sample

/* 해당 예제는 아래 파일 포맷을 지원합니다
WAV, AU, AIFF
https://docs.oracle.com/javase/8/docs/technotes/guides/sound/index.html
*/

package ai.vito.openapi.stream;

import ai.vito.openapi.v1.*;

import com.fasterxml.jackson.databind.ObjectMapper;

import com.google.protobuf.ByteString;

import io.grpc.*;
import io.grpc.stub.StreamObserver;
import okhttp3.FormBody;
import okhttp3.OkHttpClient;
import okhttp3.Request;
import okhttp3.RequestBody;
import okhttp3.Response;

import javax.sound.sampled.AudioInputStream;
import javax.sound.sampled.AudioSystem;
import java.io.File;
import java.io.IOException;
import java.util.HashMap;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.Executor;
import java.util.concurrent.TimeUnit;
import java.util.logging.Level;
import java.util.logging.Logger;
import javax.sound.sampled.*;

/*
본 예제에서는 스트리밍 입력을 음성파일을 읽어서 시뮬레이션 합니다.
실제사용시에는 마이크 입력 등의 실시간 음성 스트림이 들어와야합니다.
*/
final class FileStreamer {
    private AudioInputStream audio8KStream;
    private int SAMPLE_RATE = 8000;
    private int BITS_PER_SAMPLE = 16;

    public FileStreamer(String filePath) throws IOException, UnsupportedAudioFileException {
        File file = new File(filePath);
        try {
            AudioInputStream originalAudioStream = AudioSystem.getAudioInputStream(file);
            AudioFormat originalFormat = originalAudioStream.getFormat();
            AudioFormat newFormat = new AudioFormat(
                    AudioFormat.Encoding.PCM_SIGNED,
                    SAMPLE_RATE,
                    BITS_PER_SAMPLE,
                    1,
                    1 * (BITS_PER_SAMPLE / 8),
                    SAMPLE_RATE,
                    originalFormat.isBigEndian());

            this.audio8KStream = AudioSystem.getAudioInputStream(newFormat, originalAudioStream);
        } catch (IOException | UnsupportedAudioFileException e) {
            throw e;
        }
    }

    public int read(byte[] b) throws IOException, InterruptedException {
        int maxSize = 1024 * 1024;
        int byteSize = Math.min(b.length, maxSize);
        try {
            Thread.sleep(byteSize / ((SAMPLE_RATE * (BITS_PER_SAMPLE / 8)) / 1000));
        } catch (InterruptedException e) {
            throw e;
        }
        return this.audio8KStream.read(b, 0, byteSize);
    }

    public void close() throws IOException {
        this.audio8KStream.close();
    }
}

public class RTZRSttGrpcClient {
    private static final Logger logger = Logger.getLogger(RTZRSttGrpcClient.class.getName());

    private final OnlineDecoderGrpc.OnlineDecoderStub asyncStub;
    private final StreamObserver<DecoderRequest> decoder;
    private final CountDownLatch finishLatch;

    public RTZRSttGrpcClient(Channel channel, final String token, final StreamObserver<DecoderResponse> observer) {
        finishLatch = new CountDownLatch(1);
        asyncStub = OnlineDecoderGrpc.newStub(channel)
                .withCallCredentials(new CallCredentials() {
                    @Override
                    public void applyRequestMetadata(RequestInfo requestInfo, Executor appExecutor,
                            MetadataApplier applier) {
                        final Metadata metadata = new Metadata();
                        metadata.put(Metadata.Key.of("authorization", Metadata.ASCII_STRING_MARSHALLER),
                                "Bearer " + token);
                        applier.apply(metadata);
                    }

                    @Override
                    public void thisUsesUnstableApi() {

                    }
                });
        decoder = asyncStub.decode(new StreamObserver<DecoderResponse>() {
            @Override
            public void onNext(DecoderResponse value) {
                observer.onNext(value);
            }

            @Override
            public void onError(Throwable t) {
                observer.onError(t);
                finishLatch.countDown();
            }

            @Override
            public void onCompleted() {
                observer.onCompleted();
                finishLatch.countDown();
            }
        });
    }

    public void await(long timeout, TimeUnit unit) throws InterruptedException {
        finishLatch.await(timeout, unit);
    }

    public void await() throws InterruptedException {
        finishLatch.await();
    }

    public void setDecoderConfig(DecoderConfig config) {
        decoder.onNext(DecoderRequest.newBuilder().setStreamingConfig(config).build());
    }

    public void send(byte[] buff, int size) {
        decoder.onNext(DecoderRequest.newBuilder().setAudioContent(ByteString.copyFrom(buff, 0, size)).build());
    }

    public void closeSend() {
        decoder.onCompleted();
    }

    private static void log(Level level, String msg, Object... args) {
        logger.log(level, msg, args);
    }

    private static void log(Level level, String msg, Throwable t) {
        logger.log(level, msg, t);
    }

    public static String getAccessToken() throws IOException {
        OkHttpClient client = new OkHttpClient();
        RequestBody formBody = new FormBody.Builder()
                .add("client_id", "{YOUR_CLIENT_ID}")
                .add("client_secret", "{YOUR_CLIENT_SECRET}")
                .build();
        Request request = new Request.Builder()
                .url("https://openapi.vito.ai/v1/authenticate")
                .post(formBody)
                .build();
        Response response = client.newCall(request).execute();
        ObjectMapper objectMapper = new ObjectMapper();
        HashMap<String, String> map = objectMapper.readValue(response.body().string(), HashMap.class);
        return map.get("access_token");
    }

    public static void main(String[] args) throws Exception {

        ManagedChannel channel = ManagedChannelBuilder.forTarget("grpc-openapi.vito.ai:443")
                .useTransportSecurity()
                .build();

        String token = getAccessToken();

        RTZRSttGrpcClient client = new RTZRSttGrpcClient(channel, token, new StreamObserver<DecoderResponse>() {
            @Override
            public void onNext(DecoderResponse value) {
                StreamingRecognitionResult result = value.getResults(0);
                SpeechRecognitionAlternative best = result.getAlternatives(0);
                if (result.getIsFinal()) {
                    System.out.printf("final:%6d,%6d: %s\n", result.getStartAt(), result.getDuration(), best.getText());
                } else {
                    System.out.printf(best.getText() + "\n");
                }
            }

            @Override
            public void onError(Throwable t) {
                log(Level.WARNING, "on error", t);
            }

            @Override
            public void onCompleted() {
                log(Level.INFO, "Complete");
            }
        });
        FileStreamer fileStreamer = new FileStreamer("sample.wav");

        DecoderConfig config = DecoderConfig.newBuilder().setSampleRate(8000)
                .setEncoding(DecoderConfig.AudioEncoding.LINEAR16).setUseItn(true).setUseDisfluencyFilter(true)
                .setUseProfanityFilter(true).build();

        client.setDecoderConfig(config);
        byte[] buffer = new byte[1024];
        int readBytes = 0;
        // Try to read numBytes bytes from the file.
        while ((readBytes = fileStreamer.read(buffer)) != -1) {
            client.send(buffer, readBytes);
        }
        fileStreamer.close();
        client.closeSend();
        client.await();
    }
}

오류 코드

스트리밍 STT - gRPC의 오류 처리는 grpc error code를 이용하여 처리합니다.

Code	Description	Notes
16	Unauthenticated	인증 실패
3	InvalidArgument	잘못된 파라미터 요청
8	ResourceExhausted	사용량 초과 또는 카드 등록 필요
13	Internal	서버 오류

참고사항

오디오 파일을 텍스트로 변환할 경우, 스트리밍 STT API를 이용하여 처리할 수도 있지만 일반 STT 가이드 문서에서 기술된 것처럼 일반 STT API로 변환 작업을 수행하는 것이 더 편리합니다.

연동 예제​

인증 토큰 발급​

DecoderConfig​

StreamingRecognitionResult​

샘플 코드​

오류 코드​

참고사항​