Batch STT
The Batch STT API is an HTTP REST API that converts audio files to text.
Supported formats
tip
Batch STT supports mp4, m4a, mp3, amr, flac, wav.
Authentication token
Obtain a token via the Authentication guide before using Batch STT.
API list
Method | URL | Description |
---|---|---|
POST | /v1/transcribe | Submit a job |
GET | /v1/transcribe/{TRANSCRIBE_ID} | Get job result |
1) [POST] /v1/transcribe
Submits a transcription job for a stored audio file.
HTTP request
POST https://openapi.vito.ai/v1/transcribe
Request headers
Authorization: Bearer {YOUR_JWT_TOKEN}
- scheme: bearer
- bearerFormat: JWT
Request body
content-type: multipart/form-data
Field | Type | Required |
---|---|---|
config | RequestConfig | required |
file | Binary | required |
RequestConfig
Name | Desc | Type | Required | Value | Default |
---|---|---|---|---|---|
model_name | Recognition model | string | optional | sommers, whisper | sommers |
language | Recognition language, whisper-only | string | optional | ko, detect, multi | ko |
language_candidates | Language detection candidates | array | optional | ["ko","ja","zh","en"] | |
use_diarization | Speaker diarization | boolean | optional | false | |
diarization.spk_count | Number of speakers, effective when use_diarization is true | integer | optional | >= 0 | 0 (auto) |
use_itn | English/Number/Unit normalization | boolean | optional | true | |
use_disfluency_filter | Disfluency filter | boolean | optional | true | |
use_profanity_filter | Profanity filter | boolean | optional | false | |
use_paragraph_splitter | Paragraph splitter | boolean | optional | true | |
paragraph_splitter.max | Max characters per paragraph, effective when use_paragraph_splitter is true | integer | optional | >= 1 | 50 |
domain | Domain | string | optional | GENERAL, CALL | GENERAL |
use_word_timestamp | Word-level timestamps | boolean | optional | false | |
keywords | Keyword boosting | array | optional |
caution
- POST concurrency: number of in-flight jobs follows the Rate limit policy. Completion is determined by the GET API.
- Max file size: 2GB; Max duration: 4 hours.
- Jobs are processed in order. For long files and busy periods, start delays up to 30+ minutes are possible.
Sample code 1
- cURL
- Python
- Java
transcribe.sh
curl -X "POST" \
"https://openapi.vito.ai/v1/transcribe" \
-H "accept: application/json" \
-H "Authorization: Bearer ${YOUR_JWT_TOKEN}" \
-H "Content-Type: multipart/form-data" \
-F "file=@sample.wav" \
-F 'config={}'
transcribe.py
import json
import requests
import os
# Read JWT token from environment
jwt_token = os.getenv("YOUR_JWT_TOKEN")
if not jwt_token:
raise ValueError("Environment variable 'YOUR_JWT_TOKEN' is not set")
config = {}
resp = requests.post(
"https://openapi.vito.ai/v1/transcribe",
headers={"Authorization": f"Bearer {jwt_token}"},
data={"config": json.dumps(config)},
files={"file": open("sample.wav", "rb")},
)
resp.raise_for_status()
print(resp.json())
transcribe.java
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;
import java.util.Scanner;
public class PostTranscribeSample {
public static void main(String[] args) throws Exception {
URL url = new URL("https://openapi.vito.ai/v1/transcribe");
HttpURLConnection httpConn = (HttpURLConnection) url.openConnection();
httpConn.setRequestMethod("POST");
httpConn.setRequestProperty("accept", "application/json");
httpConn.setRequestProperty("Authorization", "Bearer "+ "{YOUR_JWT_TOKEN}");
httpConn.setRequestProperty("Content-Type", "multipart/form-data;boundary=authsample");
httpConn.setDoOutput(true);
File file = new File("sample.wav");
try (DataOutputStream outputStream = new DataOutputStream(httpConn.getOutputStream());
FileInputStream in = new FileInputStream(file)) {
// File part
outputStream.writeBytes("--authsample\r\n");
outputStream.writeBytes("Content-Disposition: form-data; name=\"file\"; filename=\"" + file.getName() + "\"\r\n");
outputStream.writeBytes("Content-Type: " + URLConnection.guessContentTypeFromName(file.getName()) + "\r\n");
outputStream.writeBytes("Content-Transfer-Encoding: binary\r\n\r\n");
byte[] buffer = new byte[8192];
int bytesRead;
while ((bytesRead = in.read(buffer)) != -1) {
outputStream.write(buffer, 0, bytesRead);
}
outputStream.writeBytes("\r\n");
// Config part
outputStream.writeBytes("--authsample\r\n");
outputStream.writeBytes("Content-Disposition: form-data; name=\"config\"\r\n");
outputStream.writeBytes("Content-Type: application/json\r\n\r\n");
outputStream.writeBytes("{}\r\n");
// End boundary
outputStream.writeBytes("--authsample--\r\n");
outputStream.flush();
}
InputStream responseStream = httpConn.getResponseCode() / 100 == 2
? httpConn.getInputStream()
: httpConn.getErrorStream();
Scanner s = new Scanner(responseStream).useDelimiter("\\A");
String response = s.hasNext() ? s.next() : "";
s.close();
System.out.println(response);
}
}
Sample code 2
- cURL
- Python
- Java
transcribe.sh
curl -X "POST" \
"https://openapi.vito.ai/v1/transcribe" \
-H "accept: application/json" \
-H "Authorization: Bearer ${YOUR_JWT_TOKEN}" \
-H "Content-Type: multipart/form-data" \
-F "file=@sample.wav" \
-F 'config={
"use_diarization": true,
"diarization": {
"spk_count": 2
},
"use_itn": false,
"use_disfluency_filter": false,
"use_profanity_filter": false,
"use_paragraph_splitter": true,
"paragraph_splitter": {
"max": 50
}
}'
transcribe.py
import json
import requests
import os
# Read JWT token from environment
jwt_token = os.getenv("YOUR_JWT_TOKEN")
if not jwt_token:
raise ValueError("Environment variable 'YOUR_JWT_TOKEN' is not set")
config = {
"use_diarization": True,
"diarization": {"spk_count": 2},
"use_itn": False,
"use_disfluency_filter": False,
"use_profanity_filter": False,
"use_paragraph_splitter": True,
"paragraph_splitter": {"max": 50},
}
resp = requests.post(
"https://openapi.vito.ai/v1/transcribe",
headers={"Authorization": f"Bearer {jwt_token}"},
data={"config": json.dumps(config)},
files={"file": open("sample.wav", "rb")},
)
resp.raise_for_status()
print(resp.json())
transcribe.java
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;
import java.util.Scanner;
public class PostTranscribeSample {
public static void main(String[] args) throws Exception {
URL url = new URL("https://openapi.vito.ai/v1/transcribe");
HttpURLConnection httpConn = (HttpURLConnection) url.openConnection();
httpConn.setRequestMethod("POST");
httpConn.setRequestProperty("accept", "application/json");
httpConn.setRequestProperty("Authorization", "Bearer "+ "{YOUR_JWT_TOKEN}");
httpConn.setRequestProperty("Content-Type", "multipart/form-data;boundary=authsample");
httpConn.setDoOutput(true);
File file = new File("sample.wav");
try (DataOutputStream outputStream = new DataOutputStream(httpConn.getOutputStream());
FileInputStream in = new FileInputStream(file)) {
// File part
outputStream.writeBytes("--authsample\r\n");
outputStream.writeBytes("Content-Disposition: form-data; name=\"file\"; filename=\"" + file.getName() + "\"\r\n");
outputStream.writeBytes("Content-Type: " + URLConnection.guessContentTypeFromName(file.getName()) + "\r\n");
outputStream.writeBytes("Content-Transfer-Encoding: binary\r\n\r\n");
byte[] buffer = new byte[8192];
int bytesRead;
while ((bytesRead = in.read(buffer)) != -1) {
outputStream.write(buffer, 0, bytesRead);
}
outputStream.writeBytes("\r\n");
// Config part
outputStream.writeBytes("--authsample\r\n");
outputStream.writeBytes("Content-Disposition: form-data; name=\"config\"\r\n");
outputStream.writeBytes("Content-Type: application/json\r\n\r\n");
outputStream.writeBytes(
"{\n" +
" \"use_diarization\": true,\n" +
" \"diarization\": {\"spk_count\": 2},\n" +
" \"use_itn\": false,\n" +
" \"use_disfluency_filter\": false,\n" +
" \"use_profanity_filter\": false,\n" +
" \"use_paragraph_splitter\": true,\n" +
" \"paragraph_splitter\": {\"max\": 50}\n" +
"}\r\n"
);
// End boundary
outputStream.writeBytes("--authsample--\r\n");
outputStream.flush();
}
InputStream responseStream = httpConn.getResponseCode() / 100 == 2
? httpConn.getInputStream()
: httpConn.getErrorStream();
Scanner s = new Scanner(responseStream).useDelimiter("\\A");
String response = s.hasNext() ? s.next() : "";
s.close();
System.out.println(response);
}
}
Response body
On success:
{
"id": "{TRANSCRIBE_ID}"
}
Error codes
HTTP Status | Code | Notes |
---|---|---|
400 | H0001 | Invalid parameter |
400 | H0010 | Unsupported file type |
401 | H0002 | Invalid token |
413 | H0005 | File size exceeded |
413 | H0006 | File length exceeded |
429 | A0001 | Usage exceeded |
429 | A0002 | Concurrency exceeded |
500 | E500 | Server error |
Example failure:
{
"code": "H0001",
"msg": "unexpected end of JSON input"
}
2) [GET] /v1/transcribe/{TRANSCRIBE_ID}
- Fetch transcription results using the
TRANSCRIBE_ID
returned by the POST API.
HTTP request
GET https://openapi.vito.ai/v1/transcribe/{TRANSCRIBE_ID}
Request headers
Authorization: Bearer {YOUR_JWT_TOKEN}
- scheme: bearer
- bearerFormat: JWT
Sample code
- cURL
- Python
- Java
get_transcript.sh
curl -X "GET" \
"https://openapi.vito.ai/v1/transcribe/${TRANSCRIBE_ID}" \
-H "accept: application/json" \
-H "Authorization: Bearer ${YOUR_JWT_TOKEN}"
get_transcript.py
import requests
resp = requests.get(
"https://openapi.vito.ai/v1/transcribe/" + "{TRANSCRIBE_ID}",
headers={"Authorization": "bearer " + "{YOUR_JWT_TOKEN}"},
)
resp.raise_for_status()
print(resp.json())
get_transcript.java
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.Scanner;
public class GetTranscribeSample {
public static void main(String[] args) throws Exception {
URL url = new URL("https://openapi.vito.ai/v1/transcribe/"+"{TRANSCRIBE_ID}");
HttpURLConnection httpConn = (HttpURLConnection) url.openConnection();
httpConn.setRequestMethod("GET");
httpConn.setRequestProperty("accept", "application/json");
httpConn.setRequestProperty("Authorization", "Bearer {YOUR_JWT_TOKEN}");
InputStream responseStream = httpConn.getResponseCode() / 100 == 2
? httpConn.getInputStream()
: httpConn.getErrorStream();
Scanner s = new Scanner(responseStream).useDelimiter("\\A");
String response = s.hasNext() ? s.next() : "";
s.close();
System.out.println(response);
}
}
Response body
On success (selected fields):
Name | Desc | Type | Value |
---|---|---|---|
id | transcribe id | string | |
status | status of the job | string | transcribing , completed , failed |
results.utterances | utterance list | array | |
results.utterances.start_at | utterance start time (ms) | integer | |
results.utterances.duration | utterance duration (ms) | integer | |
results.utterances.msg | utterance text | string | |
results.utterances.spk | speaker/channel id | integer | |
results.utterances.lang | language value or detected language for detect/multi | string | ISO 639-1 |
tip
Batch STT uses polling for long files. When status is transcribing
, poll every ~5s until completed
or failed
. Too short intervals may result in 429.
status: transcribing
{
"id": "{TRANSCRIBE_ID}",
"status": "transcribing"
}
status: completed
{
"id": "{TRANSCRIBE_ID}",
"status": "completed",
"results": {
"utterances": [
{
"start_at": 4737,
"duration": 2360,
"msg": "Hello.",
"spk": 0,
"lang": "en"
}
]
}
}
status: failed
{
"id": "{TRANSCRIBE_ID}",
"status": "failed",
"error": {
"code": "{ERROR_CODE}",
"message": "{MESSAGE}"
}
}
Example:
{
"id": "ZbOOQftrS1ywK_T3ikuveA",
"status": "failed",
"error": {
"code": "E500",
"message": "internal server error"
}
}
Error codes
HttpStatus | Code | Notes |
---|---|---|
400 | H0001 | Invalid parameter |
401 | H0002 | Invalid token |
403 | H0003 | Forbidden |
404 | H0004 | Not found |
410 | H0007 | Result expired |
429 | A0003 | Rate limited |
500 | E500 | Server error |
Example failure:
{
"code": "H0004",
"msg": "not found"
}
Unified example
In the example script below, you can combine the desired settings with the PRESET
environment variable. The default is sommers_basic
.
- Python (recommended)
transcribe.py
import json
import os
import time
from typing import Any, Dict, Optional
import requests
class RTZROpenAPIClient:
"""Minimal client for RTZR OpenAPI (auth + STT file).
- Fetches JWT via /v1/authenticate using client_id/client_secret
- Submits a file transcription job via /v1/transcribe
- Polls /v1/transcribe/{id} every few seconds until completed/failed
"""
def __init__(
self,
client_id: Optional[str] = None,
client_secret: Optional[str] = None,
base_url: str = "https://openapi.vito.ai",
) -> None:
self.base_url = base_url.rstrip("/")
self.client_id = client_id or os.getenv("RTZR_CLIENT_ID")
self.client_secret = client_secret or os.getenv("RTZR_CLIENT_SECRET")
if not self.client_id or not self.client_secret:
raise ValueError(
"Missing credentials. Set RTZR_CLIENT_ID and RTZR_CLIENT_SECRET "
"environment variables, or pass client_id/client_secret to RTZROpenAPIClient."
)
self._sess = requests.Session()
self._token: Optional[Dict[str, Any]] = None
@property
def token(self) -> str:
# Renew if missing or expiring within 30 minutes
if self._token is None or self._token.get("expire_at", 0) < time.time() - 1800:
resp = self._sess.post(
f"{self.base_url}/v1/authenticate",
data={"client_id": self.client_id, "client_secret": self.client_secret},
)
resp.raise_for_status()
self._token = resp.json()
access = self._token.get("access_token")
if not access:
raise RuntimeError("authenticate: 'access_token' not found in response")
return access
def _auth_headers(self) -> Dict[str, str]:
return {"Authorization": f"Bearer {self.token}"}
def transcribe_file(self, file_path: str, config: Dict[str, Any]) -> Dict[str, Any]:
url = f"{self.base_url}/v1/transcribe"
with open(file_path, "rb") as f:
files = {"file": (os.path.basename(file_path), f)}
data = {"config": json.dumps(config)}
resp = self._sess.post(url, headers=self._auth_headers(), files=files, data=data)
resp.raise_for_status()
return resp.json()
def get_transcription(self, transcribe_id: str) -> Dict[str, Any]:
url = f"{self.base_url}/v1/transcribe/{transcribe_id}"
resp = self._sess.get(url, headers=self._auth_headers())
resp.raise_for_status()
return resp.json()
def wait_for_result(
self,
transcribe_id: str,
poll_interval_sec: int = 5,
timeout_sec: int = 3600,
) -> Dict[str, Any]:
deadline = time.time() + timeout_sec
while True:
if time.time() > deadline:
raise TimeoutError("Timed out waiting for transcription result")
result = self.get_transcription(transcribe_id)
status = result.get("status")
if status in ("completed", "failed"):
return result
time.sleep(poll_interval_sec)
# Preset configurations
PRESETS: Dict[str, Dict[str, Any]] = {
"sommers_basic": { # 1) sommers without diarization
"model_name": "sommers",
"use_diarization": False,
"domain": "GENERAL",
},
"sommers_call_diarization": { # 2) sommers + diarization + CALL, spk_count=2
"model_name": "sommers",
"domain": "CALL",
"use_diarization": True,
"diarization": {"spk_count": 2},
},
"whisper_en_diarization": { # 3) whisper + diarization, language=en
"model_name": "whisper",
"language": "en",
"use_diarization": True,
},
# Additional commonly requested options
"paragraph_split_80": {"use_paragraph_splitter": True, "paragraph_splitter": {"max": 80}},
"keywords_example": {"keywords": ["stt", "returnzero", "api"]},
"with_word_timestamps": {"use_word_timestamp": True},
"disfluency_on": {"use_disfluency_filter": True},
"profanity_on": {"use_profanity_filter": True},
"whisper_detect_multi": {
"model_name": "whisper",
"language": "multi",
"language_candidates": ["ko", "en", "ja"],
},
}
def main():
audio_path = os.getenv("AUDIO_PATH", "sample.wav")
preset_name = os.getenv("PRESET", "sommers_basic")
if preset_name not in PRESETS:
raise ValueError(f"Unknown PRESET '{preset_name}'. Available: {sorted(PRESETS.keys())}")
config = PRESETS[preset_name]
client = RTZROpenAPIClient()
submit = client.transcribe_file(audio_path, config)
transcribe_id = submit.get("id")
result = client.wait_for_result(transcribe_id, poll_interval_sec=5)
print(json.dumps(result, ensure_ascii=False, indent=2))
if __name__ == "__main__":
main()
- sommers_basic: model_name=sommers, use_diarization=false, domain=GENERAL
- sommers_call_diarization: model_name=sommers, domain=CALL, use_diarization=true, diarization.spk_count=2
- whisper_en_diarization: model_name=whisper, language=en, use_diarization=true
- paragraph_split_80: use_paragraph_splitter=true, paragraph_splitter.max=80
- keywords_example: keywords=["stt","returnzero","api"]
- with_word_timestamps: use_word_timestamp=true
- disfluency_on: use_disfluency_filter=true
- profanity_on: use_profanity_filter=true
- whisper_detect_multi: model_name=whisper, language=multi, language_candidates=["ko","en","ja"]