バッチ STT
バッチ STT API は、音声ファイルをテキストに変換する HTTP ベースの REST API です。
対応フォーマット
ヒント
mp4, m4a, mp3, amr, flac, wav に対応しています。
認証トークン
認証ガイドに従ってトークンを取得してからご利用ください。
API 一覧
Method | URL | 説明 |
---|---|---|
POST | /v1/transcribe | 変換ジョブの作成 |
GET | /v1/transcribe/{TRANSCRIBE_ID} | 変換結果の取得 |
1) [POST] /v1/transcribe
保存済みの音声ファイルに対する変換を要求します。
HTTP リクエスト
POST https://openapi.vito.ai/v1/transcribe
リクエストヘッダー
Authorization: Bearer {YOUR_JWT_TOKEN}
- scheme: bearer
- bearerFormat: JWT
リクエストボディ
content-type: multipart/form-data
Field | Type | Required |
---|---|---|
config | RequestConfig | required |
file | Binary | required |
RequestConfig
Name | 説明 | Type | Required | 値 | 既定値 |
---|---|---|---|---|---|
model_name | 音声認識モデル | string | optional | sommers, whisper | sommers |
language | 言語、whisper 専用 | string | optional | ko, detect, multi | ko |
language_candidates | 言語検出候補 | array | optional | ["ko","ja","zh","en"] | |
use_diarization | 話者分離 | boolean | optional | false | |
diarization.spk_count | 話者数(use_diarization が true のとき) | integer | optional | 0 以上 | 0 (自動) |
use_itn | 英字/数字/単位の正規化 | boolean | optional | true | |
use_disfluency_filter | フィラーワード除去 | boolean | optional | true | |
use_profanity_filter | 不適切語フィルタ | boolean | optional | false | |
use_paragraph_splitter | 段落分割 | boolean | optional | true | |
paragraph_splitter.max | 1 段落の最大文字数(use_paragraph_splitter が true のとき) | integer | optional | 1 以上 | 50 |
domain | ドメイン | string | optional | GENERAL, CALL | GENERAL |
use_word_timestamp | 単語タイムスタンプ | boolean | optional | false | |
keywords | キーワードブースト | array | optional |
注意
サンプル 1
- cURL
- Python
- Java
transcribe.sh
curl -X "POST" \
"https://openapi.vito.ai/v1/transcribe" \
-H "accept: application/json" \
-H "Authorization: Bearer ${YOUR_JWT_TOKEN}" \
-H "Content-Type: multipart/form-data" \
-F "file=@sample.wav" \
-F 'config={}'
transcribe.py
import json
import requests
import os
# Read JWT token from environment
jwt_token = os.getenv("YOUR_JWT_TOKEN")
if not jwt_token:
raise ValueError("Environment variable 'YOUR_JWT_TOKEN' is not set")
config = {}
resp = requests.post(
"https://openapi.vito.ai/v1/transcribe",
headers={"Authorization": f"Bearer {jwt_token}"},
data={"config": json.dumps(config)},
files={"file": open("sample.wav", "rb")},
)
resp.raise_for_status()
print(resp.json())
transcribe.java
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;
import java.util.Scanner;
public class PostTranscribeSample {
public static void main(String[] args) throws Exception {
URL url = new URL("https://openapi.vito.ai/v1/transcribe");
HttpURLConnection httpConn = (HttpURLConnection) url.openConnection();
httpConn.setRequestMethod("POST");
httpConn.setRequestProperty("accept", "application/json");
httpConn.setRequestProperty("Authorization", "Bearer "+ "{YOUR_JWT_TOKEN}");
httpConn.setRequestProperty("Content-Type", "multipart/form-data;boundary=authsample");
httpConn.setDoOutput(true);
File file = new File("sample.wav");
try (DataOutputStream outputStream = new DataOutputStream(httpConn.getOutputStream());
FileInputStream in = new FileInputStream(file)) {
// File part
outputStream.writeBytes("--authsample\r\n");
outputStream.writeBytes("Content-Disposition: form-data; name=\"file\"; filename=\"" + file.getName() + "\"\r\n");
outputStream.writeBytes("Content-Type: " + URLConnection.guessContentTypeFromName(file.getName()) + "\r\n");
outputStream.writeBytes("Content-Transfer-Encoding: binary\r\n\r\n");
byte[] buffer = new byte[8192];
int bytesRead;
while ((bytesRead = in.read(buffer)) != -1) {
outputStream.write(buffer, 0, bytesRead);
}
outputStream.writeBytes("\r\n");
// Config part
outputStream.writeBytes("--authsample\r\n");
outputStream.writeBytes("Content-Disposition: form-data; name=\"config\"\r\n");
outputStream.writeBytes("Content-Type: application/json\r\n\r\n");
outputStream.writeBytes("{}\r\n");
// End boundary
outputStream.writeBytes("--authsample--\r\n");
outputStream.flush();
}
InputStream responseStream = httpConn.getResponseCode() / 100 == 2
? httpConn.getInputStream()
: httpConn.getErrorStream();
Scanner s = new Scanner(responseStream).useDelimiter("\\A");
String response = s.hasNext() ? s.next() : "";
s.close();
System.out.println(response);
}
}
サンプル 2
- cURL
- Python
- Java
transcribe.sh
curl -X "POST" \
"https://openapi.vito.ai/v1/transcribe" \
-H "accept: application/json" \
-H "Authorization: Bearer ${YOUR_JWT_TOKEN}" \
-H "Content-Type: multipart/form-data" \
-F "file=@sample.wav" \
-F 'config={
"use_diarization": true,
"diarization": {
"spk_count": 2
},
"use_itn": false,
"use_disfluency_filter": false,
"use_profanity_filter": false,
"use_paragraph_splitter": true,
"paragraph_splitter": {
"max": 50
}
}'
transcribe.py
import json
import requests
import os
# Read JWT token from environment
jwt_token = os.getenv("YOUR_JWT_TOKEN")
if not jwt_token:
raise ValueError("Environment variable 'YOUR_JWT_TOKEN' is not set")
config = {
"use_diarization": True,
"diarization": {"spk_count": 2},
"use_itn": False,
"use_disfluency_filter": False,
"use_profanity_filter": False,
"use_paragraph_splitter": True,
"paragraph_splitter": {"max": 50},
}
resp = requests.post(
"https://openapi.vito.ai/v1/transcribe",
headers={"Authorization": f"Bearer {jwt_token}"},
data={"config": json.dumps(config)},
files={"file": open("sample.wav", "rb")},
)
resp.raise_for_status()
print(resp.json())
transcribe.java
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;
import java.util.Scanner;
public class PostTranscribeSample {
public static void main(String[] args) throws Exception {
URL url = new URL("https://openapi.vito.ai/v1/transcribe");
HttpURLConnection httpConn = (HttpURLConnection) url.openConnection();
httpConn.setRequestMethod("POST");
httpConn.setRequestProperty("accept", "application/json");
httpConn.setRequestProperty("Authorization", "Bearer "+ "{YOUR_JWT_TOKEN}");
httpConn.setRequestProperty("Content-Type", "multipart/form-data;boundary=authsample");
httpConn.setDoOutput(true);
File file = new File("sample.wav");
try (DataOutputStream outputStream = new DataOutputStream(httpConn.getOutputStream());
FileInputStream in = new FileInputStream(file)) {
// File part
outputStream.writeBytes("--authsample\r\n");
outputStream.writeBytes("Content-Disposition: form-data; name=\"file\"; filename=\"" + file.getName() + "\"\r\n");
outputStream.writeBytes("Content-Type: " + URLConnection.guessContentTypeFromName(file.getName()) + "\r\n");
outputStream.writeBytes("Content-Transfer-Encoding: binary\r\n\r\n");
byte[] buffer = new byte[8192];
int bytesRead;
while ((bytesRead = in.read(buffer)) != -1) {
outputStream.write(buffer, 0, bytesRead);
}
outputStream.writeBytes("\r\n");
// Config part
outputStream.writeBytes("--authsample\r\n");
outputStream.writeBytes("Content-Disposition: form-data; name=\"config\"\r\n");
outputStream.writeBytes("Content-Type: application/json\r\n\r\n");
outputStream.writeBytes(
"{\n" +
" \"use_diarization\": true,\n" +
" \"diarization\": {\"spk_count\": 2},\n" +
" \"use_itn\": false,\n" +
" \"use_disfluency_filter\": false,\n" +
" \"use_profanity_filter\": false,\n" +
" \"use_paragraph_splitter\": true,\n" +
" \"paragraph_splitter\": {\"max\": 50}\n" +
"}\r\n"
);
// End boundary
outputStream.writeBytes("--authsample--\r\n");
outputStream.flush();
}
InputStream responseStream = httpConn.getResponseCode() / 100 == 2
? httpConn.getInputStream()
: httpConn.getErrorStream();
Scanner s = new Scanner(responseStream).useDelimiter("\\A");
String response = s.hasNext() ? s.next() : "";
s.close();
System.out.println(response);
}
}
レスポンス
成功時:
{"id": "{TRANSCRIBE_ID}"}
エラー
HTTP Status | Code | Notes |
---|---|---|
400 | H0001 | パラメータ不正 |
400 | H0010 | 非対応フォーマット |
401 | H0002 | 無効トークン |
413 | H0005 | サイズ超過 |
413 | H0006 | 長さ超過 |
429 | A0001 | 使用量超過 |
429 | A0002 | 同時処理超過 |
500 | E500 | サーバーエラー |
失敗例:
{"code":"H0001","msg":"unexpected end of JSON input"}
2) [GET] /v1/transcribe/{TRANSCRIBE_ID}
- POST で返された
TRANSCRIBE_ID
に対する結果を取得します。
HTTP リクエスト
GET https://openapi.vito.ai/v1/transcribe/{TRANSCRIBE_ID}
リクエストヘッダー
Authorization: Bearer {YOUR_JWT_TOKEN}
サンプル
- cURL
- Python
- Java
get_transcript.sh
curl -X "GET" \
"https://openapi.vito.ai/v1/transcribe/${TRANSCRIBE_ID}" \
-H "accept: application/json" \
-H "Authorization: Bearer ${YOUR_JWT_TOKEN}"
get_transcript.py
import requests
resp = requests.get(
"https://openapi.vito.ai/v1/transcribe/" + "{TRANSCRIBE_ID}",
headers={"Authorization": "bearer " + "{YOUR_JWT_TOKEN}"},
)
resp.raise_for_status()
print(resp.json())
get_transcript.java
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.Scanner;
public class GetTranscribeSample {
public static void main(String[] args) throws Exception {
URL url = new URL("https://openapi.vito.ai/v1/transcribe/"+"{TRANSCRIBE_ID}");
HttpURLConnection httpConn = (HttpURLConnection) url.openConnection();
httpConn.setRequestMethod("GET");
httpConn.setRequestProperty("accept", "application/json");
httpConn.setRequestProperty("Authorization", "Bearer {YOUR_JWT_TOKEN}");
InputStream responseStream = httpConn.getResponseCode() / 100 == 2
? httpConn.getInputStream()
: httpConn.getErrorStream();
Scanner s = new Scanner(responseStream).useDelimiter("\\A");
String response = s.hasNext() ? s.next() : "";
s.close();
System.out.println(response);
}
}
レスポンス本文(抜粋)
Name | 説明 | Type | 値 |
---|---|---|---|
id | transcribe id | string | |
status | ステータス | string | transcribing , completed , failed |
results.utterances | 発話配列 | array | |
results.utterances.start_at | 開始時刻(ms) | integer | |
results.utterances.duration | 継続時間(ms) | integer | |
results.utterances.msg | テキスト | string | |
results.utterances.spk | 話者/チャネル id | integer | |
results.utterances.lang | 言語 | string | ISO 639-1 |
ヒント
長時間ファイル対応のためポーリング方式です。transcribing
の場合は 5 秒程度の間隔で最終状態まで取得してください。短すぎる間隔は 429 を招く可能性があります。
Unified example
In the example script below, you can combine the desired settings with the PRESET
environment variable. The default is sommers_basic
.
- Python (recommended)
transcribe.py
import json
import os
import time
from typing import Any, Dict, Optional
import requests
class RTZROpenAPIClient:
"""Minimal client for RTZR OpenAPI (auth + STT file).
- Fetches JWT via /v1/authenticate using client_id/client_secret
- Submits a file transcription job via /v1/transcribe
- Polls /v1/transcribe/{id} every few seconds until completed/failed
"""
def __init__(
self,
client_id: Optional[str] = None,
client_secret: Optional[str] = None,
base_url: str = "https://openapi.vito.ai",
) -> None:
self.base_url = base_url.rstrip("/")
self.client_id = client_id or os.getenv("RTZR_CLIENT_ID")
self.client_secret = client_secret or os.getenv("RTZR_CLIENT_SECRET")
if not self.client_id or not self.client_secret:
raise ValueError(
"Missing credentials. Set RTZR_CLIENT_ID and RTZR_CLIENT_SECRET "
"environment variables, or pass client_id/client_secret to RTZROpenAPIClient."
)
self._sess = requests.Session()
self._token: Optional[Dict[str, Any]] = None
@property
def token(self) -> str:
# Renew if missing or expiring within 30 minutes
if self._token is None or self._token.get("expire_at", 0) < time.time() - 1800:
resp = self._sess.post(
f"{self.base_url}/v1/authenticate",
data={"client_id": self.client_id, "client_secret": self.client_secret},
)
resp.raise_for_status()
self._token = resp.json()
access = self._token.get("access_token")
if not access:
raise RuntimeError("authenticate: 'access_token' not found in response")
return access
def _auth_headers(self) -> Dict[str, str]:
return {"Authorization": f"Bearer {self.token}"}
def transcribe_file(self, file_path: str, config: Dict[str, Any]) -> Dict[str, Any]:
url = f"{self.base_url}/v1/transcribe"
with open(file_path, "rb") as f:
files = {"file": (os.path.basename(file_path), f)}
data = {"config": json.dumps(config)}
resp = self._sess.post(url, headers=self._auth_headers(), files=files, data=data)
resp.raise_for_status()
return resp.json()
def get_transcription(self, transcribe_id: str) -> Dict[str, Any]:
url = f"{self.base_url}/v1/transcribe/{transcribe_id}"
resp = self._sess.get(url, headers=self._auth_headers())
resp.raise_for_status()
return resp.json()
def wait_for_result(
self,
transcribe_id: str,
poll_interval_sec: int = 5,
timeout_sec: int = 3600,
) -> Dict[str, Any]:
deadline = time.time() + timeout_sec
while True:
if time.time() > deadline:
raise TimeoutError("Timed out waiting for transcription result")
result = self.get_transcription(transcribe_id)
status = result.get("status")
if status in ("completed", "failed"):
return result
time.sleep(poll_interval_sec)
# Preset configurations
PRESETS: Dict[str, Dict[str, Any]] = {
"sommers_basic": { # 1) sommers without diarization
"model_name": "sommers",
"use_diarization": False,
"domain": "GENERAL",
},
"sommers_call_diarization": { # 2) sommers + diarization + CALL, spk_count=2
"model_name": "sommers",
"domain": "CALL",
"use_diarization": True,
"diarization": {"spk_count": 2},
},
"whisper_en_diarization": { # 3) whisper + diarization, language=en
"model_name": "whisper",
"language": "en",
"use_diarization": True,
},
# Additional commonly requested options
"paragraph_split_80": {"use_paragraph_splitter": True, "paragraph_splitter": {"max": 80}},
"keywords_example": {"keywords": ["stt", "returnzero", "api"]},
"with_word_timestamps": {"use_word_timestamp": True},
"disfluency_on": {"use_disfluency_filter": True},
"profanity_on": {"use_profanity_filter": True},
"whisper_detect_multi": {
"model_name": "whisper",
"language": "multi",
"language_candidates": ["ko", "en", "ja"],
},
}
def main():
audio_path = os.getenv("AUDIO_PATH", "sample.wav")
preset_name = os.getenv("PRESET", "sommers_basic")
if preset_name not in PRESETS:
raise ValueError(f"Unknown PRESET '{preset_name}'. Available: {sorted(PRESETS.keys())}")
config = PRESETS[preset_name]
client = RTZROpenAPIClient()
submit = client.transcribe_file(audio_path, config)
transcribe_id = submit.get("id")
result = client.wait_for_result(transcribe_id, poll_interval_sec=5)
print(json.dumps(result, ensure_ascii=False, indent=2))
if __name__ == "__main__":
main()
- sommers_basic: model_name=sommers, use_diarization=false, domain=GENERAL
- sommers_call_diarization: model_name=sommers, domain=CALL, use_diarization=true, diarization.spk_count=2
- whisper_en_diarization: model_name=whisper, language=en, use_diarization=true
- paragraph_split_80: use_paragraph_splitter=true, paragraph_splitter.max=80
- keywords_example: keywords=["stt","returnzero","api"]
- with_word_timestamps: use_word_timestamp=true
- disfluency_on: use_disfluency_filter=true
- profanity_on: use_profanity_filter=true
- whisper_detect_multi: model_name=whisper, language=multi, language_candidates=["ko","en","ja"]