Transcribe streaming audio
This guide walks you through running a sample application that converts speech to text in real time. Voice input from the microphone is instantly converted to text and displayed. The example on this page uses Python to perform speech recognition based on gRPC bidirectional streaming, but you can create and use client libraries in any language that supports gRPC.
1. Prerequisites
- Prior knowledge of Python and gRPC (Refer to grpc.io)
- Device with microphone connected
- API Token (You can issue it from API Console)
WARNING
- Realtime speech recognition currently only supports Korean and English.
- Audio with loud song sound or background music is not supported.
2. Save a Protocol Buffer file
gRPC communicates according to the definition of a protocol buffer. The protocol buffer provided by Daglo Realtime STT is as follows. Please save the following content as a speech.proto
file.
syntax = "proto3";
package dagloapis.speech.v1;
// A service that implements a speech recognition API
service Speech {
// Speech recognition is performed in a bidirectional streaming manner,
// transmitting audio and receiving results simultaneously.
rpc StreamingRecognize(stream StreamingRecognizeRequest)
returns (stream StreamingRecognizeResponse) {}
}
// The top-level message sent by the client to the `StreamingRecognize` method.
// `StreamingRecognizeRequest` message has two kinds.
// The first message must contain the `config` field and not `audio_content`.
// All messages sent after this must contain `audio_content` and not `config`.
message StreamingRecognizeRequest {
// A streaming request is a streaming configuration or audio stream.
oneof streaming_request {
// Provides configuration information regarding audio and STT processing.
RecognitionConfig config = 1;
// Audio stream data. Consecutive audio data fragments are sent sequentially
// using `StreamingRecognizeRequest` messages.
// Audio sources must be captured and transmitted using `LINEAR16` encoding.
// The sampling rate should be `16000Hz`. Resample the audio if necessary.
// Only mono (1 channel) audio is supported.
bytes audio_content = 2;
}
}
// Provides the server with a way to handle requests.
message RecognitionConfig {
// Select the language in which the audio will be provided.
// Language codes follow the [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) standard.
// This field can be omitted, but if omitted, the default value "ko-KR" will be applied.
// Supported language codes:
// - ko-KR
// - en-US
string language_code = 1;
// If `true`, return the temporary partial result during the utterance immediately
// (These partial results are indicated by the `is_final=false` flag).
// If `false` or omitted, only results with `is_final=true` will be returned.
bool interim_results = 2;
}
// `StreamingRecognizeResponse` is the only message returned to the client by `StreamingRecognize`.
// Zero or more `StreamingRecognizeResponse` messages are streamed to the client.
// In each response, only one field, either `error` or `result`, is set.
message StreamingRecognizeResponse {
// The result corresponding to the audio portion currently being processed.
StreamingRecognitionResult result = 1;
// Total audio duration for the stream processed (in seconds).
float total_duration = 2;
}
// Streaming speech recognition results corresponding to the audio portion currently being processed.
message StreamingRecognitionResult {
// Script text representing the words spoken by the user.
// In languages where words are separated by spaces, this script may contain leading spaces if it is not the first result.
// If you concatenate each result, you get the entire script without any delimiters.
string transcript = 1;
// If `false`, this `StreamingRecognitionResult` represents a partial result that may change.
// If `true`, it means the result of the completed result for the corresponding audio segment
// from the previous starting point to one completed utterance.
bool is_final = 2;
}
3. Write a client code
3-1. Install gRPC & Protobuf libraries
Install the libraries required to create and communicate with gRPC clients.
pip install grpcio grpcio-tools
3-2. Installing libraries for processing audio input streams
This example uses pyaudio
. To use pyaudio
, you must have a library called PortAudio installed. Install PortAudio for your platform (it may be included in the pyaudio
package), then install the pyaudio
package.
pip install pyaudio
3-3. Compile the Protobuf file
You need to compile the provided .proto
file into Python code. Run the following command to generate the necessary files.
python -m grpc_tools.protoc -I. --python_out=. --grpc_python_out=. ./speech.proto
Running the above command will generate the files speech_pb2.py
and speech_pb2_grpc.py
. Place these files in the same directory as your client code.
3-4. Write a client code
Now you can write code to stream audio data to the StreamingRecognize
method using the gRPC client. The following example demonstrates the entire process of constructing an audio streaming request and receiving the results from the server. Save the following example code as client.py
.
import argparse
import grpc
import speech_pb2
import speech_pb2_grpc
import pyaudio
import sys
# Audio stream config
SAMPLE_RATE = 16000
CHUNK = int(SAMPLE_RATE * 0.25) # 0.25 seconds
CHANNELS = 1
FORMAT = pyaudio.paInt16
def generate_requests(audio_stream, language_code='ko-KR', interim_results=True):
"""
A generator function that reads audio data from the microphone stream
and generates a streaming request.
"""
# First request: Send Config
config = speech_pb2.RecognitionConfig(
language_code=language_code,
interim_results=interim_results
)
yield speech_pb2.StreamingRecognizeRequest(config=config)
# Subsequent requests: Transmit audio data
while True:
try:
# Reading data from an audio stream
audio_chunk = audio_stream.read(CHUNK)
if not audio_chunk:
break
yield speech_pb2.StreamingRecognizeRequest(audio_content=audio_chunk)
except IOError as e:
print(f"Error reading from audio stream: {e}", file=sys.stderr)
break
# Last request: Send an empty message to indicate end of transaction (EOS)
yield speech_pb2.StreamingRecognizeRequest()
def run_streaming_recognition_from_mic(server_address, api_token):
"""
The main function that performs realtime streaming speech recognition
using microphone input.
"""
# gRPC Channel/Stub config
creds = grpc.ssl_channel_credentials()
channel = grpc.secure_channel(server_address, creds)
stub = speech_pb2_grpc.SpeechStub(channel)
# Metadata (header) for API authorization
metadata = (
("authorization", f"Bearer {api_token}"),
)
# PyAudio instance initialization
audio = pyaudio.PyAudio()
# Open audio stream
try:
stream = audio.open(format=FORMAT,
channels=CHANNELS,
rate=SAMPLE_RATE,
input=True,
frames_per_buffer=CHUNK)
print("마이크 녹음 및 스트리밍 시작. 'Ctrl+C'를 눌러 종료하세요.")
# Bidirectional streaming of requests and responses
response_iterator = stub.StreamingRecognize(
generate_requests(stream),
metadata=metadata
)
# Loop to process responses
for response in response_iterator:
if response.result:
if response.result.is_final:
print(f"최종 결과: {response.result.transcript}")
else:
print(f"부분 결과: {response.result.transcript}", end='\r')
except grpc.RpcError as e:
print(f"\ngRPC 오류 발생: {e.code()}, {e.details()}", file=sys.stderr)
except KeyboardInterrupt:
print("\n녹음을 중단하고 프로그램을 종료합니다.")
finally:
# Cleaning up streams and PyAudio objects
if 'stream' in locals() and stream.is_active():
stream.stop_stream()
stream.close()
audio.terminate()
channel.close()
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="gRPC Speech Recognition Client")
parser.add_argument(
"--server",
type=str,
required=True,
help="gRPC server address (e.g., apis.daglo.ai)",
)
parser.add_argument("--token", type=str, required=True, help="API Token")
args = parser.parse_args()
run_streaming_recognition_from_mic(args.server, args.token)
3-5. Run the client code
python client.py --server apis.daglo.ai --token $API_TOKEN
4. Troubleshooting
- If the API Token is incorrect, check the validity of the token in the API Console.
- If voice recognition quality is poor, try again in a quiet environment or check your microphone settings.