Listen API — Real-time STT
Raw real-time speech-to-text without AI cleaning. Create a session, get a WebSocket URL, and connect directly. You get interim and final transcripts — perfect for live captions, voice commands, or building your own processing pipeline.
Python
from sayd_ai import Sayd
client = Sayd(api_key="sk-your-key")
# Create a real-time STT session (no AI cleaning)
session = client.listen.create(
language="multi", # "en", "zh", or "multi"
sample_rate=16000, # 8000 or 16000 Hz
codec="pcm16", # "pcm16", "opus", or "opus_fs320"
)
# Connect to the WebSocket URL directly
print(f"Session: {session.session_id}")
print(f"Connect: {session.websocket_url}")
# Use any WebSocket client to stream audio
import websockets.sync.client, json
with websockets.sync.client.connect(session.websocket_url) as ws:
msg = json.loads(ws.recv()) # {"type": "ready"}
# Send audio chunks (PCM16, 100ms each)
ws.send(audio_bytes)
# Receive live transcripts
for msg in ws:
data = json.loads(msg)
if data["type"] == "partial":
print(f"\r {data['text']}", end="", flush=True)
elif data["type"] == "sentence":
print(f"\n[final] {data['text']}")
ws.send(json.dumps({"type": "end"}))
# List & retrieve sessions
sessions = client.listen.list(limit=10)
detail = client.listen.get(session.session_id)API Endpoints
POST
/v1/listenCreate a Listen session (returns WebSocket URL)GET
/v1/listenList Listen sessionsGET
/v1/listen/{id}Get Listen session details & transcripts