Sayd

Listen API — Real-time STT

Raw real-time speech-to-text without AI cleaning. Create a session, get a WebSocket URL, and connect directly. You get interim and final transcripts — perfect for live captions, voice commands, or building your own processing pipeline.

Python
from sayd_ai import Sayd

client = Sayd(api_key="sk-your-key")

# Create a real-time STT session (no AI cleaning)
session = client.listen.create(
    language="multi",      # "en", "zh", or "multi"
    sample_rate=16000,     # 8000 or 16000 Hz
    codec="pcm16",         # "pcm16", "opus", or "opus_fs320"
)

# Connect to the WebSocket URL directly
print(f"Session: {session.session_id}")
print(f"Connect: {session.websocket_url}")

# Use any WebSocket client to stream audio
import websockets.sync.client, json

with websockets.sync.client.connect(session.websocket_url) as ws:
    msg = json.loads(ws.recv())  # {"type": "ready"}

    # Send audio chunks (PCM16, 100ms each)
    ws.send(audio_bytes)

    # Receive live transcripts
    for msg in ws:
        data = json.loads(msg)
        if data["type"] == "partial":
            print(f"\r  {data['text']}", end="", flush=True)
        elif data["type"] == "sentence":
            print(f"\n[final] {data['text']}")

    ws.send(json.dumps({"type": "end"}))

# List & retrieve sessions
sessions = client.listen.list(limit=10)
detail = client.listen.get(session.session_id)

API Endpoints

POST/v1/listenCreate a Listen session (returns WebSocket URL)
GET/v1/listenList Listen sessions
GET/v1/listen/{id}Get Listen session details & transcripts