Streaming Responses¶
Set stream: true to receive tokens as they are generated instead of waiting for the full response. This reduces perceived latency significantly for long outputs.
Python SDK¶
import studiolm
client = studiolm.Client(api_key="sk-...")
for chunk in client.chat.completions.create(
model="gemma-3-12b-it-qat",
messages=[{"role": "user", "content": "Write a short story about a robot."}],
stream=True,
):
print(chunk["choices"][0].get("delta", {}).get("content", ""), end="", flush=True)
print() # newline when done
Python (requests)¶
import requests
import json
def stream_chat(api_key, messages):
response = requests.post(
"https://api.studiolm.dev/v1/chat/completions",
headers={"Authorization": f"Bearer {api_key}"},
json={"model": "gemma-3-12b-it-qat", "messages": messages, "stream": True},
stream=True,
)
response.raise_for_status()
for line in response.iter_lines():
if not line:
continue
text = line.decode("utf-8")
if text.startswith("data: "):
text = text[6:]
if text == "[DONE]":
break
try:
chunk = json.loads(text)
content = chunk["choices"][0].get("delta", {}).get("content", "")
print(content, end="", flush=True)
except (json.JSONDecodeError, KeyError):
continue
print()
JavaScript (Fetch)¶
async function streamChat(apiKey, messages) {
const response = await fetch("https://api.studiolm.dev/v1/chat/completions", {
method: "POST",
headers: {
"Authorization": `Bearer ${apiKey}`,
"Content-Type": "application/json",
},
body: JSON.stringify({ model: "gemma-3-12b-it-qat", messages, stream: true }),
});
const reader = response.body.getReader();
const decoder = new TextDecoder();
while (true) {
const { done, value } = await reader.read();
if (done) break;
for (const line of decoder.decode(value).split("\n")) {
if (!line.startsWith("data: ") || line === "data: [DONE]") continue;
const chunk = JSON.parse(line.slice(6));
process.stdout.write(chunk.choices[0]?.delta?.content ?? "");
}
}
}