Inference
Streaming
Real-time streaming responses using Server-Sent Events (SSE)
Streaming
Receive AI responses in real-time as they're generated using Server-Sent Events (SSE). Streaming improves perceived latency for long responses.
Enable Streaming
Set stream: true in your request:
{
"model": "gpt-oss-120b",
"messages": [{"role": "user", "content": "Write a story"}],
"stream": true
}Quick Example
from openai import OpenAI
client = OpenAI(
api_key="sk-mel-your-api-key-here",
base_url="https://api.melious.ai/v1"
)
stream = client.chat.completions.create(
model="gpt-oss-120b",
messages=[{"role": "user", "content": "Write a haiku about coding"}],
stream=True
)
for chunk in stream:
content = chunk.choices[0].delta.content
if content:
print(content, end="", flush=True)import OpenAI from 'openai';
const client = new OpenAI({
apiKey: 'sk-mel-your-api-key-here',
baseURL: 'https://api.melious.ai/v1'
});
const stream = await client.chat.completions.create({
model: 'gpt-oss-120b',
messages: [{ role: 'user', content: 'Write a haiku about coding' }],
stream: true
});
for await (const chunk of stream) {
const content = chunk.choices[0]?.delta?.content;
if (content) {
process.stdout.write(content);
}
}curl https://api.melious.ai/v1/chat/completions \
-H "Authorization: Bearer sk-mel-your-api-key-here" \
-H "Content-Type: application/json" \
-N \
-d '{
"model": "gpt-oss-120b",
"messages": [{"role": "user", "content": "Write a haiku about coding"}],
"stream": true
}'SSE Format
Streaming responses use Server-Sent Events format:
data: {"id":"chatcmpl-123","object":"chat.completion.chunk","created":1699999999,"model":"gpt-oss-120b","choices":[{"index":0,"delta":{"content":"Hello"},"finish_reason":null}]}
data: {"id":"chatcmpl-123","object":"chat.completion.chunk","created":1699999999,"model":"gpt-oss-120b","choices":[{"index":0,"delta":{"content":" world"},"finish_reason":null}]}
data: {"id":"chatcmpl-123","object":"chat.completion.chunk","created":1699999999,"model":"gpt-oss-120b","choices":[{"index":0,"delta":{},"finish_reason":"stop"}]}
data: [DONE]Chunk Structure
{
"id": "chatcmpl-123",
"object": "chat.completion.chunk",
"created": 1699999999,
"model": "gpt-oss-120b",
"choices": [
{
"index": 0,
"delta": {
"content": "Hello"
},
"finish_reason": null
}
]
}Delta Object
| Field | Description |
|---|---|
role | Only in first chunk: "assistant" |
content | Text content fragment |
tool_calls | Tool call fragments (if using function calling) |
Final Chunk
The last chunk has:
- Empty
deltaobject finish_reasonset to"stop","length", or"tool_calls"
Stream End
The stream ends with:
data: [DONE]Async Python
import asyncio
from openai import AsyncOpenAI
client = AsyncOpenAI(
api_key="sk-mel-your-api-key-here",
base_url="https://api.melious.ai/v1"
)
async def stream_response():
stream = await client.chat.completions.create(
model="gpt-oss-120b",
messages=[{"role": "user", "content": "Tell me a story"}],
stream=True
)
async for chunk in stream:
content = chunk.choices[0].delta.content
if content:
print(content, end="", flush=True)
asyncio.run(stream_response())Streaming with Tools
When using function calling with streaming:
stream = client.chat.completions.create(
model="gpt-oss-120b",
messages=[{"role": "user", "content": "What's the weather in Paris?"}],
tools=[{
"type": "function",
"function": {
"name": "get_weather",
"parameters": {"type": "object", "properties": {"location": {"type": "string"}}}
}
}],
stream=True
)
tool_calls = []
for chunk in stream:
delta = chunk.choices[0].delta
# Accumulate tool calls
if delta.tool_calls:
for tc in delta.tool_calls:
if tc.index >= len(tool_calls):
tool_calls.append({"id": tc.id, "function": {"name": "", "arguments": ""}})
if tc.function.name:
tool_calls[tc.index]["function"]["name"] = tc.function.name
if tc.function.arguments:
tool_calls[tc.index]["function"]["arguments"] += tc.function.arguments
# Print content
if delta.content:
print(delta.content, end="", flush=True)
# Process accumulated tool calls
if tool_calls:
print(f"\nTool calls: {tool_calls}")Web Integration
Fetch API (Browser)
async function streamChat(message) {
const response = await fetch('https://api.melious.ai/v1/chat/completions', {
method: 'POST',
headers: {
'Authorization': 'Bearer sk-mel-your-api-key-here',
'Content-Type': 'application/json'
},
body: JSON.stringify({
model: 'gpt-oss-120b',
messages: [{ role: 'user', content: message }],
stream: true
})
});
const reader = response.body.getReader();
const decoder = new TextDecoder();
let buffer = '';
while (true) {
const { done, value } = await reader.read();
if (done) break;
buffer += decoder.decode(value, { stream: true });
const lines = buffer.split('\n');
buffer = lines.pop() || '';
for (const line of lines) {
if (line.startsWith('data: ')) {
const data = line.slice(6);
if (data === '[DONE]') return;
const chunk = JSON.parse(data);
const content = chunk.choices[0]?.delta?.content;
if (content) {
document.getElementById('output').textContent += content;
}
}
}
}
}React Hook
import { useState, useCallback } from 'react';
function useChatStream() {
const [response, setResponse] = useState('');
const [isStreaming, setIsStreaming] = useState(false);
const streamMessage = useCallback(async (message) => {
setResponse('');
setIsStreaming(true);
const res = await fetch('https://api.melious.ai/v1/chat/completions', {
method: 'POST',
headers: {
'Authorization': `Bearer ${process.env.MELIOUS_API_KEY}`,
'Content-Type': 'application/json'
},
body: JSON.stringify({
model: 'gpt-oss-120b',
messages: [{ role: 'user', content: message }],
stream: true
})
});
const reader = res.body.getReader();
const decoder = new TextDecoder();
while (true) {
const { done, value } = await reader.read();
if (done) break;
const text = decoder.decode(value);
const lines = text.split('\n').filter(line => line.startsWith('data: '));
for (const line of lines) {
const data = line.slice(6);
if (data === '[DONE]') continue;
const chunk = JSON.parse(data);
const content = chunk.choices[0]?.delta?.content;
if (content) {
setResponse(prev => prev + content);
}
}
}
setIsStreaming(false);
}, []);
return { response, isStreaming, streamMessage };
}Response Headers
Streaming responses include:
Content-Type: text/event-stream
Cache-Control: no-cache
Connection: keep-alive
X-Accel-Buffering: noX-Accel-Buffering: no prevents nginx from buffering the stream.
Error Handling
Errors during streaming are sent as SSE events:
data: {"error":{"message":"Rate limit exceeded","type":"rate_limit_error","code":"RATE_LIMIT_EXCEEDED"}}
data: [DONE]Handle errors in your stream processing:
for chunk in stream:
if hasattr(chunk, 'error'):
print(f"Error: {chunk.error.message}")
break
# Process normal chunksBest Practices
- Always handle
[DONE]- Break your loop when you receive it - Buffer partial lines - SSE lines may be split across chunks
- Handle reconnection - Implement retry logic for network errors
- Show typing indicator - Display while waiting for first chunk
- Cancel on user action - Allow users to stop generation
See Also
- Chat Completions - Full chat API reference
- Models - Available models