{
  "$schema": "http://json-schema.org/draft-07/schema#",
  "$id": "minicpm-o-realtime-protocol-v2",
  "title": "MiniCPM-o Realtime API Protocol Schema",
  "description": "WebSocket JSON message schema for MiniCPM-o Realtime API. Endpoint: wss://host/v1/realtime?mode={video|audio}. session_id is generated by the server (format: rt_{timestamp_ms}) and returned in session.created. Audio: 16kHz float32 PCM (input), 24kHz float32 PCM (output). Video: JPEG base64.",

  "definitions": {

    "base64_string": {
      "type": "string",
      "pattern": "^[A-Za-z0-9+/]+=*$",
      "description": "Base64-encoded binary data"
    },

    "error_object": {
      "type": "object",
      "required": ["code", "message", "type"],
      "additionalProperties": false,
      "properties": {
        "code": {
          "type": "string",
          "enum": [
            "not_ready",
            "unknown_event",
            "missing_field",
            "invalid_payload",
            "service_unavailable",
            "queue_full",
            "worker_busy",
            "worker_connect_failed",
            "inference_error"
          ],
          "description": "Machine-readable error code"
        },
        "message": {
          "type": "string",
          "description": "Human-readable description (for debugging, do not parse)"
        },
        "type": {
          "type": "string",
          "enum": ["client_error", "server_error"]
        }
      }
    },

    "client_event": {
      "oneOf": [
        { "$ref": "#/definitions/client/session.update" },
        { "$ref": "#/definitions/client/input_audio_buffer.append" },
        { "$ref": "#/definitions/client/session.close" }
      ]
    },

    "server_event": {
      "oneOf": [
        { "$ref": "#/definitions/server/session.created" },
        { "$ref": "#/definitions/server/response.output_audio.delta" },
        { "$ref": "#/definitions/server/response.listen" },
        { "$ref": "#/definitions/server/session.closed" },
        { "$ref": "#/definitions/server/session.queued" },
        { "$ref": "#/definitions/server/session.queue_update" },
        { "$ref": "#/definitions/server/session.queue_done" },
        { "$ref": "#/definitions/server/error" }
      ]
    },

    "client": {

      "session.update": {
        "type": "object",
        "required": ["type", "session"],
        "additionalProperties": false,
        "properties": {
          "type": { "const": "session.update" },
          "session": {
            "type": "object",
            "required": ["instructions"],
            "additionalProperties": false,
            "properties": {
              "instructions":   { "type": "string", "minLength": 1, "description": "System prompt" },
              "max_slice_nums": { "type": "integer", "minimum": 1, "maximum": 9, "default": 1, "description": "Max vision slices per frame (video mode only, 1=fast 64tok, 4=detailed 192tok)" },
              "ref_audio":      { "$ref": "#/definitions/base64_string", "description": "LLM reference audio (base64 WAV, 16kHz) for semantic voice cloning" },
              "tts_ref_audio":  { "$ref": "#/definitions/base64_string", "description": "TTS reference audio (base64 WAV, 16kHz) for acoustic voice cloning. Falls back to ref_audio if omitted" }
            }
          }
        }
      },

      "input_audio_buffer.append": {
        "type": "object",
        "required": ["type", "audio"],
        "additionalProperties": false,
        "properties": {
          "type":           { "const": "input_audio_buffer.append" },
          "audio":          { "$ref": "#/definitions/base64_string", "description": "16kHz mono float32 PCM, 1s = 16000 samples = 64000 bytes. Minimum 4000 samples (250ms). Base64-encoded." },
          "video_frames":   {
            "type": "array",
            "items": { "$ref": "#/definitions/base64_string" },
            "maxItems": 4,
            "description": "JPEG frames, base64-encoded. Recommended in video mode; behavior is undefined if sent in audio mode."
          },
          "force_listen":   { "type": "boolean", "default": false, "description": "Force model into listen state (interrupt speaking)" },
          "max_slice_nums": { "type": "integer", "minimum": 1, "maximum": 9, "description": "Override vision slices for this chunk (video mode only)" }
        }
      },

      "session.close": {
        "type": "object",
        "required": ["type"],
        "additionalProperties": false,
        "properties": {
          "type":   { "const": "session.close" },
          "reason": { "type": "string", "description": "Close reason (e.g. 'user_stop')" }
        }
      }
    },

    "server": {

      "session.created": {
        "type": "object",
        "required": ["type", "session_id"],
        "additionalProperties": false,
        "properties": {
          "type":          { "const": "session.created" },
          "session_id":    { "type": "string", "description": "Server-assigned session identifier" },
          "prompt_length": { "type": "integer", "minimum": 0, "description": "Number of tokens in the prepared prompt" }
        }
      },

      "response.output_audio.delta": {
        "type": "object",
        "required": ["type", "text", "audio", "end_of_turn", "kv_cache_length"],
        "additionalProperties": false,
        "properties": {
          "type":            { "const": "response.output_audio.delta" },
          "text":            { "type": "string", "description": "Generated text fragment. Note: text leads audio by ~hundreds of ms due to model architecture" },
          "audio":           { "$ref": "#/definitions/base64_string", "description": "24kHz mono float32 PCM, base64-encoded. Middle chunks = 1s (24000 samples), first/last chunk may be shorter" },
          "end_of_turn":     { "type": "boolean", "description": "Turn EOS: true when model finishes this utterance and is about to switch back to listen" },
          "kv_cache_length": { "type": "integer", "minimum": 0, "description": "Current KV cache token count (max 8192)" }
        }
      },

      "response.listen": {
        "type": "object",
        "required": ["type"],
        "additionalProperties": false,
        "properties": {
          "type":            { "const": "response.listen" },
          "kv_cache_length": { "type": "integer", "minimum": 0 }
        }
      },


      "session.closed": {
        "type": "object",
        "required": ["type", "reason"],
        "additionalProperties": false,
        "properties": {
          "type":   { "const": "session.closed" },
          "reason": {
            "type": "string",
            "enum": ["stopped", "timeout", "context_full", "server_shutdown", "error"],
            "description": "Why the session was closed. timeout=total duration exceeded, context_full=8192 token limit reached"
          }
        }
      },

      "session.queued": {
        "type": "object",
        "required": ["type", "position"],
        "additionalProperties": false,
        "properties": {
          "type":             { "const": "session.queued" },
          "position":         { "type": "integer", "minimum": 1, "description": "Position in queue (1 = next)" },
          "estimated_wait_s": { "type": "number", "minimum": 0, "description": "Estimated wait time in seconds" },
          "ticket_id":        { "type": "string" },
          "queue_length":     { "type": "integer", "minimum": 0 }
        }
      },

      "session.queue_update": {
        "type": "object",
        "required": ["type", "position"],
        "additionalProperties": false,
        "properties": {
          "type":             { "const": "session.queue_update" },
          "position":         { "type": "integer", "minimum": 1 },
          "estimated_wait_s": { "type": "number", "minimum": 0 },
          "queue_length":     { "type": "integer", "minimum": 0 }
        }
      },

      "session.queue_done": {
        "type": "object",
        "required": ["type"],
        "additionalProperties": false,
        "properties": {
          "type": { "const": "session.queue_done" }
        }
      },

      "error": {
        "type": "object",
        "required": ["type", "error"],
        "additionalProperties": false,
        "properties": {
          "type":  { "const": "error" },
          "error": { "$ref": "#/definitions/error_object" }
        }
      }
    }
  },

  "type": "object",
  "oneOf": [
    { "$ref": "#/definitions/client_event" },
    { "$ref": "#/definitions/server_event" }
  ]
}
