Foundation API Contracts for Agent-Aware Backport
Executive Summary
This document defines the comprehensive API contracts required for integrating agent-aware capabilities into the existing Foundation infrastructure. The contracts are designed to maintain 100% backward compatibility while adding powerful new agent-centric functionality through additive APIs.
Design Principles
- Backward Compatibility: All existing APIs remain unchanged and functional
- Additive Enhancement: New functionality added through new methods, not modifications
- Consistent Patterns: Agent-aware APIs follow established Foundation patterns
- Type Safety: Comprehensive type specifications for all interfaces
- Error Handling: Unified error handling across all agent-aware APIs
- Documentation: Complete documentation with examples for all contracts
Core Type Definitions
Agent Types
@type agent_id :: atom() | String.t()
@type capability :: atom()
@type health_status :: :healthy | :degraded | :unhealthy | :unknown
@type agent_state :: :initializing | :ready | :active | :idle | :degraded | :maintenance | :stopping | :stopped
@type resource_usage :: %{
memory: float(), # 0.0 to 1.0 (percentage)
cpu: float(), # 0.0 to 1.0 (percentage)
network: non_neg_integer(), # connections count
storage: float(), # 0.0 to 1.0 (percentage)
custom: %{atom() => any()} # Custom resource types
}
@type agent_metadata :: %{
type: :agent | :service | :coordination_service,
capabilities: [capability()],
health: health_status(),
state: agent_state(),
resource_usage: resource_usage(),
coordination_state: %{
active_consensus: [atom()],
active_barriers: [atom()],
held_locks: [String.t()],
leadership_roles: [atom()]
},
last_health_check: DateTime.t(),
configuration: map(),
metadata: map()
}
@type agent_context :: %{
agent_id: agent_id(),
current_capability: capability() | nil,
health_status: health_status(),
resource_state: resource_usage(),
coordination_active: boolean()
}
Error Types
@type agent_error :: %Foundation.Types.Error{
code: 3000..3999,
error_type: :agent_not_found | :agent_unhealthy | :agent_capability_missing |
:agent_resource_exhausted | :agent_coordination_failed,
message: String.t(),
severity: :low | :medium | :high | :critical,
context: map(),
agent_context: agent_context() | nil
}
1. ProcessRegistry Agent-Aware API
Enhanced Registration API
defmodule Foundation.ProcessRegistry.AgentAPI do
@moduledoc """
Agent-aware extensions to Foundation ProcessRegistry.
Provides enhanced registration and discovery capabilities for agents
while maintaining full backward compatibility with existing APIs.
"""
@doc """
Register an agent with comprehensive metadata.
Extends the basic registration with agent-specific metadata including
capabilities, health status, resource usage, and coordination state.
## Parameters
- `namespace`: Registry namespace (:production, {:test, ref})
- `agent_id`: Unique identifier for the agent
- `pid`: Process ID of the agent
- `agent_metadata`: Agent-specific metadata and state information
## Examples
agent_metadata = %{
type: :ml_agent,
capabilities: [:nlp, :classification, :generation],
health: :healthy,
state: :ready,
resource_usage: %{memory: 0.7, cpu: 0.4},
coordination_state: %{
active_consensus: [],
active_barriers: [],
held_locks: [],
leadership_roles: []
},
last_health_check: DateTime.utc_now(),
configuration: %{model_type: "transformer"},
metadata: %{version: "1.0", deployment: "production"}
}
:ok = ProcessRegistry.register_agent(
:production,
:ml_agent_1,
pid,
agent_metadata
)
"""
@spec register_agent(namespace(), agent_id(), pid(), agent_metadata()) ::
:ok | {:error, Foundation.Types.Error.t()}
def register_agent(namespace, agent_id, pid, agent_metadata)
@doc """
Look up an agent with full metadata.
Returns the agent process and complete metadata including current
health status, capabilities, and coordination state.
## Examples
{:ok, {pid, metadata}} = ProcessRegistry.lookup_agent(:production, :ml_agent_1)
# Access agent capabilities
capabilities = metadata.capabilities
# Check agent health
if metadata.health == :healthy do
# Proceed with agent interaction
end
"""
@spec lookup_agent(namespace(), agent_id()) ::
{:ok, {pid(), agent_metadata()}} | {:error, Foundation.Types.Error.t()}
def lookup_agent(namespace, agent_id)
@doc """
Find agents by capability.
Returns all agents that have the specified capability, useful for
dynamic agent discovery and load balancing.
## Examples
# Find all NLP-capable agents
nlp_agents = ProcessRegistry.lookup_agents_by_capability(:production, :nlp)
# Find coordination agents
coordinators = ProcessRegistry.lookup_agents_by_capability(:production, :coordination)
"""
@spec lookup_agents_by_capability(namespace(), capability()) ::
[{agent_id(), pid(), agent_metadata()}]
def lookup_agents_by_capability(namespace, capability)
@doc """
Find agents by health status.
Returns all agents matching the specified health status, useful for
health monitoring and load balancing decisions.
## Examples
# Find all healthy agents
healthy_agents = ProcessRegistry.lookup_agents_by_health(:production, :healthy)
# Find degraded agents for maintenance
degraded_agents = ProcessRegistry.lookup_agents_by_health(:production, :degraded)
"""
@spec lookup_agents_by_health(namespace(), health_status()) ::
[{agent_id(), pid(), agent_metadata()}]
def lookup_agents_by_health(namespace, health_status)
@doc """
Update agent metadata.
Updates the metadata for an existing agent, typically used for
health status updates, capability changes, or state transitions.
## Examples
# Update agent health status
updated_metadata = %{existing_metadata |
health: :degraded,
last_health_check: DateTime.utc_now()
}
:ok = ProcessRegistry.update_agent_metadata(
:production,
:ml_agent_1,
updated_metadata
)
"""
@spec update_agent_metadata(namespace(), agent_id(), agent_metadata()) ::
:ok | {:error, Foundation.Types.Error.t()}
def update_agent_metadata(namespace, agent_id, agent_metadata)
@doc """
Get comprehensive agent statistics.
Returns detailed statistics about agents in the registry including
health distribution, capability distribution, and resource usage.
## Examples
stats = ProcessRegistry.get_agent_stats(:production)
# => %{
# total_agents: 45,
# health_distribution: %{healthy: 40, degraded: 4, unhealthy: 1},
# capability_distribution: %{nlp: 15, coordination: 5, inference: 25},
# average_resource_usage: %{memory: 0.65, cpu: 0.45}
# }
"""
@spec get_agent_stats(namespace()) :: %{
total_agents: non_neg_integer(),
health_distribution: %{health_status() => non_neg_integer()},
capability_distribution: %{capability() => non_neg_integer()},
average_resource_usage: resource_usage()
}
def get_agent_stats(namespace)
end
2. Circuit Breaker Agent-Aware API
defmodule Foundation.Infrastructure.AgentCircuitBreaker.API do
@moduledoc """
Agent-aware circuit breaker API that integrates agent health and
resource status into circuit breaker decisions.
"""
@type circuit_name :: atom()
@type circuit_options :: [
agent_id: agent_id(),
capability: capability(),
resource_thresholds: %{
memory: float(),
cpu: float()
},
failure_threshold: pos_integer(),
recovery_timeout: pos_integer(),
health_check_interval: pos_integer()
]
@doc """
Start an agent-aware circuit breaker.
Creates a circuit breaker that considers both traditional failure patterns
and agent-specific health metrics when making decisions.
## Parameters
- `circuit_name`: Unique identifier for the circuit breaker
- `options`: Circuit breaker configuration with agent context
## Examples
{:ok, pid} = AgentCircuitBreaker.start_agent_breaker(
:ml_inference_service,
agent_id: :ml_agent_1,
capability: :inference,
resource_thresholds: %{memory: 0.8, cpu: 0.9},
failure_threshold: 5,
recovery_timeout: 60_000
)
"""
@spec start_agent_breaker(circuit_name(), circuit_options()) ::
{:ok, pid()} | {:error, Foundation.Types.Error.t()}
def start_agent_breaker(circuit_name, options)
@doc """
Execute operation with agent-aware circuit protection.
Executes the operation through the circuit breaker, considering both
circuit state and agent health when making execution decisions.
## Parameters
- `circuit_name`: Circuit breaker identifier
- `agent_id`: Agent performing the operation
- `operation`: Function to execute (0-arity)
- `metadata`: Additional context for telemetry and logging
## Examples
result = AgentCircuitBreaker.execute_with_agent(
:ml_service,
:ml_agent_1,
fn -> perform_ml_inference(data) end,
%{request_id: "req_123", priority: :high}
)
case result do
{:ok, inference_result} ->
# Process successful result
inference_result
{:error, %{error_type: :circuit_breaker_open}} ->
# Circuit is open due to failures or agent health
fallback_response()
{:error, %{error_type: :agent_unhealthy}} ->
# Agent health check failed
route_to_healthy_agent()
end
"""
@spec execute_with_agent(circuit_name(), agent_id(), (() -> any()), map()) ::
{:ok, any()} | {:error, Foundation.Types.Error.t()}
def execute_with_agent(circuit_name, agent_id, operation, metadata \\ %{})
@doc """
Get agent circuit breaker status.
Returns detailed status including circuit state, agent health,
failure counts, and resource utilization.
## Examples
{:ok, status} = AgentCircuitBreaker.get_agent_status(
:ml_service,
:ml_agent_1
)
# => %{
# circuit_name: :ml_service,
# agent_id: :ml_agent_1,
# circuit_state: :closed,
# agent_health: :healthy,
# failure_count: 2,
# resource_usage: %{memory: 0.7, cpu: 0.4},
# last_health_check: ~U[2024-01-01 12:00:00Z]
# }
"""
@spec get_agent_status(circuit_name(), agent_id()) ::
{:ok, map()} | {:error, Foundation.Types.Error.t()}
def get_agent_status(circuit_name, agent_id)
@doc """
Reset agent circuit breaker.
Forces the circuit to close and resets failure counts, useful for
manual recovery or after resolving underlying issues.
## Examples
:ok = AgentCircuitBreaker.reset_agent_circuit(:ml_service, :ml_agent_1)
"""
@spec reset_agent_circuit(circuit_name(), agent_id()) ::
:ok | {:error, Foundation.Types.Error.t()}
def reset_agent_circuit(circuit_name, agent_id)
end
3. Rate Limiter Agent-Aware API
defmodule Foundation.Infrastructure.AgentRateLimiter.API do
@moduledoc """
Agent-aware rate limiting with capability-based throttling and
health-adaptive rate limits.
"""
@type operation_type :: atom()
@type rate_limit :: {pos_integer(), pos_integer()} # {count, window_ms}
@doc """
Check rate limit with agent context.
Evaluates rate limits considering agent capabilities, health status,
and current resource utilization to make intelligent throttling decisions.
## Parameters
- `agent_id`: Agent requesting the operation
- `operation_type`: Type of operation being rate limited
- `metadata`: Additional context for rate limiting decisions
## Examples
case AgentRateLimiter.check_rate_with_agent(
:ml_agent_1,
:inference_request,
%{model: "gpt-4", complexity: :high}
) do
:ok ->
# Proceed with operation
perform_inference()
{:error, %{error_type: :agent_rate_limit_exceeded}} ->
# Rate limit exceeded for this agent
queue_request_for_later()
{:error, %{error_type: :agent_unhealthy}} ->
# Agent health too poor for operation
route_to_healthy_agent()
end
"""
@spec check_rate_with_agent(agent_id(), operation_type(), map()) ::
:ok | {:error, Foundation.Types.Error.t()}
def check_rate_with_agent(agent_id, operation_type, metadata \\ %{})
@doc """
Execute operation with rate limiting protection.
Combines rate limit checking with operation execution, providing
automatic backpressure based on agent context.
## Examples
result = AgentRateLimiter.execute_with_limit(
:ml_agent_1,
:training_step,
fn -> execute_training_step(batch_data) end,
%{batch_size: 32, epoch: 5}
)
"""
@spec execute_with_limit(agent_id(), operation_type(), (() -> any()), map()) ::
{:ok, any()} | {:error, Foundation.Types.Error.t()}
def execute_with_limit(agent_id, operation_type, operation, metadata \\ %{})
@doc """
Update agent-specific rate limits.
Allows dynamic adjustment of rate limits based on agent performance,
system load, or operational requirements.
## Examples
# Increase rate limit for high-performing agent
:ok = AgentRateLimiter.update_agent_limits(
:ml_agent_1,
:inference,
{200, 60_000} # 200 requests per minute
)
# Decrease rate limit for degraded agent
:ok = AgentRateLimiter.update_agent_limits(
:degraded_agent,
:inference,
{50, 60_000} # 50 requests per minute
)
"""
@spec update_agent_limits(agent_id(), capability(), rate_limit()) ::
:ok | {:error, Foundation.Types.Error.t()}
def update_agent_limits(agent_id, capability, rate_limit)
@doc """
Get current rate limiting status for an agent.
Returns detailed information about current usage, limits,
and agent-specific adjustments.
## Examples
{:ok, status} = AgentRateLimiter.get_agent_status(:ml_agent_1)
# => %{
# agent_id: :ml_agent_1,
# current_limits: %{inference: {100, 60_000}, training: {10, 60_000}},
# current_usage: %{inference: 45, training: 2},
# health_adjustment: 1.0,
# resource_adjustment: 0.8
# }
"""
@spec get_agent_status(agent_id()) ::
{:ok, map()} | {:error, Foundation.Types.Error.t()}
def get_agent_status(agent_id)
end
4. Configuration Server Agent-Aware API
defmodule Foundation.Services.ConfigServer.AgentAPI do
@moduledoc """
Agent-aware configuration management with hierarchical overrides
and agent-specific configuration support.
"""
@type config_path :: [atom() | String.t()]
@type config_value :: any()
@type config_scope :: :global | :namespace | :agent | :runtime
@doc """
Get effective configuration for an agent.
Returns configuration value with agent-specific overrides applied.
Follows hierarchy: global → namespace → agent → runtime.
## Parameters
- `agent_id`: Agent requesting configuration
- `config_path`: Path to configuration value
- `namespace`: Configuration namespace (optional, defaults to :foundation)
## Examples
# Get effective model temperature for specific agent
{:ok, temperature} = ConfigServer.get_effective_config(
:ml_agent_1,
[:model, :temperature]
)
# Get agent-specific inference configuration
{:ok, config} = ConfigServer.get_effective_config(
:ml_agent_1,
[:inference, :batch_size]
)
"""
@spec get_effective_config(agent_id(), config_path(), atom()) ::
{:ok, config_value()} | {:error, Foundation.Types.Error.t()}
def get_effective_config(agent_id, config_path, namespace \\ :foundation)
@doc """
Set agent-specific configuration override.
Creates an agent-specific configuration override that takes precedence
over global and namespace configurations for the specified agent.
## Examples
# Set agent-specific model temperature
:ok = ConfigServer.set_agent_config(
:ml_agent_1,
[:model, :temperature],
0.8
)
# Set agent-specific resource limits
:ok = ConfigServer.set_agent_config(
:ml_agent_1,
[:resources, :memory_limit],
2_000_000_000 # 2GB
)
"""
@spec set_agent_config(agent_id(), config_path(), config_value(), atom()) ::
:ok | {:error, Foundation.Types.Error.t()}
def set_agent_config(agent_id, config_path, config_value, namespace \\ :foundation)
@doc """
Subscribe to agent-specific configuration changes.
The calling process will receive messages when configuration
affecting the specified agent changes.
## Examples
# Subscribe to all config changes affecting agent
:ok = ConfigServer.subscribe_to_agent_changes(:ml_agent_1)
# Subscribe to specific config path for agent
:ok = ConfigServer.subscribe_to_agent_changes(
:ml_agent_1,
[:model, :temperature]
)
# Receive change notifications
receive do
{:agent_config_changed, agent_id, config_path, old_value, new_value} ->
handle_config_change(agent_id, config_path, new_value)
end
"""
@spec subscribe_to_agent_changes(agent_id(), config_path() | nil, atom()) ::
:ok | {:error, Foundation.Types.Error.t()}
def subscribe_to_agent_changes(agent_id, config_path \\ nil, namespace \\ :foundation)
@doc """
Get configuration change history for an agent.
Returns recent configuration changes that affected the specified agent,
useful for debugging and auditing.
## Examples
{:ok, changes} = ConfigServer.get_agent_config_history(:ml_agent_1, 10)
# => [
# %{
# path: [:model, :temperature],
# old_value: 0.7,
# new_value: 0.8,
# scope: :agent,
# timestamp: ~U[2024-01-01 12:00:00Z]
# }
# ]
"""
@spec get_agent_config_history(agent_id(), non_neg_integer(), atom()) ::
{:ok, [map()]} | {:error, Foundation.Types.Error.t()}
def get_agent_config_history(agent_id, limit \\ 100, namespace \\ :foundation)
end
5. Event Store Agent-Aware API
defmodule Foundation.Services.EventStore.AgentAPI do
@moduledoc """
Agent-aware event storage with correlation, real-time subscriptions,
and agent-specific event management.
"""
@type event_type :: atom()
@type correlation_id :: String.t()
@doc """
Store event with automatic agent context enrichment.
Events are automatically enriched with agent context if agent_id
is provided, including current health, capabilities, and state.
## Examples
event = %{
type: :agent_task_completed,
agent_id: :ml_agent_1,
data: %{
task_type: :inference,
duration: 250,
success: true,
result_size: 1024
},
correlation_id: "task_batch_456"
}
:ok = EventStore.store_agent_event(event)
"""
@spec store_agent_event(map()) :: :ok | {:error, Foundation.Types.Error.t()}
def store_agent_event(event_data)
@doc """
Query events by agent with rich filtering.
Returns events related to the specified agent with optional filtering
by event type, time range, and correlation ID.
## Parameters
- `agent_id`: Agent to query events for
- `filter`: Optional filter criteria
- `:type` - Filter by event type
- `:since` - Events after this timestamp
- `:until` - Events before this timestamp
- `:correlation_id` - Events with this correlation ID
- `:limit` - Maximum number of events
## Examples
# Get all events for agent in last hour
one_hour_ago = DateTime.add(DateTime.utc_now(), -3600)
{:ok, events} = EventStore.query_agent_events(:ml_agent_1, %{
since: one_hour_ago,
limit: 100
})
# Get task completion events for agent
{:ok, task_events} = EventStore.query_agent_events(:ml_agent_1, %{
type: :agent_task_completed,
limit: 50
})
"""
@spec query_agent_events(agent_id(), map()) ::
{:ok, [map()]} | {:error, Foundation.Types.Error.t()}
def query_agent_events(agent_id, filter \\ %{})
@doc """
Subscribe to real-time agent events.
The calling process will receive messages of the form:
`{:agent_event, subscription_id, event}` for matching events.
## Examples
# Subscribe to all events for agent
{:ok, sub_id} = EventStore.subscribe_to_agent_events(:ml_agent_1)
# Subscribe to specific event types for agent
{:ok, sub_id} = EventStore.subscribe_to_agent_events(:ml_agent_1, %{
types: [:agent_health_changed, :agent_task_completed]
})
# Receive events
receive do
{:agent_event, ^sub_id, event} ->
handle_agent_event(event)
end
"""
@spec subscribe_to_agent_events(agent_id(), map()) ::
{:ok, String.t()} | {:error, Foundation.Types.Error.t()}
def subscribe_to_agent_events(agent_id, filter \\ %{})
@doc """
Get correlated events across multiple agents.
Returns all events with the specified correlation ID, useful for
tracing multi-agent workflows and coordination scenarios.
## Examples
# Get all events for a coordination session
{:ok, events} = EventStore.get_agent_correlated_events("coordination_session_789")
# Events might include:
# - consensus_started (coordinator agent)
# - consensus_vote (participant agents)
# - consensus_completed (coordinator agent)
"""
@spec get_agent_correlated_events(correlation_id()) ::
{:ok, [map()]} | {:error, Foundation.Types.Error.t()}
def get_agent_correlated_events(correlation_id)
@doc """
Get agent event statistics and analytics.
Returns comprehensive statistics about events for the specified agent,
including event type distribution, frequency, and trends.
## Examples
{:ok, stats} = EventStore.get_agent_event_stats(:ml_agent_1)
# => %{
# total_events: 1247,
# event_type_distribution: %{
# agent_task_completed: 1100,
# agent_health_changed: 12,
# agent_error_occurred: 5
# },
# events_per_hour: 24.5,
# most_recent_event: ~U[2024-01-01 12:00:00Z]
# }
"""
@spec get_agent_event_stats(agent_id()) ::
{:ok, map()} | {:error, Foundation.Types.Error.t()}
def get_agent_event_stats(agent_id)
end
6. Telemetry Service Agent-Aware API
defmodule Foundation.Services.TelemetryService.AgentAPI do
@moduledoc """
Agent-aware telemetry with metric aggregation, alerting,
and performance analytics for multi-agent systems.
"""
@type metric_name :: [atom()]
@type metric_value :: number()
@type metric_type :: :counter | :gauge | :histogram
@doc """
Record agent-specific metric.
Records a metric with automatic agent context enrichment for
comprehensive agent performance monitoring.
## Examples
# Record agent task duration
TelemetryService.record_agent_metric(
:ml_agent_1,
[:agent, :task, :duration],
250.5,
:histogram,
%{task_type: :inference, model: "gpt-4"}
)
# Record agent resource usage
TelemetryService.record_agent_metric(
:ml_agent_1,
[:agent, :resource, :memory],
0.75,
:gauge
)
"""
@spec record_agent_metric(agent_id(), metric_name(), metric_value(), metric_type(), map()) ::
:ok
def record_agent_metric(agent_id, metric_name, value, type, metadata \\ %{})
@doc """
Get aggregated metrics for an agent.
Returns processed and aggregated metrics for the specified agent,
including trends, averages, and performance indicators.
## Examples
{:ok, metrics} = TelemetryService.get_agent_metrics(:ml_agent_1, %{
since: DateTime.add(DateTime.utc_now(), -3600), # Last hour
metrics: [[:agent, :task, :duration], [:agent, :resource, :memory]]
})
# => [
# %{
# name: [:agent, :task, :duration],
# type: :histogram,
# count: 45,
# avg: 225.7,
# min: 150.2,
# max: 401.3,
# percentiles: %{p50: 220.1, p95: 350.8, p99: 385.2}
# }
# ]
"""
@spec get_agent_metrics(agent_id(), map()) ::
{:ok, [map()]} | {:error, Foundation.Types.Error.t()}
def get_agent_metrics(agent_id, filter \\ %{})
@doc """
Create agent-specific alert.
Sets up automatic alerting for agent metrics that exceed
specified thresholds or meet certain conditions.
## Examples
# Alert on high memory usage
alert_spec = %{
name: :agent_high_memory,
agent_id: :ml_agent_1,
metric_path: [:agent, :resource, :memory],
threshold: 0.9,
condition: :greater_than,
window: 300_000 # 5 minutes
}
:ok = TelemetryService.create_agent_alert(alert_spec)
# Alert on low task completion rate
performance_alert = %{
name: :agent_low_performance,
agent_id: :ml_agent_1,
metric_path: [:agent, :task, :completion_rate],
threshold: 0.8,
condition: :less_than,
window: 600_000 # 10 minutes
}
:ok = TelemetryService.create_agent_alert(performance_alert)
"""
@spec create_agent_alert(map()) ::
:ok | {:error, Foundation.Types.Error.t()}
def create_agent_alert(alert_specification)
@doc """
Get agent performance analytics.
Returns comprehensive performance analytics for the agent including
trends, efficiency metrics, and comparative analysis.
## Examples
{:ok, analytics} = TelemetryService.get_agent_analytics(:ml_agent_1, %{
time_range: :last_24_hours,
include_trends: true,
compare_to_baseline: true
})
# => %{
# performance_score: 0.87,
# efficiency_trend: :improving,
# resource_utilization: %{
# memory: %{avg: 0.68, trend: :stable},
# cpu: %{avg: 0.45, trend: :decreasing}
# },
# task_metrics: %{
# completion_rate: 0.94,
# avg_duration: 235.8,
# error_rate: 0.02
# },
# recommendations: [
# "Consider increasing task batch size for better efficiency",
# "Memory usage is optimal"
# ]
# }
"""
@spec get_agent_analytics(agent_id(), map()) ::
{:ok, map()} | {:error, Foundation.Types.Error.t()}
def get_agent_analytics(agent_id, options \\ %{})
end
7. Coordination Service Agent-Aware API
defmodule Foundation.Coordination.Service.AgentAPI do
@moduledoc """
Agent-aware coordination with intelligent participant selection,
health-based decision making, and resource-aware coordination.
"""
@type coordination_id :: atom() | String.t()
@type consensus_strategy :: :unanimous | :majority | :quorum | :health_weighted
@doc """
Perform agent-aware consensus.
Initiates consensus that considers agent health, capabilities, and
current load when making decisions and selecting participants.
## Examples
# Consensus with health-weighted voting
{:ok, result} = CoordinationService.agent_consensus(
:model_selection,
[:ml_agent_1, :ml_agent_2, :ml_agent_3],
%{models: ["gpt-4", "claude-3"], task: :classification},
%{
strategy: :health_weighted,
timeout: 10_000,
min_healthy_participants: 2,
capability_required: :inference
}
)
# Capability-based consensus
{:ok, result} = CoordinationService.agent_consensus(
:resource_allocation,
:auto_select, # Automatically select capable agents
%{resources: [:gpu_1, :gpu_2], task: :training},
%{
strategy: :majority,
required_capability: :training,
min_participants: 3,
exclude_degraded: true
}
)
"""
@spec agent_consensus(coordination_id(), [agent_id()] | :auto_select, any(), map()) ::
{:ok, map()} | {:error, Foundation.Types.Error.t()}
def agent_consensus(coordination_id, participants, proposal, options \\ %{})
@doc """
Create agent barrier with health monitoring.
Creates a synchronization barrier that monitors participant health
and can adapt to agent failures or degradation.
## Examples
# Barrier with health monitoring
:ok = CoordinationService.create_agent_barrier(
:training_epoch_complete,
[:trainer_1, :trainer_2, :trainer_3],
%{
health_check_interval: 5_000,
auto_exclude_unhealthy: true,
min_healthy_participants: 2
}
)
"""
@spec create_agent_barrier(atom(), [agent_id()], map()) ::
:ok | {:error, Foundation.Types.Error.t()}
def create_agent_barrier(barrier_id, participants, options \\ %{})
@doc """
Intelligent leader election based on agent capabilities.
Elects a leader considering agent health, capabilities, current load,
and coordination history for optimal leadership selection.
## Examples
# Election with capability requirements
{:ok, leader} = CoordinationService.elect_agent_leader(
[:agent_1, :agent_2, :agent_3],
%{
required_capabilities: [:coordination, :monitoring],
prefer_healthy: true,
consider_load: true,
exclude_overloaded: true
}
)
# Election for specific role
{:ok, coordinator} = CoordinationService.elect_agent_leader(
:auto_select, # Select from all available agents
%{
role: :training_coordinator,
required_capabilities: [:training, :coordination],
min_resource_availability: %{memory: 0.3, cpu: 0.4}
}
)
"""
@spec elect_agent_leader([agent_id()] | :auto_select, map()) ::
{:ok, agent_id()} | {:error, Foundation.Types.Error.t()}
def elect_agent_leader(candidates, options \\ %{})
@doc """
Resource-aware coordination with allocation management.
Performs coordination while considering and managing resource
allocation across participating agents.
## Examples
# Coordination with resource allocation
{:ok, result} = CoordinationService.coordinate_with_resources(
:distributed_training,
[:trainer_1, :trainer_2, :trainer_3],
%{
task: :model_training,
dataset_size: 1_000_000,
batch_size: 1000
},
%{
resource_requirements: %{memory: 0.8, cpu: 0.6},
auto_allocate_resources: true,
fail_if_insufficient: true,
coordination_type: :consensus
}
)
"""
@spec coordinate_with_resources(coordination_id(), [agent_id()], any(), map()) ::
{:ok, map()} | {:error, Foundation.Types.Error.t()}
def coordinate_with_resources(coordination_id, participants, task_spec, options \\ %{})
@doc """
Get coordination analytics for agents.
Returns detailed analytics about agent coordination performance,
participation rates, and collaboration effectiveness.
## Examples
{:ok, analytics} = CoordinationService.get_agent_coordination_analytics(
[:ml_agent_1, :ml_agent_2],
%{time_range: :last_7_days}
)
# => %{
# agents: %{
# ml_agent_1: %{
# participation_rate: 0.89,
# leadership_rate: 0.25,
# consensus_agreement_rate: 0.93,
# avg_response_time: 150.5
# }
# },
# collaboration_matrix: %{
# {:ml_agent_1, :ml_agent_2} => %{
# coordination_count: 45,
# success_rate: 0.96,
# avg_duration: 2500
# }
# }
# }
"""
@spec get_agent_coordination_analytics([agent_id()], map()) ::
{:ok, map()} | {:error, Foundation.Types.Error.t()}
def get_agent_coordination_analytics(agent_ids, options \\ %{})
end
8. Resource Manager API
defmodule Foundation.Infrastructure.ResourceManager.API do
@moduledoc """
Comprehensive resource management for agents with allocation,
monitoring, and predictive management capabilities.
"""
@type resource_limits :: %{
memory: non_neg_integer(), # Bytes
cpu: float(), # Number of cores
network_connections: non_neg_integer(),
storage: non_neg_integer(), # Bytes
custom: %{atom() => any()} # Custom resources
}
@doc """
Register agent with resource requirements.
Registers an agent with its resource requirements and limits,
enabling comprehensive resource management and monitoring.
## Examples
resource_limits = %{
memory: 2_000_000_000, # 2GB
cpu: 2.0, # 2 cores
network_connections: 100,
storage: 10_000_000_000, # 10GB
custom: %{
gpu_memory: 4_000_000_000, # 4GB GPU
model_slots: 5
}
}
:ok = ResourceManager.register_agent(
:ml_agent_1,
resource_limits,
%{priority: :high, preemptible: false}
)
"""
@spec register_agent(agent_id(), resource_limits(), map()) ::
:ok | {:error, Foundation.Types.Error.t()}
def register_agent(agent_id, resource_limits, metadata \\ %{})
@doc """
Check resource availability for agent operation.
Verifies if sufficient resources are available for the requested
operation, considering current allocation and system constraints.
## Examples
# Check if agent can allocate additional memory
case ResourceManager.check_resource_availability(
:ml_agent_1,
:memory,
500_000_000 # 500MB
) do
:ok ->
# Sufficient memory available
proceed_with_allocation()
{:error, %{error_type: :agent_resource_limit_exceeded}} ->
# Agent would exceed its memory limit
cleanup_or_wait()
{:error, %{error_type: :system_resource_critical}} ->
# System memory critically low
defer_operation()
end
"""
@spec check_resource_availability(agent_id(), atom(), any()) ::
:ok | {:error, Foundation.Types.Error.t()}
def check_resource_availability(agent_id, resource_type, required_amount)
@doc """
Allocate resources to agent.
Allocates specified resources to the agent and updates tracking
to prevent over-allocation and enable monitoring.
## Examples
# Allocate resources for training task
:ok = ResourceManager.allocate_resources(:ml_agent_1, %{
memory: 1_000_000_000, # 1GB
cpu: 1.5, # 1.5 cores
custom: %{gpu_memory: 2_000_000_000} # 2GB GPU
})
"""
@spec allocate_resources(agent_id(), resource_limits()) ::
:ok | {:error, Foundation.Types.Error.t()}
def allocate_resources(agent_id, resource_allocation)
@doc """
Release resources from agent.
Releases previously allocated resources, making them available
for other agents and updating system resource tracking.
## Examples
# Release resources after task completion
:ok = ResourceManager.release_resources(:ml_agent_1, %{
memory: 1_000_000_000,
cpu: 1.5,
custom: %{gpu_memory: 2_000_000_000}
})
"""
@spec release_resources(agent_id(), resource_limits()) :: :ok
def release_resources(agent_id, resource_deallocation)
@doc """
Get comprehensive resource status for agent.
Returns detailed resource utilization, allocation, and health
information for the specified agent.
## Examples
{:ok, status} = ResourceManager.get_agent_resource_status(:ml_agent_1)
# => %{
# agent_id: :ml_agent_1,
# resource_limits: %{memory: 2_000_000_000, cpu: 2.0},
# current_usage: %{memory: 1_200_000_000, cpu: 1.2},
# utilization_percentage: %{memory: 0.6, cpu: 0.6},
# health_status: :healthy,
# recommendations: ["Consider increasing CPU allocation for optimal performance"]
# }
"""
@spec get_agent_resource_status(agent_id()) ::
{:ok, map()} | {:error, Foundation.Types.Error.t()}
def get_agent_resource_status(agent_id)
@doc """
Update agent resource limits dynamically.
Allows runtime adjustment of agent resource limits based on
changing requirements or performance characteristics.
## Examples
# Increase memory limit for high-performing agent
:ok = ResourceManager.update_agent_limits(:ml_agent_1, %{
memory: 4_000_000_000 # Increase to 4GB
})
# Decrease limits for underperforming agent
:ok = ResourceManager.update_agent_limits(:slow_agent, %{
cpu: 0.5 # Reduce to 0.5 cores
})
"""
@spec update_agent_limits(agent_id(), resource_limits()) ::
:ok | {:error, Foundation.Types.Error.t()}
def update_agent_limits(agent_id, new_limits)
@doc """
Get system-wide resource status and analytics.
Returns comprehensive system resource status including allocation
efficiency, utilization trends, and optimization recommendations.
## Examples
{:ok, system_status} = ResourceManager.get_system_resource_status()
# => %{
# total_resources: %{memory: 32_000_000_000, cpu: 16.0},
# allocated_resources: %{memory: 24_000_000_000, cpu: 12.5},
# utilization: %{memory: 0.75, cpu: 0.78},
# agent_count: 12,
# resource_efficiency: 0.87,
# recommendations: [
# "Consider rebalancing CPU allocation",
# "Memory utilization is optimal"
# ]
# }
"""
@spec get_system_resource_status() ::
{:ok, map()} | {:error, Foundation.Types.Error.t()}
def get_system_resource_status()
end
Backward Compatibility Guarantees
Preserved APIs (100% Compatible)
All existing Foundation APIs remain completely unchanged:
- ProcessRegistry:
register/4
,lookup/2
,unregister/2
,list_all/1
- CircuitBreaker:
start_fuse_instance/2
,execute/3
,get_status/1
- RateLimiter:
check_rate/4
,execute_with_limit/6
,get_status/2
- ConfigServer:
get/0
,get/1
,update/2
,subscribe/1
- EventStore:
store/1
,query/1
,subscribe/1
- TelemetryService:
emit/3
,get_stats/0
- Coordination: All existing primitive APIs
Migration Strategy
- Additive Enhancement: New functionality through new modules and methods
- Optional Parameters: New optional parameters for enhanced functionality
- Graceful Fallback: Agent-unaware calls work exactly as before
- Progressive Adoption: Services can adopt agent-awareness incrementally
Example Usage Patterns
Basic Agent Registration and Discovery
# Register an ML agent
agent_metadata = %{
type: :ml_agent,
capabilities: [:nlp, :classification],
health: :healthy,
state: :ready,
resource_usage: %{memory: 0.6, cpu: 0.4}
}
:ok = ProcessRegistry.register_agent(:production, :ml_agent_1, pid, agent_metadata)
# Find agents by capability
nlp_agents = ProcessRegistry.lookup_agents_by_capability(:production, :nlp)
# Get effective configuration for agent
{:ok, model_config} = ConfigServer.get_effective_config(:ml_agent_1, [:model, :config])
Protected Agent Operations
# Set up infrastructure protection for agent
{:ok, _} = AgentCircuitBreaker.start_agent_breaker(
:ml_service,
agent_id: :ml_agent_1,
capability: :inference
)
# Execute protected operation with rate limiting and circuit breaking
result = with :ok <- AgentRateLimiter.check_rate_with_agent(:ml_agent_1, :inference),
{:ok, result} <- AgentCircuitBreaker.execute_with_agent(
:ml_service,
:ml_agent_1,
fn -> perform_ml_inference(data) end
) do
{:ok, result}
else
error -> error
end
Agent Coordination
# Intelligent consensus with agent health consideration
{:ok, result} = CoordinationService.agent_consensus(
:model_selection,
:auto_select, # Automatically select capable, healthy agents
%{task: :classification, models: ["bert", "roberta"]},
%{
strategy: :health_weighted,
required_capability: :classification,
min_healthy_participants: 3
}
)
# Resource-aware coordination
{:ok, coordination_result} = CoordinationService.coordinate_with_resources(
:distributed_training,
[:trainer_1, :trainer_2, :trainer_3],
%{dataset: "large_corpus", epochs: 10},
%{
resource_requirements: %{memory: 0.8, cpu: 0.7},
auto_allocate_resources: true
}
)
This comprehensive API specification provides a solid foundation for implementing agent-aware capabilities while maintaining complete backward compatibility with the existing Foundation infrastructure.