Based on the comprehensive Foundation 2.0 documentation, I’ll create additional complementary technical information to round out the implementation strategy.
Foundation 2.0: Complementary Technical Documentation
Performance Benchmarking Framework
Benchmark Suite Implementation
# test/benchmarks/foundation_benchmark_suite.ex
defmodule Foundation.BenchmarkSuite do
@moduledoc """
Comprehensive benchmarking framework for Foundation 2.0 performance validation.
Validates that Foundation's overhead is <5% compared to direct tool usage
and that distributed operations meet defined SLAs.
"""
use Benchee
def run_full_suite() do
# Process management benchmarks
run_process_management_benchmarks()
# Messaging throughput benchmarks
run_messaging_benchmarks()
# Service discovery benchmarks
run_service_discovery_benchmarks()
# Cluster formation benchmarks
run_cluster_formation_benchmarks()
# Memory usage analysis
run_memory_benchmarks()
end
defp run_process_management_benchmarks() do
Benchee.run(
%{
"Foundation.ProcessManager.start_singleton" => fn ->
Foundation.ProcessManager.start_singleton(BenchmarkWorker, [])
end,
"Direct Horde.DynamicSupervisor + Registry" => fn ->
{:ok, pid} = Horde.DynamicSupervisor.start_child(
Foundation.DistributedSupervisor,
BenchmarkWorker
)
Horde.Registry.register(Foundation.ProcessRegistry, :benchmark_worker, pid)
{:ok, pid}
end,
"Foundation.ProcessManager.lookup_singleton" => fn ->
Foundation.ProcessManager.lookup_singleton(:test_service)
end,
"Direct Horde.Registry.lookup" => fn ->
Horde.Registry.lookup(Foundation.ProcessRegistry, :test_service)
end
},
time: 10,
memory_time: 2,
formatters: [
Benchee.Formatters.HTML,
Benchee.Formatters.Console,
{Benchee.Formatters.JSON, file: "benchmarks/process_management.json"}
]
)
end
defp run_messaging_benchmarks() do
# Test different message sizes and channel types
message_sizes = [
{:small, 100}, # 100 bytes
{:medium, 10_240}, # 10KB
{:large, 1_048_576} # 1MB
]
Enum.each(message_sizes, fn {size_name, size} ->
test_data = :crypto.strong_rand_bytes(size)
Benchee.run(
%{
"Foundation.Channels.broadcast (#{size_name})" => fn ->
Foundation.Channels.broadcast(:events, test_data)
end,
"Direct Phoenix.PubSub.broadcast (#{size_name})" => fn ->
Phoenix.PubSub.broadcast(Foundation.PubSub, "foundation:events", test_data)
end,
"Foundation.Channels.broadcast with compression (#{size_name})" => fn ->
Foundation.Channels.broadcast(:events, test_data, compression: true)
end
},
time: 5,
formatters: [
{Benchee.Formatters.JSON, file: "benchmarks/messaging_#{size_name}.json"}
]
)
end)
end
defp run_cluster_formation_benchmarks() do
# Benchmark cluster formation time vs node count
node_counts = [2, 3, 5, 10]
Enum.each(node_counts, fn node_count ->
time = measure_cluster_formation_time(node_count)
File.write!(
"benchmarks/cluster_formation_#{node_count}_nodes.json",
Jason.encode!(%{
node_count: node_count,
formation_time_ms: time,
measured_at: System.system_time(:millisecond)
})
)
end)
end
defp measure_cluster_formation_time(node_count) do
# Implementation would start multiple nodes and measure time to full connectivity
# This is a placeholder that would integrate with test cluster infrastructure
base_time = 5000 # 5 seconds base
additional_time = (node_count - 2) * 2000 # 2 seconds per additional node
base_time + additional_time
end
end
# Mix task for easy benchmark execution
defmodule Mix.Tasks.Foundation.Benchmark do
use Mix.Task
def run(args) do
case args do
["all"] -> Foundation.BenchmarkSuite.run_full_suite()
["process"] -> Foundation.BenchmarkSuite.run_process_management_benchmarks()
["messaging"] -> Foundation.BenchmarkSuite.run_messaging_benchmarks()
_ ->
IO.puts("Usage: mix foundation.benchmark [all|process|messaging]")
end
end
end
Advanced Error Handling & Recovery Patterns
Resilient Operation Patterns
# lib/foundation/resilience.ex
defmodule Foundation.Resilience do
@moduledoc """
Advanced error handling and recovery patterns for distributed operations.
Provides circuit breaker, retry, and fallback patterns specifically
designed for Foundation's distributed architecture.
"""
@doc """
Executes a distributed operation with comprehensive resilience patterns.
Features:
- Circuit breaker for failing operations
- Exponential backoff retry with jitter
- Graceful degradation and fallbacks
- Distributed rate limiting
"""
def with_resilience(operation_name, operation_fn, opts \\ []) do
circuit_breaker = Keyword.get(opts, :circuit_breaker, true)
max_retries = Keyword.get(opts, :max_retries, 3)
fallback_fn = Keyword.get(opts, :fallback)
try do
if circuit_breaker do
execute_with_circuit_breaker(operation_name, operation_fn, opts)
else
execute_with_retry(operation_fn, max_retries, opts)
end
rescue
error ->
case fallback_fn do
nil -> {:error, error}
fallback -> execute_fallback(fallback, error, opts)
end
end
end
defp execute_with_circuit_breaker(operation_name, operation_fn, opts) do
case Foundation.CircuitBreaker.call(operation_name, operation_fn) do
{:ok, result} ->
{:ok, result}
{:error, :circuit_open} ->
Logger.warning("Circuit breaker open for #{operation_name}")
handle_circuit_open(operation_name, opts)
{:error, reason} ->
{:error, reason}
end
end
defp execute_with_retry(operation_fn, retries_left, opts) do
case operation_fn.() do
{:ok, result} ->
{:ok, result}
{:error, reason} when retries_left > 0 ->
delay = calculate_retry_delay(retries_left, opts)
Logger.info("Retrying operation in #{delay}ms, #{retries_left} retries left")
:timer.sleep(delay)
execute_with_retry(operation_fn, retries_left - 1, opts)
{:error, reason} ->
Logger.error("Operation failed after all retries: #{inspect(reason)}")
{:error, {:max_retries_exceeded, reason}}
end
end
defp calculate_retry_delay(retries_left, opts) do
base_delay = Keyword.get(opts, :base_delay, 1000)
max_delay = Keyword.get(opts, :max_delay, 30_000)
jitter = Keyword.get(opts, :jitter, true)
# Exponential backoff
exponential_delay = base_delay * :math.pow(2, 3 - retries_left)
# Apply jitter to prevent thundering herd
final_delay = if jitter do
jitter_factor = 0.1 + (:rand.uniform() * 0.1) # 10-20% jitter
round(exponential_delay * jitter_factor)
else
round(exponential_delay)
end
min(final_delay, max_delay)
end
defp handle_circuit_open(operation_name, opts) do
case Keyword.get(opts, :circuit_open_strategy, :error) do
:error ->
{:error, :circuit_open}
:fallback ->
case Keyword.get(opts, :fallback) do
nil -> {:error, :circuit_open}
fallback -> execute_fallback(fallback, :circuit_open, opts)
end
:cache ->
# Try to return cached result
Foundation.Cache.get({:circuit_cache, operation_name})
end
end
defp execute_fallback(fallback_fn, original_error, opts) when is_function(fallback_fn) do
Logger.info("Executing fallback due to: #{inspect(original_error)}")
case fallback_fn.(original_error) do
{:ok, result} ->
{:ok, {:fallback, result}}
{:error, fallback_error} ->
{:error, {:fallback_failed, original_error, fallback_error}}
end
end
end
# Circuit breaker implementation
defmodule Foundation.CircuitBreaker do
use GenServer
defstruct [
:name,
:failure_threshold,
:recovery_timeout,
:half_open_max_calls,
:state,
:failure_count,
:last_failure_time,
:half_open_call_count
]
def start_link(name, opts \\ []) do
GenServer.start_link(__MODULE__, {name, opts}, name: via_tuple(name))
end
def call(name, operation_fn, timeout \\ 5000) do
GenServer.call(via_tuple(name), {:call, operation_fn}, timeout)
end
@impl true
def init({name, opts}) do
state = %__MODULE__{
name: name,
failure_threshold: Keyword.get(opts, :failure_threshold, 5),
recovery_timeout: Keyword.get(opts, :recovery_timeout, 60_000),
half_open_max_calls: Keyword.get(opts, :half_open_max_calls, 3),
state: :closed,
failure_count: 0,
last_failure_time: nil,
half_open_call_count: 0
}
{:ok, state}
end
@impl true
def handle_call({:call, operation_fn}, _from, %{state: :open} = state) do
if should_attempt_reset?(state) do
new_state = %{state | state: :half_open, half_open_call_count: 0}
execute_operation(operation_fn, new_state)
else
{:reply, {:error, :circuit_open}, state}
end
end
def handle_call({:call, operation_fn}, _from, state) do
execute_operation(operation_fn, state)
end
defp execute_operation(operation_fn, state) do
try do
result = operation_fn.()
handle_operation_success(result, state)
catch
kind, reason ->
handle_operation_failure({kind, reason}, state)
end
end
defp handle_operation_success({:ok, _} = result, %{state: :half_open} = state) do
# Success in half-open state - reset to closed
new_state = %{state |
state: :closed,
failure_count: 0,
half_open_call_count: 0
}
{:reply, result, new_state}
end
defp handle_operation_success({:ok, _} = result, state) do
# Success in closed state - reset failure count
new_state = %{state | failure_count: 0}
{:reply, result, new_state}
end
defp handle_operation_success(other_result, state) do
# Non-{:ok, _} results are treated as failures
handle_operation_failure(other_result, state)
end
defp handle_operation_failure(error, state) do
new_failure_count = state.failure_count + 1
new_state = %{state |
failure_count: new_failure_count,
last_failure_time: System.system_time(:millisecond)
}
if new_failure_count >= state.failure_threshold do
# Trip circuit breaker
final_state = %{new_state | state: :open}
Logger.warning("Circuit breaker #{state.name} opened after #{new_failure_count} failures")
{:reply, {:error, error}, final_state}
else
{:reply, {:error, error}, new_state}
end
end
defp should_attempt_reset?(%{last_failure_time: nil}), do: false
defp should_attempt_reset?(state) do
time_since_failure = System.system_time(:millisecond) - state.last_failure_time
time_since_failure >= state.recovery_timeout
end
defp via_tuple(name) do
{:via, Registry, {Foundation.CircuitBreakerRegistry, name}}
end
end
Configuration Management & Validation
Advanced Configuration System
# lib/foundation/config_validator.ex
defmodule Foundation.ConfigValidator do
@moduledoc """
Comprehensive configuration validation for Foundation 2.0.
Validates configuration at compile time and runtime to prevent
common misconfigurations and provides helpful error messages.
"""
@valid_cluster_strategies [:kubernetes, :consul, :dns, :gossip, :mdns_lite]
@valid_profiles [:development, :production, :enterprise]
def validate_config!(config) do
config
|> validate_cluster_config()
|> validate_profile_config()
|> validate_security_config()
|> validate_performance_config()
|> validate_compatibility()
end
defp validate_cluster_config(%{cluster: cluster_config} = config) do
case cluster_config do
true ->
config # Auto-detection is always valid
false ->
config # No clustering is valid
strategy when strategy in @valid_cluster_strategies ->
validate_strategy_specific_config(strategy, config)
invalid ->
raise ArgumentError, """
Invalid cluster configuration: #{inspect(invalid)}
Valid options:
- true (auto-detection)
- false (no clustering)
- :kubernetes (Kubernetes strategy)
- :consul (Consul strategy)
- :dns (DNS strategy)
- :gossip (Gossip strategy)
"""
end
end
defp validate_cluster_config(config), do: config # No cluster config is valid
defp validate_strategy_specific_config(:kubernetes, config) do
required_env_vars = ["KUBERNETES_SERVICE_HOST"]
missing_vars = Enum.filter(required_env_vars, &(System.get_env(&1) == nil))
unless Enum.empty?(missing_vars) do
Logger.warning("""
Kubernetes strategy selected but environment variables missing: #{inspect(missing_vars)}
This may cause cluster formation to fail. Ensure you're running in a Kubernetes environment
or switch to a different strategy:
config :foundation, cluster: :gossip # For development
config :foundation, cluster: :dns # For other container environments
""")
end
config
end
defp validate_strategy_specific_config(:consul, config) do
unless System.get_env("CONSUL_HTTP_ADDR") do
Logger.warning("""
Consul strategy selected but CONSUL_HTTP_ADDR not set.
Set the environment variable:
export CONSUL_HTTP_ADDR=http://consul:8500
Or use auto-detection:
config :foundation, cluster: true
""")
end
config
end
defp validate_strategy_specific_config(:mdns_lite, config) do
case Application.load(:mdns_lite) do
:ok -> config
{:error, {:already_loaded, :mdns_lite}} -> config
_ ->
raise ArgumentError, """
mdns_lite strategy selected but :mdns_lite dependency not found.
Add to mix.exs:
{:mdns_lite, "~> 0.8"}
Or use auto-detection for automatic strategy selection:
config :foundation, cluster: true
"""
end
end
defp validate_strategy_specific_config(_strategy, config), do: config
defp validate_profile_config(%{profile: profile} = config) when profile in @valid_profiles do
config
end
defp validate_profile_config(%{profile: invalid} = _config) do
raise ArgumentError, """
Invalid profile: #{inspect(invalid)}
Valid profiles:
- :development (zero-config local clustering)
- :production (cloud-ready with intelligent defaults)
- :enterprise (multi-cluster support)
"""
end
defp validate_profile_config(config), do: config # No profile is valid (auto-detection)
defp validate_security_config(config) do
# Validate TLS settings for production
if production_environment?(config) do
validate_production_security(config)
else
config
end
end
defp validate_production_security(config) do
security_config = Map.get(config, :security, %{})
# Check for secure defaults
warnings = []
warnings = if Map.get(security_config, :tls_enabled) != true do
["TLS not enabled for production cluster" | warnings]
else
warnings
end
warnings = if not Map.has_key?(security_config, :erlang_cookie) do
["Erlang cookie not explicitly configured" | warnings]
else
warnings
end
unless Enum.empty?(warnings) do
Logger.warning("""
Production security recommendations:
#{Enum.map(warnings, &(" - " <> &1)) |> Enum.join("\n")}
Recommended security config:
config :foundation,
cluster: :kubernetes,
security: [
tls_enabled: true,
erlang_cookie: {:system, "ERLANG_COOKIE"},
certificate_file: "/etc/ssl/certs/cluster.pem"
]
""")
end
config
end
defp validate_performance_config(config) do
# Validate performance-related settings
if expected_nodes = Map.get(config, :expected_nodes) do
if expected_nodes > 100 do
Logger.info("""
Large cluster detected (#{expected_nodes} nodes).
Consider enterprise profile for optimal large-cluster support:
config :foundation, profile: :enterprise
""")
end
end
config
end
defp validate_compatibility(config) do
# Check for conflicting configurations
if Map.has_key?(config, :cluster) and Application.get_env(:libcluster, :topologies) do
Logger.info("""
Both Foundation cluster config and libcluster topology detected.
Foundation will defer to existing libcluster configuration.
To use Foundation's configuration management, remove:
config :libcluster, topologies: [...]
""")
end
config
end
defp production_environment?(config) do
Map.get(config, :profile) == :production or
System.get_env("MIX_ENV") == "prod"
end
end
# Compile-time validation
defmodule Foundation.ConfigMacros do
@moduledoc """
Compile-time configuration validation and helpers.
"""
defmacro validate_foundation_config! do
config = Application.get_env(:foundation, :cluster)
quote do
Foundation.ConfigValidator.validate_config!(unquote(Macro.escape(config)))
end
end
end
Deployment & Operations Guide
Production Deployment Patterns
# lib/foundation/deployment.ex
defmodule Foundation.Deployment do
@moduledoc """
Production deployment helpers and health checks for Foundation 2.0.
Provides utilities for safe deployment, rolling updates, and
operational validation.
"""
@doc """
Validates that the current node is ready for production traffic.
Checks:
- Cluster connectivity
- Service health
- Configuration consistency
- Performance baselines
"""
def readiness_check(opts \\ []) do
timeout = Keyword.get(opts, :timeout, 30_000)
checks = [
{:cluster_connectivity, &check_cluster_connectivity/0},
{:service_registry, &check_service_registry/0},
{:configuration_sync, &check_configuration_sync/0},
{:performance_baseline, &check_performance_baseline/0}
]
results = run_health_checks(checks, timeout)
case Enum.all?(results, fn {_check, result} -> result == :ok end) do
true ->
{:ok, :ready}
false ->
failed_checks = Enum.filter(results, fn {_check, result} -> result != :ok end)
{:error, {:not_ready, failed_checks}}
end
end
@doc """
Performs a graceful shutdown of Foundation services.
"""
def graceful_shutdown(opts \\ []) do
timeout = Keyword.get(opts, :timeout, 30_000)
Logger.info("Starting graceful Foundation shutdown...")
# Step 1: Stop accepting new work
mark_node_draining()
# Step 2: Wait for active operations to complete
wait_for_operations_completion(timeout)
# Step 3: Migrate critical processes
migrate_critical_processes()
# Step 4: Shutdown services
shutdown_foundation_services()
Logger.info("Graceful shutdown completed")
:ok
end
defp run_health_checks(checks, timeout) do
tasks = Enum.map(checks, fn {name, check_fn} ->
Task.async(fn ->
try do
case check_fn.() do
:ok -> {name, :ok}
{:error, reason} -> {name, {:error, reason}}
other -> {name, {:error, {:unexpected_result, other}}}
end
catch
kind, reason -> {name, {:error, {kind, reason}}}
end
end)
end)
Task.await_many(tasks, timeout)
end
defp check_cluster_connectivity() do
cluster_health = Foundation.HealthMonitor.get_cluster_health()
case cluster_health.overall_status do
:healthy -> :ok
status -> {:error, {:cluster_unhealthy, status}}
end
end
defp check_service_registry() do
# Verify service registry is responsive
test_service_name = {:foundation_readiness_test, Node.self()}
case Foundation.ServiceMesh.register_service(test_service_name, self(), [:test]) do
:ok ->
case Foundation.ServiceMesh.discover_services(name: test_service_name) do
[_service] ->
Foundation.ServiceMesh.deregister_service(test_service_name)
:ok
[] ->
{:error, :service_registry_not_synced}
end
error ->
{:error, {:service_registration_failed, error}}
end
end
defp check_configuration_sync() do
# Verify configuration is consistent across cluster
local_config_hash = calculate_config_hash()
case Foundation.Channels.broadcast(:control, {:config_hash_request, Node.self(), local_config_hash}) do
:ok ->
# Wait for responses and check consistency
# This is a simplified version - production would implement proper consensus
:timer.sleep(1000)
:ok
error ->
{:error, {:config_sync_check_failed, error}}
end
end
defp check_performance_baseline() do
# Basic performance validation
start_time = System.monotonic_time(:microsecond)
# Test message roundtrip
Foundation.Channels.broadcast(:events, {:performance_test, start_time})
# Test service discovery speed
Foundation.ServiceMesh.discover_services([])
end_time = System.monotonic_time(:microsecond)
duration_ms = (end_time - start_time) / 1000
if duration_ms < 100 do # Should complete in under 100ms
:ok
else
{:error, {:performance_below_baseline, duration_ms}}
end
end
defp mark_node_draining() do
# Mark this node as draining in service registry
Foundation.Channels.broadcast(:control, {:node_draining, Node.self()})
end
defp wait_for_operations_completion(timeout) do
# Wait for active operations to complete
start_time = System.monotonic_time(:millisecond)
wait_loop(start_time, timeout)
end
defp wait_loop(start_time, timeout) do
current_time = System.monotonic_time(:millisecond)
if current_time - start_time >= timeout do
Logger.warning("Graceful shutdown timeout reached")
:timeout
else
active_operations = count_active_operations()
if active_operations == 0 do
:ok
else
Logger.info("Waiting for #{active_operations} operations to complete...")
:timer.sleep(1000)
wait_loop(start_time, timeout)
end
end
end
defp migrate_critical_processes() do
# Find critical processes that should be migrated to other nodes
critical_services = Foundation.ServiceMesh.discover_services(
capabilities: [:critical],
node: Node.self()
)
Enum.each(critical_services, fn service ->
Logger.info("Migrating critical service: #{service.name}")
# Implementation would coordinate with other nodes to restart the service
end)
end
defp shutdown_foundation_services() do
# Gracefully shutdown Foundation supervision tree
case Process.whereis(Foundation.Supervisor) do
nil -> :ok
pid ->
Process.exit(pid, :shutdown)
wait_for_process_exit(pid, 10_000)
end
end
defp wait_for_process_exit(pid, timeout) do
ref = Process.monitor(pid)
receive do
{:DOWN, ^ref, :process, ^pid, _reason} -> :ok
after
timeout ->
Logger.warning("Process #{inspect(pid)} did not exit gracefully")
Process.exit(pid, :kill)
end
end
defp calculate_config_hash() do
config = Application.get_all_env(:foundation)
:crypto.hash(:sha256, :erlang.term_to_binary(config)) |> Base.encode16()
end
defp count_active_operations() do
# Count active operations across all Foundation services
# This is a simplified version - production would track actual operations
0
end
end
# Health check endpoint for load balancers
defmodule Foundation.HealthEndpoint do
@moduledoc """
HTTP health check endpoint for load balancers and orchestrators.
"""
def init(req, state) do
response = case Foundation.Deployment.readiness_check(timeout: 5_000) do
{:ok, :ready} ->
{200, %{"status" => "healthy", "timestamp" => System.system_time(:second)}}
{:error, {:not_ready, failed_checks}} ->
{503, %{
"status" => "not_ready",
"failed_checks" => format_failed_checks(failed_checks),
"timestamp" => System.system_time(:second)
}}
end
{status, body} = response
req = :cowboy_req.reply(
status,
%{"content-type" => "application/json"},
Jason.encode!(body),
req
)
{:ok, req, state}
end
defp format_failed_checks(failed_checks) do
Enum.map(failed_checks, fn {check_name, error} ->
%{
"check" => to_string(check_name),
"error" => inspect(error)
}
end)
end
end
Observability & Monitoring Integration
Comprehensive Telemetry Integration
# lib/foundation/telemetry.ex
defmodule Foundation.Telemetry do
@moduledoc """
Comprehensive telemetry integration for Foundation 2.0.
Provides structured metrics, events, and traces for all Foundation
operations with integration points for external monitoring systems.
"""
# Standard telemetry events emitted by Foundation
@events [
# Cluster events
[:foundation, :cluster, :node_joined],
[:foundation, :cluster, :node_left],
[:foundation, :cluster, :topology_changed],
# Process management events
[:foundation, :process, :singleton_started],
[:foundation, :process, :singleton_failed],
[:foundation, :process, :migration_started],
[:foundation, :process, :migration_completed],
# Service mesh events
[:foundation, :service, :registered],
[:foundation, :service, :deregistered],
[:foundation, :service, :discovery_request],
[:foundation, :service, :health_check],
# Messaging events
[:foundation, :message, :sent],
[:foundation, :message, :received],
[:foundation, :message, :broadcast],
[:foundation, :channel, :subscription],
# Performance events
[:foundation, :performance, :optimization_applied],
[:foundation, :performance, :anomaly_detected],
[:foundation, :health, :check_completed]
]
def setup_telemetry() do
# Attach default handlers
attach_console_logger()
attach_metrics_aggregator()
attach_prometheus_exporter()
attach_distributed_tracer()
end
def emit_cluster_event(event_type, metadata \\ %{}) do
:telemetry.execute(
[:foundation, :cluster, event_type],
%{timestamp: System.system_time(:microsecond)},
Map.merge(metadata, %{node: Node.self()})
)
end
def emit_process_event(event_type, process_info, metadata \\ %{}) do
:telemetry.execute(
[:foundation, :process, event_type],
%{
timestamp: System.system_time(:microsecond),
process_count: 1
},
Map.merge(metadata, %{
node: Node.self(),
process_name: process_info.name,
process_pid: process_info.pid
})
)
end
def emit_service_event(event_type, service_info, metadata \\ %{}) do
:telemetry.execute(
[:foundation, :service, event_type],
%{
timestamp: System.system_time(:microsecond),
service_count: 1
},
Map.merge(metadata, %{
node: Node.self(),
service_name: service_info.name,
capabilities: service_info.capabilities
})
)
end
def emit_message_event(event_type, message_info, metadata \\ %{}) do
:telemetry.execute(
[:foundation, :message, event_type],
%{
timestamp: System.system_time(:microsecond),
message_size: message_info.size,
latency: Map.get(message_info, :latency, 0)
},
Map.merge(metadata, %{
node: Node.self(),
channel: message_info.channel,
destination: Map.get(message_info, :destination)
})
)
end
defp attach_console_logger() do
:telemetry.attach_many(
"foundation-console-logger",
@events,
&handle_console_logging/4,
%{log_level: :info}
)
end
defp attach_metrics_aggregator() do
:telemetry.attach_many(
"foundation-metrics-aggregator",
@events,
&handle_metrics_aggregation/4,
%{aggregation_window: 60_000}
)
end
defp attach_prometheus_exporter() do
if Code.ensure_loaded?(:prometheus) do
setup_prometheus_metrics()
:telemetry.attach_many(
"foundation-prometheus-exporter",
@events,
&handle_prometheus_export/4,
%{}
)
end
end
defp attach_distributed_tracer() do
:telemetry.attach_many(
"foundation-distributed-tracer",
@events,
&handle_distributed_tracing/4,
%{trace_sample_rate: 0.1}
)
end
defp handle_console_logging(event, measurements, metadata, config) do
log_level = Map.get(config, :log_level, :info)
case should_log_event?(event, log_level) do
true ->
message = format_telemetry_message(event, measurements, metadata)
Logger.log(log_level, message)
false ->
:ok
end
end
defp handle_metrics_aggregation(event, measurements, metadata, config) do
# Store metrics in ETS for aggregation
window = Map.get(config, :aggregation_window, 60_000)
current_window = div(System.system_time(:millisecond), window)
# Increment counters
:ets.update_counter(
:foundation_metrics,
{event, current_window},
[{2, 1}], # Increment count
{{event, current_window}, 0, measurements, metadata}
)
# Update measurements (sum, max, min)
update_measurement_aggregates(event, measurements, current_window)
end
defp handle_prometheus_export(event, measurements, metadata, _config) do
prometheus_metric_name = event_to_prometheus_name(event)
labels = metadata_to_prometheus_labels(metadata)
case event do
[:foundation, :cluster, _] ->
:prometheus_gauge.set(prometheus_metric_name, labels, 1)
[:foundation, :process, _] ->
:prometheus_counter.inc(prometheus_metric_name, labels)
[:foundation, :message, _] ->
:prometheus_counter.inc(prometheus_metric_name, labels)
if latency = Map.get(measurements, :latency) do
:prometheus_histogram.observe(
"foundation_message_latency_microseconds",
labels,
latency
)
end
_ ->
:prometheus_counter.inc(prometheus_metric_name, labels)
end
end
defp handle_distributed_tracing(event, measurements, metadata, config) do
sample_rate = Map.get(config, :trace_sample_rate, 0.1)
if :rand.uniform() < sample_rate do
trace_id = Map.get(metadata, :trace_id) || generate_trace_id()
span_id = generate_span_id()
span = %{
trace_id: trace_id,
span_id: span_id,
operation_name: format_operation_name(event),
start_time: measurements.timestamp,
tags: metadata,
node: Node.self()
}
# Send to distributed tracing system (Jaeger, Zipkin, etc.)
send_trace_span(span)
end
end
defp setup_prometheus_metrics() do
# Define Prometheus metrics for Foundation
metrics = [
{:prometheus_gauge, :foundation_cluster_nodes,
"Number of nodes in the cluster", [:cluster_id]},
{:prometheus_counter, :foundation_processes_total,
"Total number of processes started", [:process_type, :node]},
{:prometheus_counter, :foundation_services_total,
"Total number of services registered", [:service_type, :node]},
{:prometheus_counter, :foundation_messages_total,
"Total number of messages sent", [:channel, :node]},
{:prometheus_histogram, :foundation_message_latency_microseconds,
"Message latency in microseconds", [:channel, :node],
%{buckets: [10, 50, 100, 500, 1000, 5000, 10000, 30000]}},
{:prometheus_histogram, :foundation_service_discovery_duration_microseconds,
"Service discovery duration in microseconds", [:node],
%{buckets: [100, 500, 1000, 5000, 10000, 30000]}},
{:prometheus_gauge, :foundation_health_score,
"Overall cluster health score (0-1)", [:cluster_id]}
]
Enum.each(metrics, fn
{type, name, help, labels} ->
apply(type, :declare, [[name: name, help: help, labels: labels]])
{type, name, help, labels, opts} ->
apply(type, :declare, [[name: name, help: help, labels: labels] ++ [opts]])
end)
end
# Utility functions
defp should_log_event?([:foundation, :cluster, _], :info), do: true
defp should_log_event?([:foundation, :process, :singleton_failed], _), do: true
defp should_log_event?([:foundation, :service, _], :debug), do: true
defp should_log_event?(_, :debug), do: true
defp should_log_event?(_, _), do: false
defp format_telemetry_message(event, measurements, metadata) do
event_name = Enum.join(event, ".")
node = Map.get(metadata, :node, Node.self())
timestamp = Map.get(measurements, :timestamp, System.system_time(:microsecond))
base_message = "#{event_name} on #{node}"
additional_info = case event do
[:foundation, :cluster, :node_joined] ->
" (cluster size now: #{Map.get(metadata, :cluster_size, "unknown")})"
[:foundation, :process, :singleton_started] ->
" (process: #{Map.get(metadata, :process_name, "unknown")})"
[:foundation, :message, :sent] ->
channel = Map.get(metadata, :channel, "unknown")
size = Map.get(measurements, :message_size, 0)
" (channel: #{channel}, size: #{size} bytes)"
_ ->
""
end
base_message <> additional_info
end
defp update_measurement_aggregates(event, measurements, window) do
Enum.each(measurements, fn {key, value} when is_number(value) ->
# Update sum
:ets.update_counter(
:foundation_metrics_sums,
{event, key, window},
[{2, value}],
{{event, key, window}, 0}
)
# Update max
case :ets.lookup(:foundation_metrics_max, {event, key, window}) do
[] ->
:ets.insert(:foundation_metrics_max, {{event, key, window}, value})
[{{event, key, window}, current_max}] when value > current_max ->
:ets.insert(:foundation_metrics_max, {{event, key, window}, value})
_ ->
:ok
end
# Update min
case :ets.lookup(:foundation_metrics_min, {event, key, window}) do
[] ->
:ets.insert(:foundation_metrics_min, {{event, key, window}, value})
[{{event, key, window}, current_min}] when value < current_min ->
:ets.insert(:foundation_metrics_min, {{event, key, window}, value})
_ ->
:ok
end
end)
end
defp event_to_prometheus_name(event) do
event
|> Enum.join("_")
|> String.replace("-", "_")
|> String.to_atom()
end
defp metadata_to_prometheus_labels(metadata) do
metadata
|> Map.take([:node, :channel, :service_type, :process_type])
|> Enum.map(fn {k, v} -> {k, to_string(v)} end)
end
defp generate_trace_id() do
:crypto.strong_rand_bytes(16) |> Base.encode16(case: :lower)
end
defp generate_span_id() do
:crypto.strong_rand_bytes(8) |> Base.encode16(case: :lower)
end
defp format_operation_name(event) do
Enum.join(event, ".")
end
defp send_trace_span(span) do
# Send to distributed tracing backend
# This would integrate with Jaeger, Zipkin, or other tracing systems
GenServer.cast(Foundation.TracingExporter, {:export_span, span})
end
end
# Metrics dashboard
defmodule Foundation.MetricsDashboard do
@moduledoc """
Real-time metrics dashboard for Foundation 2.0 operations.
"""
use Phoenix.LiveView
def mount(_params, _session, socket) do
if connected?(socket) do
# Subscribe to telemetry updates
:timer.send_interval(1000, self(), :update_metrics)
end
socket = assign(socket,
cluster_metrics: get_cluster_metrics(),
process_metrics: get_process_metrics(),
message_metrics: get_message_metrics(),
health_metrics: get_health_metrics()
)
{:ok, socket}
end
def handle_info(:update_metrics, socket) do
socket = assign(socket,
cluster_metrics: get_cluster_metrics(),
process_metrics: get_process_metrics(),
message_metrics: get_message_metrics(),
health_metrics: get_health_metrics()
)
{:noreply, socket}
end
defp get_cluster_metrics() do
current_window = div(System.system_time(:millisecond), 60_000)
%{
node_count: length([Node.self() | Node.list()]),
node_joins: get_event_count([:foundation, :cluster, :node_joined], current_window),
node_leaves: get_event_count([:foundation, :cluster, :node_left], current_window),
topology_changes: get_event_count([:foundation, :cluster, :topology_changed], current_window)
}
end
defp get_process_metrics() do
current_window = div(System.system_time(:millisecond), 60_000)
%{
singletons_started: get_event_count([:foundation, :process, :singleton_started], current_window),
singleton_failures: get_event_count([:foundation, :process, :singleton_failed], current_window),
migrations: get_event_count([:foundation, :process, :migration_completed], current_window)
}
end
defp get_message_metrics() do
current_window = div(System.system_time(:millisecond), 60_000)
%{
messages_sent: get_event_count([:foundation, :message, :sent], current_window),
messages_received: get_event_count([:foundation, :message, :received], current_window),
broadcasts: get_event_count([:foundation, :message, :broadcast], current_window),
avg_latency: get_average_measurement([:foundation, :message, :sent], :latency, current_window)
}
end
defp get_health_metrics() do
case Foundation.HealthMonitor.get_cluster_health() do
%{overall_status: status, performance: perf} ->
%{
overall_status: status,
avg_message_latency: Map.get(perf, :avg_message_latency_ms, 0),
message_throughput: Map.get(perf, :message_throughput_per_sec, 0),
error_rate: Map.get(perf, :error_rate_percent, 0.0)
}
_ ->
%{overall_status: :unknown}
end
end
defp get_event_count(event, window) do
case :ets.lookup(:foundation_metrics, {event, window}) do
[{{event, window}, count, _, _}] -> count
[] -> 0
end
end
defp get_average_measurement(event, measurement_key, window) do
sum = case :ets.lookup(:foundation_metrics_sums, {event, measurement_key, window}) do
[{{event, measurement_key, window}, sum_value}] -> sum_value
[] -> 0
end
count = get_event_count(event, window)
if count > 0 do
sum / count
else
0
end
end
def render(assigns) do
~H"""
<div class="foundation-metrics-dashboard">
<h1>Foundation 2.0 Metrics Dashboard</h1>
<div class="metrics-grid">
<div class="metric-card cluster-metrics">
<h2>Cluster Metrics</h2>
<div class="metric-value">
<span class="label">Active Nodes:</span>
<span class="value"><%= @cluster_metrics.node_count %></span>
</div>
<div class="metric-value">
<span class="label">Node Joins (1m):</span>
<span class="value"><%= @cluster_metrics.node_joins %></span>
</div>
<div class="metric-value">
<span class="label">Node Leaves (1m):</span>
<span class="value"><%= @cluster_metrics.node_leaves %></span>
</div>
</div>
<div class="metric-card process-metrics">
<h2>Process Metrics</h2>
<div class="metric-value">
<span class="label">Singletons Started (1m):</span>
<span class="value"><%= @process_metrics.singletons_started %></span>
</div>
<div class="metric-value">
<span class="label">Singleton Failures (1m):</span>
<span class="value"><%= @process_metrics.singleton_failures %></span>
</div>
<div class="metric-value">
<span class="label">Migrations (1m):</span>
<span class="value"><%= @process_metrics.migrations %></span>
</div>
</div>
<div class="metric-card message-metrics">
<h2>Message Metrics</h2>
<div class="metric-value">
<span class="label">Messages Sent (1m):</span>
<span class="value"><%= @message_metrics.messages_sent %></span>
</div>
<div class="metric-value">
<span class="label">Average Latency:</span>
<span class="value"><%= Float.round(@message_metrics.avg_latency / 1000, 2) %>ms</span>
</div>
<div class="metric-value">
<span class="label">Broadcasts (1m):</span>
<span class="value"><%= @message_metrics.broadcasts %></span>
</div>
</div>
<div class="metric-card health-metrics">
<h2>Health Status</h2>
<div class="metric-value">
<span class="label">Overall Status:</span>
<span class={"value status-#{@health_metrics.overall_status}"}><%= @health_metrics.overall_status %></span>
</div>
<div class="metric-value">
<span class="label">Throughput:</span>
<span class="value"><%= @health_metrics.message_throughput %>/sec</span>
</div>
<div class="metric-value">
<span class="label">Error Rate:</span>
<span class="value"><%= Float.round(@health_metrics.error_rate, 3) %>%</span>
</div>
</div>
</div>
</div>
<style>
.foundation-metrics-dashboard {
padding: 20px;
font-family: system-ui, sans-serif;
}
.metrics-grid {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
gap: 20px;
margin-top: 20px;
}
.metric-card {
background: #f8fafc;
border: 1px solid #e2e8f0;
border-radius: 8px;
padding: 20px;
}
.metric-card h2 {
margin: 0 0 15px 0;
color: #2d3748;
font-size: 18px;
}
.metric-value {
display: flex;
justify-content: space-between;
margin: 8px 0;
}
.metric-value .label {
color: #4a5568;
}
.metric-value .value {
font-weight: bold;
color: #2d3748;
}
.status-healthy { color: #48bb78; }
.status-degraded { color: #ed8936; }
.status-critical { color: #e53e3e; }
</style>
"""
end
end
Security & Access Control Framework
# lib/foundation/security.ex
defmodule Foundation.Security do
@moduledoc """
Security and access control framework for Foundation 2.0.
Provides authentication, authorization, and audit logging
for distributed Foundation operations.
"""
@doc """
Authenticates a request with the cluster.
"""
def authenticate_request(request, opts \\ []) do
case extract_credentials(request) do
{:ok, credentials} ->
validate_credentials(credentials, opts)
{:error, reason} ->
{:error, {:authentication_failed, reason}}
end
end
@doc """
Authorizes an operation based on capabilities and context.
"""
def authorize_operation(operation, context, opts \\ []) do
required_capabilities = get_required_capabilities(operation)
user_capabilities = Map.get(context, :capabilities, [])
if has_required_capabilities?(user_capabilities, required_capabilities) do
audit_log(:operation_authorized, operation, context)
:ok
else
audit_log(:operation_denied, operation, context)
{:error, {:insufficient_capabilities, required_capabilities}}
end
end
@doc """
Secures inter-node communication with TLS and authentication.
"""
def configure_secure_communication(opts \\ []) do
tls_config = build_tls_config(opts)
auth_config = build_auth_config(opts)
# Configure Distributed Erlang with TLS
case configure_distributed_erlang_tls(tls_config) do
:ok ->
# Configure Foundation channels with authentication
configure_channel_security(auth_config)
{:error, reason} ->
{:error, {:tls_configuration_failed, reason}}
end
end
defp extract_credentials(%{headers: headers}) do
case Map.get(headers, "authorization") do
"Bearer " <> token ->
{:ok, {:bearer_token, token}}
"Basic " <> encoded ->
case Base.decode64(encoded) do
{:ok, decoded} ->
case String.split(decoded, ":", parts: 2) do
[username, password] -> {:ok, {:basic_auth, username, password}}
_ -> {:error, :invalid_basic_auth}
end
:error ->
{:error, :invalid_base64}
end
nil ->
{:error, :no_authorization_header}
_ ->
{:error, :unsupported_auth_scheme}
end
end
defp extract_credentials(request) when is_map(request) do
# Handle different request formats
cond do
Map.has_key?(request, :token) ->
{:ok, {:bearer_token, request.token}}
Map.has_key?(request, :username) and Map.has_key?(request, :password) ->
{:ok, {:basic_auth, request.username, request.password}}
true ->
{:error, :no_credentials}
end
end
defp validate_credentials({:bearer_token, token}, opts) do
case verify_jwt_token(token, opts) do
{:ok, claims} ->
{:ok, %{
type: :jwt,
subject: Map.get(claims, "sub"),
capabilities: Map.get(claims, "capabilities", []),
expires_at: Map.get(claims, "exp")
}}
{:error, reason} ->
{:error, {:invalid_token, reason}}
end
end
defp validate_credentials({:basic_auth, username, password}, opts) do
auth_backend = Keyword.get(opts, :auth_backend, :static)
case auth_backend do
:static ->
validate_static_credentials(username, password, opts)
:ldap ->
validate_ldap_credentials(username, password, opts)
:database ->
validate_database_credentials(username, password, opts)
end
end
defp verify_jwt_token(token, opts) do
secret = Keyword.get(opts, :jwt_secret) || System.get_env("JWT_SECRET")
if secret do
case Joken.verify_and_validate(token, secret) do
{:ok, claims} -> {:ok, claims}
{:error, reason} -> {:error, reason}
end
else
{:error, :no_jwt_secret_configured}
end
end
defp validate_static_credentials(username, password, opts) do
static_users = Keyword.get(opts, :static_users, %{})
case Map.get(static_users, username) do
%{password: ^password, capabilities: capabilities} ->
{:ok, %{
type: :static,
subject: username,
capabilities: capabilities
}}
%{password: _wrong_password} ->
{:error, :invalid_password}
nil ->
{:error, :user_not_found}
end
end
defp get_required_capabilities(operation) do
case operation do
:start_singleton -> [:process_management]
:register_service -> [:service_management]
:cluster_config_change -> [:cluster_admin]
:health_check -> [:monitoring]
:shutdown_node -> [:node_admin]
_ -> []
end
end
defp has_required_capabilities?(user_caps, required_caps) do
# Check if user has admin capability (grants everything)
if :admin in user_caps do
true
else
# Check if user has all required capabilities
Enum.all?(required_caps, &(&1 in user_caps))
end
end
defp build_tls_config(opts) do
%{
cert_file: Keyword.get(opts, :cert_file) || System.get_env("TLS_CERT_FILE"),
key_file: Keyword.get(opts, :key_file) || System.get_env("TLS_KEY_FILE"),
ca_file: Keyword.get(opts, :ca_file) || System.get_env("TLS_CA_FILE"),
verify: Keyword.get(opts, :verify_peer, :verify_peer),
depth: Keyword.get(opts, :depth, 2)
}
end
defp build_auth_config(opts) do
%{
enabled: Keyword.get(opts, :auth_enabled, true),
backend: Keyword.get(opts, :auth_backend, :jwt),
jwt_secret: Keyword.get(opts, :jwt_secret) || System.get_env("JWT_SECRET"),
static_users: Keyword.get(opts, :static_users, %{})
}
end
defp configure_distributed_erlang_tls(tls_config) do
if all_tls_files_present?(tls_config) do
inet_tls_config = [
{:cert, to_charlist(tls_config.cert_file)},
{:key, to_charlist(tls_config.key_file)},
{:cacerts, [File.read!(tls_config.ca_file)]},
{:verify, tls_config.verify},
{:depth, tls_config.depth}
]
# Set inet_dist_use_interface for TLS
:ok = :application.set_env(:kernel, :inet_dist_use_interface, {0, 0, 0, 0})
:ok = :application.set_env(:kernel, :inet_dist_listen_options, inet_tls_config)
:ok = :application.set_env(:kernel, :inet_dist_connect_options, inet_tls_config)
:ok
else
{:error, :missing_tls_files}
end
end
defp configure_channel_security(auth_config) do
if auth_config.enabled do
# Add authentication middleware to Foundation channels
Foundation.Channels.add_middleware(Foundation.Security.AuthMiddleware, auth_config)
else
:ok
end
end
defp all_tls_files_present?(config) do
config.cert_file && File.exists?(config.cert_file) &&
config.key_file && File.exists?(config.key_file) &&
config.ca_file && File.exists?(config.ca_file)
end
defp audit_log(action, operation, context) do
audit_entry = %{
timestamp: System.system_time(:microsecond),
action: action,
operation: operation,
user: Map.get(context, :subject),
node: Node.self(),
source_ip: Map.get(context, :source_ip),
user_agent: Map.get(context, :user_agent)
}
# Log to audit trail
Logger.info("AUDIT: #{inspect(audit_entry)}")
# Send to audit service if configured
case Application.get_env(:foundation, :audit_service) do
nil -> :ok
service -> GenServer.cast(service, {:audit_log, audit_entry})
end
end
# Placeholder implementations for external auth systems
defp validate_ldap_credentials(_username, _password, _opts) do
{:error, :ldap_not_implemented}
end
defp validate_database_credentials(_username, _password, _opts) do
{:error, :database_auth_not_implemented}
end
end
# Authentication middleware for channels
defmodule Foundation.Security.AuthMiddleware do
@moduledoc """
Authentication middleware for Foundation channels.
"""
def call(message, next, config) do
case authenticate_message(message, config) do
{:ok, authenticated_message} ->
next.(authenticated_message)
{:error, reason} ->
Logger.warning("Channel authentication failed: #{inspect(reason)}")
{:error, :authentication_failed}
end
end
defp authenticate_message(message, config) do
case extract_auth_context(message) do
{:ok, auth_context} ->
case Foundation.Security.authenticate_request(auth_context, config) do
{:ok, user_context} ->
authenticated_message = Map.put(message, :user_context, user_context)
{:ok, authenticated_message}
error ->
error
end
{:error, _reason} ->
# Allow messages without auth context if auth is optional
if config[:require_auth] do
{:error, :no_auth_context}
else
{:ok, message}
end
end
end
defp extract_auth_context(message) do
case Map.get(message, :auth) do
nil -> {:error, :no_auth_context}
auth_data -> {:ok, auth_data}
end
end
end
This comprehensive set of complementary technical documentation provides:
- Performance Benchmarking Framework - Validates that Foundation’s overhead is minimal and performance meets SLAs
- Advanced Error Handling & Recovery - Resilient operation patterns with circuit breakers, retries, and fallbacks
- Configuration Management & Validation - Compile-time and runtime validation with helpful error messages
- Deployment & Operations Guide - Production readiness checks, graceful shutdown, and health endpoints
- Observability & Monitoring Integration - Comprehensive telemetry with Prometheus, console logging, and distributed tracing
- Security & Access Control Framework - Authentication, authorization, TLS configuration, and audit logging
These additions complete the technical foundation needed to implement, deploy, and operate Foundation 2.0 successfully in production environments while maintaining the “Smart Facades on a Pragmatic Core” philosophy throughout.