Best Practices: Test Isolation in Elixir/OTP Systems
Executive Summary
Test isolation failures are the #1 cause of flaky tests in Elixir applications. This document provides production-grade patterns for eliminating test contamination through proper OTP supervision, process naming, and state management.
The Problem: Test Contamination
Symptoms
- ✅ Tests pass individually
- ❌ Tests fail in suite context
- 🔄 Intermittent failures
- ⏱️ Timeout assertions
- 📮 “Process mailbox is empty” errors
Root Causes
- Shared Named Processes - Multiple tests compete for same process names
- Global State Pollution - ETS tables, registries, telemetry handlers
- Process Lifecycle Leakage - Processes from previous tests interfere
- Resource Contention - Shared application services
The Solution: Test Supervision Trees
Core Principle
Each test should run in complete isolation with its own supervision tree and zero shared state.
Implementation Strategy
- Test-Scoped Process Names - Unique names per test
- Isolated Supervision Trees - Each test gets own supervisor
- Automatic Cleanup - OTP supervision handles process cleanup
- Resource Isolation - Test-specific instances of all services
Implementation Patterns
Pattern 1: Test Supervision Tree
defmodule Foundation.TestIsolation do
@doc """
Creates completely isolated test environment with own supervision tree.
"""
def start_isolated_test(opts \\ []) do
test_id = :erlang.unique_integer([:positive])
test_context = %{
test_id: test_id,
signal_bus_name: :"test_signal_bus_#{test_id}",
registry_name: :"test_registry_#{test_id}",
router_name: :"test_router_#{test_id}",
telemetry_prefix: "test_#{test_id}"
}
# Define test-scoped supervision tree
children = [
{MySignalBus, [name: test_context.signal_bus_name]},
{Registry, [keys: :unique, name: test_context.registry_name]},
{MyRouter, [name: test_context.router_name]},
# Add other services as needed
]
supervisor_opts = [
strategy: :one_for_one,
name: :"test_supervisor_#{test_id}"
]
case Supervisor.start_link(children, supervisor_opts) do
{:ok, supervisor} -> {:ok, supervisor, test_context}
error -> error
end
end
def stop_isolated_test(supervisor) do
# Stop supervisor - automatically stops all children
Supervisor.stop(supervisor, :normal, 5000)
# Clean up any remaining global state
cleanup_telemetry_handlers()
cleanup_ets_tables()
:ok
catch
:exit, {:noproc, _} -> :ok
end
end
Pattern 2: Test-Scoped Process Names
# ❌ WRONG: Global names cause contamination
defmodule MyService do
def start_link(opts) do
GenServer.start_link(__MODULE__, opts, name: __MODULE__)
end
end
# ✅ RIGHT: Test-scoped names
defmodule MyService do
def start_link(opts) do
name = Keyword.get(opts, :name, __MODULE__)
GenServer.start_link(__MODULE__, opts, name: name)
end
end
# Usage in tests:
test_name = :"my_service_#{:erlang.unique_integer([:positive])}"
{:ok, pid} = MyService.start_link(name: test_name)
Pattern 3: Telemetry Handler Isolation
defmodule Foundation.TestTelemetry do
@doc """
Attaches test-scoped telemetry handler with automatic cleanup.
"""
def attach_test_handler(test_id, event, handler_fun) do
handler_id = "test_#{test_id}_#{:erlang.unique_integer([:positive])}"
:telemetry.attach(handler_id, event, handler_fun, nil)
# Return cleanup function
fn ->
try do
:telemetry.detach(handler_id)
catch
_, _ -> :ok
end
end
end
@doc """
Cleans up all test telemetry handlers.
"""
def cleanup_test_handlers(test_id) do
:telemetry.list_handlers([])
|> Enum.filter(&String.contains?(&1.id, "test_#{test_id}"))
|> Enum.each(fn handler ->
try do
:telemetry.detach(handler.id)
catch
_, _ -> :ok
end
end)
end
end
Pattern 4: ETS Table Isolation
# ❌ WRONG: Named ETS tables shared across tests
:ets.new(:my_table, [:named_table, :public])
# ✅ RIGHT: Test-scoped ETS tables
defmodule TestETSManager do
def create_test_table(test_id, name) do
table_name = :"#{name}_#{test_id}"
:ets.new(table_name, [:named_table, :public])
# Return cleanup function
fn ->
try do
:ets.delete(table_name)
catch
_, _ -> :ok
end
end
end
end
Pattern 5: Registry Isolation
# Test setup with isolated registry
setup do
test_id = :erlang.unique_integer([:positive])
registry_name = :"test_registry_#{test_id}"
{:ok, _} = Registry.start_link(keys: :unique, name: registry_name)
on_exit(fn ->
if Process.whereis(registry_name) do
GenServer.stop(registry_name)
end
end)
%{registry: registry_name, test_id: test_id}
end
Standard Test Configuration
Base Test Module
defmodule Foundation.TestConfig do
defmacro __using__(type) do
quote do
use ExUnit.Case, async: false # Start with serial, optimize later
alias Foundation.TestIsolation
import Foundation.TestConfig
unquote(apply_test_type(type))
end
end
defp apply_test_type(:isolated) do
quote do
setup do
isolated_foundation_setup()
end
end
end
defp apply_test_type(:signal_routing) do
quote do
setup do
signal_routing_setup()
end
end
end
def isolated_foundation_setup(opts \\ []) do
{:ok, supervisor, test_context} = TestIsolation.start_isolated_test(opts)
on_exit(fn ->
TestIsolation.stop_isolated_test(supervisor)
end)
%{test_context: test_context, supervisor: supervisor}
end
def signal_routing_setup(opts \\ []) do
base_setup = isolated_foundation_setup(opts)
test_context = base_setup.test_context
# Start test-scoped signal router
{:ok, router_pid} = start_test_signal_router(test_context)
on_exit(fn ->
if Process.alive?(router_pid) do
GenServer.stop(router_pid)
end
end)
Map.merge(base_setup, %{
router_pid: router_pid,
signal_bus_name: test_context.signal_bus_name
})
end
end
Usage in Tests
defmodule MyFeatureTest do
use Foundation.TestConfig, :signal_routing
test "feature works correctly", %{test_context: ctx, signal_bus_name: bus} do
# All services are isolated - no contamination possible
agent = start_test_agent(ctx)
Bridge.emit_signal(agent, %{type: "test"}, bus: bus)
assert_receive {:signal_received, "test"}
end
end
Migration Strategy
Phase 1: Infrastructure Setup
- Create
TestIsolation
module - Create
TestConfig
base module - Create isolated versions of core services
- Set up cleanup helpers
Phase 2: Critical Path Migration
- Identify most problematic test files (flaky tests)
- Convert to isolated pattern
- Verify reliability improvement
- Document patterns for team
Phase 3: Gradual Migration
- Convert test files one by one
- Update CI to catch contamination
- Add linting rules for global state usage
- Training and documentation
Phase 4: Optimization
- Re-enable
async: true
for truly isolated tests - Performance profiling
- Resource usage optimization
- Advanced patterns (test pooling, etc.)
Troubleshooting Guide
Common Issues
Symptom | Root Cause | Solution |
---|---|---|
Tests pass individually, fail in suite | Shared process names | Use test-scoped names |
“Process not alive” errors | Process cleanup race conditions | Use defensive cleanup with try/catch |
Timeout assertions | Missing telemetry handlers | Use isolated telemetry setup |
ETS table conflicts | Named ETS tables | Use test-scoped ETS table names |
Registry conflicts | Shared registry processes | Create test-specific registries |
Debugging Test Contamination
# Add to failing tests to identify contamination
setup do
# Log all named processes before test
before_processes = Process.registered()
# Log all telemetry handlers
before_handlers = :telemetry.list_handlers([])
on_exit(fn ->
after_processes = Process.registered()
after_handlers = :telemetry.list_handlers([])
leaked_processes = after_processes -- before_processes
leaked_handlers = after_handlers -- before_handlers
if leaked_processes != [] do
IO.puts("LEAKED PROCESSES: #{inspect(leaked_processes)}")
end
if leaked_handlers != [] do
IO.puts("LEAKED HANDLERS: #{inspect(leaked_handlers)}")
end
end)
:ok
end
Production Examples
Phoenix Applications
# Phoenix uses this pattern for isolated channel tests
use MyAppWeb.ChannelCase # Provides isolated PubSub, Endpoint
Ecto Applications
# Ecto uses this for isolated database tests
use MyApp.DataCase # Provides isolated database connection
OTP Applications
# Standard OTP pattern for service tests
use MyApp.ServiceCase # Provides isolated supervision tree
Best Practices Summary
✅ DO
- Use test supervision trees for complete isolation
- Generate unique names for all test processes
- Clean up all resources in
on_exit
callbacks - Start with
async: false
, optimize later - Use defensive cleanup with try/catch
- Monitor test reliability metrics
❌ DON’T
- Share named processes across tests
- Use global state (Application config, ETS, etc.)
- Rely on test ordering
- Use
Process.sleep()
for coordination - Leave resources uncleaned
- Ignore intermittent test failures
🔧 TOOLS
Supervisor.start_link/2
- Test supervision trees:erlang.unique_integer/1
- Unique test IDson_exit/1
- Automatic cleanupExUnit.Case.async: false
- Serial execution when needed:telemetry.list_handlers/1
- Debug telemetry state
Conclusion
Proper test isolation is essential for reliable Elixir test suites. The patterns in this document eliminate test contamination through:
- Complete Process Isolation - Each test gets own supervision tree
- Zero Shared State - Test-scoped names for all resources
- Automatic Cleanup - OTP supervision handles process lifecycle
- Standard Patterns - Production-proven approaches from major projects
Result: Reliable, fast, maintainable test suites that scale with your application.
Implementation: Start with critical path tests, migrate gradually, measure improvements.