Test environment improvements

Documentation for test_environment_improvements from the Dspex repository.

Long-Term Test Environment Improvements for DSPex

Overview

This document outlines a comprehensive strategy for improving the DSPex test environment with a focus on long-term maintainability, scalability, and addressing current DSPy integration issues.

Current Issues to Address

1. Language Model Configuration

Problem: “No LM is loaded” errors in integration tests
Root Cause: DSPy requires a configured language model (Gemini) but tests don’t properly initialize it

2. Test Isolation

Problem: Program ID conflicts, shared state between tests
Root Cause: Tests use hardcoded IDs and share global resources

3. Resource Management

Problem: Python processes spawned during tests, port leaks, broken pipes
Root Cause: Improper cleanup and resource lifecycle management

Proposed Architecture

1. Test Environment Layers (Enhanced)

defmodule DSPex.Test.Environment do
  @moduledoc """
  Centralized test environment configuration and management.
  """
  
  defstruct [
    :layer,
    :adapter,
    :python_enabled,
    :pooling_enabled,
    :lm_config,
    :isolation_level,
    :resource_manager
  ]
  
  @layers %{
    unit: %{
      adapter: :mock,
      python_enabled: false,
      pooling_enabled: false,
      lm_config: :mock,
      isolation_level: :full
    },
    integration: %{
      adapter: :bridge_mock,
      python_enabled: true,
      pooling_enabled: false,
      lm_config: :mock,
      isolation_level: :namespace
    },
    e2e: %{
      adapter: :python_pool,
      python_enabled: true,
      pooling_enabled: true,
      lm_config: :real,
      isolation_level: :session
    }
  }
end

2. Language Model Test Configuration

defmodule DSPex.Test.LMConfig do
  @moduledoc """
  Manages language model configuration for different test scenarios.
  """
  
  def setup_test_lm(mode) do
    case mode do
      :mock ->
        # Use a deterministic mock LM for unit tests
        setup_mock_lm()
        
      :cached ->
        # Use cached responses for integration tests
        setup_cached_lm()
        
      :real ->
        # Use real Gemini API with test quotas
        setup_real_lm()
    end
  end
  
  defp setup_mock_lm do
    # Configure DSPy with a mock LM that returns predictable responses
    %{
      type: :mock,
      responses: %{
        "test_input" => "test_output",
        default: "mock_response"
      }
    }
  end
  
  defp setup_cached_lm do
    # Use VCR-like response caching for deterministic integration tests
    %{
      type: :cached,
      cache_dir: "test/fixtures/lm_responses",
      fallback: :record  # Record new responses when not cached
    }
  end
  
  defp setup_real_lm do
    # Real API with rate limiting and cost controls
    %{
      type: :gemini,
      api_key: System.get_env("GEMINI_TEST_API_KEY"),
      rate_limit: 10,  # requests per minute
      cost_limit: 1.00,  # dollars per test run
      model: "gemini-1.5-flash"  # Cheaper model for tests
    }
  end
end

3. Test Isolation Framework

defmodule DSPex.Test.Isolation do
  @moduledoc """
  Provides test isolation mechanisms at different levels.
  """
  
  defmacro isolated_test(name, opts \\ [], do: block) do
    quote do
      test unquote(name), context do
        isolation = DSPex.Test.Isolation.setup(unquote(opts))
        
        try do
          # Create isolated namespace
          namespace = isolation.create_namespace()
          
          # Override context with isolated resources
          context = Map.merge(context, %{
            namespace: namespace,
            program_prefix: namespace,
            session_id: "#{namespace}_session",
            adapter: isolation.create_adapter(namespace)
          })
          
          unquote(block)
        after
          DSPex.Test.Isolation.cleanup(isolation)
        end
      end
    end
  end
  
  def setup(opts) do
    %{
      id: generate_test_id(),
      resources: [],
      cleanup_tasks: [],
      level: Keyword.get(opts, :level, :full)
    }
  end
  
  def cleanup(isolation) do
    # Run all cleanup tasks in reverse order
    Enum.reverse(isolation.cleanup_tasks)
    |> Enum.each(& &1.())
  end
end

4. Resource Management

defmodule DSPex.Test.ResourceManager do
  @moduledoc """
  Manages test resources lifecycle (ports, processes, files).
  """
  
  use GenServer
  
  def start_link(opts) do
    GenServer.start_link(__MODULE__, opts, name: __MODULE__)
  end
  
  def init(_opts) do
    # Set up resource tracking
    {:ok, %{
      resources: %{
        ports: MapSet.new(),
        processes: MapSet.new(),
        temp_files: MapSet.new()
      },
      cleanup_on_exit: true
    }}
  end
  
  def track_port(port) do
    GenServer.call(__MODULE__, {:track, :ports, port})
  end
  
  def track_process(pid) do
    GenServer.call(__MODULE__, {:track, :processes, pid})
  end
  
  def cleanup_all do
    GenServer.call(__MODULE__, :cleanup_all)
  end
  
  # Automatic cleanup on test process exit
  def handle_info({:DOWN, _ref, :process, pid, _reason}, state) do
    cleanup_for_process(pid, state)
    {:noreply, state}
  end
end

5. Python Bridge Test Manager

defmodule DSPex.Test.PythonBridgeManager do
  @moduledoc """
  Manages Python bridge lifecycle for tests with proper initialization.
  """
  
  def setup_bridge(opts) do
    mode = Keyword.get(opts, :mode, :test)
    
    case mode do
      :mock ->
        setup_mock_bridge()
        
      :isolated ->
        setup_isolated_bridge(opts)
        
      :shared ->
        setup_shared_bridge(opts)
    end
  end
  
  defp setup_isolated_bridge(opts) do
    # Each test gets its own Python process
    config = %{
      mode: :pool_worker,
      session_namespace: opts[:namespace],
      lm_config: DSPex.Test.LMConfig.setup_test_lm(opts[:lm_mode] || :mock),
      cleanup_on_exit: true
    }
    
    # Start bridge with test configuration
    {:ok, bridge} = DSPex.PythonBridge.start_link(config)
    
    # Initialize DSPy with test LM
    :ok = initialize_test_dspy(bridge, config.lm_config)
    
    bridge
  end
  
  defp initialize_test_dspy(bridge, lm_config) do
    # Send initialization command to Python bridge
    DSPex.PythonBridge.execute(bridge, :init_test_lm, lm_config)
  end
end

6. Test Fixtures and Factories

defmodule DSPex.Test.Fixtures do
  @moduledoc """
  Provides test fixtures and factories for consistent test data.
  """
  
  use ExMachina
  
  def program_factory do
    %{
      id: sequence(:program_id, &"test_program_#{&1}_#{:rand.uniform(10000)}"),
      signature: signature_factory(),
      metadata: %{
        test: true,
        created_at: DateTime.utc_now()
      }
    }
  end
  
  def signature_factory do
    %{
      inputs: [
        %{name: "input", type: "string", description: "Test input"}
      ],
      outputs: [
        %{name: "output", type: "string", description: "Test output"}
      ]
    }
  end
  
  def session_factory do
    %{
      id: sequence(:session_id, &"test_session_#{&1}_#{System.unique_integer([:positive])}"),
      user_id: sequence(:user_id, &"test_user_#{&1}"),
      started_at: DateTime.utc_now()
    }
  end
end

7. Enhanced Test Helper

# test/test_helper.exs
# Load test framework
Code.require_file("support/test_environment.ex", __DIR__)
Code.require_file("support/lm_config.ex", __DIR__)
Code.require_file("support/isolation.ex", __DIR__)
Code.require_file("support/resource_manager.ex", __DIR__)

# Start resource manager
{:ok, _} = DSPex.Test.ResourceManager.start_link([])

# Configure test environment based on TEST_MODE
test_mode = System.get_env("TEST_MODE", "unit") |> String.to_atom()
test_env = DSPex.Test.Environment.setup(test_mode)

# Configure application
Application.put_env(:dspex, :test_environment, test_env)
Application.put_env(:dspex, :python_bridge_enabled, test_env.python_enabled)
Application.put_env(:dspex, :pooling_enabled, test_env.pooling_enabled)

# Set up LM configuration
lm_config = DSPex.Test.LMConfig.setup_test_lm(test_env.lm_config)
Application.put_env(:dspex, :test_lm_config, lm_config)

# Configure ExUnit
ExUnit.configure(
  exclude: exclude_tags_for_mode(test_mode),
  timeout: timeout_for_mode(test_mode),
  max_failures: 1  # Stop on first failure in CI
)

# Start ExUnit
ExUnit.start()

# Cleanup hook
System.at_exit(fn _ ->
  DSPex.Test.ResourceManager.cleanup_all()
end)

8. Test Configuration Schema

# config/test.exs
import Config

config :dspex, :test,
  # Resource limits
  max_python_processes: 4,
  max_pool_size: 2,
  process_timeout: 30_000,
  
  # LM Configuration
  lm_modes: %{
    unit: :mock,
    integration: :cached,
    e2e: :real
  },
  
  # Cleanup policies
  cleanup_policy: :aggressive,
  retain_on_failure: true,
  
  # Performance
  parallel: true,
  max_parallel_cases: System.schedulers_online()

9. CI/CD Integration

# .github/workflows/test.yml
name: Test Suite

on: [push, pull_request]

jobs:
  unit-tests:
    runs-on: ubuntu-latest
    env:
      TEST_MODE: unit
      MIX_ENV: test
    steps:
      - uses: actions/checkout@v2
      - name: Run unit tests
        run: mix test --only unit
        
  integration-tests:
    runs-on: ubuntu-latest
    env:
      TEST_MODE: integration
      MIX_ENV: test
      # Use cached LM responses
      LM_CACHE_DIR: test/fixtures/lm_responses
    steps:
      - uses: actions/checkout@v2
      - name: Cache LM responses
        uses: actions/cache@v2
        with:
          path: test/fixtures/lm_responses
          key: lm-responses-${{ hashFiles('test/**/*.exs') }}
      - name: Run integration tests
        run: mix test --only integration
        
  e2e-tests:
    runs-on: ubuntu-latest
    if: github.event_name == 'push' && github.ref == 'refs/heads/main'
    env:
      TEST_MODE: e2e
      MIX_ENV: test
      GEMINI_TEST_API_KEY: ${{ secrets.GEMINI_TEST_API_KEY }}
    steps:
      - uses: actions/checkout@v2
      - name: Run E2E tests
        run: mix test --only e2e --max-failures 5

Implementation Strategy

Phase 1: Foundation (Week 1-2)

Implement TestEnvironment module
Create ResourceManager for cleanup
Add LMConfig for mock/cached/real modes
Update test_helper.exs

Phase 2: Isolation (Week 3-4)

Implement Isolation framework
Add namespace support to adapters
Create test factories
Update existing tests to use isolation

Phase 3: Python Bridge (Week 5-6)

Implement PythonBridgeManager
Add DSPy initialization in tests
Create LM response caching system
Fix remaining integration tests

Phase 4: CI/CD (Week 7-8)

Set up test matrix in CI
Add performance benchmarks
Implement cost monitoring for LM usage
Create test reporting dashboard

Benefits

Reliability: Deterministic tests with proper isolation
Performance: Parallel execution with resource pooling
Cost Control: LM API usage monitoring and caching
Debugging: Better error messages and test artifacts
Scalability: Easy to add new test scenarios
Maintainability: Clear separation of concerns

Success Metrics

Test execution time < 5 minutes
Zero flaky tests
LM API costs < $10/month
100% test coverage for critical paths
Resource leaks: 0