Foundation Distributed Tracing Guide
This guide explains how to use Foundation’s distributed tracing capabilities to track operations across services and processes.
Overview
Foundation provides span-based distributed tracing that integrates with OpenTelemetry and other tracing systems. This allows you to:
- Track operations across multiple services
- Understand request flow and latencies
- Debug complex distributed systems
- Monitor performance bottlenecks
Basic Usage
Simple Spans
import Foundation.Telemetry.Span
# Basic span
with_span :database_query do
Database.query("SELECT * FROM users")
end
# Span with metadata
with_span :api_request, %{endpoint: "/users", method: "GET"} do
HTTP.get("/users")
end
Nested Spans
with_span :handle_request, %{request_id: request_id} do
# Validate request
with_span :validate_request do
validate(params)
end
# Process business logic
with_span :process_business_logic do
result = process(params)
# Add attributes to current span
add_attributes(%{
items_processed: length(result),
processing_status: "success"
})
result
end
# Send response
with_span :send_response do
send_response(conn, result)
end
end
Recording Events
with_span :long_operation do
# Record checkpoints
record_event(:started_processing)
data = fetch_data()
record_event(:data_fetched, %{record_count: length(data)})
result = process_data(data)
record_event(:processing_complete, %{duration_ms: 1234})
result
end
Cross-Process Tracing
Propagating Context
# Parent process
with_span :parent_operation do
context = propagate_context()
# Send to another process
GenServer.call(worker, {:process, data, context})
# Or use with Task
Task.async(fn ->
with_propagated_context(context, fn ->
with_span :async_operation do
do_work()
end
end)
end)
end
# Worker process
def handle_call({:process, data, context}, _from, state) do
with_propagated_context(context, fn ->
with_span :worker_processing do
result = process_data(data)
{:reply, result, state}
end
end)
end
HTTP Request Tracing
defmodule MyApp.HTTPClient do
import Foundation.Telemetry.Span
def request(method, url, body, headers \\ []) do
with_span :http_request, %{method: method, url: url} do
# Add trace headers
trace_headers = build_trace_headers()
headers = headers ++ trace_headers
# Make request
case HTTP.request(method, url, body, headers) do
{:ok, %{status: status} = response} ->
add_attributes(%{
status_code: status,
response_size: byte_size(response.body)
})
{:ok, response}
{:error, reason} = error ->
add_attributes(%{error: reason})
error
end
end
end
defp build_trace_headers do
case current_trace_id() do
nil -> []
trace_id -> [{"x-trace-id", trace_id}]
end
end
end
Integration with Cache
defmodule MyApp.Cache do
import Foundation.Telemetry.Span
def get(key) do
with_span :cache_get, %{cache_key: key} do
case Foundation.Infrastructure.Cache.get(key) do
nil ->
add_attributes(%{cache_hit: false})
# Fetch from source
with_span :fetch_from_source do
value = fetch_from_database(key)
# Store in cache
with_span :cache_put do
Foundation.Infrastructure.Cache.put(key, value)
end
value
end
value ->
add_attributes(%{cache_hit: true})
value
end
end
end
end
OpenTelemetry Integration
Setup
- Add dependencies:
defp deps do
[
{:opentelemetry, "~> 1.3"},
{:opentelemetry_exporter, "~> 1.3"},
{:opentelemetry_api, "~> 1.2"}
]
end
- Configure OpenTelemetry:
# config/config.exs
config :opentelemetry, :resource, [
service: [
name: "my-service",
version: "1.0.0"
]
]
config :opentelemetry,
span_processor: :batch,
traces_exporter: :otlp
config :opentelemetry_exporter,
otlp_protocol: :grpc,
otlp_endpoint: "http://localhost:4317"
# Enable Foundation bridge
config :foundation, :opentelemetry,
enabled: true,
service_name: "my-service"
- Start the bridge:
children = [
# ... other children ...
Foundation.Telemetry.OpenTelemetryBridge
]
Advanced Patterns
Circuit Breaker with Tracing
defmodule MyApp.ExternalService do
import Foundation.Telemetry.Span
def call_api(params) do
with_span :external_api_call, %{service: "payment_api"} do
Foundation.Services.CircuitBreaker.call(
:payment_api,
fn ->
with_span :http_request do
HTTP.post("/charge", params)
end
end,
timeout: 5000
)
end
end
end
Retry with Tracing
defmodule MyApp.DataProcessor do
import Foundation.Telemetry.Span
def process_with_retry(data) do
with_span :data_processing, %{retry_enabled: true} do
Foundation.Services.RetryService.retry(
fn attempt ->
record_event(:retry_attempt, %{attempt: attempt})
with_span :process_attempt, %{attempt: attempt} do
process_data(data)
end
end,
max_attempts: 3,
backoff: :exponential
)
end
end
end
Batch Processing
defmodule MyApp.BatchProcessor do
import Foundation.Telemetry.Span
def process_batch(items) do
with_span :batch_processing, %{batch_size: length(items)} do
items
|> Enum.with_index()
|> Enum.map(fn {item, index} ->
with_span :process_item, %{item_index: index} do
try do
result = process_item(item)
add_attributes(%{status: "success"})
{:ok, result}
rescue
e ->
add_attributes(%{status: "error", error: inspect(e)})
{:error, e}
end
end
end)
end
end
end
Monitoring and Visualization
Jaeger
- Run Jaeger:
docker run -d --name jaeger \
-p 16686:16686 \
-p 4317:4317 \
jaegertracing/all-in-one:latest
- View traces at http://localhost:16686
Grafana Tempo
- Add to docker-compose.yml:
tempo:
image: grafana/tempo:latest
command: [ "-config.file=/etc/tempo.yaml" ]
volumes:
- ./tempo.yaml:/etc/tempo.yaml
- tempo_data:/tmp/tempo
ports:
- "3200:3200" # tempo
- "4317:4317" # otlp grpc
- Configure Grafana data source to use Tempo
Best Practices
1. Span Naming
Use descriptive, consistent names:
- â
database.query
- â
cache.get
- â
http.request
- â
operation1
- â
do_stuff
2. Attribute Guidelines
Include relevant context:
with_span :api_request, %{
# Good attributes
endpoint: "/users",
method: "GET",
user_type: "premium",
# Avoid high cardinality
# user_id: user_id, # Don't include unique IDs
}
3. Error Handling
Always let spans capture errors:
# Good - span captures error
with_span :risky_operation do
risky_function() # May raise
end
# Bad - span doesn't see error
try do
with_span :risky_operation do
risky_function()
end
rescue
e -> handle_error(e)
end
4. Sampling
For high-volume operations:
def should_trace? do
# Sample 10% of requests
:rand.uniform() < 0.1
end
def handle_request(params) do
if should_trace?() do
with_span :handle_request, %{sampled: true} do
do_work(params)
end
else
do_work(params)
end
end
Troubleshooting
Missing Spans
- Check if tracing is enabled:
Application.get_env(:foundation, :opentelemetry)[:enabled]
- Verify exporter configuration
- Check for errors in logs
Performance Impact
- Monitor overhead with:
:telemetry.attach("trace-overhead", [:foundation, :span, :stop], fn _, %{duration: d}, _, _ ->
Logger.debug("Span overhead: #{d}Ξs")
end, nil)
- Adjust sampling rate if needed
- Use async export for better performance
Context Propagation Issues
- Ensure context is properly passed:
# Debug context
context = propagate_context()
IO.inspect(context, label: "Trace context")
- Check for process boundaries
- Verify trace headers in HTTP requests