Json schema generation for llm constraints

Documentation for json_schema_generation_for_llm_constraints from the Pipeline ex repository.

JSON Schema Generation for LLM Constraints

Overview

This document details how to generate optimized JSON Schemas specifically for constraining LLM outputs when generating pipeline configurations. Different LLM providers have varying levels of support for structured outputs, and schemas must be tailored to maximize success rates while maintaining expressiveness.

Provider-Specific Schema Support

1. Claude (Anthropic)

Supported Methods:

Tool Use (Function Calling) with JSON Schema
Direct JSON mode with schema hints
System prompts with schema examples

defmodule Pipeline.Schema.Claude do
  @moduledoc """
  Generate Claude-optimized schemas for pipeline generation.
  """
  
  def generate_tool_schema(requirements) do
    %{
      name: "create_pipeline",
      description: "Create a complete pipeline configuration",
      input_schema: build_claude_schema(requirements)
    }
  end
  
  defp build_claude_schema(requirements) do
    base_schema()
    |> apply_claude_optimizations()
    |> add_claude_constraints(requirements)
  end
  
  defp apply_claude_optimizations(schema) do
    schema
    |> use_descriptive_field_names()  # Claude responds well to clear naming
    |> add_detailed_descriptions()     # More context improves accuracy
    |> simplify_nested_structures()    # Flatten where possible
    |> add_examples_to_fields()        # Claude uses examples effectively
  end
  
  def generate_system_prompt_with_schema(schema) do
    """
    You must generate a pipeline configuration that EXACTLY matches this JSON schema:
    
    ```json
    #{Jason.encode!(schema, pretty: true)}
    ```
    
    Important constraints:
    - All required fields must be present
    - Field types must match exactly
    - Enum values must be from the provided list
    - Arrays must contain items matching the item schema
    """
  end
end

2. Gemini (Google)

Supported Methods:

Response Schema with response_mime_type: "application/json"
Function declarations with structured outputs
Strongly typed generation config

defmodule Pipeline.Schema.Gemini do
  @moduledoc """
  Generate Gemini-optimized schemas for pipeline generation.
  """
  
  def generate_response_schema(requirements) do
    %{
      type: "object",
      properties: build_gemini_properties(requirements),
      required: determine_required_fields(requirements)
    }
  end
  
  defp build_gemini_properties(requirements) do
    # Gemini prefers explicit type definitions
    %{
      "workflow" => %{
        type: "object",
        properties: %{
          "name" => %{type: "string"},
          "steps" => %{
            type: "array",
            items: gemini_step_schema(requirements)
          }
        }
      }
    }
  end
  
  def configure_generation(schema) do
    %{
      generation_config: %{
        response_mime_type: "application/json",
        response_schema: schema,
        temperature: 0.1,  # Lower for structured output
        candidate_count: 1
      }
    }
  end
end

3. OpenAI

Supported Methods:

Function calling with JSON Schema
Response format with JSON mode
Structured outputs (beta)

defmodule Pipeline.Schema.OpenAI do
  @moduledoc """
  Generate OpenAI-optimized schemas for pipeline generation.
  """
  
  def generate_function_schema(requirements) do
    %{
      name: "create_pipeline",
      description: "Generate a pipeline configuration",
      parameters: build_openai_schema(requirements),
      required: ["workflow"]
    }
  end
  
  def generate_response_format(requirements) do
    %{
      type: "json_schema",
      json_schema: %{
        name: "pipeline_config",
        schema: build_openai_schema(requirements),
        strict: true  # Enforce strict mode
      }
    }
  end
end

Schema Generation Strategies

1. Progressive Complexity

defmodule Pipeline.Schema.Progressive do
  @moduledoc """
  Generate schemas with progressive complexity based on context.
  """
  
  def generate_schema(complexity_level, base_requirements) do
    case complexity_level do
      :minimal ->
        minimal_schema(base_requirements)
      :standard ->
        standard_schema(base_requirements)
      :advanced ->
        advanced_schema(base_requirements)
      :expert ->
        full_schema(base_requirements)
    end
  end
  
  defp minimal_schema(reqs) do
    %{
      "$schema" => "http://json-schema.org/draft-07/schema#",
      "type" => "object",
      "properties" => %{
        "workflow" => %{
          "type" => "object",
          "properties" => %{
            "name" => %{"type" => "string"},
            "steps" => %{
              "type" => "array",
              "items" => %{
                "type" => "object",
                "properties" => %{
                  "name" => %{"type" => "string"},
                  "type" => %{"type" => "string", "enum" => reqs.allowed_step_types},
                  "prompt" => %{"type" => "string"}
                },
                "required" => ["name", "type", "prompt"]
              }
            }
          },
          "required" => ["name", "steps"]
        }
      },
      "required" => ["workflow"]
    }
  end
  
  defp standard_schema(reqs) do
    minimal_schema(reqs)
    |> add_prompt_structure()
    |> add_step_options()
    |> add_validation_rules()
  end
  
  defp add_prompt_structure(schema) do
    put_in(
      schema,
      ["properties", "workflow", "properties", "steps", "items", "properties", "prompt"],
      %{
        "oneOf" => [
          %{"type" => "string"},
          %{
            "type" => "array",
            "items" => %{
              "type" => "object",
              "properties" => %{
                "type" => %{"type" => "string", "enum" => ["static", "file", "previous_response"]},
                "content" => %{"type" => "string"}
              }
            }
          }
        ]
      }
    )
  end
end

2. Context-Aware Schema Generation

defmodule Pipeline.Schema.ContextAware do
  @moduledoc """
  Generate schemas based on user context and requirements.
  """
  
  def generate_from_context(context) do
    base = base_schema_for_domain(context.domain)
    
    base
    |> adjust_for_user_level(context.user_level)
    |> add_domain_constraints(context.domain)
    |> optimize_for_provider(context.llm_provider)
    |> add_security_constraints(context.security_level)
  end
  
  defp base_schema_for_domain(domain) do
    case domain do
      :data_processing ->
        Schemas.DataProcessing.base_schema()
      :content_generation ->
        Schemas.ContentGeneration.base_schema()
      :multi_agent ->
        Schemas.MultiAgent.base_schema()
      _ ->
        Schemas.Generic.base_schema()
    end
  end
  
  defp adjust_for_user_level(schema, level) do
    case level do
      :beginner ->
        schema
        |> hide_advanced_fields()
        |> add_more_examples()
        |> strengthen_validation()
        
      :expert ->
        schema
        |> expose_all_fields()
        |> relax_constraints()
        |> add_experimental_features()
        
      _ ->
        schema
    end
  end
end

3. Dynamic Schema Builder

defmodule Pipeline.Schema.DynamicBuilder do
  @moduledoc """
  Build schemas dynamically based on requirements.
  """
  
  def build(requirements) do
    SchemaBuilder.new()
    |> add_workflow_structure()
    |> add_step_types(requirements.allowed_steps)
    |> add_constraints(requirements.constraints)
    |> add_conditional_fields(requirements.conditionals)
    |> optimize_for_llm(requirements.llm_provider)
    |> finalize()
  end
  
  defmodule SchemaBuilder do
    defstruct schema: %{}, 
              definitions: %{}, 
              constraints: [],
              metadata: %{}
    
    def new do
      %__MODULE__{
        schema: %{
          "$schema" => "http://json-schema.org/draft-07/schema#",
          "type" => "object"
        }
      }
    end
    
    def add_workflow_structure(builder) do
      update_in(builder.schema["properties"], fn _ ->
        %{
          "workflow" => %{
            "type" => "object",
            "properties" => %{},
            "required" => []
          }
        }
      end)
    end
    
    def add_step_types(builder, allowed_types) do
      # Dynamically build step schemas based on allowed types
      step_schemas = Enum.map(allowed_types, fn type ->
        {type, generate_step_schema_for_type(type)}
      end)
      
      # Add to definitions
      update_in(builder.definitions, fn defs ->
        Map.merge(defs, Map.new(step_schemas))
      end)
    end
    
    def finalize(builder) do
      builder.schema
      |> Map.put("definitions", builder.definitions)
      |> apply_global_constraints(builder.constraints)
    end
  end
end

Schema Optimization Techniques

1. Token Optimization

defmodule Pipeline.Schema.TokenOptimizer do
  @moduledoc """
  Optimize schemas to reduce token usage in LLM interactions.
  """
  
  def optimize_for_tokens(schema, target_reduction \\ 0.3) do
    schema
    |> shorten_field_names()
    |> remove_redundant_descriptions()
    |> use_references_for_repetition()
    |> compress_enums()
    |> measure_reduction(schema, target_reduction)
  end
  
  defp shorten_field_names(schema) do
    # Map verbose names to concise ones
    name_map = %{
      "workflow" => "wf",
      "steps" => "s",
      "properties" => "props",
      "required" => "req"
    }
    
    transform_keys(schema, name_map)
  end
  
  defp use_references_for_repetition(schema) do
    # Find repeated structures and use $ref
    repeated = find_repeated_structures(schema)
    
    Enum.reduce(repeated, schema, fn {structure, locations}, acc ->
      ref_name = generate_ref_name(structure)
      acc
      |> add_to_definitions(ref_name, structure)
      |> replace_with_refs(locations, ref_name)
    end)
  end
end

2. Constraint Optimization

defmodule Pipeline.Schema.ConstraintOptimizer do
  @moduledoc """
  Optimize constraints for better LLM compliance.
  """
  
  def optimize_constraints(schema, llm_provider) do
    schema
    |> adjust_constraint_strictness(llm_provider)
    |> add_provider_hints(llm_provider)
    |> reorder_for_importance()
  end
  
  defp adjust_constraint_strictness(:claude, schema) do
    # Claude handles complex constraints well
    schema
  end
  
  defp adjust_constraint_strictness(:gemini, schema) do
    # Gemini prefers simpler constraints
    schema
    |> simplify_complex_patterns()
    |> flatten_deep_nesting()
  end
  
  defp add_provider_hints(:claude, schema) do
    # Add Claude-specific hints
    schema
    |> add_to_descriptions("Use exactly this format: ")
    |> add_inline_examples()
  end
end

3. Validation Feedback Integration

defmodule Pipeline.Schema.FeedbackIntegration do
  @moduledoc """
  Integrate validation feedback to improve schemas.
  """
  
  def improve_schema_from_feedback(schema, validation_history) do
    common_errors = analyze_common_errors(validation_history)
    
    schema
    |> strengthen_weak_points(common_errors)
    |> add_clarifications(common_errors)
    |> adjust_constraints(common_errors)
  end
  
  defp strengthen_weak_points(schema, errors) do
    # Identify fields that frequently have errors
    problem_fields = identify_problem_fields(errors)
    
    Enum.reduce(problem_fields, schema, fn {field_path, error_rate}, acc ->
      if error_rate > 0.3 do
        strengthen_field_validation(acc, field_path)
      else
        acc
      end
    end)
  end
  
  defp add_clarifications(schema, errors) do
    # Add better descriptions where confusion occurs
    confusion_points = find_confusion_patterns(errors)
    
    Enum.reduce(confusion_points, schema, fn {field, clarification}, acc ->
      update_field_description(acc, field, clarification)
    end)
  end
end

Schema Testing and Validation

1. Schema Test Suite

defmodule Pipeline.Schema.Testing do
  @moduledoc """
  Test generated schemas for effectiveness.
  """
  
  def test_schema_with_llm(schema, test_cases, llm_provider) do
    results = Enum.map(test_cases, fn test_case ->
      result = generate_with_schema(llm_provider, schema, test_case.prompt)
      
      %{
        test_case: test_case,
        generated: result,
        valid: validate_against_schema(result, schema),
        matches_intent: matches_test_intent?(result, test_case)
      }
    end)
    
    %{
      success_rate: calculate_success_rate(results),
      common_failures: analyze_failures(results),
      recommendations: generate_recommendations(results)
    }
  end
  
  def generate_test_cases(domain) do
    [
      %{
        name: "minimal_pipeline",
        prompt: "Create a simple pipeline that processes data",
        expected_steps: ["load", "process", "save"]
      },
      %{
        name: "complex_pipeline",
        prompt: "Create a pipeline with conditional logic and error handling",
        expected_features: [:conditionals, :error_handling]
      }
    ]
  end
end

2. Schema Evolution

defmodule Pipeline.Schema.Evolution do
  @moduledoc """
  Evolve schemas based on usage patterns.
  """
  
  def evolve_schema(current_schema, usage_data) do
    %{
      version: increment_version(current_schema),
      schema: apply_evolution(current_schema, usage_data),
      changelog: generate_changelog(current_schema, usage_data),
      migration: generate_migration_guide(current_schema)
    }
  end
  
  defp apply_evolution(schema, usage_data) do
    schema
    |> add_frequently_used_patterns(usage_data)
    |> remove_unused_features(usage_data)
    |> optimize_based_on_errors(usage_data)
    |> modernize_syntax()
  end
end

Best Practices

1. Schema Design Principles

Start simple, add complexity gradually
Use clear, unambiguous field names
Provide examples for complex structures
Test with multiple LLM providers

2. Provider-Specific Tips

Claude: Use detailed descriptions and examples
Gemini: Keep schemas flat and simple
OpenAI: Leverage function calling for best results

3. Maintenance Guidelines

Version schemas for backward compatibility
Track validation success rates
Update based on LLM provider updates
Document schema changes clearly

Conclusion

Effective JSON Schema generation for LLM constraints requires understanding both the capabilities of different LLM providers and the specific requirements of pipeline generation. By using progressive complexity, context-aware generation, and continuous optimization based on feedback, we can create schemas that consistently produce valid, high-quality pipeline configurations from LLM outputs.