The Computing Series

How It Works

State Machine Implementation

struct CircuitBreaker:
    state: "CLOSED" | "OPEN" | "HALF_OPEN"
    failure_count: int
    failure_threshold: int        // N failures → trip to OPEN
    success_count: int
    success_threshold: int        // N successes in HALF_OPEN → CLOSED
    open_at: timestamp            // when did circuit open?
    reset_timeout: duration       // how long before trying HALF_OPEN?

function call_through(cb, dependency_fn):
    if cb.state == "OPEN":
        // Check if it's time to try HALF_OPEN
        if current_time() - cb.open_at >= cb.reset_timeout:
            transition_to_half_open(cb)
        else:
            raise CircuitOpenError("dependency unavailable — fast-fail")

    if cb.state == "HALF_OPEN":
        // Only one probe request passes through
        try:
            result = dependency_fn()
            on_half_open_success(cb)
            return result
        except Exception as e:
            on_half_open_failure(cb)
            raise

    // State: CLOSED — pass through normally
    try:
        result = dependency_fn()
        on_success(cb)
        return result
    except Exception as e:
        on_failure(cb)
        raise

function on_failure(cb):
    cb.failure_count += 1
    if cb.failure_count >= cb.failure_threshold:
        transition_to_open(cb)

function on_success(cb):
    cb.failure_count = 0   // reset on success in CLOSED state

function transition_to_open(cb):
    cb.state = "OPEN"
    cb.open_at = current_time()
    log("Circuit breaker OPENED for dependency: " + cb.name)
    emit_metric("circuit_breaker.opened", cb.name)

function transition_to_half_open(cb):
    cb.state = "HALF_OPEN"
    cb.success_count = 0
    log("Circuit breaker transitioning to HALF_OPEN: " + cb.name)

function on_half_open_success(cb):
    cb.success_count += 1
    if cb.success_count >= cb.success_threshold:
        cb.state = "CLOSED"
        cb.failure_count = 0
        log("Circuit breaker CLOSED: dependency recovered")
        emit_metric("circuit_breaker.closed", cb.name)

function on_half_open_failure(cb):
    cb.state = "OPEN"
    cb.open_at = current_time()
    log("Circuit breaker returned to OPEN: probe failed")

Sliding Window Failure Rate

// Count-based threshold is naive: 10 failures in 10 requests = 100% failure rate
// But 10 failures in 10,000 requests = 0.1% failure rate
// A count threshold does not distinguish between these cases

// Better: use a sliding window to measure failure rate, not count

struct SlidingWindowCB:
    window: RingBuffer[bool]   // true = success, false = failure
    window_size: int           // e.g., 100 most recent calls
    failure_rate_threshold: float   // e.g., 0.5 = 50% failure rate
    minimum_calls: int         // don't trip on first 5 calls of a cold start

function measure_failure_rate(cb):
    if len(cb.window) < cb.minimum_calls:
        return 0.0    // insufficient data
    failures = count(cb.window, where=false)
    return failures / len(cb.window)

function on_call_result(cb, success: bool):
    cb.window.push(success)  // replaces oldest entry if full

    if measure_failure_rate(cb) >= cb.failure_rate_threshold:
        if cb.state == "CLOSED":
            transition_to_open(cb)

Circuit Breaker per Dependency

// One circuit breaker per dependency — not one global breaker
// A slow payments service should not affect the recommendations service

struct ServiceClient:
    circuit_breakers: HashMap[dependency_name -> CircuitBreaker]

function call_dependency(client, dependency_name, fn):
    cb = client.circuit_breakers.get_or_create(
        dependency_name,
        CircuitBreaker(failure_threshold=5, reset_timeout=30s)
    )
    return call_through(cb, fn)

// Example: payment service failing should not affect user profile reads
function checkout(user_id, cart):
    // These calls have independent circuit breakers
    try:
        user_profile = call_dependency("user_service", () -> user_service.get(user_id))
    except CircuitOpenError:
        raise CheckoutError("user service unavailable")

    try:
        payment = call_dependency("payment_service", () -> payment_service.charge(cart.total))
    except CircuitOpenError:
        raise CheckoutError("payment service unavailable")

    // Recommendations failure should not block checkout
    try:
        upsells = call_dependency("recommendation_service",
                                  () -> recommendation_service.get(user_id))
    except CircuitOpenError:
        upsells = []   // graceful degradation: skip upsells if circuit is open

Bulkhead Complement

// Circuit breaker: detects failure and stops sending requests (time-based isolation)
// Bulkhead: limits the resources any one dependency can consume (resource isolation)
// Together: comprehensive protection against dependency failures

// Circuit breaker alone: a slow dependency keeps threads waiting until circuit opens
// During the N failures needed to trip the breaker, threads are held
// Bulkhead limits how many threads are held simultaneously

struct BulkheadedCircuitBreaker:
    circuit_breaker: CircuitBreaker
    semaphore: Semaphore    // limits concurrent calls (e.g., max 10 concurrent)

function call_with_bulkhead_and_cb(bcb, fn):
    // First check circuit breaker (cheap — no thread hold)
    if bcb.circuit_breaker.state == "OPEN":
        raise CircuitOpenError()

    // Try to acquire a slot in the bulkhead
    if not bcb.semaphore.try_acquire(timeout=10ms):
        raise BulkheadFullError("too many concurrent requests")

    try:
        return bcb.circuit_breaker.call_through(fn)
    finally:
        bcb.semaphore.release()

Read in the book →