How It Works
State Machine Implementation
struct CircuitBreaker:
state: "CLOSED" | "OPEN" | "HALF_OPEN"
failure_count: int
failure_threshold: int // N failures → trip to OPEN
success_count: int
success_threshold: int // N successes in HALF_OPEN → CLOSED
open_at: timestamp // when did circuit open?
reset_timeout: duration // how long before trying HALF_OPEN?
function call_through(cb, dependency_fn):
if cb.state == "OPEN":
// Check if it's time to try HALF_OPEN
if current_time() - cb.open_at >= cb.reset_timeout:
transition_to_half_open(cb)
else:
raise CircuitOpenError("dependency unavailable — fast-fail")
if cb.state == "HALF_OPEN":
// Only one probe request passes through
try:
result = dependency_fn()
on_half_open_success(cb)
return result
except Exception as e:
on_half_open_failure(cb)
raise
// State: CLOSED — pass through normally
try:
result = dependency_fn()
on_success(cb)
return result
except Exception as e:
on_failure(cb)
raise
function on_failure(cb):
cb.failure_count += 1
if cb.failure_count >= cb.failure_threshold:
transition_to_open(cb)
function on_success(cb):
cb.failure_count = 0 // reset on success in CLOSED state
function transition_to_open(cb):
cb.state = "OPEN"
cb.open_at = current_time()
log("Circuit breaker OPENED for dependency: " + cb.name)
emit_metric("circuit_breaker.opened", cb.name)
function transition_to_half_open(cb):
cb.state = "HALF_OPEN"
cb.success_count = 0
log("Circuit breaker transitioning to HALF_OPEN: " + cb.name)
function on_half_open_success(cb):
cb.success_count += 1
if cb.success_count >= cb.success_threshold:
cb.state = "CLOSED"
cb.failure_count = 0
log("Circuit breaker CLOSED: dependency recovered")
emit_metric("circuit_breaker.closed", cb.name)
function on_half_open_failure(cb):
cb.state = "OPEN"
cb.open_at = current_time()
log("Circuit breaker returned to OPEN: probe failed")
Sliding Window Failure Rate
// Count-based threshold is naive: 10 failures in 10 requests = 100% failure rate
// But 10 failures in 10,000 requests = 0.1% failure rate
// A count threshold does not distinguish between these cases
// Better: use a sliding window to measure failure rate, not count
struct SlidingWindowCB:
window: RingBuffer[bool] // true = success, false = failure
window_size: int // e.g., 100 most recent calls
failure_rate_threshold: float // e.g., 0.5 = 50% failure rate
minimum_calls: int // don't trip on first 5 calls of a cold start
function measure_failure_rate(cb):
if len(cb.window) < cb.minimum_calls:
return 0.0 // insufficient data
failures = count(cb.window, where=false)
return failures / len(cb.window)
function on_call_result(cb, success: bool):
cb.window.push(success) // replaces oldest entry if full
if measure_failure_rate(cb) >= cb.failure_rate_threshold:
if cb.state == "CLOSED":
transition_to_open(cb)
Circuit Breaker per Dependency
// One circuit breaker per dependency — not one global breaker
// A slow payments service should not affect the recommendations service
struct ServiceClient:
circuit_breakers: HashMap[dependency_name -> CircuitBreaker]
function call_dependency(client, dependency_name, fn):
cb = client.circuit_breakers.get_or_create(
dependency_name,
CircuitBreaker(failure_threshold=5, reset_timeout=30s)
)
return call_through(cb, fn)
// Example: payment service failing should not affect user profile reads
function checkout(user_id, cart):
// These calls have independent circuit breakers
try:
user_profile = call_dependency("user_service", () -> user_service.get(user_id))
except CircuitOpenError:
raise CheckoutError("user service unavailable")
try:
payment = call_dependency("payment_service", () -> payment_service.charge(cart.total))
except CircuitOpenError:
raise CheckoutError("payment service unavailable")
// Recommendations failure should not block checkout
try:
upsells = call_dependency("recommendation_service",
() -> recommendation_service.get(user_id))
except CircuitOpenError:
upsells = [] // graceful degradation: skip upsells if circuit is open
Bulkhead Complement
// Circuit breaker: detects failure and stops sending requests (time-based isolation)
// Bulkhead: limits the resources any one dependency can consume (resource isolation)
// Together: comprehensive protection against dependency failures
// Circuit breaker alone: a slow dependency keeps threads waiting until circuit opens
// During the N failures needed to trip the breaker, threads are held
// Bulkhead limits how many threads are held simultaneously
struct BulkheadedCircuitBreaker:
circuit_breaker: CircuitBreaker
semaphore: Semaphore // limits concurrent calls (e.g., max 10 concurrent)
function call_with_bulkhead_and_cb(bcb, fn):
// First check circuit breaker (cheap — no thread hold)
if bcb.circuit_breaker.state == "OPEN":
raise CircuitOpenError()
// Try to acquire a slot in the bulkhead
if not bcb.semaphore.try_acquire(timeout=10ms):
raise BulkheadFullError("too many concurrent requests")
try:
return bcb.circuit_breaker.call_through(fn)
finally:
bcb.semaphore.release()