Cost Optimization

Introduction to Cost Management

Managing costs is critical for sustainable agent systems. This section covers strategies to optimize spending while maintaining performance.

Cost Drivers

API Costs:

LLM API calls (tokens)
Embedding generation
Image generation
Audio processing

Infrastructure:

Compute resources
Storage
Network bandwidth
Database operations

Third-Party Services:

Search APIs
Data providers
Monitoring tools

Token Usage Optimization

Token Counting and Budgeting

import tiktoken
from typing import Dict, List

class TokenOptimizer:
    """Optimize token usage"""
    
    def __init__(self, model: str = "gpt-4"):
        self.encoding = tiktoken.encoding_for_model(model)
        self.model = model
        self.token_costs = {
            "gpt-4": {"input": 0.03, "output": 0.06},  # per 1K tokens
            "gpt-4-turbo": {"input": 0.01, "output": 0.03},
            "gpt-3.5-turbo": {"input": 0.0005, "output": 0.0015}
        }
    
    def count_tokens(self, text: str) -> int:
        """Count tokens in text"""
        return len(self.encoding.encode(text))
    
    def estimate_cost(self, input_text: str, output_tokens: int) -> float:
        """Estimate API call cost"""
        input_tokens = self.count_tokens(input_text)
        
        costs = self.token_costs.get(self.model, self.token_costs["gpt-4"])
        
        input_cost = (input_tokens / 1000) * costs["input"]
        output_cost = (output_tokens / 1000) * costs["output"]
        
        return input_cost + output_cost
    
    def optimize_prompt(self, prompt: str, max_tokens: int) -> str:
        """Optimize prompt to fit token budget"""
        tokens = self.count_tokens(prompt)
        
        if tokens <= max_tokens:
            return prompt
        
        # Truncate to fit budget
        words = prompt.split()
        while tokens > max_tokens and words:
            words.pop()
            prompt = " ".join(words)
            tokens = self.count_tokens(prompt)
        
        return prompt
    
    def compress_context(self, messages: List[Dict], max_tokens: int) -> List[Dict]:
        """Compress conversation context"""
        total_tokens = sum(self.count_tokens(m["content"]) for m in messages)
        
        if total_tokens <= max_tokens:
            return messages
        
        # Keep system message and recent messages
        compressed = [messages[0]]  # System message
        
        # Add recent messages until budget
        for msg in reversed(messages[1:]):
            msg_tokens = self.count_tokens(msg["content"])
            if total_tokens - msg_tokens >= 0:
                compressed.insert(1, msg)
                total_tokens -= msg_tokens
            else:
                break
        
        return compressed

# Usage
optimizer = TokenOptimizer("gpt-4")

prompt = "This is a long prompt..." * 100
tokens = optimizer.count_tokens(prompt)
cost = optimizer.estimate_cost(prompt, 500)

print(f"Tokens: {tokens}, Estimated cost: ${cost:.4f}")

# Optimize
optimized = optimizer.optimize_prompt(prompt, max_tokens=1000)

Caching Strategies

from functools import lru_cache
import hashlib
import json
from typing import Optional

class ResponseCache:
    """Cache LLM responses"""
    
    def __init__(self, max_size: int = 1000):
        self.cache = {}
        self.max_size = max_size
        self.hits = 0
        self.misses = 0
    
    def get_cache_key(self, prompt: str, model: str, temperature: float) -> str:
        """Generate cache key"""
        key_data = f"{prompt}:{model}:{temperature}"
        return hashlib.md5(key_data.encode()).hexdigest()
    
    def get(self, prompt: str, model: str, temperature: float) -> Optional[str]:
        """Get cached response"""
        key = self.get_cache_key(prompt, model, temperature)
        
        if key in self.cache:
            self.hits += 1
            return self.cache[key]
        
        self.misses += 1
        return None
    
    def set(self, prompt: str, model: str, temperature: float, response: str):
        """Cache response"""
        key = self.get_cache_key(prompt, model, temperature)
        
        # Evict oldest if full
        if len(self.cache) >= self.max_size:
            oldest_key = next(iter(self.cache))
            del self.cache[oldest_key]
        
        self.cache[key] = response
    
    def get_stats(self) -> Dict:
        """Get cache statistics"""
        total = self.hits + self.misses
        hit_rate = self.hits / total if total > 0 else 0
        
        return {
            "hits": self.hits,
            "misses": self.misses,
            "hit_rate": hit_rate,
            "size": len(self.cache)
        }

# Cached Agent
class CachedAgent:
    """Agent with response caching"""
    
    def __init__(self):
        self.client = openai.OpenAI()
        self.cache = ResponseCache()
    
    def generate(self, prompt: str, model: str = "gpt-4", temperature: float = 0.7) -> str:
        """Generate with caching"""
        
        # Check cache
        cached = self.cache.get(prompt, model, temperature)
        if cached:
            print("✓ Cache hit")
            return cached
        
        # Generate
        print("✗ Cache miss - calling API")
        response = self.client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": prompt}],
            temperature=temperature
        )
        
        result = response.choices[0].message.content
        
        # Cache result
        self.cache.set(prompt, model, temperature, result)
        
        return result

# Usage
agent = CachedAgent()

# First call - cache miss
response1 = agent.generate("What is AI?")

# Second call - cache hit
response2 = agent.generate("What is AI?")

# Stats
stats = agent.cache.get_stats()
print(f"Cache hit rate: {stats['hit_rate']:.1%}")

Model Selection

Cost-Performance Trade-offs

class ModelSelector:
    """Select optimal model based on requirements"""
    
    def __init__(self):
        self.models = {
            "gpt-4": {
                "cost_per_1k": 0.03,
                "quality": 10,
                "speed": 5
            },
            "gpt-4-turbo": {
                "cost_per_1k": 0.01,
                "quality": 9,
                "speed": 8
            },
            "gpt-3.5-turbo": {
                "cost_per_1k": 0.0005,
                "quality": 7,
                "speed": 10
            }
        }
    
    def select_model(self, 
                    priority: str = "balanced",
                    complexity: str = "medium") -> str:
        """Select best model"""
        
        if priority == "cost":
            return "gpt-3.5-turbo"
        elif priority == "quality":
            return "gpt-4"
        elif priority == "speed":
            return "gpt-3.5-turbo"
        else:  # balanced
            if complexity == "high":
                return "gpt-4-turbo"
            else:
                return "gpt-3.5-turbo"
    
    def estimate_monthly_cost(self, 
                             requests_per_day: int,
                             avg_tokens: int,
                             model: str) -> float:
        """Estimate monthly cost"""
        
        cost_per_1k = self.models[model]["cost_per_1k"]
        daily_cost = (requests_per_day * avg_tokens / 1000) * cost_per_1k
        monthly_cost = daily_cost * 30
        
        return monthly_cost

# Usage
selector = ModelSelector()

# Select for simple task
model = selector.select_model(priority="cost", complexity="low")
print(f"Selected: {model}")

# Estimate costs
monthly = selector.estimate_monthly_cost(
    requests_per_day=10000,
    avg_tokens=500,
    model="gpt-3.5-turbo"
)
print(f"Estimated monthly cost: ${monthly:.2f}")

Batch Processing

Batch API Usage

class BatchProcessor:
    """Process requests in batches"""
    
    def __init__(self, batch_size: int = 10):
        self.batch_size = batch_size
        self.client = openai.OpenAI()
    
    def process_batch(self, requests: List[str]) -> List[str]:
        """Process multiple requests efficiently"""
        
        results = []
        
        # Process in batches
        for i in range(0, len(requests), self.batch_size):
            batch = requests[i:i + self.batch_size]
            
            # Process batch
            batch_results = self.process_single_batch(batch)
            results.extend(batch_results)
        
        return results
    
    def process_single_batch(self, batch: List[str]) -> List[str]:
        """Process single batch"""
        
        # Combine into single prompt for efficiency
        combined_prompt = "Process these requests:\n\n"
        for i, req in enumerate(batch, 1):
            combined_prompt += f"{i}. {req}\n"
        
        response = self.client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": combined_prompt}]
        )
        
        # Parse results
        result_text = response.choices[0].message.content
        results = result_text.split('\n')
        
        return results[:len(batch)]

# Usage
processor = BatchProcessor(batch_size=5)
requests = [f"Summarize topic {i}" for i in range(20)]
results = processor.process_batch(requests)

Resource Optimization

Compute Optimization

class ResourceOptimizer:
    """Optimize compute resources"""
    
    def __init__(self):
        self.metrics = {
            "cpu_usage": [],
            "memory_usage": [],
            "response_times": []
        }
    
    def monitor_resources(self):
        """Monitor resource usage"""
        import psutil
        
        cpu = psutil.cpu_percent(interval=1)
        memory = psutil.virtual_memory().percent
        
        self.metrics["cpu_usage"].append(cpu)
        self.metrics["memory_usage"].append(memory)
        
        return {"cpu": cpu, "memory": memory}
    
    def should_scale(self) -> Dict:
        """Determine if scaling is needed"""
        
        if not self.metrics["cpu_usage"]:
            return {"scale": False}
        
        avg_cpu = sum(self.metrics["cpu_usage"][-10:]) / min(10, len(self.metrics["cpu_usage"]))
        avg_memory = sum(self.metrics["memory_usage"][-10:]) / min(10, len(self.metrics["memory_usage"]))
        
        scale_up = avg_cpu > 80 or avg_memory > 80
        scale_down = avg_cpu < 20 and avg_memory < 20
        
        return {
            "scale": scale_up or scale_down,
            "direction": "up" if scale_up else "down",
            "cpu": avg_cpu,
            "memory": avg_memory
        }

# Usage
optimizer = ResourceOptimizer()
resources = optimizer.monitor_resources()
scaling = optimizer.should_scale()

if scaling["scale"]:
    print(f"Scale {scaling['direction']}: CPU={scaling['cpu']:.1f}%, Memory={scaling['memory']:.1f}%")

Cost Monitoring

Real-Time Cost Tracking

class CostMonitor:
    """Monitor and track costs"""
    
    def __init__(self, budget: float = 1000.0):
        self.budget = budget
        self.costs = []
        self.alerts = []
    
    def record_cost(self, amount: float, service: str, metadata: Dict = None):
        """Record cost"""
        
        cost_entry = {
            "amount": amount,
            "service": service,
            "timestamp": time.time(),
            "metadata": metadata or {}
        }
        
        self.costs.append(cost_entry)
        
        # Check budget
        total = self.get_total_cost()
        if total > self.budget * 0.8:
            self.add_alert("warning", f"80% of budget used: ${total:.2f}")
        
        if total > self.budget:
            self.add_alert("critical", f"Budget exceeded: ${total:.2f}")
    
    def get_total_cost(self) -> float:
        """Get total cost"""
        return sum(c["amount"] for c in self.costs)
    
    def get_cost_by_service(self) -> Dict:
        """Get costs grouped by service"""
        by_service = {}
        
        for cost in self.costs:
            service = cost["service"]
            by_service[service] = by_service.get(service, 0) + cost["amount"]
        
        return by_service
    
    def add_alert(self, level: str, message: str):
        """Add cost alert"""
        alert = {
            "level": level,
            "message": message,
            "timestamp": time.time()
        }
        
        self.alerts.append(alert)
        print(f"🚨 {level.upper()}: {message}")
    
    def get_report(self) -> Dict:
        """Generate cost report"""
        total = self.get_total_cost()
        by_service = self.get_cost_by_service()
        
        return {
            "total_cost": total,
            "budget": self.budget,
            "remaining": self.budget - total,
            "utilization": (total / self.budget) * 100,
            "by_service": by_service,
            "alerts": self.alerts
        }

# Usage
monitor = CostMonitor(budget=100.0)

# Record costs
monitor.record_cost(15.50, "openai", {"model": "gpt-4"})
monitor.record_cost(2.30, "pinecone", {"operation": "query"})

# Get report
report = monitor.get_report()
print(f"Total: ${report['total_cost']:.2f}")
print(f"Budget utilization: {report['utilization']:.1f}%")

Best Practices

Monitor costs: Track spending in real-time
Set budgets: Implement spending limits
Cache responses: Avoid redundant API calls
Optimize prompts: Minimize token usage
Choose right model: Balance cost and quality
Batch requests: Process multiple items together
Use cheaper models: For simple tasks
Implement rate limiting: Prevent runaway costs
Regular audits: Review and optimize
Alert on anomalies: Detect unusual spending

Next Steps

Chapter 8 (Enterprise & Scale) is complete! You now understand architecture patterns, security & compliance, and cost optimization for production agent systems.

We’ve completed 8 out of 10 modules! Only Chapters 9 and 10 remain. Would you like to continue?

Keyboard shortcuts

Agentic Guide to AI Agents