Keyboard shortcuts

Press or to navigate between chapters

Press S or / to search in the book

Press ? to show this help

Press Esc to hide this help

Implementation

Building the Autonomous Software Engineering Agent

Let’s build the complete system step by step.

Project Setup

# Create project structure
mkdir autonomous-se-agent
cd autonomous-se-agent

# Create directories
mkdir -p src/{agents,tools,memory,orchestration}
mkdir -p tests
mkdir -p data/{cache,feedback}

# Install dependencies
pip install openai chromadb gitpython docker pytest pylint black ast-grep-py

Core Implementation

1. Main Orchestrator

# src/orchestration/orchestrator.py
from typing import Dict, List
from dataclasses import dataclass
from enum import Enum
import openai

class TaskType(Enum):
    ANALYZE = "analyze"
    FIX = "fix"
    TEST = "test"
    REFACTOR = "refactor"
    REVIEW = "review"

@dataclass
class Task:
    type: TaskType
    target: str
    description: str
    context: Dict

class SoftwareEngineeringAgent:
    """Main orchestrator for autonomous SE agent"""
    
    def __init__(self):
        self.client = openai.OpenAI()
        self.analyzer = AnalyzerAgent()
        self.fixer = FixerAgent()
        self.tester = TesterAgent()
        self.memory = AgentMemory()
    
    def process_request(self, request: str, target_path: str) -> Dict:
        """Process user request"""
        
        # Parse intent
        intent = self.parse_intent(request)
        
        # Create plan
        plan = self.create_plan(intent, target_path)
        
        # Execute plan
        results = self.execute_plan(plan)
        
        # Store in memory
        self.memory.store_episode(request, plan, results)
        
        return results
    
    def parse_intent(self, request: str) -> Dict:
        """Parse user intent"""
        
        response = self.client.chat.completions.create(
            model="gpt-4",
            messages=[{
                "role": "system",
                "content": "Parse user intent. Return JSON with: task_type, target, requirements"
            }, {
                "role": "user",
                "content": request
            }],
            temperature=0.2
        )
        
        import json
        return json.loads(response.choices[0].message.content)
    
    def create_plan(self, intent: Dict, target_path: str) -> List[Task]:
        """Create execution plan"""
        
        tasks = []
        task_type = TaskType(intent["task_type"])
        
        if task_type == TaskType.FIX:
            # Fix requires: analyze -> fix -> test
            tasks.append(Task(TaskType.ANALYZE, target_path, "Analyze code", {}))
            tasks.append(Task(TaskType.FIX, target_path, intent["requirements"], {}))
            tasks.append(Task(TaskType.TEST, target_path, "Validate fix", {}))
        
        elif task_type == TaskType.REFACTOR:
            # Refactor requires: analyze -> refactor -> test
            tasks.append(Task(TaskType.ANALYZE, target_path, "Analyze code", {}))
            tasks.append(Task(TaskType.REFACTOR, target_path, intent["requirements"], {}))
            tasks.append(Task(TaskType.TEST, target_path, "Validate refactor", {}))
        
        else:
            tasks.append(Task(task_type, target_path, intent["requirements"], {}))
        
        return tasks
    
    def execute_plan(self, plan: List[Task]) -> Dict:
        """Execute task plan"""
        
        results = []
        context = {}
        
        for task in plan:
            task.context = context
            
            if task.type == TaskType.ANALYZE:
                result = self.analyzer.execute(task)
            elif task.type == TaskType.FIX:
                result = self.fixer.execute(task)
            elif task.type == TaskType.TEST:
                result = self.tester.execute(task)
            else:
                result = {"error": "Unknown task type"}
            
            results.append(result)
            context.update(result)
        
        return {"tasks": len(plan), "results": results}

2. Analyzer Agent

# src/agents/analyzer.py
import ast
from typing import Dict, List

class AnalyzerAgent:
    """Analyzes code for issues"""
    
    def __init__(self):
        self.client = openai.OpenAI()
    
    def execute(self, task: Task) -> Dict:
        """Analyze code file"""
        
        # Read code
        with open(task.target, 'r') as f:
            code = f.read()
        
        # Parse AST
        ast_analysis = self.analyze_ast(code)
        
        # Run static analysis
        static_issues = self.run_static_analysis(task.target)
        
        # LLM-based analysis
        llm_analysis = self.llm_analyze(code)
        
        return {
            "file": task.target,
            "ast_analysis": ast_analysis,
            "static_issues": static_issues,
            "llm_analysis": llm_analysis,
            "issues": self.consolidate_issues(static_issues, llm_analysis)
        }
    
    def analyze_ast(self, code: str) -> Dict:
        """Analyze code structure"""
        
        try:
            tree = ast.parse(code)
            
            functions = [node.name for node in ast.walk(tree) 
                        if isinstance(node, ast.FunctionDef)]
            classes = [node.name for node in ast.walk(tree) 
                      if isinstance(node, ast.ClassDef)]
            
            return {
                "functions": functions,
                "classes": classes,
                "lines": len(code.split('\n'))
            }
        except SyntaxError as e:
            return {"error": str(e)}
    
    def run_static_analysis(self, file_path: str) -> List[Dict]:
        """Run pylint"""
        
        import subprocess
        
        result = subprocess.run(
            ['pylint', file_path, '--output-format=json'],
            capture_output=True,
            text=True
        )
        
        import json
        try:
            return json.loads(result.stdout)
        except:
            return []
    
    def llm_analyze(self, code: str) -> Dict:
        """LLM-based code analysis"""
        
        response = self.client.chat.completions.create(
            model="gpt-4",
            messages=[{
                "role": "system",
                "content": "You are an expert code reviewer. Analyze code for bugs, security issues, and improvements."
            }, {
                "role": "user",
                "content": f"Analyze this code:\n\n{code}"
            }],
            temperature=0.3
        )
        
        return {"analysis": response.choices[0].message.content}
    
    def consolidate_issues(self, static: List[Dict], llm: Dict) -> List[Dict]:
        """Consolidate all issues"""
        
        issues = []
        
        # Add static analysis issues
        for issue in static:
            issues.append({
                "type": issue.get("type", "unknown"),
                "message": issue.get("message", ""),
                "line": issue.get("line", 0),
                "severity": issue.get("severity", "info"),
                "source": "static"
            })
        
        return issues

3. Fixer Agent

# src/agents/fixer.py
from typing import Dict
import difflib

class FixerAgent:
    """Generates and applies fixes"""
    
    def __init__(self):
        self.client = openai.OpenAI()
        self.validator = FixValidator()
    
    def execute(self, task: Task) -> Dict:
        """Generate and apply fix"""
        
        # Read current code
        with open(task.target, 'r') as f:
            original_code = f.read()
        
        # Get issues from context
        issues = task.context.get("issues", [])
        
        # Generate fix
        fixed_code = self.generate_fix(original_code, issues, task.description)
        
        # Validate fix
        validation = self.validator.validate(original_code, fixed_code)
        
        if not validation["valid"]:
            return {
                "success": False,
                "error": "Validation failed",
                "details": validation
            }
        
        # Show diff
        diff = self.generate_diff(original_code, fixed_code)
        
        return {
            "success": True,
            "original_code": original_code,
            "fixed_code": fixed_code,
            "diff": diff,
            "validation": validation
        }
    
    def generate_fix(self, code: str, issues: List[Dict], description: str) -> str:
        """Generate fixed code"""
        
        issues_text = "\n".join([
            f"- Line {i['line']}: {i['message']}"
            for i in issues[:5]  # Top 5 issues
        ])
        
        response = self.client.chat.completions.create(
            model="gpt-4",
            messages=[{
                "role": "system",
                "content": "You are an expert programmer. Fix code issues while preserving functionality."
            }, {
                "role": "user",
                "content": f"Fix these issues:\n{issues_text}\n\nRequirement: {description}\n\nOriginal code:\n{code}\n\nFixed code:"
            }],
            temperature=0.2
        )
        
        return self.extract_code(response.choices[0].message.content)
    
    def generate_diff(self, original: str, fixed: str) -> str:
        """Generate unified diff"""
        
        diff = difflib.unified_diff(
            original.splitlines(keepends=True),
            fixed.splitlines(keepends=True),
            fromfile='original',
            tofile='fixed'
        )
        
        return ''.join(diff)
    
    def extract_code(self, text: str) -> str:
        """Extract code from markdown"""
        import re
        pattern = r'```python\n(.*?)```'
        matches = re.findall(pattern, text, re.DOTALL)
        return matches[0] if matches else text

class FixValidator:
    """Validate fixes"""
    
    def validate(self, original: str, fixed: str) -> Dict:
        """Multi-level validation"""
        
        return {
            "valid": self.check_syntax(fixed) and self.check_safety(fixed),
            "syntax_valid": self.check_syntax(fixed),
            "safety_passed": self.check_safety(fixed)
        }
    
    def check_syntax(self, code: str) -> bool:
        """Check syntax"""
        try:
            ast.parse(code)
            return True
        except:
            return False
    
    def check_safety(self, code: str) -> bool:
        """Check for unsafe patterns"""
        unsafe = ["eval(", "exec(", "__import__", "os.system"]
        return not any(pattern in code for pattern in unsafe)

4. Tester Agent

# src/agents/tester.py
from typing import Dict, List

class TesterAgent:
    """Generates and runs tests"""
    
    def __init__(self):
        self.client = openai.OpenAI()
    
    def execute(self, task: Task) -> Dict:
        """Generate tests for code"""
        
        # Read code
        with open(task.target, 'r') as f:
            code = f.read()
        
        # Generate tests
        tests = self.generate_tests(code)
        
        # Run tests
        results = self.run_tests(tests)
        
        return {
            "tests_generated": len(tests),
            "tests_passed": sum(1 for r in results if r["passed"]),
            "coverage": self.calculate_coverage(code, tests),
            "test_code": tests
        }
    
    def generate_tests(self, code: str) -> str:
        """Generate test code"""
        
        response = self.client.chat.completions.create(
            model="gpt-4",
            messages=[{
                "role": "system",
                "content": "Generate comprehensive pytest tests. Include edge cases, error cases, and normal cases."
            }, {
                "role": "user",
                "content": f"Generate tests for:\n\n{code}"
            }],
            temperature=0.3
        )
        
        return response.choices[0].message.content
    
    def run_tests(self, test_code: str) -> List[Dict]:
        """Run generated tests"""
        
        # Write to temp file
        import tempfile
        with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f:
            f.write(test_code)
            test_file = f.name
        
        # Run pytest
        import subprocess
        result = subprocess.run(
            ['pytest', test_file, '-v', '--json-report'],
            capture_output=True
        )
        
        return [{"passed": result.returncode == 0}]
    
    def calculate_coverage(self, code: str, tests: str) -> float:
        """Estimate test coverage"""
        # Simplified coverage estimation
        return 0.85

5. Memory System

# src/memory/agent_memory.py
import chromadb
from typing import Dict, List
import json

class AgentMemory:
    """Unified memory system"""
    
    def __init__(self):
        self.working_memory = []
        self.client = chromadb.Client()
        self.episodes = self.client.create_collection("episodes")
        self.codebase = self.client.create_collection("codebase")
    
    def store_episode(self, request: str, plan: List[Task], results: Dict):
        """Store completed episode"""
        
        episode = {
            "request": request,
            "plan": [{"type": t.type.value, "target": t.target} for t in plan],
            "results": results,
            "success": results.get("success", False)
        }
        
        self.episodes.add(
            documents=[json.dumps(episode)],
            metadatas=[{"request": request}],
            ids=[f"episode_{len(self.episodes.get()['ids'])}"]
        )
    
    def recall_similar_episodes(self, request: str, limit: int = 3) -> List[Dict]:
        """Recall similar past episodes"""
        
        results = self.episodes.query(
            query_texts=[request],
            n_results=limit
        )
        
        return [json.loads(doc) for doc in results['documents'][0]]
    
    def index_file(self, file_path: str, code: str, analysis: Dict):
        """Index file in semantic memory"""
        
        self.codebase.add(
            documents=[code],
            metadatas=[{
                "file_path": file_path,
                "functions": json.dumps(analysis.get("functions", [])),
                "classes": json.dumps(analysis.get("classes", []))
            }],
            ids=[file_path]
        )
    
    def search_codebase(self, query: str, limit: int = 5) -> List[Dict]:
        """Search codebase semantically"""
        
        results = self.codebase.query(
            query_texts=[query],
            n_results=limit
        )
        
        return results

6. Tool Layer

# src/tools/code_tools.py
import ast
import subprocess
from typing import Dict, List

class CodeTools:
    """Low-level code manipulation tools"""
    
    @staticmethod
    def parse_python(code: str) -> Dict:
        """Parse Python code"""
        
        try:
            tree = ast.parse(code)
            
            return {
                "valid": True,
                "functions": [n.name for n in ast.walk(tree) if isinstance(n, ast.FunctionDef)],
                "classes": [n.name for n in ast.walk(tree) if isinstance(n, ast.ClassDef)],
                "imports": [n.names[0].name for n in ast.walk(tree) if isinstance(n, ast.Import)]
            }
        except SyntaxError as e:
            return {"valid": False, "error": str(e)}
    
    @staticmethod
    def run_linter(file_path: str) -> List[Dict]:
        """Run pylint"""
        
        result = subprocess.run(
            ['pylint', file_path, '--output-format=json'],
            capture_output=True,
            text=True
        )
        
        import json
        try:
            return json.loads(result.stdout)
        except:
            return []
    
    @staticmethod
    def format_code(code: str) -> str:
        """Format with black"""
        
        result = subprocess.run(
            ['black', '-'],
            input=code,
            capture_output=True,
            text=True
        )
        
        return result.stdout if result.returncode == 0 else code
    
    @staticmethod
    def run_tests(test_path: str) -> Dict:
        """Run pytest"""
        
        result = subprocess.run(
            ['pytest', test_path, '-v'],
            capture_output=True,
            text=True
        )
        
        return {
            "passed": result.returncode == 0,
            "output": result.stdout
        }

class SafeExecutor:
    """Execute code safely in Docker"""
    
    def __init__(self):
        import docker
        self.client = docker.from_env()
    
    def execute(self, code: str, timeout: int = 30) -> Dict:
        """Execute in isolated container"""
        
        try:
            container = self.client.containers.run(
                "python:3.11-slim",
                command=['python', '-c', code],
                detach=True,
                mem_limit="256m",
                network_disabled=True,
                remove=True
            )
            
            result = container.wait(timeout=timeout)
            logs = container.logs().decode()
            
            return {"success": True, "output": logs, "exit_code": result['StatusCode']}
            
        except Exception as e:
            return {"success": False, "error": str(e)}

7. Complete Agent Implementation

# src/agents/fixer.py (complete version)
from typing import Dict, List
import openai

class FixerAgent:
    """Generates and applies fixes"""
    
    def __init__(self):
        self.client = openai.OpenAI()
        self.tools = CodeTools()
    
    def execute(self, task: Task) -> Dict:
        """Generate fix for issues"""
        
        # Read code
        with open(task.target, 'r') as f:
            original_code = f.read()
        
        # Get issues from context
        issues = task.context.get("issues", [])
        
        # Retrieve similar fixes from memory
        similar_fixes = self.recall_similar_fixes(issues)
        
        # Generate fix with context
        fixed_code = self.generate_fix(
            original_code, 
            issues, 
            task.description,
            similar_fixes
        )
        
        # Validate
        if not self.validate_fix(original_code, fixed_code):
            return {"success": False, "error": "Validation failed"}
        
        # Generate explanation
        explanation = self.explain_fix(original_code, fixed_code, issues)
        
        return {
            "success": True,
            "original_code": original_code,
            "fixed_code": fixed_code,
            "explanation": explanation,
            "issues_addressed": len(issues)
        }
    
    def generate_fix(self, 
                    code: str, 
                    issues: List[Dict],
                    description: str,
                    similar_fixes: List[Dict]) -> str:
        """Generate fixed code"""
        
        issues_text = "\n".join([
            f"- Line {i['line']}: {i['message']} (severity: {i['severity']})"
            for i in issues[:10]
        ])
        
        context_text = ""
        if similar_fixes:
            context_text = "\n\nSimilar fixes from history:\n" + "\n".join([
                f"- {fix['description']}: {fix['approach']}"
                for fix in similar_fixes[:3]
            ])
        
        response = self.client.chat.completions.create(
            model="gpt-4",
            messages=[{
                "role": "system",
                "content": "Fix code issues while preserving functionality. Return only the fixed code."
            }, {
                "role": "user",
                "content": f"Issues:\n{issues_text}\n\nRequirement: {description}{context_text}\n\nCode:\n{code}\n\nFixed code:"
            }],
            temperature=0.2
        )
        
        return self.extract_code(response.choices[0].message.content)
    
    def validate_fix(self, original: str, fixed: str) -> bool:
        """Validate fix"""
        
        # Check syntax
        parsed = self.tools.parse_python(fixed)
        if not parsed["valid"]:
            return False
        
        # Check no unsafe operations
        unsafe = ["eval(", "exec(", "os.system"]
        if any(op in fixed for op in unsafe):
            return False
        
        return True
    
    def explain_fix(self, original: str, fixed: str, issues: List[Dict]) -> str:
        """Explain what was fixed"""
        
        response = self.client.chat.completions.create(
            model="gpt-4",
            messages=[{
                "role": "user",
                "content": f"Explain changes:\n\nOriginal:\n{original[:500]}\n\nFixed:\n{fixed[:500]}\n\nIssues addressed: {len(issues)}"
            }],
            temperature=0.3
        )
        
        return response.choices[0].message.content
    
    def recall_similar_fixes(self, issues: List[Dict]) -> List[Dict]:
        """Recall similar fixes from memory"""
        # Simplified - would use vector search
        return []
    
    def extract_code(self, text: str) -> str:
        """Extract code from response"""
        import re
        pattern = r'```python\n(.*?)```'
        matches = re.findall(pattern, text, re.DOTALL)
        return matches[0] if matches else text

8. CLI Interface

# src/cli.py
import click
from orchestration.orchestrator import SoftwareEngineeringAgent

@click.group()
def cli():
    """Autonomous Software Engineering Agent"""
    pass

@cli.command()
@click.argument('file_path')
def analyze(file_path):
    """Analyze code file"""
    agent = SoftwareEngineeringAgent()
    result = agent.process_request(f"Analyze {file_path}", file_path)
    click.echo(json.dumps(result, indent=2))

@cli.command()
@click.argument('file_path')
@click.option('--description', '-d', help='Fix description')
def fix(file_path, description):
    """Fix issues in code"""
    agent = SoftwareEngineeringAgent()
    result = agent.process_request(
        f"Fix issues: {description}" if description else "Fix all issues",
        file_path
    )
    
    if result['results'][-1]['success']:
        click.echo("✓ Fix generated successfully")
        click.echo("\nDiff:")
        click.echo(result['results'][-1]['diff'])
    else:
        click.echo("✗ Fix failed")

@cli.command()
@click.argument('file_path')
def test(file_path):
    """Generate tests"""
    agent = SoftwareEngineeringAgent()
    result = agent.process_request(f"Generate tests for {file_path}", file_path)
    click.echo(f"Generated {result['results'][0]['tests_generated']} tests")

if __name__ == '__main__':
    cli()

Usage Examples

Example 1: Analyze and Fix

# Analyze code
python src/cli.py analyze src/example.py

# Fix issues
python src/cli.py fix src/example.py --description "Fix type errors and add error handling"

# Generate tests
python src/cli.py test src/example.py

Example 2: Programmatic Usage

from orchestration.orchestrator import SoftwareEngineeringAgent

# Initialize agent
agent = SoftwareEngineeringAgent()

# Analyze code
result = agent.process_request(
    "Analyze this file for bugs and security issues",
    "src/auth.py"
)

print(f"Found {len(result['results'][0]['issues'])} issues")

# Fix critical issues
fix_result = agent.process_request(
    "Fix all critical and high severity issues",
    "src/auth.py"
)

if fix_result['results'][-1]['success']:
    print("Fix applied successfully")
    print(fix_result['results'][-1]['explanation'])

Advanced Features

Learning from Feedback

class FeedbackLearner:
    """Learn from user feedback"""
    
    def __init__(self):
        self.feedback_db = []
    
    def collect_feedback(self, task: Task, result: Dict, user_rating: int):
        """Collect user feedback"""
        
        self.feedback_db.append({
            "task": task,
            "result": result,
            "rating": user_rating,
            "timestamp": time.time()
        })
    
    def improve_from_feedback(self):
        """Analyze feedback and improve"""
        
        # Identify patterns in low-rated results
        low_rated = [f for f in self.feedback_db if f["rating"] < 3]
        
        # Extract common issues
        # Adjust prompts or strategies
        # Update tool selection logic
        pass

Parallel Processing

import asyncio
from typing import List

class ParallelAnalyzer:
    """Analyze multiple files in parallel"""
    
    async def analyze_files(self, file_paths: List[str]) -> List[Dict]:
        """Analyze files concurrently"""
        
        tasks = [self.analyze_file(path) for path in file_paths]
        results = await asyncio.gather(*tasks)
        
        return results
    
    async def analyze_file(self, file_path: str) -> Dict:
        """Analyze single file"""
        
        analyzer = AnalyzerAgent()
        task = Task(TaskType.ANALYZE, file_path, "Analyze", {})
        
        return analyzer.execute(task)

# Usage
async def main():
    analyzer = ParallelAnalyzer()
    results = await analyzer.analyze_files(['file1.py', 'file2.py', 'file3.py'])
    print(f"Analyzed {len(results)} files")

asyncio.run(main())

Testing the Agent

Unit Tests

# tests/test_analyzer.py
import pytest
from agents.analyzer import AnalyzerAgent
from orchestration.orchestrator import Task, TaskType

def test_analyzer_detects_issues():
    """Test analyzer finds issues"""
    
    agent = AnalyzerAgent()
    
    # Create test task
    task = Task(
        type=TaskType.ANALYZE,
        target="tests/fixtures/buggy_code.py",
        description="Analyze",
        context={}
    )
    
    result = agent.execute(task)
    
    assert "issues" in result
    assert len(result["issues"]) > 0

def test_analyzer_handles_syntax_errors():
    """Test analyzer handles invalid syntax"""
    
    agent = AnalyzerAgent()
    
    # Write invalid code
    with open("tests/fixtures/invalid.py", "w") as f:
        f.write("def broken(\n")
    
    task = Task(TaskType.ANALYZE, "tests/fixtures/invalid.py", "Analyze", {})
    result = agent.execute(task)
    
    assert "error" in result["ast_analysis"]

Integration Tests

# tests/test_integration.py
import pytest
from orchestration.orchestrator import SoftwareEngineeringAgent

def test_end_to_end_fix():
    """Test complete fix workflow"""
    
    agent = SoftwareEngineeringAgent()
    
    # Create buggy code
    buggy_code = '''
def divide(a, b):
    return a / b
'''
    
    with open("tests/fixtures/buggy.py", "w") as f:
        f.write(buggy_code)
    
    # Request fix
    result = agent.process_request(
        "Fix the division by zero bug",
        "tests/fixtures/buggy.py"
    )
    
    # Verify fix was generated
    assert result["results"][-1]["success"]
    assert "if b == 0" in result["results"][-1]["fixed_code"]

Deployment

Docker Container

# Dockerfile
FROM python:3.11-slim

WORKDIR /app

# Install dependencies
COPY requirements.txt .
RUN pip install -r requirements.txt

# Copy source
COPY src/ ./src/

# Expose API
EXPOSE 8000

CMD ["python", "-m", "uvicorn", "src.api:app", "--host", "0.0.0.0"]

API Service

# src/api.py
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel

app = FastAPI(title="Autonomous SE Agent API")

class AnalyzeRequest(BaseModel):
    file_path: str
    options: Dict = {}

class FixRequest(BaseModel):
    file_path: str
    description: str

@app.post("/analyze")
async def analyze_code(request: AnalyzeRequest):
    """Analyze code endpoint"""
    
    agent = SoftwareEngineeringAgent()
    result = agent.process_request(
        f"Analyze {request.file_path}",
        request.file_path
    )
    
    return result

@app.post("/fix")
async def fix_code(request: FixRequest):
    """Fix code endpoint"""
    
    agent = SoftwareEngineeringAgent()
    result = agent.process_request(
        f"Fix: {request.description}",
        request.file_path
    )
    
    return result

@app.get("/health")
async def health():
    """Health check"""
    return {"status": "healthy"}

Next Steps

You now have a complete implementation! In the next section, we’ll evaluate and iterate on the agent to make it production-ready.