by thetestingacademy
Testing patterns for autonomous AI coding agents like Devin and SWE-Agent including task verification, output validation, sandboxed execution, regression testing for agent behavior, and safety guardrails for autonomous code generation.
npx @qaskills/cli add devin-ai-testingAuto-detects your AI agent and installs the skill. Works with Claude Code, Cursor, Copilot, and more.
You are an expert in testing autonomous AI coding agents. When the user asks you to validate autonomous agent output, build verification pipelines for AI-generated code, implement safety guardrails, or test agent task completion, follow these detailed instructions.
agent-testing/
tasks/
definitions/
simple-function.task.yaml
api-endpoint.task.yaml
bug-fix.task.yaml
refactoring.task.yaml
test-writing.task.yaml
expected-outputs/
simple-function/
expected-code.ts
expected-tests.ts
api-endpoint/
expected-route.ts
expected-handler.ts
validators/
syntax-validator.ts
type-validator.ts
test-runner-validator.ts
security-scanner.ts
style-validator.ts
output-comparator.ts
sandbox/
docker-sandbox.ts
permission-manager.ts
resource-limiter.ts
network-policy.ts
runners/
task-runner.ts
batch-runner.ts
regression-runner.ts
safety/
guardrails.ts
file-access-policy.ts
command-allowlist.ts
secret-detector.ts
metrics/
task-completion-tracker.ts
quality-scorer.ts
cost-tracker.ts
reports/
agent-report.ts
comparison-report.ts
config/
agent-config.ts
sandbox-config.ts
# tasks/definitions/simple-function.task.yaml
id: task-001
name: "Implement debounce function"
description: |
Create a TypeScript debounce function that:
1. Takes a function and delay in milliseconds
2. Returns a debounced version of the function
3. Supports cancellation via a cancel() method
4. Preserves the 'this' context
5. Passes all arguments to the original function
type: code-generation
language: typescript
difficulty: medium
expected_files:
- path: src/utils/debounce.ts
must_contain:
- "export function debounce"
- "cancel"
- "clearTimeout"
- path: src/utils/debounce.test.ts
must_contain:
- "describe"
- "it("
- "expect"
constraints:
max_files_modified: 3
max_lines_of_code: 100
no_external_dependencies: true
must_pass_typecheck: true
must_pass_tests: true
must_pass_lint: true
verification:
syntax: true
types: true
tests: true
security: true
style: true
timeout_minutes: 10
// validators/output-comparator.ts
import { execSync } from 'child_process';
import { existsSync, readFileSync } from 'fs';
export interface ValidationResult {
step: string;
passed: boolean;
details: string;
severity: 'critical' | 'warning' | 'info';
}
export interface AgentOutput {
taskId: string;
files: Array<{ path: string; content: string }>;
commands: string[];
duration: number;
agentVersion: string;
}
export class OutputValidator {
async validateAll(output: AgentOutput, taskDef: any): Promise<ValidationResult[]> {
const results: ValidationResult[] = [];
// Step 1: File existence check
results.push(this.validateFileExistence(output, taskDef));
// Step 2: Syntax check
results.push(await this.validateSyntax(output));
// Step 3: Type check
results.push(await this.validateTypes(output));
// Step 4: Content requirements
results.push(...this.validateContentRequirements(output, taskDef));
// Step 5: Security scan
results.push(await this.validateSecurity(output));
// Step 6: Test execution
results.push(await this.validateTests(output));
// Step 7: Constraint compliance
results.push(...this.validateConstraints(output, taskDef));
// Step 8: Style check
results.push(await this.validateStyle(output));
return results;
}
private validateFileExistence(output: AgentOutput, taskDef: any): ValidationResult {
const expectedFiles = taskDef.expected_files.map((f: any) => f.path);
const generatedFiles = output.files.map((f) => f.path);
const missing = expectedFiles.filter((f: string) => !generatedFiles.includes(f));
return {
step: 'file-existence',
passed: missing.length === 0,
details: missing.length === 0
? `All ${expectedFiles.length} expected files present`
: `Missing files: ${missing.join(', ')}`,
severity: 'critical',
};
}
private async validateSyntax(output: AgentOutput): Promise<ValidationResult> {
try {
for (const file of output.files) {
if (file.path.endsWith('.ts') || file.path.endsWith('.tsx')) {
// Use TypeScript compiler API for syntax check
const ts = await import('typescript');
const sourceFile = ts.createSourceFile(
file.path,
file.content,
ts.ScriptTarget.Latest,
true
);
const diagnostics = sourceFile.parseDiagnostics || [];
if (diagnostics.length > 0) {
return {
step: 'syntax',
passed: false,
details: `Syntax errors in ${file.path}: ${diagnostics.length} error(s)`,
severity: 'critical',
};
}
}
}
return { step: 'syntax', passed: true, details: 'All files pass syntax check', severity: 'critical' };
} catch (error: any) {
return { step: 'syntax', passed: false, details: error.message, severity: 'critical' };
}
}
private async validateTypes(output: AgentOutput): Promise<ValidationResult> {
try {
execSync('npx tsc --noEmit', { cwd: '/tmp/sandbox', encoding: 'utf-8' });
return { step: 'types', passed: true, details: 'Type checking passed', severity: 'critical' };
} catch (error: any) {
return { step: 'types', passed: false, details: `Type errors: ${error.stdout}`, severity: 'critical' };
}
}
private validateContentRequirements(output: AgentOutput, taskDef: any): ValidationResult[] {
const results: ValidationResult[] = [];
for (const expected of taskDef.expected_files) {
const file = output.files.find((f) => f.path === expected.path);
if (!file) continue;
for (const requirement of expected.must_contain || []) {
const passed = file.content.includes(requirement);
results.push({
step: `content-${expected.path}`,
passed,
details: passed
? `Found required content: "${requirement}"`
: `Missing required content: "${requirement}" in ${expected.path}`,
severity: 'warning',
});
}
}
return results;
}
private async validateSecurity(output: AgentOutput): Promise<ValidationResult> {
const issues: string[] = [];
for (const file of output.files) {
// Check for hardcoded secrets
if (/(?:password|secret|api[_-]?key|token)\s*[:=]\s*['"][^'"]{8,}/i.test(file.content)) {
issues.push(`Potential hardcoded secret in ${file.path}`);
}
// Check for dangerous patterns
if (/eval\s*\(/.test(file.content)) {
issues.push(`eval() usage in ${file.path}`);
}
if (/exec(?:Sync)?\s*\(/.test(file.content) && !file.path.includes('test')) {
issues.push(`Shell execution in non-test file ${file.path}`);
}
// Check for unrestricted file access
if (/(?:readFileSync|writeFileSync|fs\.)\s*\([^)]*(?:\/etc|\/proc|\/sys|\.env)/.test(file.content)) {
issues.push(`Sensitive file access in ${file.path}`);
}
}
return {
step: 'security',
passed: issues.length === 0,
details: issues.length === 0 ? 'No security issues detected' : issues.join('; '),
severity: 'critical',
};
}
private async validateTests(output: AgentOutput): Promise<ValidationResult> {
try {
const result = execSync('npx vitest run --reporter=json', {
cwd: '/tmp/sandbox',
encoding: 'utf-8',
timeout: 60000,
});
const parsed = JSON.parse(result);
const passed = parsed.numPassedTests || 0;
const failed = parsed.numFailedTests || 0;
return {
step: 'tests',
passed: failed === 0 && passed > 0,
details: `${passed} tests passed, ${failed} failed`,
severity: 'critical',
};
} catch (error: any) {
return { step: 'tests', passed: false, details: `Test execution failed: ${error.message}`, severity: 'critical' };
}
}
private validateConstraints(output: AgentOutput, taskDef: any): ValidationResult[] {
const results: ValidationResult[] = [];
const constraints = taskDef.constraints || {};
if (constraints.max_files_modified) {
const passed = output.files.length <= constraints.max_files_modified;
results.push({
step: 'constraint-files',
passed,
details: `Files: ${output.files.length}/${constraints.max_files_modified}`,
severity: 'warning',
});
}
if (constraints.max_lines_of_code) {
const totalLines = output.files.reduce((sum, f) => sum + f.content.split('\n').length, 0);
const passed = totalLines <= constraints.max_lines_of_code;
results.push({
step: 'constraint-lines',
passed,
details: `Lines: ${totalLines}/${constraints.max_lines_of_code}`,
severity: 'warning',
});
}
if (constraints.no_external_dependencies) {
const hasNewDeps = output.files.some((f) =>
f.path === 'package.json' && f.content.includes('"dependencies"')
);
results.push({
step: 'constraint-deps',
passed: !hasNewDeps,
details: hasNewDeps ? 'New dependencies added' : 'No new dependencies',
severity: 'warning',
});
}
return results;
}
private async validateStyle(output: AgentOutput): Promise<ValidationResult> {
try {
execSync('npx prettier --check .', { cwd: '/tmp/sandbox', encoding: 'utf-8' });
return { step: 'style', passed: true, details: 'Code style passes', severity: 'info' };
} catch {
return { step: 'style', passed: false, details: 'Code style violations detected', severity: 'info' };
}
}
}
// safety/guardrails.ts
export interface GuardrailConfig {
allowedFileExtensions: string[];
blockedPaths: string[];
allowedCommands: string[];
maxFileSize: number;
maxTotalFiles: number;
networkAllowed: boolean;
allowedDomains: string[];
}
const DEFAULT_GUARDRAILS: GuardrailConfig = {
allowedFileExtensions: ['.ts', '.tsx', '.js', '.jsx', '.json', '.yaml', '.yml', '.md', '.css'],
blockedPaths: ['/etc', '/proc', '/sys', '/root', '.env', '.ssh', 'node_modules'],
allowedCommands: ['npm', 'npx', 'node', 'tsc', 'vitest', 'eslint', 'prettier'],
maxFileSize: 1024 * 1024, // 1MB
maxTotalFiles: 50,
networkAllowed: false,
allowedDomains: [],
};
export class Guardrails {
private config: GuardrailConfig;
constructor(config: Partial<GuardrailConfig> = {}) {
this.config = { ...DEFAULT_GUARDRAILS, ...config };
}
validateFileAccess(path: string): { allowed: boolean; reason?: string } {
for (const blocked of this.config.blockedPaths) {
if (path.includes(blocked)) {
return { allowed: false, reason: `Access to ${blocked} is blocked` };
}
}
const ext = '.' + path.split('.').pop();
if (!this.config.allowedFileExtensions.includes(ext)) {
return { allowed: false, reason: `File extension ${ext} is not allowed` };
}
return { allowed: true };
}
validateCommand(command: string): { allowed: boolean; reason?: string } {
const executable = command.split(' ')[0];
if (!this.config.allowedCommands.includes(executable)) {
return { allowed: false, reason: `Command ${executable} is not in allowlist` };
}
// Block dangerous flags
if (command.includes('--force') || command.includes('-rf') || command.includes('sudo')) {
return { allowed: false, reason: 'Dangerous command flags detected' };
}
return { allowed: true };
}
validateFileContent(content: string): { allowed: boolean; issues: string[] } {
const issues: string[] = [];
if (content.length > this.config.maxFileSize) {
issues.push(`File exceeds maximum size: ${content.length} > ${this.config.maxFileSize}`);
}
// Detect potential secrets
const secretPatterns = [
/(?:api[_-]?key|secret|password|token)\s*[:=]\s*['"][A-Za-z0-9+/=]{20,}['"]/gi,
/(?:sk|pk)[-_](?:live|test)[-_][A-Za-z0-9]{20,}/g,
/ghp_[A-Za-z0-9]{36}/g,
];
for (const pattern of secretPatterns) {
if (pattern.test(content)) {
issues.push('Potential secret or API key detected in file content');
break;
}
}
return { allowed: issues.length === 0, issues };
}
}
// runners/regression-runner.ts
import { OutputValidator } from '../validators/output-comparator';
import { readFileSync } from 'fs';
export interface RegressionResult {
taskId: string;
currentVersion: string;
baselineVersion: string;
qualityDelta: number;
completionRateDelta: number;
newRegressions: string[];
newImprovements: string[];
}
export class RegressionRunner {
private validator: OutputValidator;
constructor() {
this.validator = new OutputValidator();
}
async compareVersions(
currentOutputs: any[],
baselineOutputs: any[],
taskDefs: any[]
): Promise<RegressionResult[]> {
const results: RegressionResult[] = [];
for (const taskDef of taskDefs) {
const current = currentOutputs.find((o) => o.taskId === taskDef.id);
const baseline = baselineOutputs.find((o) => o.taskId === taskDef.id);
if (!current || !baseline) continue;
const currentResults = await this.validator.validateAll(current, taskDef);
const baselineResults = await this.validator.validateAll(baseline, taskDef);
const currentScore = currentResults.filter((r) => r.passed).length / currentResults.length;
const baselineScore = baselineResults.filter((r) => r.passed).length / baselineResults.length;
const regressions = currentResults
.filter((r) => !r.passed)
.filter((r) => baselineResults.find((b) => b.step === r.step)?.passed)
.map((r) => r.step);
const improvements = currentResults
.filter((r) => r.passed)
.filter((r) => !baselineResults.find((b) => b.step === r.step)?.passed)
.map((r) => r.step);
results.push({
taskId: taskDef.id,
currentVersion: current.agentVersion,
baselineVersion: baseline.agentVersion,
qualityDelta: Math.round((currentScore - baselineScore) * 100),
completionRateDelta: 0,
newRegressions: regressions,
newImprovements: improvements,
});
}
return results;
}
}
- name: Install QA Skills
run: npx @qaskills/cli add devin-ai-testing12 of 29 agents supported