| <!DOCTYPE html> |
| <html lang="en"> |
| <head> |
| <meta charset="UTF-8"> |
| <meta name="viewport" content="width=device-width, initial-scale=1.0"> |
| <title>Computer Agent Evaluation Viewer</title> |
| <style> |
| |
| body { |
| font-family: Arial, sans-serif; |
| margin: 0; |
| padding: 20px; |
| background-color: #f5f5f5; |
| } |
| .container { |
| max-width: 1200px; |
| margin: 0 auto; |
| background-color: #fff; |
| padding: 20px; |
| border-radius: 8px; |
| box-shadow: 0 2px 10px rgba(0,0,0,0.1); |
| } |
| h1, h2, h3 { |
| color: #333; |
| } |
| select, input, button { |
| padding: 8px 12px; |
| margin: 5px 0; |
| border: 1px solid #ddd; |
| border-radius: 4px; |
| } |
| button { |
| background-color: #4a6cf7; |
| color: white; |
| cursor: pointer; |
| border: none; |
| } |
| button:hover { |
| background-color: #3a5ce5; |
| } |
| button:disabled { |
| background-color: #cccccc; |
| cursor: not-allowed; |
| } |
| .row { |
| display: flex; |
| margin-bottom: 20px; |
| } |
| .col { |
| flex: 1; |
| padding: 0 10px; |
| } |
| .image-viewer { |
| width: 100%; |
| max-height: 500px; |
| border: 1px solid #ddd; |
| border-radius: 4px; |
| overflow: hidden; |
| margin-bottom: 10px; |
| position: relative; |
| } |
| .image-viewer img { |
| max-width: 100%; |
| max-height: 450px; |
| display: block; |
| margin: 0 auto; |
| } |
| .image-controls { |
| display: flex; |
| justify-content: space-between; |
| align-items: center; |
| margin-top: 10px; |
| } |
| .nav-buttons { |
| display: flex; |
| gap: 10px; |
| } |
| .step { |
| border: 1px solid #ddd; |
| border-radius: 4px; |
| margin-bottom: 10px; |
| overflow: hidden; |
| } |
| .step-header { |
| background-color: #f0f0f0; |
| padding: 10px; |
| font-weight: bold; |
| cursor: pointer; |
| display: flex; |
| justify-content: space-between; |
| } |
| .step-content { |
| padding: 15px; |
| white-space: pre-wrap; |
| font-family: monospace; |
| background-color: #f9f9f9; |
| max-height: 300px; |
| overflow-y: auto; |
| } |
| .hidden { |
| display: none; |
| } |
| .status-success { |
| color: #22c55e; |
| font-weight: bold; |
| } |
| .status-failure { |
| color: #ef4444; |
| font-weight: bold; |
| } |
| .tabs { |
| display: flex; |
| border-bottom: 1px solid #ddd; |
| margin-bottom: 20px; |
| } |
| .tab { |
| padding: 10px 20px; |
| cursor: pointer; |
| border-bottom: 2px solid transparent; |
| } |
| .tab.active { |
| border-bottom-color: #4a6cf7; |
| font-weight: bold; |
| } |
| .tab-content { |
| display: none; |
| } |
| .tab-content.active { |
| display: block; |
| } |
| pre { |
| background-color: #f0f0f0; |
| padding: 10px; |
| border-radius: 4px; |
| overflow-x: auto; |
| white-space: pre-wrap; |
| } |
| .error-message { |
| background-color: #fee2e2; |
| color: #b91c1c; |
| padding: 10px; |
| border-radius: 4px; |
| margin: 10px 0; |
| } |
| .loading { |
| display: inline-block; |
| width: 20px; |
| height: 20px; |
| border: 2px solid #f3f3f3; |
| border-top: 2px solid #3498db; |
| border-radius: 50%; |
| animation: spin 1s linear infinite; |
| margin-left: 10px; |
| } |
| @keyframes spin { |
| 0% { transform: rotate(0deg); } |
| 100% { transform: rotate(360deg); } |
| } |
| </style> |
| </head> |
| <body> |
| <div class="container"> |
| <h1>Computer Agent Evaluation Viewer</h1> |
| |
| |
| <div style="margin-bottom: 20px; padding: 15px; background-color: #f0f0f0; border-radius: 8px;"> |
| <h2>Load Evaluation Data</h2> |
| <div style="display: flex; gap: 10px; margin-top: 10px;"> |
| <input type="text" id="base-path" placeholder="Base directory path (leave empty for default)" |
| style="flex-grow: 1; padding: 8px; border: 1px solid #ddd; border-radius: 4px;"> |
| <button id="refresh-evals-btn">Refresh</button> |
| </div> |
| <div style="margin-top: 10px;"> |
| <label for="eval-select">Select Evaluation:</label> |
| <select id="eval-select" style="min-width: 300px;"></select> |
| </div> |
| <div id="load-status" style="margin-top: 10px; font-style: italic;"></div> |
| </div> |
| |
| |
| <div class="row"> |
| <div class="col"> |
| <label for="example-select">Select Example:</label> |
| <select id="example-select"> |
| <option value="">-- Select Example --</option> |
| </select> |
| </div> |
| <div class="col"> |
| <label for="run-select">Select Run:</label> |
| <select id="run-select" disabled> |
| <option value="">-- Select Run --</option> |
| </select> |
| </div> |
| </div> |
| |
| |
| <div id="run-details" class="hidden"> |
| <div> |
| <h2>Task</h2> |
| <pre id="task-text"></pre> |
| </div> |
| |
| <div> |
| <h2>Run Status</h2> |
| <div id="status-display"></div> |
| </div> |
| |
| |
| <div class="tabs"> |
| <div class="tab active" data-tab="screenshots">Screenshots</div> |
| <div class="tab" data-tab="agent-trace">Agent Trace</div> |
| <div class="tab" data-tab="raw-json">Raw JSON</div> |
| </div> |
| |
| |
| <div id="screenshots-tab" class="tab-content active"> |
| <div id="no-images" class="hidden"> |
| <p>No screenshots available for this run.</p> |
| </div> |
| <div id="image-container" class="image-viewer hidden"> |
| <img id="current-image" src="" alt="Screenshot"> |
| <p id="image-caption" class="text-center"></p> |
| </div> |
| <div class="image-controls hidden" id="image-controls"> |
| <div class="nav-buttons"> |
| <button id="prev-image">Previous</button> |
| <span id="image-counter">0 / 0</span> |
| <button id="next-image">Next</button> |
| </div> |
| <input type="range" id="image-slider" min="0" max="0" value="0" style="width: 100%"> |
| </div> |
| </div> |
| |
| |
| <div id="agent-trace-tab" class="tab-content"> |
| <div id="agent-steps"></div> |
| </div> |
| |
| |
| <div id="raw-json-tab" class="tab-content"> |
| <div id="json-loading-indicator" class="hidden"> |
| <p>Loading metadata... <span class="loading"></span></p> |
| </div> |
| <div id="json-error" class="error-message hidden"></div> |
| <pre id="raw-json"></pre> |
| </div> |
| </div> |
| </div> |
|
|
| <script> |
| |
| const appState = { |
| basePath: '', |
| evalId: null, |
| currentExampleId: null, |
| currentRunId: null, |
| currentImages: [], |
| currentImageIndex: 0, |
| loadedData: { |
| examples: {}, |
| runs: {}, |
| metadata: {}, |
| screenshots: {} |
| } |
| }; |
| |
| |
| const basePathInput = document.getElementById('base-path'); |
| const refreshEvalsBtn = document.getElementById('refresh-evals-btn'); |
| const evalSelect = document.getElementById('eval-select'); |
| const loadStatusDisplay = document.getElementById('load-status'); |
| const exampleSelect = document.getElementById('example-select'); |
| const runSelect = document.getElementById('run-select'); |
| const runDetails = document.getElementById('run-details'); |
| const taskText = document.getElementById('task-text'); |
| const statusDisplay = document.getElementById('status-display'); |
| const imageContainer = document.getElementById('image-container'); |
| const noImages = document.getElementById('no-images'); |
| const imageControls = document.getElementById('image-controls'); |
| const currentImage = document.getElementById('current-image'); |
| const imageCaption = document.getElementById('image-caption'); |
| const imageCounter = document.getElementById('image-counter'); |
| const imageSlider = document.getElementById('image-slider'); |
| const prevImage = document.getElementById('prev-image'); |
| const nextImage = document.getElementById('next-image'); |
| const agentSteps = document.getElementById('agent-steps'); |
| const rawJson = document.getElementById('raw-json'); |
| const jsonLoadingIndicator = document.getElementById('json-loading-indicator'); |
| const jsonError = document.getElementById('json-error'); |
| |
| |
| refreshEvalsBtn.addEventListener('click', loadEvaluations); |
| |
| |
| async function loadEvaluations() { |
| appState.basePath = basePathInput.value.trim(); |
| loadStatusDisplay.textContent = 'Loading evaluations...'; |
| refreshEvalsBtn.disabled = true; |
| |
| try { |
| const response = await fetch(`/api/evals?path=${encodeURIComponent(appState.basePath)}`); |
| if (!response.ok) { |
| const errorData = await response.json(); |
| throw new Error(errorData.error || 'Failed to load evaluations'); |
| } |
| |
| const evals = await response.json(); |
| |
| |
| evalSelect.innerHTML = '<option value="">-- Select Evaluation --</option>'; |
| |
| |
| evals.forEach(evalId => { |
| const option = document.createElement('option'); |
| option.value = evalId; |
| option.textContent = evalId; |
| evalSelect.appendChild(option); |
| }); |
| |
| loadStatusDisplay.textContent = `Loaded ${evals.length} evaluations`; |
| |
| |
| if (evals.length > 0) { |
| |
| evals.sort().reverse(); |
| evalSelect.value = evals[0]; |
| |
| evalSelect.dispatchEvent(new Event('change')); |
| } |
| } catch (err) { |
| console.error('Error loading evaluations:', err); |
| loadStatusDisplay.textContent = `Error: ${err.message}`; |
| } finally { |
| refreshEvalsBtn.disabled = false; |
| } |
| } |
| |
| |
| evalSelect.addEventListener('change', async () => { |
| appState.evalId = evalSelect.value; |
| |
| if (!appState.evalId) { |
| exampleSelect.innerHTML = '<option value="">-- Select Example --</option>'; |
| exampleSelect.disabled = true; |
| runSelect.innerHTML = '<option value="">-- Select Run --</option>'; |
| runSelect.disabled = true; |
| runDetails.classList.add('hidden'); |
| return; |
| } |
| |
| try { |
| loadStatusDisplay.textContent = 'Loading examples...'; |
| evalSelect.disabled = true; |
| |
| const response = await fetch(`/api/eval/${appState.evalId}/examples?path=${encodeURIComponent(appState.basePath)}`); |
| if (!response.ok) { |
| const errorData = await response.json(); |
| throw new Error(errorData.error || 'Failed to load examples'); |
| } |
| |
| const examples = await response.json(); |
| appState.loadedData.examples = examples; |
| |
| |
| exampleSelect.innerHTML = '<option value="">-- Select Example --</option>'; |
| |
| for (const [exampleId, task] of Object.entries(examples)) { |
| const option = document.createElement('option'); |
| option.value = exampleId; |
| option.textContent = exampleId; |
| option.title = task; |
| exampleSelect.appendChild(option); |
| } |
| |
| exampleSelect.disabled = false; |
| runSelect.innerHTML = '<option value="">-- Select Run --</option>'; |
| runSelect.disabled = true; |
| runDetails.classList.add('hidden'); |
| |
| loadStatusDisplay.textContent = `Loaded ${Object.keys(examples).length} examples`; |
| |
| |
| if (Object.keys(examples).length > 0) { |
| const firstExampleId = Object.keys(examples)[0]; |
| exampleSelect.value = firstExampleId; |
| |
| exampleSelect.dispatchEvent(new Event('change')); |
| } |
| } catch (err) { |
| console.error('Error loading examples:', err); |
| loadStatusDisplay.textContent = `Error: ${err.message}`; |
| } finally { |
| evalSelect.disabled = false; |
| } |
| }); |
| |
| |
| exampleSelect.addEventListener('change', async () => { |
| appState.currentExampleId = exampleSelect.value; |
| |
| |
| runSelect.innerHTML = '<option value="">-- Select Run --</option>'; |
| |
| if (!appState.currentExampleId) { |
| runSelect.disabled = true; |
| runDetails.classList.add('hidden'); |
| return; |
| } |
| |
| try { |
| loadStatusDisplay.textContent = 'Loading runs...'; |
| exampleSelect.disabled = true; |
| |
| const response = await fetch(`/api/eval/${appState.evalId}/example/${appState.currentExampleId}/runs?path=${encodeURIComponent(appState.basePath)}`); |
| if (!response.ok) { |
| const errorData = await response.json(); |
| throw new Error(errorData.error || 'Failed to load runs'); |
| } |
| |
| const runs = await response.json(); |
| appState.loadedData.runs[appState.currentExampleId] = runs; |
| |
| |
| runs.sort((a, b) => a.id.localeCompare(b.id, undefined, {numeric: true})); |
| |
| |
| runSelect.innerHTML = '<option value="">-- Select Run --</option>'; |
| runs.forEach(run => { |
| const option = document.createElement('option'); |
| option.value = run.id; |
| option.textContent = `${run.id} (${run.status})`; |
| option.dataset.status = run.status; |
| runSelect.appendChild(option); |
| }); |
| |
| runSelect.disabled = false; |
| runDetails.classList.add('hidden'); |
| |
| loadStatusDisplay.textContent = `Loaded ${runs.length} runs`; |
| |
| |
| if (runs.length > 0) { |
| runSelect.value = runs[0].id; |
| |
| runSelect.dispatchEvent(new Event('change')); |
| } |
| } catch (err) { |
| console.error('Error loading runs:', err); |
| loadStatusDisplay.textContent = `Error: ${err.message}`; |
| } finally { |
| exampleSelect.disabled = false; |
| } |
| }); |
| |
| |
| runSelect.addEventListener('change', () => { |
| appState.currentRunId = runSelect.value; |
| |
| if (appState.currentRunId && appState.currentExampleId) { |
| loadRunData(appState.currentExampleId, appState.currentRunId); |
| runDetails.classList.remove('hidden'); |
| } else { |
| runDetails.classList.add('hidden'); |
| } |
| }); |
| |
| |
| async function loadRunData(exampleId, runId) { |
| loadStatusDisplay.textContent = 'Loading run data...'; |
| runSelect.disabled = true; |
| jsonLoadingIndicator.classList.remove('hidden'); |
| jsonError.classList.add('hidden'); |
| |
| try { |
| |
| const metadataResponse = await fetch(`/api/eval/${appState.evalId}/example/${exampleId}/run/${runId}/metadata?path=${encodeURIComponent(appState.basePath)}`); |
| let metadata; |
| |
| if (metadataResponse.ok) { |
| metadata = await metadataResponse.json(); |
| } else { |
| const errorData = await metadataResponse.json(); |
| console.error('Error loading metadata:', errorData); |
| jsonError.textContent = `Error loading metadata: ${errorData.error || 'Unknown error'}`; |
| jsonError.classList.remove('hidden'); |
| metadata = null; |
| } |
| |
| appState.loadedData.metadata[exampleId] = appState.loadedData.metadata[exampleId] || {}; |
| appState.loadedData.metadata[exampleId][runId] = metadata; |
| |
| |
| const task = appState.loadedData.examples[exampleId]; |
| taskText.textContent = task || "No task available"; |
| |
| |
| let statusHtml = ""; |
| |
| if (metadata) { |
| if (metadata.status === 'completed') { |
| statusHtml = `<p><span class="status-success">✓ Completed successfully</span></p>`; |
| } else { |
| statusHtml = `<p><span class="status-failure">✗ Failed</span></p>`; |
| if (metadata.error_message) { |
| statusHtml += `<p>Error: ${metadata.error_message}</p>`; |
| } |
| } |
| } else { |
| statusHtml = "<p>Status information not available</p>"; |
| } |
| |
| statusDisplay.innerHTML = statusHtml; |
| |
| |
| const screenshotsResponse = await fetch(`/api/eval/${appState.evalId}/example/${exampleId}/run/${runId}/screenshots?path=${encodeURIComponent(appState.basePath)}`); |
| const screenshots = await screenshotsResponse.json(); |
| |
| appState.loadedData.screenshots[exampleId] = appState.loadedData.screenshots[exampleId] || {}; |
| appState.loadedData.screenshots[exampleId][runId] = screenshots; |
| |
| |
| loadScreenshots(exampleId, runId); |
| |
| |
| renderAgentTrace(metadata); |
| |
| |
| if (metadata) { |
| rawJson.textContent = JSON.stringify(metadata, null, 2); |
| } else { |
| rawJson.textContent = "No metadata available"; |
| } |
| |
| |
| document.querySelector('.tab[data-tab="screenshots"]').click(); |
| |
| loadStatusDisplay.textContent = 'Run data loaded successfully'; |
| } catch (err) { |
| console.error('Error loading run data:', err); |
| loadStatusDisplay.textContent = `Error: ${err.message}`; |
| jsonError.textContent = `Error loading data: ${err.message}`; |
| jsonError.classList.remove('hidden'); |
| } finally { |
| jsonLoadingIndicator.classList.add('hidden'); |
| runSelect.disabled = false; |
| } |
| } |
| |
| |
| function loadScreenshots(exampleId, runId) { |
| appState.currentImages = appState.loadedData.screenshots[exampleId]?.[runId] || []; |
| |
| if (appState.currentImages.length === 0) { |
| imageContainer.classList.add('hidden'); |
| imageControls.classList.add('hidden'); |
| noImages.classList.remove('hidden'); |
| return; |
| } |
| |
| |
| noImages.classList.add('hidden'); |
| imageContainer.classList.remove('hidden'); |
| imageControls.classList.remove('hidden'); |
| |
| |
| imageSlider.min = 0; |
| imageSlider.max = appState.currentImages.length - 1; |
| imageSlider.value = 0; |
| |
| |
| appState.currentImageIndex = 0; |
| updateImageDisplay(); |
| } |
| |
| |
| function updateImageDisplay() { |
| if (appState.currentImages.length === 0) return; |
| |
| const image = appState.currentImages[appState.currentImageIndex]; |
| currentImage.src = image.path; |
| imageCaption.textContent = image.name; |
| imageCounter.textContent = `${appState.currentImageIndex + 1} / ${appState.currentImages.length}`; |
| imageSlider.value = appState.currentImageIndex; |
| |
| |
| prevImage.disabled = appState.currentImageIndex === 0; |
| nextImage.disabled = appState.currentImageIndex === appState.currentImages.length - 1; |
| } |
| |
| |
| prevImage.addEventListener('click', () => { |
| if (appState.currentImageIndex > 0) { |
| appState.currentImageIndex--; |
| updateImageDisplay(); |
| } |
| }); |
| |
| nextImage.addEventListener('click', () => { |
| if (appState.currentImageIndex < appState.currentImages.length - 1) { |
| appState.currentImageIndex++; |
| updateImageDisplay(); |
| } |
| }); |
| |
| imageSlider.addEventListener('input', () => { |
| appState.currentImageIndex = parseInt(imageSlider.value); |
| updateImageDisplay(); |
| }); |
| |
| |
| document.querySelectorAll('.tab').forEach(tab => { |
| tab.addEventListener('click', () => { |
| |
| document.querySelectorAll('.tab').forEach(t => t.classList.remove('active')); |
| tab.classList.add('active'); |
| |
| |
| const tabId = tab.getAttribute('data-tab'); |
| document.querySelectorAll('.tab-content').forEach(content => { |
| content.classList.remove('active'); |
| }); |
| document.getElementById(`${tabId}-tab`).classList.add('active'); |
| }); |
| }); |
| |
| |
| function renderAgentTrace(metadata) { |
| agentSteps.innerHTML = ''; |
| |
| if (!metadata || !metadata.summary || metadata.summary.length === 0) { |
| agentSteps.innerHTML = '<p>No agent trace data available</p>'; |
| return; |
| } |
| |
| |
| metadata.summary.forEach((step, index) => { |
| const stepDiv = document.createElement('div'); |
| stepDiv.className = 'step'; |
| |
| |
| const headerDiv = document.createElement('div'); |
| headerDiv.className = 'step-header'; |
| |
| let headerText = `Step ${index}`; |
| if (index === 0 && step.task) { |
| headerText = 'Task'; |
| } else if (step.model_output_message) { |
| headerText = 'Planning'; |
| } else if (step.tool_calls) { |
| headerText = `Action ${index}`; |
| } else if (step.error) { |
| headerText = 'Error'; |
| } |
| |
| headerDiv.innerHTML = `<span>${headerText}</span><span>▲</span>`; |
| stepDiv.appendChild(headerDiv); |
| |
| |
| const contentDiv = document.createElement('div'); |
| contentDiv.className = 'step-content'; |
| |
| contentDiv.style.display = 'block'; |
| |
| let contentHtml = ''; |
| |
| |
| if (index === 0 && step.task) { |
| |
| contentHtml += `${step.task}\n\n`; |
| } |
| |
| |
| if (step.model_output_message && step.model_output_message.content) { |
| contentHtml += `<strong>Model Output:</strong>\n${step.model_output_message.content}\n\n`; |
| |
| if (step.plan) { |
| contentHtml += `<strong>Plan:</strong>\n${step.plan}\n\n`; |
| } |
| } |
| |
| |
| if (step.tool_calls && step.tool_calls.length > 0) { |
| step.tool_calls.forEach(toolCall => { |
| if (toolCall.function) { |
| contentHtml += `<strong>Tool Call:</strong> ${toolCall.function.name}\n`; |
| if (toolCall.function.arguments) { |
| contentHtml += `<strong>Arguments:</strong>\n${toolCall.function.arguments}\n\n`; |
| } |
| } |
| }); |
| } |
| |
| |
| if (step.model_output) { |
| contentHtml += `<strong>Model Reasoning:</strong>\n${step.model_output}\n\n`; |
| } |
| |
| |
| if (step.observations) { |
| contentHtml += `<strong>Observations:</strong>\n${step.observations}\n\n`; |
| } |
| |
| |
| if (step.action_output) { |
| contentHtml += `<strong>Action Output:</strong>\n${step.action_output}\n\n`; |
| } |
| |
| |
| if (step.error) { |
| contentHtml += `<strong>Error Type:</strong> ${step.error.type || 'Unknown'}\n`; |
| if (step.error.message) { |
| contentHtml += `<strong>Error Message:</strong> ${step.error.message}\n`; |
| } |
| } |
| |
| contentDiv.textContent = contentHtml || "No content available for this step"; |
| stepDiv.appendChild(contentDiv); |
| |
| |
| headerDiv.addEventListener('click', () => { |
| const isHidden = contentDiv.style.display === 'none'; |
| contentDiv.style.display = isHidden ? 'block' : 'none'; |
| headerDiv.querySelector('span:last-child').textContent = isHidden ? '▲' : '▼'; |
| }); |
| |
| agentSteps.appendChild(stepDiv); |
| }); |
| |
| |
| } |
| |
| |
| document.addEventListener('keydown', (e) => { |
| if (!appState.currentImages || appState.currentImages.length === 0) return; |
| |
| |
| const screenshotsTab = document.getElementById('screenshots-tab'); |
| if (!screenshotsTab.classList.contains('active')) return; |
| |
| if (e.key === 'ArrowLeft' && appState.currentImageIndex > 0) { |
| appState.currentImageIndex--; |
| updateImageDisplay(); |
| } else if (e.key === 'ArrowRight' && appState.currentImageIndex < appState.currentImages.length - 1) { |
| appState.currentImageIndex++; |
| updateImageDisplay(); |
| } |
| }); |
| |
| |
| document.addEventListener('DOMContentLoaded', loadEvaluations); |
| </script> |
| </body> |
| </html> |