Spaces:
Running
Running
File size: 8,348 Bytes
38badd9 fcc06d1 38badd9 fcc06d1 38badd9 fcc06d1 38badd9 fcc06d1 38badd9 fcc06d1 38badd9 fcc06d1 38badd9 fcc06d1 38badd9 fcc06d1 e6101a2 fcc06d1 38badd9 fcc06d1 38badd9 fcc06d1 38badd9 fcc06d1 38badd9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 |
# mcp_servers.py (Corrected for GOOGLE_API_KEY)
import asyncio
import os
import httpx
import json
import google.generativeai as genai
import anthropic
import openai
from personas import PERSONAS_DATA
# --- 1. Load API Keys from Blaxel Secrets ---
# --- THIS IS THE FIX ---
GEMINI_API_KEY = os.getenv("GOOGLE_API_KEY") # Use the secret name from your screenshot
# ---
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")
SAMBANOVA_API_KEY = os.getenv("SAMBANOVA_API_KEY")
SAMBANOVA_BASE_URL = os.getenv("SAMBANOVA_BASE_URL", "https://api.sambanova.ai/v1")
# --- 2. Configure API Clients ---
genai.configure(api_key=GEMINI_API_KEY)
gemini_model = genai.GenerativeModel('gemini-1.5-pro-latest')
anthropic_client = anthropic.AsyncAnthropic(api_key=ANTHROPIC_API_KEY)
sambanova_client = openai.AsyncOpenAI(
api_key=SAMBANOVA_API_KEY,
base_url=SAMBANOVA_BASE_URL
)
# This is the prompt from your 'LLM judges prompt v3.0.docx'
EVALUATION_PROMPT_TEMPLATE = """
You are an impartial and objective AI evaluator specializing in assessing business solutions.
Your task is to critically analyze a proposed solution to a given business problem.
You will evaluate the solution across five specific dimensions: Novelty, Usefulness/Feasibility, Flexibility, Elaboration, and Cultural Appropriateness/Sensitivity.
**Evaluation Criteria:**
Assign a score from 1 to 5 for each criterion (1=Very Low, 5=Very High).
You MUST provide a brief, specific justification (1-3 sentences) for each score.
**Definitions:**
1. **Novelty:** How original, unexpected, or non-obvious is the solution?
2. **Usefulness/Feasibility:** Is the solution practical, implementable, and likely to be effective?
3. **Flexibility:** Does the solution offer diverse approaches or adaptable ideas?
4. **Elaboration:** Is the solution well-explained, clear, and sufficiently detailed?
5. **Cultural Appropriateness/Sensitivity:** How well does the solution consider and align with potential cultural factors?
**Business Problem:**
{problem}
**Proposed Solution:**
{solution_text}
**Output Format:**
You MUST return *only* a valid JSON object in the following format:
{
"Novelty": {"score": <score_int>, "justification": "<justification_str>"},
"Usefulness_Feasibility": {"score": <score_int>, "justification": "<justification_str>"},
"Flexibility": {"score": <score_int>, "justification": "<justification_str>"},
"Elaboration": {"score": <score_int>, "justification": "<justification_str>"},
"Cultural_Appropriateness": {"score": <score_int>, "justification": "<justification_str>"}
}
"""
class BusinessSolutionEvaluator:
"""Implements the "LLM-as-a-Judge" with a live call to Gemini."""
async def evaluate(self, problem: str, solution_text: str) -> dict:
print(f"Evaluating solution (live): {solution_text[:50]}...")
prompt = EVALUATION_PROMPT_TEMPLATE.format(problem=problem, solution_text=solution_text)
try:
response = await gemini_model.generate_content_async(
prompt,
generation_config=genai.types.GenerationConfig(
response_mime_type="application/json"
)
)
json_text = response.text.strip().replace("```json", "").replace("```", "")
v_fitness = json.loads(json_text)
print(f"Evaluation complete (live): {v_fitness}")
return v_fitness
except Exception as e:
print(f"ERROR: BusinessSolutionEvaluator failed: {e}")
return {
"Novelty": {"score": 1, "justification": "Error during evaluation."},
"Usefulness_Feasibility": {"score": 1, "justification": "Error during evaluation."},
"Flexibility": {"score": 1, "justification": "Error during evaluation."},
"Elaboration": {"score": 1, "justification": "Error during evaluation."},
"Cultural_Appropriateness": {"score": 1, "justification": "Error during evaluation."}
}
# --- 3. Unified API Call Function ---
async def get_llm_response(client_name: str, system_prompt: str, user_prompt: str) -> str:
"""A single function to handle calling any of the three sponsor LLMs."""
try:
if client_name == "Gemini":
chat = gemini_model.start_chat(history=[
{'role': 'user', 'parts': [system_prompt]},
{'role': 'model', 'parts': ["Understood. I will act as this persona."]}
])
response = await chat.send_message_async(user_prompt)
return response.text
elif client_name == "Anthropic":
response = await anthropic_client.messages.create(
model="claude-3-opus-20240229",
max_tokens=2048,
system=system_prompt,
messages=[{"role": "user", "content": user_prompt}]
)
return response.content[0].text
elif client_name == "SambaNova":
completion = await sambanova_client.chat.completions.create(
model="Meta-Llama-3.1-8B-Instruct",
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt}
]
)
return completion.choices[0].message.content
except Exception as e:
print(f"ERROR: API call to {client_name} failed: {e}")
return f"Error generating response from {client_name}."
class AgentCalibrator:
"""Tests the sponsor LLMs with live API calls."""
def __init__(self, evaluator: BusinessSolutionEvaluator):
self.evaluator = evaluator
self.sponsor_llms = ["Gemini", "Anthropic", "SambaNova"]
async def calibrate_team(self, problem: str) -> dict:
print("Running LIVE calibration test for specialist team...")
roles_to_test = {
"Plant": PERSONAS_DATA["Culture_5"]["description"],
"Implementer": PERSONAS_DATA["Culture_Expert"]["description"],
"Monitor": PERSONAS_DATA["Culture_11"]["description"]
}
test_problem = f"For the business problem '{problem}', generate a single, brief, one-paragraph concept-level solution."
tasks = []
for role, persona in roles_to_test.items():
for llm in self.sponsor_llms:
tasks.append(self.run_calibration_test(problem, role, llm, persona, test_problem))
results = await asyncio.gather(*tasks)
best_llms = {}
role_metrics = {
"Plant": "Novelty",
"Implementer": "Usefulness_Feasibility",
"Monitor": "Cultural_Appropriateness"
}
for role in roles_to_test.keys():
best_score = -1
best_llm = "None"
for res in results:
if res["role"] == role:
metric = role_metrics[role]
score = res.get("score", {}).get(metric, {}).get("score", 0)
if score > best_score:
best_score = score
best_llm = res["llm"]
best_llms[role] = best_llm
team_plan = {
"Plant": {"persona": "Culture_5", "llm": best_llms["Plant"]},
"Implementer": {"persona": "Culture_Expert", "llm": best_llms["Implementer"]},
"Monitor": {"persona": "Culture_11", "llm": best_llms["Monitor"]}
}
print(f"Calibration complete (live). Team plan: {team_plan}")
return team_plan
async def run_calibration_test(self, problem, role, llm, persona, test_problem):
"""Helper to run a single test and evaluation."""
print(f"...Calibrating {role} on {llm}...")
solution = await get_llm_response(llm, persona, test_problem)
if "Error generating response" in solution:
return {"role": role, "llM": llm, "score": {
"Novelty": {"score": 0},
"Usefulness_Feasibility": {"score": 0},
"Cultural_Appropriateness": {"score": 0}
}}
score = await self.evaluator.evaluate(problem, solution)
return {"role": role, "llm": llm, "score": score} |