MudabbirAI / mcp_servers.py
youssefleb's picture
Update mcp_servers.py
38badd9 verified
raw
history blame
8.35 kB
# mcp_servers.py (Corrected for GOOGLE_API_KEY)
import asyncio
import os
import httpx
import json
import google.generativeai as genai
import anthropic
import openai
from personas import PERSONAS_DATA
# --- 1. Load API Keys from Blaxel Secrets ---
# --- THIS IS THE FIX ---
GEMINI_API_KEY = os.getenv("GOOGLE_API_KEY") # Use the secret name from your screenshot
# ---
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")
SAMBANOVA_API_KEY = os.getenv("SAMBANOVA_API_KEY")
SAMBANOVA_BASE_URL = os.getenv("SAMBANOVA_BASE_URL", "https://api.sambanova.ai/v1")
# --- 2. Configure API Clients ---
genai.configure(api_key=GEMINI_API_KEY)
gemini_model = genai.GenerativeModel('gemini-1.5-pro-latest')
anthropic_client = anthropic.AsyncAnthropic(api_key=ANTHROPIC_API_KEY)
sambanova_client = openai.AsyncOpenAI(
api_key=SAMBANOVA_API_KEY,
base_url=SAMBANOVA_BASE_URL
)
# This is the prompt from your 'LLM judges prompt v3.0.docx'
EVALUATION_PROMPT_TEMPLATE = """
You are an impartial and objective AI evaluator specializing in assessing business solutions.
Your task is to critically analyze a proposed solution to a given business problem.
You will evaluate the solution across five specific dimensions: Novelty, Usefulness/Feasibility, Flexibility, Elaboration, and Cultural Appropriateness/Sensitivity.
**Evaluation Criteria:**
Assign a score from 1 to 5 for each criterion (1=Very Low, 5=Very High).
You MUST provide a brief, specific justification (1-3 sentences) for each score.
**Definitions:**
1. **Novelty:** How original, unexpected, or non-obvious is the solution?
2. **Usefulness/Feasibility:** Is the solution practical, implementable, and likely to be effective?
3. **Flexibility:** Does the solution offer diverse approaches or adaptable ideas?
4. **Elaboration:** Is the solution well-explained, clear, and sufficiently detailed?
5. **Cultural Appropriateness/Sensitivity:** How well does the solution consider and align with potential cultural factors?
**Business Problem:**
{problem}
**Proposed Solution:**
{solution_text}
**Output Format:**
You MUST return *only* a valid JSON object in the following format:
{
"Novelty": {"score": <score_int>, "justification": "<justification_str>"},
"Usefulness_Feasibility": {"score": <score_int>, "justification": "<justification_str>"},
"Flexibility": {"score": <score_int>, "justification": "<justification_str>"},
"Elaboration": {"score": <score_int>, "justification": "<justification_str>"},
"Cultural_Appropriateness": {"score": <score_int>, "justification": "<justification_str>"}
}
"""
class BusinessSolutionEvaluator:
"""Implements the "LLM-as-a-Judge" with a live call to Gemini."""
async def evaluate(self, problem: str, solution_text: str) -> dict:
print(f"Evaluating solution (live): {solution_text[:50]}...")
prompt = EVALUATION_PROMPT_TEMPLATE.format(problem=problem, solution_text=solution_text)
try:
response = await gemini_model.generate_content_async(
prompt,
generation_config=genai.types.GenerationConfig(
response_mime_type="application/json"
)
)
json_text = response.text.strip().replace("```json", "").replace("```", "")
v_fitness = json.loads(json_text)
print(f"Evaluation complete (live): {v_fitness}")
return v_fitness
except Exception as e:
print(f"ERROR: BusinessSolutionEvaluator failed: {e}")
return {
"Novelty": {"score": 1, "justification": "Error during evaluation."},
"Usefulness_Feasibility": {"score": 1, "justification": "Error during evaluation."},
"Flexibility": {"score": 1, "justification": "Error during evaluation."},
"Elaboration": {"score": 1, "justification": "Error during evaluation."},
"Cultural_Appropriateness": {"score": 1, "justification": "Error during evaluation."}
}
# --- 3. Unified API Call Function ---
async def get_llm_response(client_name: str, system_prompt: str, user_prompt: str) -> str:
"""A single function to handle calling any of the three sponsor LLMs."""
try:
if client_name == "Gemini":
chat = gemini_model.start_chat(history=[
{'role': 'user', 'parts': [system_prompt]},
{'role': 'model', 'parts': ["Understood. I will act as this persona."]}
])
response = await chat.send_message_async(user_prompt)
return response.text
elif client_name == "Anthropic":
response = await anthropic_client.messages.create(
model="claude-3-opus-20240229",
max_tokens=2048,
system=system_prompt,
messages=[{"role": "user", "content": user_prompt}]
)
return response.content[0].text
elif client_name == "SambaNova":
completion = await sambanova_client.chat.completions.create(
model="Meta-Llama-3.1-8B-Instruct",
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt}
]
)
return completion.choices[0].message.content
except Exception as e:
print(f"ERROR: API call to {client_name} failed: {e}")
return f"Error generating response from {client_name}."
class AgentCalibrator:
"""Tests the sponsor LLMs with live API calls."""
def __init__(self, evaluator: BusinessSolutionEvaluator):
self.evaluator = evaluator
self.sponsor_llms = ["Gemini", "Anthropic", "SambaNova"]
async def calibrate_team(self, problem: str) -> dict:
print("Running LIVE calibration test for specialist team...")
roles_to_test = {
"Plant": PERSONAS_DATA["Culture_5"]["description"],
"Implementer": PERSONAS_DATA["Culture_Expert"]["description"],
"Monitor": PERSONAS_DATA["Culture_11"]["description"]
}
test_problem = f"For the business problem '{problem}', generate a single, brief, one-paragraph concept-level solution."
tasks = []
for role, persona in roles_to_test.items():
for llm in self.sponsor_llms:
tasks.append(self.run_calibration_test(problem, role, llm, persona, test_problem))
results = await asyncio.gather(*tasks)
best_llms = {}
role_metrics = {
"Plant": "Novelty",
"Implementer": "Usefulness_Feasibility",
"Monitor": "Cultural_Appropriateness"
}
for role in roles_to_test.keys():
best_score = -1
best_llm = "None"
for res in results:
if res["role"] == role:
metric = role_metrics[role]
score = res.get("score", {}).get(metric, {}).get("score", 0)
if score > best_score:
best_score = score
best_llm = res["llm"]
best_llms[role] = best_llm
team_plan = {
"Plant": {"persona": "Culture_5", "llm": best_llms["Plant"]},
"Implementer": {"persona": "Culture_Expert", "llm": best_llms["Implementer"]},
"Monitor": {"persona": "Culture_11", "llm": best_llms["Monitor"]}
}
print(f"Calibration complete (live). Team plan: {team_plan}")
return team_plan
async def run_calibration_test(self, problem, role, llm, persona, test_problem):
"""Helper to run a single test and evaluation."""
print(f"...Calibrating {role} on {llm}...")
solution = await get_llm_response(llm, persona, test_problem)
if "Error generating response" in solution:
return {"role": role, "llM": llm, "score": {
"Novelty": {"score": 0},
"Usefulness_Feasibility": {"score": 0},
"Cultural_Appropriateness": {"score": 0}
}}
score = await self.evaluator.evaluate(problem, solution)
return {"role": role, "llm": llm, "score": score}