Spaces:

MCP-1st-Birthday
/

MudabbirAI

Running

App Files Files Community

MudabbirAI / mcp_servers.py

youssefleb

Update mcp_servers.py

38badd9 verified about 1 month ago

raw

history blame

8.35 kB

	# mcp_servers.py (Corrected for GOOGLE_API_KEY)
	import asyncio
	import os
	import httpx
	import json
	import google.generativeai as genai
	import anthropic
	import openai
	from personas import PERSONAS_DATA

	# --- 1. Load API Keys from Blaxel Secrets ---
	# --- THIS IS THE FIX ---
	GEMINI_API_KEY = os.getenv("GOOGLE_API_KEY") # Use the secret name from your screenshot
	# ---
	ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")
	SAMBANOVA_API_KEY = os.getenv("SAMBANOVA_API_KEY")
	SAMBANOVA_BASE_URL = os.getenv("SAMBANOVA_BASE_URL", "https://api.sambanova.ai/v1")

	# --- 2. Configure API Clients ---
	genai.configure(api_key=GEMINI_API_KEY)
	gemini_model = genai.GenerativeModel('gemini-1.5-pro-latest')
	anthropic_client = anthropic.AsyncAnthropic(api_key=ANTHROPIC_API_KEY)

	sambanova_client = openai.AsyncOpenAI(
	api_key=SAMBANOVA_API_KEY,
	base_url=SAMBANOVA_BASE_URL
	)

	# This is the prompt from your 'LLM judges prompt v3.0.docx'
	EVALUATION_PROMPT_TEMPLATE = """
	You are an impartial and objective AI evaluator specializing in assessing business solutions.
	Your task is to critically analyze a proposed solution to a given business problem.
	You will evaluate the solution across five specific dimensions: Novelty, Usefulness/Feasibility, Flexibility, Elaboration, and Cultural Appropriateness/Sensitivity.

	Evaluation Criteria:
	Assign a score from 1 to 5 for each criterion (1=Very Low, 5=Very High).
	You MUST provide a brief, specific justification (1-3 sentences) for each score.

	Definitions:
	1. Novelty: How original, unexpected, or non-obvious is the solution?
	2. Usefulness/Feasibility: Is the solution practical, implementable, and likely to be effective?
	3. Flexibility: Does the solution offer diverse approaches or adaptable ideas?
	4. Elaboration: Is the solution well-explained, clear, and sufficiently detailed?
	5. Cultural Appropriateness/Sensitivity: How well does the solution consider and align with potential cultural factors?

	Business Problem:
	{problem}

	Proposed Solution:
	{solution_text}

	Output Format:
	You MUST return only a valid JSON object in the following format:
	{
	"Novelty": {"score": <score_int>, "justification": "<justification_str>"},
	"Usefulness_Feasibility": {"score": <score_int>, "justification": "<justification_str>"},
	"Flexibility": {"score": <score_int>, "justification": "<justification_str>"},
	"Elaboration": {"score": <score_int>, "justification": "<justification_str>"},
	"Cultural_Appropriateness": {"score": <score_int>, "justification": "<justification_str>"}
	}
	"""

	class BusinessSolutionEvaluator:
	"""Implements the "LLM-as-a-Judge" with a live call to Gemini."""

	async def evaluate(self, problem: str, solution_text: str) -> dict:
	print(f"Evaluating solution (live): {solution_text[:50]}...")

	prompt = EVALUATION_PROMPT_TEMPLATE.format(problem=problem, solution_text=solution_text)

	try:
	response = await gemini_model.generate_content_async(
	prompt,
	generation_config=genai.types.GenerationConfig(
	response_mime_type="application/json"
	)
	)

	json_text = response.text.strip().replace("```json", "").replace("```", "")
	v_fitness = json.loads(json_text)

	print(f"Evaluation complete (live): {v_fitness}")
	return v_fitness
	except Exception as e:
	print(f"ERROR: BusinessSolutionEvaluator failed: {e}")
	return {
	"Novelty": {"score": 1, "justification": "Error during evaluation."},
	"Usefulness_Feasibility": {"score": 1, "justification": "Error during evaluation."},
	"Flexibility": {"score": 1, "justification": "Error during evaluation."},
	"Elaboration": {"score": 1, "justification": "Error during evaluation."},
	"Cultural_Appropriateness": {"score": 1, "justification": "Error during evaluation."}
	}

	# --- 3. Unified API Call Function ---
	async def get_llm_response(client_name: str, system_prompt: str, user_prompt: str) -> str:
	"""A single function to handle calling any of the three sponsor LLMs."""
	try:
	if client_name == "Gemini":
	chat = gemini_model.start_chat(history=[
	{'role': 'user', 'parts': [system_prompt]},
	{'role': 'model', 'parts': ["Understood. I will act as this persona."]}
	])
	response = await chat.send_message_async(user_prompt)
	return response.text

	elif client_name == "Anthropic":
	response = await anthropic_client.messages.create(
	model="claude-3-opus-20240229",
	max_tokens=2048,
	system=system_prompt,
	messages=[{"role": "user", "content": user_prompt}]
	)
	return response.content[0].text

	elif client_name == "SambaNova":
	completion = await sambanova_client.chat.completions.create(
	model="Meta-Llama-3.1-8B-Instruct",
	messages=[
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": user_prompt}
	]
	)
	return completion.choices[0].message.content

	except Exception as e:
	print(f"ERROR: API call to {client_name} failed: {e}")
	return f"Error generating response from {client_name}."


	class AgentCalibrator:
	"""Tests the sponsor LLMs with live API calls."""

	def __init__(self, evaluator: BusinessSolutionEvaluator):
	self.evaluator = evaluator
	self.sponsor_llms = ["Gemini", "Anthropic", "SambaNova"]

	async def calibrate_team(self, problem: str) -> dict:
	print("Running LIVE calibration test for specialist team...")

	roles_to_test = {
	"Plant": PERSONAS_DATA["Culture_5"]["description"],
	"Implementer": PERSONAS_DATA["Culture_Expert"]["description"],
	"Monitor": PERSONAS_DATA["Culture_11"]["description"]
	}

	test_problem = f"For the business problem '{problem}', generate a single, brief, one-paragraph concept-level solution."

	tasks = []
	for role, persona in roles_to_test.items():
	for llm in self.sponsor_llms:
	tasks.append(self.run_calibration_test(problem, role, llm, persona, test_problem))

	results = await asyncio.gather(*tasks)

	best_llms = {}
	role_metrics = {
	"Plant": "Novelty",
	"Implementer": "Usefulness_Feasibility",
	"Monitor": "Cultural_Appropriateness"
	}

	for role in roles_to_test.keys():
	best_score = -1
	best_llm = "None"
	for res in results:
	if res["role"] == role:
	metric = role_metrics[role]
	score = res.get("score", {}).get(metric, {}).get("score", 0)
	if score > best_score:
	best_score = score
	best_llm = res["llm"]
	best_llms[role] = best_llm

	team_plan = {
	"Plant": {"persona": "Culture_5", "llm": best_llms["Plant"]},
	"Implementer": {"persona": "Culture_Expert", "llm": best_llms["Implementer"]},
	"Monitor": {"persona": "Culture_11", "llm": best_llms["Monitor"]}
	}

	print(f"Calibration complete (live). Team plan: {team_plan}")
	return team_plan

	async def run_calibration_test(self, problem, role, llm, persona, test_problem):
	"""Helper to run a single test and evaluation."""
	print(f"...Calibrating {role} on {llm}...")
	solution = await get_llm_response(llm, persona, test_problem)
	if "Error generating response" in solution:
	return {"role": role, "llM": llm, "score": {
	"Novelty": {"score": 0},
	"Usefulness_Feasibility": {"score": 0},
	"Cultural_Appropriateness": {"score": 0}
	}}

	score = await self.evaluator.evaluate(problem, solution)
	return {"role": role, "llm": llm, "score": score}