Spaces:
Running
Running
| # mcp_servers.py (Corrected for GOOGLE_API_KEY) | |
| import asyncio | |
| import os | |
| import httpx | |
| import json | |
| import google.generativeai as genai | |
| import anthropic | |
| import openai | |
| from personas import PERSONAS_DATA | |
| # --- 1. Load API Keys from Blaxel Secrets --- | |
| # --- THIS IS THE FIX --- | |
| GEMINI_API_KEY = os.getenv("GOOGLE_API_KEY") # Use the secret name from your screenshot | |
| # --- | |
| ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY") | |
| SAMBANOVA_API_KEY = os.getenv("SAMBANOVA_API_KEY") | |
| SAMBANOVA_BASE_URL = os.getenv("SAMBANOVA_BASE_URL", "https://api.sambanova.ai/v1") | |
| # --- 2. Configure API Clients --- | |
| genai.configure(api_key=GEMINI_API_KEY) | |
| gemini_model = genai.GenerativeModel('gemini-1.5-pro-latest') | |
| anthropic_client = anthropic.AsyncAnthropic(api_key=ANTHROPIC_API_KEY) | |
| sambanova_client = openai.AsyncOpenAI( | |
| api_key=SAMBANOVA_API_KEY, | |
| base_url=SAMBANOVA_BASE_URL | |
| ) | |
| # This is the prompt from your 'LLM judges prompt v3.0.docx' | |
| EVALUATION_PROMPT_TEMPLATE = """ | |
| You are an impartial and objective AI evaluator specializing in assessing business solutions. | |
| Your task is to critically analyze a proposed solution to a given business problem. | |
| You will evaluate the solution across five specific dimensions: Novelty, Usefulness/Feasibility, Flexibility, Elaboration, and Cultural Appropriateness/Sensitivity. | |
| **Evaluation Criteria:** | |
| Assign a score from 1 to 5 for each criterion (1=Very Low, 5=Very High). | |
| You MUST provide a brief, specific justification (1-3 sentences) for each score. | |
| **Definitions:** | |
| 1. **Novelty:** How original, unexpected, or non-obvious is the solution? | |
| 2. **Usefulness/Feasibility:** Is the solution practical, implementable, and likely to be effective? | |
| 3. **Flexibility:** Does the solution offer diverse approaches or adaptable ideas? | |
| 4. **Elaboration:** Is the solution well-explained, clear, and sufficiently detailed? | |
| 5. **Cultural Appropriateness/Sensitivity:** How well does the solution consider and align with potential cultural factors? | |
| **Business Problem:** | |
| {problem} | |
| **Proposed Solution:** | |
| {solution_text} | |
| **Output Format:** | |
| You MUST return *only* a valid JSON object in the following format: | |
| { | |
| "Novelty": {"score": <score_int>, "justification": "<justification_str>"}, | |
| "Usefulness_Feasibility": {"score": <score_int>, "justification": "<justification_str>"}, | |
| "Flexibility": {"score": <score_int>, "justification": "<justification_str>"}, | |
| "Elaboration": {"score": <score_int>, "justification": "<justification_str>"}, | |
| "Cultural_Appropriateness": {"score": <score_int>, "justification": "<justification_str>"} | |
| } | |
| """ | |
| class BusinessSolutionEvaluator: | |
| """Implements the "LLM-as-a-Judge" with a live call to Gemini.""" | |
| async def evaluate(self, problem: str, solution_text: str) -> dict: | |
| print(f"Evaluating solution (live): {solution_text[:50]}...") | |
| prompt = EVALUATION_PROMPT_TEMPLATE.format(problem=problem, solution_text=solution_text) | |
| try: | |
| response = await gemini_model.generate_content_async( | |
| prompt, | |
| generation_config=genai.types.GenerationConfig( | |
| response_mime_type="application/json" | |
| ) | |
| ) | |
| json_text = response.text.strip().replace("```json", "").replace("```", "") | |
| v_fitness = json.loads(json_text) | |
| print(f"Evaluation complete (live): {v_fitness}") | |
| return v_fitness | |
| except Exception as e: | |
| print(f"ERROR: BusinessSolutionEvaluator failed: {e}") | |
| return { | |
| "Novelty": {"score": 1, "justification": "Error during evaluation."}, | |
| "Usefulness_Feasibility": {"score": 1, "justification": "Error during evaluation."}, | |
| "Flexibility": {"score": 1, "justification": "Error during evaluation."}, | |
| "Elaboration": {"score": 1, "justification": "Error during evaluation."}, | |
| "Cultural_Appropriateness": {"score": 1, "justification": "Error during evaluation."} | |
| } | |
| # --- 3. Unified API Call Function --- | |
| async def get_llm_response(client_name: str, system_prompt: str, user_prompt: str) -> str: | |
| """A single function to handle calling any of the three sponsor LLMs.""" | |
| try: | |
| if client_name == "Gemini": | |
| chat = gemini_model.start_chat(history=[ | |
| {'role': 'user', 'parts': [system_prompt]}, | |
| {'role': 'model', 'parts': ["Understood. I will act as this persona."]} | |
| ]) | |
| response = await chat.send_message_async(user_prompt) | |
| return response.text | |
| elif client_name == "Anthropic": | |
| response = await anthropic_client.messages.create( | |
| model="claude-3-opus-20240229", | |
| max_tokens=2048, | |
| system=system_prompt, | |
| messages=[{"role": "user", "content": user_prompt}] | |
| ) | |
| return response.content[0].text | |
| elif client_name == "SambaNova": | |
| completion = await sambanova_client.chat.completions.create( | |
| model="Meta-Llama-3.1-8B-Instruct", | |
| messages=[ | |
| {"role": "system", "content": system_prompt}, | |
| {"role": "user", "content": user_prompt} | |
| ] | |
| ) | |
| return completion.choices[0].message.content | |
| except Exception as e: | |
| print(f"ERROR: API call to {client_name} failed: {e}") | |
| return f"Error generating response from {client_name}." | |
| class AgentCalibrator: | |
| """Tests the sponsor LLMs with live API calls.""" | |
| def __init__(self, evaluator: BusinessSolutionEvaluator): | |
| self.evaluator = evaluator | |
| self.sponsor_llms = ["Gemini", "Anthropic", "SambaNova"] | |
| async def calibrate_team(self, problem: str) -> dict: | |
| print("Running LIVE calibration test for specialist team...") | |
| roles_to_test = { | |
| "Plant": PERSONAS_DATA["Culture_5"]["description"], | |
| "Implementer": PERSONAS_DATA["Culture_Expert"]["description"], | |
| "Monitor": PERSONAS_DATA["Culture_11"]["description"] | |
| } | |
| test_problem = f"For the business problem '{problem}', generate a single, brief, one-paragraph concept-level solution." | |
| tasks = [] | |
| for role, persona in roles_to_test.items(): | |
| for llm in self.sponsor_llms: | |
| tasks.append(self.run_calibration_test(problem, role, llm, persona, test_problem)) | |
| results = await asyncio.gather(*tasks) | |
| best_llms = {} | |
| role_metrics = { | |
| "Plant": "Novelty", | |
| "Implementer": "Usefulness_Feasibility", | |
| "Monitor": "Cultural_Appropriateness" | |
| } | |
| for role in roles_to_test.keys(): | |
| best_score = -1 | |
| best_llm = "None" | |
| for res in results: | |
| if res["role"] == role: | |
| metric = role_metrics[role] | |
| score = res.get("score", {}).get(metric, {}).get("score", 0) | |
| if score > best_score: | |
| best_score = score | |
| best_llm = res["llm"] | |
| best_llms[role] = best_llm | |
| team_plan = { | |
| "Plant": {"persona": "Culture_5", "llm": best_llms["Plant"]}, | |
| "Implementer": {"persona": "Culture_Expert", "llm": best_llms["Implementer"]}, | |
| "Monitor": {"persona": "Culture_11", "llm": best_llms["Monitor"]} | |
| } | |
| print(f"Calibration complete (live). Team plan: {team_plan}") | |
| return team_plan | |
| async def run_calibration_test(self, problem, role, llm, persona, test_problem): | |
| """Helper to run a single test and evaluation.""" | |
| print(f"...Calibrating {role} on {llm}...") | |
| solution = await get_llm_response(llm, persona, test_problem) | |
| if "Error generating response" in solution: | |
| return {"role": role, "llM": llm, "score": { | |
| "Novelty": {"score": 0}, | |
| "Usefulness_Feasibility": {"score": 0}, | |
| "Cultural_Appropriateness": {"score": 0} | |
| }} | |
| score = await self.evaluator.evaluate(problem, solution) | |
| return {"role": role, "llm": llm, "score": score} |