File size: 8,348 Bytes
38badd9
fcc06d1
38badd9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fcc06d1
 
38badd9
fcc06d1
38badd9
 
 
 
fcc06d1
38badd9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fcc06d1
 
38badd9
 
fcc06d1
 
38badd9
fcc06d1
 
38badd9
fcc06d1
 
e6101a2
 
 
fcc06d1
 
38badd9
 
 
 
 
 
fcc06d1
38badd9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fcc06d1
38badd9
 
 
fcc06d1
 
38badd9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
# mcp_servers.py (Corrected for GOOGLE_API_KEY)
import asyncio
import os
import httpx
import json
import google.generativeai as genai
import anthropic
import openai
from personas import PERSONAS_DATA

# --- 1. Load API Keys from Blaxel Secrets ---
# --- THIS IS THE FIX ---
GEMINI_API_KEY = os.getenv("GOOGLE_API_KEY") # Use the secret name from your screenshot
# ---
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")
SAMBANOVA_API_KEY = os.getenv("SAMBANOVA_API_KEY")
SAMBANOVA_BASE_URL = os.getenv("SAMBANOVA_BASE_URL", "https://api.sambanova.ai/v1")

# --- 2. Configure API Clients ---
genai.configure(api_key=GEMINI_API_KEY)
gemini_model = genai.GenerativeModel('gemini-1.5-pro-latest')
anthropic_client = anthropic.AsyncAnthropic(api_key=ANTHROPIC_API_KEY)

sambanova_client = openai.AsyncOpenAI(
    api_key=SAMBANOVA_API_KEY,
    base_url=SAMBANOVA_BASE_URL
)

# This is the prompt from your 'LLM judges prompt v3.0.docx'
EVALUATION_PROMPT_TEMPLATE = """
You are an impartial and objective AI evaluator specializing in assessing business solutions.
Your task is to critically analyze a proposed solution to a given business problem.
You will evaluate the solution across five specific dimensions: Novelty, Usefulness/Feasibility, Flexibility, Elaboration, and Cultural Appropriateness/Sensitivity.

**Evaluation Criteria:**
Assign a score from 1 to 5 for each criterion (1=Very Low, 5=Very High).
You MUST provide a brief, specific justification (1-3 sentences) for each score.

**Definitions:**
1.  **Novelty:** How original, unexpected, or non-obvious is the solution?
2.  **Usefulness/Feasibility:** Is the solution practical, implementable, and likely to be effective?
3.  **Flexibility:** Does the solution offer diverse approaches or adaptable ideas?
4.  **Elaboration:** Is the solution well-explained, clear, and sufficiently detailed?
5.  **Cultural Appropriateness/Sensitivity:** How well does the solution consider and align with potential cultural factors?

**Business Problem:**
{problem}

**Proposed Solution:**
{solution_text}

**Output Format:**
You MUST return *only* a valid JSON object in the following format:
{
  "Novelty": {"score": <score_int>, "justification": "<justification_str>"},
  "Usefulness_Feasibility": {"score": <score_int>, "justification": "<justification_str>"},
  "Flexibility": {"score": <score_int>, "justification": "<justification_str>"},
  "Elaboration": {"score": <score_int>, "justification": "<justification_str>"},
  "Cultural_Appropriateness": {"score": <score_int>, "justification": "<justification_str>"}
}
"""

class BusinessSolutionEvaluator:
    """Implements the "LLM-as-a-Judge" with a live call to Gemini."""
    
    async def evaluate(self, problem: str, solution_text: str) -> dict:
        print(f"Evaluating solution (live): {solution_text[:50]}...")
        
        prompt = EVALUATION_PROMPT_TEMPLATE.format(problem=problem, solution_text=solution_text)
        
        try:
            response = await gemini_model.generate_content_async(
                prompt,
                generation_config=genai.types.GenerationConfig(
                    response_mime_type="application/json"
                )
            )
            
            json_text = response.text.strip().replace("```json", "").replace("```", "")
            v_fitness = json.loads(json_text)
            
            print(f"Evaluation complete (live): {v_fitness}")
            return v_fitness
        except Exception as e:
            print(f"ERROR: BusinessSolutionEvaluator failed: {e}")
            return {
                "Novelty": {"score": 1, "justification": "Error during evaluation."},
                "Usefulness_Feasibility": {"score": 1, "justification": "Error during evaluation."},
                "Flexibility": {"score": 1, "justification": "Error during evaluation."},
                "Elaboration": {"score": 1, "justification": "Error during evaluation."},
                "Cultural_Appropriateness": {"score": 1, "justification": "Error during evaluation."}
            }

# --- 3. Unified API Call Function ---
async def get_llm_response(client_name: str, system_prompt: str, user_prompt: str) -> str:
    """A single function to handle calling any of the three sponsor LLMs."""
    try:
        if client_name == "Gemini":
            chat = gemini_model.start_chat(history=[
                {'role': 'user', 'parts': [system_prompt]},
                {'role': 'model', 'parts': ["Understood. I will act as this persona."]}
            ])
            response = await chat.send_message_async(user_prompt)
            return response.text

        elif client_name == "Anthropic":
            response = await anthropic_client.messages.create(
                model="claude-3-opus-20240229",
                max_tokens=2048,
                system=system_prompt,
                messages=[{"role": "user", "content": user_prompt}]
            )
            return response.content[0].text
        
        elif client_name == "SambaNova":
            completion = await sambanova_client.chat.completions.create(
                model="Meta-Llama-3.1-8B-Instruct",
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt}
                ]
            )
            return completion.choices[0].message.content
            
    except Exception as e:
        print(f"ERROR: API call to {client_name} failed: {e}")
        return f"Error generating response from {client_name}."


class AgentCalibrator:
    """Tests the sponsor LLMs with live API calls."""
    
    def __init__(self, evaluator: BusinessSolutionEvaluator):
        self.evaluator = evaluator
        self.sponsor_llms = ["Gemini", "Anthropic", "SambaNova"]
        
    async def calibrate_team(self, problem: str) -> dict:
        print("Running LIVE calibration test for specialist team...")
        
        roles_to_test = {
            "Plant": PERSONAS_DATA["Culture_5"]["description"],
            "Implementer": PERSONAS_DATA["Culture_Expert"]["description"],
            "Monitor": PERSONAS_DATA["Culture_11"]["description"]
        }
        
        test_problem = f"For the business problem '{problem}', generate a single, brief, one-paragraph concept-level solution."
        
        tasks = []
        for role, persona in roles_to_test.items():
            for llm in self.sponsor_llms:
                tasks.append(self.run_calibration_test(problem, role, llm, persona, test_problem))
        
        results = await asyncio.gather(*tasks)
        
        best_llms = {}
        role_metrics = {
            "Plant": "Novelty",
            "Implementer": "Usefulness_Feasibility",
            "Monitor": "Cultural_Appropriateness"
        }
        
        for role in roles_to_test.keys():
            best_score = -1
            best_llm = "None"
            for res in results:
                if res["role"] == role:
                    metric = role_metrics[role]
                    score = res.get("score", {}).get(metric, {}).get("score", 0)
                    if score > best_score:
                        best_score = score
                        best_llm = res["llm"]
            best_llms[role] = best_llm
            
        team_plan = {
            "Plant": {"persona": "Culture_5", "llm": best_llms["Plant"]},
            "Implementer": {"persona": "Culture_Expert", "llm": best_llms["Implementer"]},
            "Monitor": {"persona": "Culture_11", "llm": best_llms["Monitor"]}
        }
        
        print(f"Calibration complete (live). Team plan: {team_plan}")
        return team_plan

    async def run_calibration_test(self, problem, role, llm, persona, test_problem):
        """Helper to run a single test and evaluation."""
        print(f"...Calibrating {role} on {llm}...")
        solution = await get_llm_response(llm, persona, test_problem)
        if "Error generating response" in solution:
            return {"role": role, "llM": llm, "score": {
                "Novelty": {"score": 0}, 
                "Usefulness_Feasibility": {"score": 0}, 
                "Cultural_Appropriateness": {"score": 0}
            }}
        
        score = await self.evaluator.evaluate(problem, solution)
        return {"role": role, "llm": llm, "score": score}