from __future__ import annotations import base64 import json import logging import os from pathlib import Path from typing import Any import dotenv import gradio as gr import requests from gradio.components.chatbot import ( FileDataDict, FileMessageDict, NormalizedMessageContent, NormalizedMessageDict, TextMessageDict, ) from gradio.components.multimodal_textbox import MultimodalValue logger = logging.getLogger() API_URL = "https://openrouter.ai/api/v1/chat/completions" MODEL = "google/gemini-2.5-flash-lite-preview-09-2025" SYSTEM_PROMPT = Path("system-prompt.md").read_text().strip() AUDIO_FORMATS = {"wav", "mp3", "m4a", "flac"} def chat_fn(user_msg: MultimodalValue, history: list[NormalizedMessageDict], api_key: str | None) -> str: logger.info(f"History (oldest first):\n{json.dumps(history[::-1], indent=2)}") logger.info(f"User message:\n{json.dumps(user_msg, indent=2)}") # Determine API key if api_key is None or len(api_key) == 0: return "Boh!" if api_key == os.environ["PASSWORD"]: api_key = os.environ["OPENROUTER_API_KEY"] # Build message history including system prompt, conversation history, and current user message user_content: list[NormalizedMessageContent] = [] if "text" in user_msg and len(text := user_msg["text"].strip()) > 0: user_content.append(TextMessageDict(type="text", text=text)) for path in user_msg.get("files", []): user_content.append(FileMessageDict(type="file", file=FileDataDict(path=path))) user_msg_dict = NormalizedMessageDict(role="user", content=user_content) history = [ NormalizedMessageDict(role="system", content=[TextMessageDict(type="text", text=SYSTEM_PROMPT)]), *history, user_msg_dict, ] # Call the model API payload = { "model": MODEL, "messages": history_to_messages(history), "max_tokens": 4096, "temperature": 0.2, } headers = { "Authorization": f"Bearer {api_key}", "Content-Type": "application/json", } response = requests.post(API_URL, headers=headers, json=payload, timeout=30) logger.info(f"Response:\n{json.dumps(response.json(), indent=2)}") response.raise_for_status() msg = response.json()["choices"][0]["message"]["content"].strip() return msg def history_to_messages(history: list[NormalizedMessageDict]) -> list[dict[str, Any]]: """ Transform content entries to openrouter format. From: { "type": "file", "file": {"path": "file.wav"} } To: { "type": "input_audio", "input_audio": { "data": "", "format": "wav", } } """ def transform_content(content: NormalizedMessageContent) -> dict[str, Any]: if content["type"] == "file": path = Path(content["file"]["path"]) suffix = path.suffix.lstrip(".").lower() if suffix not in AUDIO_FORMATS: raise ValueError(f"Unsupported file format: {suffix}") return { "type": "input_audio", "input_audio": { "data": file_to_base64(path), "format": suffix, }, } return content # pyright: ignore[reportReturnType] return [ { "role": item["role"], "content": [transform_content(c) for c in item["content"]], } for item in history ] def file_to_base64(path: str | Path) -> str: with open(path, "rb") as f: return base64.b64encode(f.read()).decode("utf-8") # Set up logging logging.basicConfig(level="INFO", format="%(asctime)s %(levelname)s: %(message)s") logging.captureWarnings(True) # Load environment variables from .env file dotenv.load_dotenv() # Chat (top-level demo variable to allow live reloading) demo = gr.ChatInterface( chat_fn, multimodal=True, chatbot=gr.Chatbot(placeholder="Ready!"), textbox=gr.MultimodalTextbox( placeholder="Your message", file_count="single", file_types=["audio"], sources=["microphone"], ), additional_inputs=[ gr.Textbox(type="password", label="Openrouter API Key"), ], additional_inputs_accordion=gr.Accordion("Options", open=True), title="Mamma AI", description="Parla con la mamma più saggia del mondo! Puoi inviare messaggi di testo o audio.\n\nPrima di usare la chat, inserici un'API key di [Openrouter](https://openrouter.ai/) (oppure la password segreta).", autofocus=True, examples=[ ["È meglio lavare i piatti a mano o in lavastoviglie?", None], ["Aiuto! Ho sporcato la camicia di vino!", None], ], ) demo.launch(server_name="0.0.0.0")