Files changed (3) hide show
  1. README.md +5 -10
  2. chat_template.jinja +47 -144
  3. tokenizer_config.json +1 -1
README.md CHANGED
@@ -3,8 +3,6 @@ library_name: transformers
3
  license: apache-2.0
4
  license_link: https://ai.google.dev/gemma/docs/gemma_4_license
5
  pipeline_tag: any-to-any
6
- base_model:
7
- - google/gemma-4-E4B
8
  ---
9
 
10
  <div align="center">
@@ -201,13 +199,13 @@ Once the model is loaded, you can start generating output by directly referencin
201
 
202
 
203
  ```python
204
- # Prompt - add audio after text
205
  messages = [
206
  {
207
  "role": "user",
208
  "content": [
 
209
  {"type": "text", "text": "Transcribe the following speech segment in its original language. Follow these specific instructions for formatting the answer:\n* Only output the transcription, with no newlines.\n* When transcribing numbers, write the digits, i.e. write 1.7 and not one point seven, and write 3 instead of three."},
210
- {"type": "audio", "audio": "https://raw.githubusercontent.com/google-gemma/cookbook/refs/heads/main/apps/sample-data/journal1.wav"},
211
  ]
212
  }
213
  ]
@@ -264,7 +262,7 @@ Once the model is loaded, you can start generating output by directly referencin
264
  messages = [
265
  {
266
  "role": "user", "content": [
267
- {"type": "image", "url": "https://raw.githubusercontent.com/google-gemma/cookbook/refs/heads/main/apps/sample-data/GoldenGate.png"},
268
  {"type": "text", "text": "What is shown in this image?"}
269
  ]
270
  }
@@ -382,10 +380,7 @@ Compared to Gemma 3, the models use standard `system`, `assistant`, and `user` r
382
 
383
  ### 4. Modality order
384
 
385
- For optimal performance with multimodal inputs, place:
386
-
387
- * Image content **before** the text in your prompt.
388
- * Audio content **after** the text in your prompt.
389
 
390
  ### 5. Variable Image Resolution
391
 
@@ -517,4 +512,4 @@ The development of vision-language models (VLMs) raises several ethical concerns
517
 
518
  ### **Benefits**
519
 
520
- At the time of release, this family of models provides high-performance open vision-language model implementations designed from the ground up for responsible AI development compared to similarly sized models.
 
3
  license: apache-2.0
4
  license_link: https://ai.google.dev/gemma/docs/gemma_4_license
5
  pipeline_tag: any-to-any
 
 
6
  ---
7
 
8
  <div align="center">
 
199
 
200
 
201
  ```python
202
+ # Prompt - add audio before text
203
  messages = [
204
  {
205
  "role": "user",
206
  "content": [
207
+ {"type": "audio", "audio": "https://raw.githubusercontent.com/google-gemma/cookbook/refs/heads/main/Demos/sample-data/journal1.wav"},
208
  {"type": "text", "text": "Transcribe the following speech segment in its original language. Follow these specific instructions for formatting the answer:\n* Only output the transcription, with no newlines.\n* When transcribing numbers, write the digits, i.e. write 1.7 and not one point seven, and write 3 instead of three."},
 
209
  ]
210
  }
211
  ]
 
262
  messages = [
263
  {
264
  "role": "user", "content": [
265
+ {"type": "image", "url": "https://raw.githubusercontent.com/google-gemma/cookbook/refs/heads/main/Demos/sample-data/GoldenGate.png"},
266
  {"type": "text", "text": "What is shown in this image?"}
267
  ]
268
  }
 
380
 
381
  ### 4. Modality order
382
 
383
+ * For optimal performance with multimodal inputs, place image and/or audio content **before** the text in your prompt.
 
 
 
384
 
385
  ### 5. Variable Image Resolution
386
 
 
512
 
513
  ### **Benefits**
514
 
515
+ At the time of release, this family of models provides high-performance open vision-language model implementations designed from the ground up for responsible AI development compared to similarly sized models.
chat_template.jinja CHANGED
@@ -1,9 +1,9 @@
1
- {%- macro format_parameters(properties, required, filter_keys=false) -%}
2
  {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
3
  {%- set ns = namespace(found_first=false) -%}
4
  {%- for key, value in properties | dictsort -%}
5
  {%- set add_comma = false -%}
6
- {%- if not filter_keys or key not in standard_keys -%}
7
  {%- if ns.found_first %},{% endif -%}
8
  {%- set ns.found_first = true -%}
9
  {{ key }}:{
@@ -11,15 +11,34 @@
11
  description:<|"|>{{ value['description'] }}<|"|>
12
  {%- set add_comma = true -%}
13
  {%- endif -%}
 
 
 
 
14
  {%- if value['type'] | upper == 'STRING' -%}
15
  {%- if value['enum'] -%}
16
  {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
17
  enum:{{ format_argument(value['enum']) }}
18
  {%- endif -%}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  {%- elif value['type'] | upper == 'ARRAY' -%}
20
  {%- if value['items'] is mapping and value['items'] -%}
21
- {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
22
- items:{
23
  {%- set ns_items = namespace(found_first=false) -%}
24
  {%- for item_key, item_value in value['items'] | dictsort -%}
25
  {%- if item_value is not none -%}
@@ -52,32 +71,6 @@
52
  }
53
  {%- endif -%}
54
  {%- endif -%}
55
- {%- if value['nullable'] %}
56
- {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
57
- nullable:true
58
- {%- endif -%}
59
- {%- if value['type'] | upper == 'OBJECT' -%}
60
- {%- if value['properties'] is defined and value['properties'] is mapping -%}
61
- {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
62
- properties:{
63
- {{- format_parameters(value['properties'], value['required'] | default([])) -}}
64
- }
65
- {%- elif value is mapping -%}
66
- {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
67
- properties:{
68
- {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}
69
- }
70
- {%- endif -%}
71
- {%- if value['required'] -%}
72
- {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
73
- required:[
74
- {%- for item in value['required'] | default([]) -%}
75
- <|"|>{{- item -}}<|"|>
76
- {%- if not loop.last %},{% endif -%}
77
- {%- endfor -%}
78
- ]
79
- {%- endif -%}
80
- {%- endif -%}
81
  {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
82
  type:<|"|>{{ value['type'] | upper }}<|"|>}
83
  {%- endif -%}
@@ -157,42 +150,24 @@
157
  {{- ns.result | trim -}}
158
  {%- endmacro -%}
159
 
160
- {%- macro format_tool_response_block(tool_name, response) -%}
161
- {{- '<|tool_response>' -}}
162
- {%- if response is mapping -%}
163
- {{- 'response:' + tool_name + '{' -}}
164
- {%- for key, value in response | dictsort -%}
165
- {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
166
- {%- if not loop.last %},{% endif -%}
167
- {%- endfor -%}
168
- {{- '}' -}}
169
- {%- else -%}
170
- {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
171
- {%- endif -%}
172
- {{- '<tool_response|>' -}}
173
- {%- endmacro -%}
174
-
175
  {%- set ns = namespace(prev_message_type=None) -%}
176
  {%- set loop_messages = messages -%}
177
- {{- bos_token -}}
178
  {#- Handle System/Tool Definitions Block -#}
179
  {%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
180
  {{- '<|turn>system\n' -}}
 
181
  {#- Inject Thinking token at the very top of the FIRST system turn -#}
182
  {%- if enable_thinking is defined and enable_thinking -%}
183
- {{- '<|think|>\n' -}}
184
  {%- set ns.prev_message_type = 'think' -%}
185
  {%- endif -%}
 
186
  {%- if messages[0]['role'] in ['system', 'developer'] -%}
187
- {%- if messages[0]['content'] is string -%}
188
- {{- messages[0]['content'] | trim -}}
189
- {%- elif messages[0]['content'] is sequence -%}
190
- {%- for item in messages[0]['content'] -%}
191
- {{- item['text'] | trim + ' '-}}
192
- {%- endfor -%}
193
- {%- endif -%}
194
  {%- set loop_messages = messages[1:] -%}
195
  {%- endif -%}
 
196
  {%- if tools -%}
197
  {%- for tool in tools %}
198
  {{- '<|tool>' -}}
@@ -201,44 +176,15 @@
201
  {%- endfor %}
202
  {%- set ns.prev_message_type = 'tool' -%}
203
  {%- endif -%}
 
204
  {{- '<turn|>\n' -}}
205
  {%- endif %}
206
 
207
- {#- Pre-scan: find last user message index for reasoning guard -#}
208
- {%- set ns_turn = namespace(last_user_idx=-1) -%}
209
- {%- for i in range(loop_messages | length) -%}
210
- {%- if loop_messages[i]['role'] == 'user' -%}
211
- {%- set ns_turn.last_user_idx = i -%}
212
- {%- endif -%}
213
- {%- endfor -%}
214
-
215
  {#- Loop through messages -#}
216
  {%- for message in loop_messages -%}
217
- {%- if message['role'] != 'tool' -%}
218
  {%- set ns.prev_message_type = None -%}
219
  {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
220
- {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
221
- {%- set prev_nt = namespace(role=None, found=false) -%}
222
- {%- if loop.index0 > 0 -%}
223
- {%- for j in range(loop.index0 - 1, -1, -1) -%}
224
- {%- if not prev_nt.found -%}
225
- {%- if loop_messages[j]['role'] != 'tool' -%}
226
- {%- set prev_nt.role = loop_messages[j]['role'] -%}
227
- {%- set prev_nt.found = true -%}
228
- {%- endif -%}
229
- {%- endif -%}
230
- {%- endfor -%}
231
- {%- endif -%}
232
- {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
233
- {%- if not continue_same_model_turn -%}
234
  {{- '<|turn>' + role + '\n' }}
235
- {%- endif -%}
236
-
237
- {#- Render reasoning/reasoning_content as thinking channel -#}
238
- {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
239
- {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
240
- {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
241
- {%- endif -%}
242
 
243
  {%- if message['tool_calls'] -%}
244
  {%- for tool_call in message['tool_calls'] -%}
@@ -259,61 +205,25 @@
259
  {%- set ns.prev_message_type = 'tool_call' -%}
260
  {%- endif -%}
261
 
262
- {%- set ns_tr_out = namespace(flag=false) -%}
263
- {%- if message.get('tool_responses') -%}
264
- {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
265
  {%- for tool_response in message['tool_responses'] -%}
266
- {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
267
- {%- set ns_tr_out.flag = true -%}
268
- {%- set ns.prev_message_type = 'tool_response' -%}
269
- {%- endfor -%}
270
- {%- elif message.get('tool_calls') -%}
271
- {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}
272
- {%- set ns_tool_scan = namespace(stopped=false) -%}
273
- {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
274
- {%- if ns_tool_scan.stopped -%}
275
- {%- elif loop_messages[k]['role'] != 'tool' -%}
276
- {%- set ns_tool_scan.stopped = true -%}
277
- {%- else -%}
278
- {%- set follow = loop_messages[k] -%}
279
- {#- Resolve tool_call_id to function name -#}
280
- {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
281
- {%- for tc in message['tool_calls'] -%}
282
- {%- if tc.get('id') == follow.get('tool_call_id') -%}
283
- {%- set ns_tname.name = tc['function']['name'] -%}
284
- {%- endif -%}
285
  {%- endfor -%}
286
- {#- Handle content as string or content-parts array -#}
287
- {%- set tool_body = follow.get('content') -%}
288
- {%- if tool_body is string -%}
289
- {{- format_tool_response_block(ns_tname.name, tool_body) -}}
290
- {%- elif tool_body is sequence and tool_body is not string -%}
291
- {%- set ns_txt = namespace(s='') -%}
292
- {%- for part in tool_body -%}
293
- {%- if part.get('type') == 'text' -%}
294
- {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
295
- {%- endif -%}
296
- {%- endfor -%}
297
- {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
298
- {%- for part in tool_body -%}
299
- {%- if part.get('type') == 'image' -%}
300
- {{- '<|image|>' -}}
301
- {%- elif part.get('type') == 'audio' -%}
302
- {{- '<|audio|>' -}}
303
- {%- elif part.get('type') == 'video' -%}
304
- {{- '<|video|>' -}}
305
- {%- endif -%}
306
- {%- endfor -%}
307
- {%- else -%}
308
- {{- format_tool_response_block(ns_tname.name, tool_body) -}}
309
- {%- endif -%}
310
- {%- set ns_tr_out.flag = true -%}
311
- {%- set ns.prev_message_type = 'tool_response' -%}
312
  {%- endif -%}
 
313
  {%- endfor -%}
 
314
  {%- endif -%}
315
 
316
- {%- set captured_content -%}
317
  {%- if message['content'] is string -%}
318
  {%- if role == 'model' -%}
319
  {{- strip_thinking(message['content']) -}}
@@ -329,32 +239,25 @@
329
  {{- item['text'] | trim -}}
330
  {%- endif -%}
331
  {%- elif item['type'] == 'image' -%}
332
- {{- '<|image|>' -}}
333
  {%- set ns.prev_message_type = 'image' -%}
334
  {%- elif item['type'] == 'audio' -%}
335
  {{- '<|audio|>' -}}
336
  {%- set ns.prev_message_type = 'audio' -%}
337
  {%- elif item['type'] == 'video' -%}
338
- {{- '<|video|>' -}}
339
  {%- set ns.prev_message_type = 'video' -%}
340
  {%- endif -%}
341
  {%- endfor -%}
342
  {%- endif -%}
343
- {%- endset -%}
344
 
345
- {{- captured_content -}}
346
- {%- set has_content = captured_content | trim | length > 0 -%}
347
-
348
- {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
349
- {{- '<|tool_response>' -}}
350
- {%- elif not (ns_tr_out.flag and not has_content) -%}
351
  {{- '<turn|>\n' -}}
352
  {%- endif -%}
353
- {%- endif -%}
354
  {%- endfor -%}
355
 
356
  {%- if add_generation_prompt -%}
357
- {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
358
  {{- '<|turn>model\n' -}}
359
  {%- endif -%}
360
  {%- endif -%}
 
1
+ {%- macro format_parameters(properties, required) -%}
2
  {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
3
  {%- set ns = namespace(found_first=false) -%}
4
  {%- for key, value in properties | dictsort -%}
5
  {%- set add_comma = false -%}
6
+ {%- if key not in standard_keys -%}
7
  {%- if ns.found_first %},{% endif -%}
8
  {%- set ns.found_first = true -%}
9
  {{ key }}:{
 
11
  description:<|"|>{{ value['description'] }}<|"|>
12
  {%- set add_comma = true -%}
13
  {%- endif -%}
14
+ {%- if value['nullable'] %}
15
+ {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
16
+ nullable:true
17
+ {%- endif -%}
18
  {%- if value['type'] | upper == 'STRING' -%}
19
  {%- if value['enum'] -%}
20
  {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
21
  enum:{{ format_argument(value['enum']) }}
22
  {%- endif -%}
23
+ {%- elif value['type'] | upper == 'OBJECT' -%}
24
+ ,properties:{
25
+ {%- if value['properties'] is defined and value['properties'] is mapping -%}
26
+ {{- format_parameters(value['properties'], value['required'] | default([])) -}}
27
+ {%- elif value is mapping -%}
28
+ {{- format_parameters(value, value['required'] | default([])) -}}
29
+ {%- endif -%}
30
+ }
31
+ {%- if value['required'] -%}
32
+ ,required:[
33
+ {%- for item in value['required'] | default([]) -%}
34
+ <|"|>{{- item -}}<|"|>
35
+ {%- if not loop.last %},{% endif -%}
36
+ {%- endfor -%}
37
+ ]
38
+ {%- endif -%}
39
  {%- elif value['type'] | upper == 'ARRAY' -%}
40
  {%- if value['items'] is mapping and value['items'] -%}
41
+ ,items:{
 
42
  {%- set ns_items = namespace(found_first=false) -%}
43
  {%- for item_key, item_value in value['items'] | dictsort -%}
44
  {%- if item_value is not none -%}
 
71
  }
72
  {%- endif -%}
73
  {%- endif -%}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
75
  type:<|"|>{{ value['type'] | upper }}<|"|>}
76
  {%- endif -%}
 
150
  {{- ns.result | trim -}}
151
  {%- endmacro -%}
152
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  {%- set ns = namespace(prev_message_type=None) -%}
154
  {%- set loop_messages = messages -%}
155
+ {{ bos_token }}
156
  {#- Handle System/Tool Definitions Block -#}
157
  {%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
158
  {{- '<|turn>system\n' -}}
159
+
160
  {#- Inject Thinking token at the very top of the FIRST system turn -#}
161
  {%- if enable_thinking is defined and enable_thinking -%}
162
+ {{- '<|think|>' -}}
163
  {%- set ns.prev_message_type = 'think' -%}
164
  {%- endif -%}
165
+
166
  {%- if messages[0]['role'] in ['system', 'developer'] -%}
167
+ {{- messages[0]['content'] | trim -}}
 
 
 
 
 
 
168
  {%- set loop_messages = messages[1:] -%}
169
  {%- endif -%}
170
+
171
  {%- if tools -%}
172
  {%- for tool in tools %}
173
  {{- '<|tool>' -}}
 
176
  {%- endfor %}
177
  {%- set ns.prev_message_type = 'tool' -%}
178
  {%- endif -%}
179
+
180
  {{- '<turn|>\n' -}}
181
  {%- endif %}
182
 
 
 
 
 
 
 
 
 
183
  {#- Loop through messages -#}
184
  {%- for message in loop_messages -%}
 
185
  {%- set ns.prev_message_type = None -%}
186
  {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
  {{- '<|turn>' + role + '\n' }}
 
 
 
 
 
 
 
188
 
189
  {%- if message['tool_calls'] -%}
190
  {%- for tool_call in message['tool_calls'] -%}
 
205
  {%- set ns.prev_message_type = 'tool_call' -%}
206
  {%- endif -%}
207
 
208
+ {%- if message['tool_responses'] -%}
209
+ {#- Tool Response handling -#}
 
210
  {%- for tool_response in message['tool_responses'] -%}
211
+ {{- '<|tool_response>' -}}
212
+ {%- if tool_response['response'] is mapping -%}
213
+ {{- 'response:' + tool_response['name'] | default('unknown') + '{' -}}
214
+ {%- for key, value in tool_response['response'] | dictsort -%}
215
+ {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
216
+ {%- if not loop.last %},{% endif -%}
 
 
 
 
 
 
 
 
 
 
 
 
 
217
  {%- endfor -%}
218
+ {{- '}' -}}
219
+ {%- else -%}
220
+ {{- 'response:' + tool_response['name'] | default('unknown') + '{value:' + format_argument(tool_response['response'], escape_keys=False) + '}' -}}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
  {%- endif -%}
222
+ {{- '<tool_response|>' -}}
223
  {%- endfor -%}
224
+ {%- set ns.prev_message_type = 'tool_response' -%}
225
  {%- endif -%}
226
 
 
227
  {%- if message['content'] is string -%}
228
  {%- if role == 'model' -%}
229
  {{- strip_thinking(message['content']) -}}
 
239
  {{- item['text'] | trim -}}
240
  {%- endif -%}
241
  {%- elif item['type'] == 'image' -%}
242
+ {{- '\n\n<|image|>\n\n' -}}
243
  {%- set ns.prev_message_type = 'image' -%}
244
  {%- elif item['type'] == 'audio' -%}
245
  {{- '<|audio|>' -}}
246
  {%- set ns.prev_message_type = 'audio' -%}
247
  {%- elif item['type'] == 'video' -%}
248
+ {{- '\n\n<|video|>\n\n' -}}
249
  {%- set ns.prev_message_type = 'video' -%}
250
  {%- endif -%}
251
  {%- endfor -%}
252
  {%- endif -%}
 
253
 
254
+ {%- if not (message['tool_responses'] and not message['content']) -%}
 
 
 
 
 
255
  {{- '<turn|>\n' -}}
256
  {%- endif -%}
 
257
  {%- endfor -%}
258
 
259
  {%- if add_generation_prompt -%}
260
+ {%- if ns.prev_message_type != 'tool_response' -%}
261
  {{- '<|turn>model\n' -}}
262
  {%- endif -%}
263
  {%- endif -%}
tokenizer_config.json CHANGED
@@ -61,7 +61,7 @@
61
  }
62
  }
63
  },
64
- "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?P<content>(?:(?!\\<turn\\|\\>)(?!\\<\\|tool_response\\>).)+)?(?:\\<turn\\|\\>|\\<\\|tool_response\\>)?"
65
  },
66
  "soc_token": "<|channel>",
67
  "sot_token": "<|turn>",
 
61
  }
62
  }
63
  },
64
+ "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<content>(?:(?!\\<\\|tool_call\\>)(?!\\<turn\\|\\>).)+)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?:\\<turn\\|\\>)?"
65
  },
66
  "soc_token": "<|channel>",
67
  "sot_token": "<|turn>",