Add files using upload-large-folder tool
Browse files- crawl_state.json +293 -0
- text_content/docs_batch-mode_214118ee.txt +5 -0
- text_content/docs_caching_85703b6c.txt +5 -0
- text_content/docs_code-execution_857b45c1.txt +5 -0
- text_content/docs_code-execution_d708f71c.txt +5 -0
- text_content/docs_downloads_7592cb44.txt +5 -0
- text_content/docs_embeddings_1c200669.txt +5 -0
- text_content/docs_ephemeral-tokens_14d8195d.txt +5 -0
- text_content/docs_function-calling_065bfaa3.txt +5 -0
- text_content/docs_function-calling_8437d0a9.txt +5 -0
- text_content/docs_image-understanding_09b055f7.txt +5 -0
- text_content/docs_libraries_9ab83d4c.txt +5 -0
- text_content/docs_live-guide_0a3a44d0.txt +5 -0
- text_content/docs_live-guide_a0b9dda4.txt +5 -0
- text_content/docs_live-session_02e3a506.txt +5 -0
- text_content/docs_live-session_ebf143cf.txt +5 -0
- text_content/docs_live_1b6bdc03.txt +5 -0
- text_content/docs_migrate-to-cloud_8062fb32.txt +5 -0
- text_content/docs_migrate_e2b2e25c.txt +5 -0
- text_content/docs_models_1b6f8b5b.txt +5 -0
- text_content/docs_models_36def28a.txt +5 -0
- text_content/docs_models_988b7c3a.txt +5 -0
- text_content/docs_openai_30a57b37.txt +5 -0
- text_content/docs_openai_49a96628.txt +5 -0
- text_content/docs_prompting_with_media_e3aaecf9.txt +5 -0
- text_content/docs_sdks_de55873c.txt +5 -0
- text_content/docs_system-instructions_f32b59f7.txt +5 -0
- text_content/docs_text-generation_61cae815.txt +5 -0
- text_content/docs_text-generation_840d1f94.txt +5 -0
- text_content/docs_thinking_46527506.txt +5 -0
- text_content/docs_thinking_693f2dc9.txt +5 -0
- text_content/docs_thinking_88bba5e2.txt +5 -0
- text_content/docs_url-context_36731e11.txt +5 -0
- text_content/docs_url-context_41e99456.txt +5 -0
- text_content/docs_usage-policies_cd8a3783.txt +5 -0
- text_content/docs_video-understanding_d31a96cd.txt +5 -0
- text_content/docs_video_618562e2.txt +5 -0
- text_content/docs_vision_1564825b.txt +5 -0
- text_content/docs_vision_3b2d50f0.txt +5 -0
- text_content/docs_vision_879c35c5.txt +5 -0
- text_content/function-calling_tutorial_491a2277.txt +5 -0
- text_content/models_experimental-models_b7a92f62.txt +5 -0
- text_content/models_experimental-models_ba05a1bd.txt +5 -0
- text_content/models_experimental-models_de2a19a1.txt +5 -0
- text_content/models_experimental-models_df2e8560.txt +5 -0
- text_content/models_gemini_571c1682.txt +5 -0
- text_content/models_gemini_65cc88ec.txt +5 -0
- text_content/models_gemini_6932841a.txt +5 -0
- text_content/models_gemini_77ab48aa.txt +5 -0
- text_content/models_gemini_ff799621.txt +5 -0
crawl_state.json
ADDED
@@ -0,0 +1,293 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"visited_urls": [
|
3 |
+
"https://ai.google.dev/gemini-api/docs/api-key#main-content",
|
4 |
+
"https://ai.google.dev/gemini-api/docs/available-regions",
|
5 |
+
"https://ai.google.dev/gemini-api/docs/models#gemini-1.5-flash-8b",
|
6 |
+
"https://ai.google.dev/gemini-api/docs/safety-guidance#main-content",
|
7 |
+
"https://ai.google.dev/gemini-api/docs/models/gemini#veo-2",
|
8 |
+
"https://ai.google.dev/gemini-api/docs/models/experimental-models#gemini-1.5-pro",
|
9 |
+
"https://ai.google.dev/gemini-api/docs/models/gemini#main-content",
|
10 |
+
"https://ai.google.dev/gemini-api/docs/models/gemini#gemini-1.5-pro",
|
11 |
+
"https://ai.google.dev/gemini-api/docs/models/gemini#available-languages",
|
12 |
+
"https://ai.google.dev/gemini-api/docs/files#prompt-guide",
|
13 |
+
"https://ai.google.dev/gemini-api/docs/live-tools#main-content",
|
14 |
+
"https://ai.google.dev/gemini-api/docs/models#gemini-embedding",
|
15 |
+
"https://ai.google.dev/gemini-api/docs/vision#capabilities",
|
16 |
+
"https://ai.google.dev/gemini-api/docs/embeddings#use-cases",
|
17 |
+
"https://ai.google.dev/gemini-api/docs/migrate-to-cloud#main-content",
|
18 |
+
"https://ai.google.dev/gemini-api/docs/libraries#install",
|
19 |
+
"https://ai.google.dev/gemini-api/docs/audio-generation#voices",
|
20 |
+
"https://ai.google.dev/gemini-api/docs/models/gemini#gemini-2.5-flash-preview-tts",
|
21 |
+
"https://ai.google.dev/gemini-api/docs/live#main-content",
|
22 |
+
"https://ai.google.dev/gemini-api/docs/system-instructions",
|
23 |
+
"https://ai.google.dev/gemini-api/docs/audio#main-content",
|
24 |
+
"https://ai.google.dev/gemini-api/docs/long-context",
|
25 |
+
"https://ai.google.dev/gemini-api/docs/function-calling?example=weather#step-4",
|
26 |
+
"https://ai.google.dev/gemini-api/docs/ephemeral-tokens",
|
27 |
+
"https://ai.google.dev/gemini-api/docs/speech-generation#languages",
|
28 |
+
"https://ai.google.dev/gemini-api/docs/semantic_retrieval",
|
29 |
+
"https://ai.google.dev/gemini-api/docs/models#gemini-1.5-pro",
|
30 |
+
"https://ai.google.dev/gemini-api/docs/embeddings",
|
31 |
+
"https://ai.google.dev/gemini-api/docs/rate-limits#usage-tiers",
|
32 |
+
"https://ai.google.dev/gemini-api/docs/live-guide#native-audio-output-thinking",
|
33 |
+
"https://ai.google.dev/gemini-api/docs/usage-policies#main-content",
|
34 |
+
"https://ai.google.dev/gemini-api/docs/migrate#client",
|
35 |
+
"https://ai.google.dev/gemini-api/docs/pricing",
|
36 |
+
"https://ai.google.dev/gemini-api/docs/safety-guidance",
|
37 |
+
"https://ai.google.dev/gemini-api/docs/model-tuning",
|
38 |
+
"https://ai.google.dev/gemini-api/docs/live#audio-to-audio",
|
39 |
+
"https://ai.google.dev/gemini-api/docs/prompting-intro#prefixes",
|
40 |
+
"https://ai.google.dev/gemini-api/docs/models/experimental-models#gemini-2.5-pro-preview-tts",
|
41 |
+
"https://ai.google.dev/gemini-api/docs/function-calling/tutorial",
|
42 |
+
"https://ai.google.dev/gemini-api/docs/models#gemini-2.0-flash-preview-image-generation",
|
43 |
+
"https://ai.google.dev/gemini-api/docs/google-search#main-content",
|
44 |
+
"https://ai.google.dev/gemini-api/docs/models/gemini#gemini-2.5-flash",
|
45 |
+
"https://ai.google.dev/gemini-api/docs/models/experimental-models#veo-2",
|
46 |
+
"https://ai.google.dev/gemini-api/docs/models/gemini#text-embedding",
|
47 |
+
"https://ai.google.dev/gemini-api/docs/live-guide#affective-dialog",
|
48 |
+
"https://ai.google.dev/gemini-api/docs/models/gemini#model-versions",
|
49 |
+
"https://ai.google.dev/gemini-api/docs/function-calling#step-4",
|
50 |
+
"https://ai.google.dev/gemini-api/docs/text-generation#text-input",
|
51 |
+
"https://ai.google.dev/gemini-api/docs#main-content",
|
52 |
+
"https://ai.google.dev/gemini-api/docs/models#rate-limits",
|
53 |
+
"https://ai.google.dev/gemini-api/docs/models/experimental-models#gemini-2.0-flash",
|
54 |
+
"https://ai.google.dev/gemini-api/docs/billing",
|
55 |
+
"https://ai.google.dev/gemini-api/docs/pricing#main-content",
|
56 |
+
"https://ai.google.dev/gemini-api/docs/models/gemini#imagen-3",
|
57 |
+
"https://ai.google.dev/gemini-api/docs/thinking#summaries",
|
58 |
+
"https://ai.google.dev/gemini-api/docs/models#gemini-1.5-flash",
|
59 |
+
"https://ai.google.dev/gemini-api/docs/quickstart#main-content",
|
60 |
+
"https://ai.google.dev/gemini-api/docs/prompting_with_media#specific-instructions",
|
61 |
+
"https://ai.google.dev/gemini-api/docs/models/gemini",
|
62 |
+
"https://ai.google.dev/gemini-api/docs/files#troubleshooting",
|
63 |
+
"https://ai.google.dev/gemini-api/docs/batch-mode#main-content",
|
64 |
+
"https://ai.google.dev/gemini-api/docs/models/experimental-models#imagen-3",
|
65 |
+
"https://ai.google.dev/gemini-api/docs/embeddings#main-content",
|
66 |
+
"https://ai.google.dev/gemini-api/docs/thinking",
|
67 |
+
"https://ai.google.dev/gemini-api/docs/prompting-strategies",
|
68 |
+
"https://ai.google.dev/gemini-api/docs/prompting-strategies#under-the-hood",
|
69 |
+
"https://ai.google.dev/gemini-api/docs/models/experimental-models",
|
70 |
+
"https://ai.google.dev/gemini-api/docs/migrate-to-cloud",
|
71 |
+
"https://ai.google.dev/gemini-api/docs/models/gemini#gemini-2.5-pro",
|
72 |
+
"https://ai.google.dev/gemini-api/docs/openai#extra-body",
|
73 |
+
"https://ai.google.dev/gemini-api/docs/usage-policies#abuse-monitoring",
|
74 |
+
"https://ai.google.dev/gemini-api/docs/thinking#signatures",
|
75 |
+
"https://ai.google.dev/gemini-api/docs/vision#segmentation",
|
76 |
+
"https://ai.google.dev/gemini-api/docs/files",
|
77 |
+
"https://ai.google.dev/gemini-api/docs/billing#enable-cloud-billing",
|
78 |
+
"https://ai.google.dev/gemini-api/docs/models/gemini#token-size",
|
79 |
+
"https://ai.google.dev/gemini-api/docs/structured-output#schemas-in-python",
|
80 |
+
"https://ai.google.dev/gemini-api/docs/models#gemini-2.5-pro",
|
81 |
+
"https://ai.google.dev/gemini-api/docs/image-understanding",
|
82 |
+
"https://ai.google.dev/gemini-api/docs/live-guide",
|
83 |
+
"https://ai.google.dev/gemini-api/docs/models#gemini-2.5-pro-preview-tts",
|
84 |
+
"https://ai.google.dev/gemini-api/docs/semantic_retrieval#main-content",
|
85 |
+
"https://ai.google.dev/gemini-api/docs/downloads#main-content",
|
86 |
+
"https://ai.google.dev/gemini-api/docs/models#token-size",
|
87 |
+
"https://ai.google.dev/gemini-api/docs/prompting_with_media?lang=python#main-content",
|
88 |
+
"https://ai.google.dev/gemini-api/docs/function-calling?example=weather#rest",
|
89 |
+
"https://ai.google.dev/gemini-api/docs/batch-mode#retrieve-batch-results",
|
90 |
+
"https://ai.google.dev/gemini-api/docs/models/experimental-models#gemini-2.0-flash-lite",
|
91 |
+
"https://ai.google.dev/gemini-api/docs/prompting-intro",
|
92 |
+
"https://ai.google.dev/gemini-api/docs/video#prompt-guide",
|
93 |
+
"https://ai.google.dev/gemini-api/docs/live-guide#main-content",
|
94 |
+
"https://ai.google.dev/gemini-api/docs/text-generation#system-instructions",
|
95 |
+
"https://ai.google.dev/gemini-api/docs/live",
|
96 |
+
"https://ai.google.dev/gemini-api/docs/prompting-strategies#prefixes",
|
97 |
+
"https://ai.google.dev/gemini-api/docs/function-calling#function_calling_modes",
|
98 |
+
"https://ai.google.dev/gemini-api/docs/api-key#set-api-env-var",
|
99 |
+
"https://ai.google.dev/gemini-api/docs/function-calling#parallel_function_calling",
|
100 |
+
"https://ai.google.dev/gemini-api/docs/models/gemini#live-api",
|
101 |
+
"https://ai.google.dev/gemini-api/docs/video#main-content",
|
102 |
+
"https://ai.google.dev/gemini-api/docs/function-calling#main-content",
|
103 |
+
"https://ai.google.dev/gemini-api/docs/code-execution",
|
104 |
+
"https://ai.google.dev/gemini-api/docs/batch-mode",
|
105 |
+
"https://ai.google.dev/gemini-api/docs/video-understanding",
|
106 |
+
"https://ai.google.dev/gemini-api/docs/caching#main-content",
|
107 |
+
"https://ai.google.dev/gemini-api/docs/speech-generation#controllable",
|
108 |
+
"https://ai.google.dev/gemini-api/docs/models/gemini#gemini-2.0-flash-preview-image-generation",
|
109 |
+
"https://ai.google.dev/gemini-api/docs/models/experimental-models#rate-limits",
|
110 |
+
"https://ai.google.dev/gemini-api/docs/system-instructions#system-instructions",
|
111 |
+
"https://ai.google.dev/gemini-api/docs/system-instructions#main-content",
|
112 |
+
"https://ai.google.dev/gemini-api/docs/ephemeral-tokens#main-content",
|
113 |
+
"https://ai.google.dev/gemini-api/docs/safety-settings",
|
114 |
+
"https://ai.google.dev/gemini-api/docs/vision#object-detection",
|
115 |
+
"https://ai.google.dev/gemini-api/docs/live#response-modalities",
|
116 |
+
"https://ai.google.dev/gemini-api/docs/image-understanding#image-input",
|
117 |
+
"https://ai.google.dev/gemini-api/docs/document-processing",
|
118 |
+
"https://ai.google.dev/gemini-api/docs/models/gemini#gemini-2.0-flash",
|
119 |
+
"https://ai.google.dev/gemini-api/docs/function-calling/tutorial#main-content",
|
120 |
+
"https://ai.google.dev/gemini-api/docs/grounding#main-content",
|
121 |
+
"https://ai.google.dev/gemini-api/docs/models#preview",
|
122 |
+
"https://ai.google.dev/gemini-api/docs/code-execution#main-content",
|
123 |
+
"https://ai.google.dev/gemini-api/docs/libraries#main-content",
|
124 |
+
"https://ai.google.dev/gemini-api/docs/function-calling?example=weather#main-content",
|
125 |
+
"https://ai.google.dev/gemini-api/docs/function-calling#thinking",
|
126 |
+
"https://ai.google.dev/gemini-api/docs/prompting_with_media?lang=python#prompting-with-videos",
|
127 |
+
"https://ai.google.dev/gemini-api/docs/models/experimental-models#gemini-2.5-flash-lite",
|
128 |
+
"https://ai.google.dev/gemini-api/docs/safety-settings#main-content",
|
129 |
+
"https://ai.google.dev/gemini-api/docs/models/gemini#embedding",
|
130 |
+
"https://ai.google.dev/gemini-api/docs/models#gemini-2.0-flash",
|
131 |
+
"https://ai.google.dev/gemini-api/docs/live-session#context-window-compression",
|
132 |
+
"https://ai.google.dev/gemini-api/docs/models/experimental-models#gemini-1.5-flash-8b",
|
133 |
+
"https://ai.google.dev/gemini-api/docs/models/experimental-models#gemini-2.0-flash-preview-image-generation",
|
134 |
+
"https://ai.google.dev/gemini-api/docs/image-understanding#main-content",
|
135 |
+
"https://ai.google.dev/gemini-api/docs/image-understanding#upload-image",
|
136 |
+
"https://ai.google.dev/gemini-api/docs/live-session",
|
137 |
+
"https://ai.google.dev/gemini-api/docs/tokens#main-content",
|
138 |
+
"https://ai.google.dev/gemini-api/docs/video#generate-from-text",
|
139 |
+
"https://ai.google.dev/gemini-api/docs",
|
140 |
+
"https://ai.google.dev/gemini-api/docs/models/experimental-models#imagen-4",
|
141 |
+
"https://ai.google.dev/gemini-api/docs/models#veo-2",
|
142 |
+
"https://ai.google.dev/gemini-api/docs/models/gemini#gemini-2.0-flash-lite",
|
143 |
+
"https://ai.google.dev/gemini-api/docs/function-calling#compositional_function_calling",
|
144 |
+
"https://ai.google.dev/gemini-api/docs/video#basics",
|
145 |
+
"https://ai.google.dev/gemini-api/docs/speech-generation#main-content",
|
146 |
+
"https://ai.google.dev/gemini-api/docs/prompting_with_media?lang=python#troubleshooting",
|
147 |
+
"https://ai.google.dev/gemini-api/docs/models/experimental-models#available-models",
|
148 |
+
"https://ai.google.dev/gemini-api/docs/models/experimental-models#gemini-1.5-flash",
|
149 |
+
"https://ai.google.dev/gemini-api/docs/migrate",
|
150 |
+
"https://ai.google.dev/gemini-api/docs/model-tuning#main-content",
|
151 |
+
"https://ai.google.dev/gemini-api/docs/structured-output",
|
152 |
+
"https://ai.google.dev/gemini-api/docs/models",
|
153 |
+
"https://ai.google.dev/gemini-api/docs/batch-mode#inline-requests",
|
154 |
+
"https://ai.google.dev/gemini-api/docs/models/experimental-models#token-size",
|
155 |
+
"https://ai.google.dev/gemini-api/docs/live-tools",
|
156 |
+
"https://ai.google.dev/gemini-api/docs/safety-settings#safety-filtering-per-request",
|
157 |
+
"https://ai.google.dev/gemini-api/docs/speech-generation#limitations",
|
158 |
+
"https://ai.google.dev/gemini-api/docs/text-generation?lang=python#main-content",
|
159 |
+
"https://ai.google.dev/gemini-api/docs/audio",
|
160 |
+
"https://ai.google.dev/gemini-api/docs/models/experimental-models#gemini-2.5-pro",
|
161 |
+
"https://ai.google.dev/gemini-api/docs/models#gemini-2.5-flash-lite",
|
162 |
+
"https://ai.google.dev/gemini-api/docs/audio#inline-audio",
|
163 |
+
"https://ai.google.dev/gemini-api/docs/live-session#session-resumption",
|
164 |
+
"https://ai.google.dev/gemini-api/docs/video#limitations",
|
165 |
+
"https://ai.google.dev/gemini-api/docs/structured-output#generating-enums",
|
166 |
+
"https://ai.google.dev/gemini-api/docs/grounding",
|
167 |
+
"https://ai.google.dev/gemini-api/docs/models#gemini-2.5-flash-preview-tts",
|
168 |
+
"https://ai.google.dev/gemini-api/docs/models/experimental-models#gemini-embedding",
|
169 |
+
"https://ai.google.dev/gemini-api/docs/prompting-intro#main-content",
|
170 |
+
"https://ai.google.dev/gemini-api/docs/sdks#main-content",
|
171 |
+
"https://ai.google.dev/gemini-api/docs/migrate#main-content",
|
172 |
+
"https://ai.google.dev/gemini-api/docs/speech-generation#voices",
|
173 |
+
"https://ai.google.dev/gemini-api/docs/models#imagen-4",
|
174 |
+
"https://ai.google.dev/gemini-api/docs/live-guide#supported-languages",
|
175 |
+
"https://ai.google.dev/gemini-api/docs/function-calling#automatic_function_calling_python_only",
|
176 |
+
"https://ai.google.dev/gemini-api/docs/prompting_with_media#main-content",
|
177 |
+
"https://ai.google.dev/gemini-api/docs/thinking#set-budget",
|
178 |
+
"https://ai.google.dev/gemini-api/docs/text-generation?lang=python#generate-a-text-stream",
|
179 |
+
"https://ai.google.dev/gemini-api/docs/models/gemini#live-api-2.0",
|
180 |
+
"https://ai.google.dev/gemini-api/docs/prompting-intro#completion",
|
181 |
+
"https://ai.google.dev/gemini-api/docs/tokens#media-token",
|
182 |
+
"https://ai.google.dev/gemini-api/docs/thinking#tasks",
|
183 |
+
"https://ai.google.dev/gemini-api/docs/models#gemini-2.0-flash-lite",
|
184 |
+
"https://ai.google.dev/gemini-api/docs/models#main-content",
|
185 |
+
"https://ai.google.dev/gemini-api/docs/live#implementation-approach",
|
186 |
+
"https://ai.google.dev/gemini-api/docs/function-calling",
|
187 |
+
"https://ai.google.dev/gemini-api/docs/prompting_with_media#troubleshooting",
|
188 |
+
"https://ai.google.dev/gemini-api/docs/libraries#previous-sdks",
|
189 |
+
"https://ai.google.dev/gemini-api/docs/live-guide#proactive-audio",
|
190 |
+
"https://ai.google.dev/gemini-api/docs/image-understanding#segmentation",
|
191 |
+
"https://ai.google.dev/gemini-api/docs/files#main-content",
|
192 |
+
"https://ai.google.dev/gemini-api/docs/image-generation#gemini",
|
193 |
+
"https://ai.google.dev/gemini-api/docs/models/gemini#gemini-1.5-flash",
|
194 |
+
"https://ai.google.dev/gemini-api/docs/api-key",
|
195 |
+
"https://ai.google.dev/gemini-api/docs/models/experimental-models#gemini-2.5-flash",
|
196 |
+
"https://ai.google.dev/gemini-api/docs/libraries#new-libraries",
|
197 |
+
"https://ai.google.dev/gemini-api/docs/image-understanding#inline-image",
|
198 |
+
"https://ai.google.dev/gemini-api/docs/function-calling/tutorial#step-4",
|
199 |
+
"https://ai.google.dev/gemini-api/docs/image-generation#imagen-prompt-guide",
|
200 |
+
"https://ai.google.dev/gemini-api/docs/models/gemini#gemini-2.5-flash-native-audio",
|
201 |
+
"https://ai.google.dev/gemini-api/docs/live-tools#tools-overview",
|
202 |
+
"https://ai.google.dev/gemini-api/docs/models/gemini#gemini-1.5-flash-8b",
|
203 |
+
"https://ai.google.dev/gemini-api/docs/rate-limits",
|
204 |
+
"https://ai.google.dev/gemini-api/docs/function-calling#multi-tool-use",
|
205 |
+
"https://ai.google.dev/gemini-api/docs/batch-mode#batch-job-status",
|
206 |
+
"https://ai.google.dev/gemini-api/docs/image-generation",
|
207 |
+
"https://ai.google.dev/gemini-api/docs/tokens",
|
208 |
+
"https://ai.google.dev/gemini-api/docs/files#specific-instructions",
|
209 |
+
"https://ai.google.dev/gemini-api/docs/live-guide#interruptions",
|
210 |
+
"https://ai.google.dev/gemini-api/docs/downloads#install",
|
211 |
+
"https://ai.google.dev/gemini-api/docs/models/gemini#imagen-4",
|
212 |
+
"https://ai.google.dev/gemini-api/docs/models#gemini-2.5-flash-native-audio",
|
213 |
+
"https://ai.google.dev/gemini-api/docs/google-search",
|
214 |
+
"https://ai.google.dev/gemini-api/docs/video",
|
215 |
+
"https://ai.google.dev/gemini-api/docs/models/gemini#rate-limits",
|
216 |
+
"https://ai.google.dev/gemini-api/docs/prompting-strategies#completion",
|
217 |
+
"https://ai.google.dev/gemini-api/docs/models/gemini#gemini-embedding",
|
218 |
+
"https://ai.google.dev/gemini-api/docs/available-regions#main-content",
|
219 |
+
"https://ai.google.dev/gemini-api/docs/text-generation#main-content",
|
220 |
+
"https://ai.google.dev/gemini-api/docs/structured-output#generating-json",
|
221 |
+
"https://ai.google.dev/gemini-api/docs/long-context#main-content",
|
222 |
+
"https://ai.google.dev/gemini-api/docs/image-generation#main-content",
|
223 |
+
"https://ai.google.dev/gemini-api/docs/billing#main-content",
|
224 |
+
"https://ai.google.dev/gemini-api/docs/audio#upload-audio",
|
225 |
+
"https://ai.google.dev/gemini-api/docs/sdks#install",
|
226 |
+
"https://ai.google.dev/gemini-api/docs/models/experimental-models#live-api-2.0",
|
227 |
+
"https://ai.google.dev/gemini-api/docs/rate-limits#main-content",
|
228 |
+
"https://ai.google.dev/gemini-api/docs/models#live-api",
|
229 |
+
"https://ai.google.dev/gemini-api/docs/models/gemini#gemini-2.5-flash-lite",
|
230 |
+
"https://ai.google.dev/gemini-api/docs/video-understanding#inline-video",
|
231 |
+
"https://ai.google.dev/gemini-api/docs/structured-output#main-content",
|
232 |
+
"https://ai.google.dev/gemini-api/docs/speech-generation#supported-models",
|
233 |
+
"https://ai.google.dev/gemini-api/docs/models#imagen-3",
|
234 |
+
"https://ai.google.dev/gemini-api/docs/prompting-strategies#main-content",
|
235 |
+
"https://ai.google.dev/gemini-api/docs/models#gemini-2.5-flash",
|
236 |
+
"https://ai.google.dev/gemini-api/docs/models/experimental-models#gemini-2.5-flash-preview-tts",
|
237 |
+
"https://ai.google.dev/gemini-api/docs/text-generation",
|
238 |
+
"https://ai.google.dev/gemini-api/docs/libraries",
|
239 |
+
"https://ai.google.dev/gemini-api/docs/openai#main-content",
|
240 |
+
"https://ai.google.dev/gemini-api/docs/models/experimental-models#main-content",
|
241 |
+
"https://ai.google.dev/gemini-api/docs/video#generate-from-images",
|
242 |
+
"https://ai.google.dev/gemini-api/docs/live-session#main-content",
|
243 |
+
"https://ai.google.dev/gemini-api/docs/image-generation#choose-a-model",
|
244 |
+
"https://ai.google.dev/gemini-api/docs/image-generation#imagen",
|
245 |
+
"https://ai.google.dev/gemini-api/docs/thinking#supported-models",
|
246 |
+
"https://ai.google.dev/gemini-api/docs/billing#request-an-upgrade",
|
247 |
+
"https://ai.google.dev/gemini-api/docs/video-understanding#main-content",
|
248 |
+
"https://ai.google.dev/gemini-api/docs/prompting_with_media?lang=python#specific-instructions",
|
249 |
+
"https://ai.google.dev/gemini-api/docs/openai",
|
250 |
+
"https://ai.google.dev/gemini-api/docs/image-understanding#object-detection",
|
251 |
+
"https://ai.google.dev/gemini-api/docs/thinking#main-content",
|
252 |
+
"https://ai.google.dev/gemini-api/docs/video-understanding#upload-video",
|
253 |
+
"https://ai.google.dev/gemini-api/docs/quickstart",
|
254 |
+
"https://ai.google.dev/gemini-api/docs/live-session#goaway-message",
|
255 |
+
"https://ai.google.dev/gemini-api/docs/sdks",
|
256 |
+
"https://ai.google.dev/gemini-api/docs/downloads",
|
257 |
+
"https://ai.google.dev/gemini-api/docs/vision#inline-image",
|
258 |
+
"https://ai.google.dev/gemini-api/docs/vision#main-content",
|
259 |
+
"https://ai.google.dev/gemini-api/docs/url-context",
|
260 |
+
"https://ai.google.dev/gemini-api/docs/batch-mode#input-file",
|
261 |
+
"https://ai.google.dev/gemini-api/docs/pricing#veo-2",
|
262 |
+
"https://ai.google.dev/gemini-api/docs/models/experimental-models#live-api",
|
263 |
+
"https://ai.google.dev/gemini-api/docs/image-understanding#capabilities",
|
264 |
+
"https://ai.google.dev/gemini-api/docs/code-execution#supported-libraries",
|
265 |
+
"https://ai.google.dev/gemini-api/docs/live#audio-generation",
|
266 |
+
"https://ai.google.dev/gemini-api/docs/vision#upload-image",
|
267 |
+
"https://ai.google.dev/gemini-api/docs/text-generation?lang=python#system-instructions",
|
268 |
+
"https://ai.google.dev/gemini-api/docs/openai#thinking",
|
269 |
+
"https://ai.google.dev/gemini-api/docs/url-context#main-content",
|
270 |
+
"https://ai.google.dev/gemini-api/docs/models/gemini#gemini-2.5-pro-preview-tts",
|
271 |
+
"https://ai.google.dev/gemini-api/docs/vision",
|
272 |
+
"https://ai.google.dev/gemini-api/docs/models#live-api-2.0",
|
273 |
+
"https://ai.google.dev/gemini-api/docs/caching",
|
274 |
+
"https://ai.google.dev/gemini-api/docs/models/experimental-models#gemini-2.5-flash-native-audio",
|
275 |
+
"https://ai.google.dev/gemini-api/docs/live-guide#native-audio-output",
|
276 |
+
"https://ai.google.dev/gemini-api/docs/prompting-strategies#few-shot",
|
277 |
+
"https://ai.google.dev/gemini-api/docs/video-understanding#youtube",
|
278 |
+
"https://ai.google.dev/gemini-api/docs/function-calling#step-2",
|
279 |
+
"https://ai.google.dev/gemini-api/docs/api-key#provide-api-key-explicitly",
|
280 |
+
"https://ai.google.dev/gemini-api/docs/document-processing#main-content",
|
281 |
+
"https://ai.google.dev/gemini-api/docs/prompting_with_media"
|
282 |
+
],
|
283 |
+
"pending_urls": [],
|
284 |
+
"downloaded_files": [],
|
285 |
+
"stats": {
|
286 |
+
"pages_crawled": 278,
|
287 |
+
"documents_downloaded": 0,
|
288 |
+
"text_files_saved": 278,
|
289 |
+
"total_size": 5907224,
|
290 |
+
"file_types": {}
|
291 |
+
},
|
292 |
+
"base_url": "https://ai.google.dev/gemini-api/docs"
|
293 |
+
}
|
text_content/docs_batch-mode_214118ee.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
URL: https://ai.google.dev/gemini-api/docs/batch-mode#input-file
|
2 |
+
Title: Batch Mode | Gemini API | Google AI for Developers
|
3 |
+
==================================================
|
4 |
+
|
5 |
+
Batch Mode | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Batch Mode The Gemini API's Batch Mode is designed to process large volumes of requests asynchronously at 50% of the standard cost . The target turnaround time is 24 hours, but in majority of cases, it is much quicker. Use Batch Mode for large-scale, non-urgent tasks such as data pre-processing or running evaluations where an immediate response is not required. Note: You can use Batch Mode with the Gemini API Python SDK or the REST API. Support for Batch Mode in the Gemini API JavaScript SDK is coming soon. Getting Started This section helps you get started with submitting your first requests in batch mode. Creating a batch job You have two ways to submit your requests in Batch Mode: Inline Requests : A list of GenerateContentRequest objects directly included in your batch creation request. This is suitable for smaller batches that keep the total request size under 20MB. The output returned from the model is a list of inlineResponse objects. Input File : A JSON Lines (JSONL) file where each line contains a complete GenerateContentRequest object. This method is recommended for larger requests. The output returned from the model is a JSONL file where each line is either a GenerateContentResponse or a status object. Inline requests For a small number of requests, you can directly embed the GenerateContentRequest objects within your BatchGenerateContentRequest . The following example calls the BatchGenerateContent method with inline requests: Python from google import genai from google.genai import types client = genai . Client () # A list of dictionaries, where each is a GenerateContentRequest inline_requests = [ { 'contents' : [{ 'parts' : [{ 'text' : 'Tell me a one-sentence joke.' }], 'role' : 'user' }] }, { 'contents' : [{ 'parts' : [{ 'text' : 'Why is the sky blue?' }], 'role' : 'user' }] } ] inline_batch_job = client . batches . create ( model = "models/gemini-2.5-flash" , src = inline_requests , config = { 'display_name' : "inlined-requests-job-1" , }, ) print ( f "Created batch job: { inline_batch_job . name } " ) REST curl https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:batchGenerateContent \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -X POST \ -H "Content-Type:application/json" \ -d '{ "batch": { "display_name": "my-batch-requests", "input_config": { "requests": { "requests": [ { "request": {"contents": [{"parts": [{"text": "Describe the process of photosynthesis."}]}]}, "metadata": { "key": "request-1" } }, { "request": {"contents": [{"parts": [{"text": "Describe the process of photosynthesis."}]}]}, "metadata": { "key": "request-2" } } ] } } } }' You can use any requests you would use in non-batch (or interactive) mode. For example, you could specify the temperature, system instructions or even pass in other modalities. The following example shows some example inline requests that contain a system instruction for one of the requests: inline_requests_list = [ { 'contents' : [{ 'parts' : [{ 'text' : 'Write a short poem about a cloud.' }]}]}, { 'contents' : [{ 'parts' : [{ 'text' : 'Write a short poem about a cat.' }]}], 'system_instructions' : { 'parts' : [{ 'text' : 'You are a cat. Your name is Neko.' }]}} ] Similarly can also specify tools to use for a request. The following example shows a request that enables the Google Search tool : inline_requests_list = [ { 'contents' : [{ 'parts' : [{ 'text' : 'Who won the euro 1998?' }]}]}, { 'contents' : [{ 'parts' : [{ 'text' : 'Who won the euro 2025?' }]}], 'tools' : [{ 'google_search ' : {}}]} ] Input file For larger sets of requests, prepare a JSON Lines (JSONL) file. Each line in this file must be a JSON object containing a user-defined key and a request object, where the request is a valid GenerateContentRequest object. The user-defined key is used in the response to indicate which output is the result of which request. For example, the request with the key defined as request-1 will have its response annotated with the same key name. This file is uploaded using the File API . The maximum allowed file size for an input file is 2GB. The following is an example of a JSONL file. You can save it in a file named my-batch-requests.json : { "key" : "request-1" , "request" : { "contents" : [{ "parts" : [{ "text" : "Describe the process of photosynthesis." }]}], "generation_config" : { "temperature" : 0.7 }}} { "key" : "request-2" , "request" : { "contents" : [{ "parts" : [{ "text" : "What are the main ingredients in a Margherita pizza?" }]}]}} Similarly to inline requests, you can specify other parameters like system instructions, tools or other configurations in each request JSON. You can upload this file using the File API as shown in the following example. If you are working with multimodal input, you can reference other uploaded files within your JSONL file. Python from google import genai from google.genai import types client = genai . Client () # Create a sample JSONL file with open ( "my-batch-requests.jsonl" , "w" ) as f : requests = [ { "key" : "request-1" , "request" : { "contents" : [{ "parts" : [{ "text" : "Describe the process of photosynthesis." }]}]}}, { "key" : "request-2" , "request" : { "contents" : [{ "parts" : [{ "text" : "What are the main ingredients in a Margherita pizza?" }]}]}} ] for req in requests : f . write ( json . dumps ( req ) + " \n " ) # Upload the file to the File API uploaded_file = client . files . upload ( file = 'my-batch-requests.jsonl' , config = types . UploadFileConfig ( display_name = 'my-batch-requests' , mime_type = 'jsonl' ) ) print ( f "Uploaded file: { uploaded_file . name } " ) REST tmp_batch_input_file = batch_input.tmp echo -e '{"contents": [{"parts": [{"text": "Describe the process of photosynthesis."}]}], "generationConfig": {"temperature": 0.7}}\n{"contents": [{"parts": [{"text": "What are the main ingredients in a Margherita pizza?"}]}]}' > batch_input.tmp MIME_TYPE = $( file -b --mime-type " ${ tmp_batch_input_file } " ) NUM_BYTES = $( wc -c < " ${ tmp_batch_input_file } " ) DISPLAY_NAME = BatchInput tmp_header_file = upload-header.tmp # Initial resumable request defining metadata. # The upload url is in the response headers dump them to a file. curl "https://generativelanguage.googleapis.com/upload/v1beta/files \ -D " ${ tmp_header_file } " \ -H " x-goog-api-key: $GEMINI_API_KEY " \ -H " X-Goog-Upload-Protocol: resumable " \ -H " X-Goog-Upload-Command: start " \ -H " X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H " X-Goog-Upload-Header-Content-Type: ${ MIME_TYPE } " \ -H " Content-Type: application/jsonl " \ -d " { 'file' : { 'display_name' : '${DISPLAY_NAME}' }} " 2> /dev/null upload_url= $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " # Upload the actual bytes. curl " ${ upload_url } " \ -H " Content-Length: ${ NUM_BYTES } " \ -H " X-Goog-Upload-Offset: 0 " \ -H " X-Goog-Upload-Command: upload, finalize " \ --data-binary " @ ${ tmp_batch_input_file } " 2> /dev/null > file_info.json file_uri= $( jq ".file.uri" file_info.json ) The following example calls the BatchGenerateContent method with the input file uploaded using File API: Python # Assumes `uploaded_file` is the file object from the previous step file_batch_job = client . batches . create ( model = "gemini-2.5-flash" , src = uploaded_file . name , config = { 'display_name' : "file-upload-job-1" , }, ) print ( f "Created batch job: { file_batch_job . name } " ) REST BATCH_INPUT_FILE = 'files/123456' # File ID curl https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:batchGenerateContent \ -X POST \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Type:application/json" \ -d "{ 'batch': { 'display_name': 'my-batch-requests', 'input_config': { 'requests': { 'file_name': ${ BATCH_INPUT_FILE } } } } }" When you create a batch job, you will get a job name returned. Use this name for monitoring the job status as well as retrieving the results once the job completes. The following is an example output that contains a job name: Created batch job from file: batches/123456789 Monitoring job status Use the operation name obtained when creating the batch job to poll its status. The state field of the batch job will indicate its current status. A batch job can be in one of the following states: JOB_STATE_PENDING : The job has been created and is waiting to be processed by the service. JOB_STATE_SUCCEEDED : The job completed successfully. You can now retrieve the results. JOB_STATE_FAILED : The job failed. Check the error details for more information. JOB_STATE_CANCELLED : The job was cancelled by the user. You can poll the job status periodically to check for completion. Python # Use the name of the job you want to check # e.g., inline_batch_job.name from the previous step job_name = "YOUR_BATCH_JOB_NAME" # (e.g. 'batches/your-batch-id') batch_job = client . batches . get ( name = job_name ) completed_states = set ([ 'JOB_STATE_SUCCEEDED' , 'JOB_STATE_FAILED' , 'JOB_STATE_CANCELLED' , ]) print ( f "Polling status for job: { job_name } " ) batch_job = client . batches . get ( name = job_name ) # Initial get while batch_job . state . name not in completed_states : print ( f "Current state: { batch_job . state . name } " ) time . sleep ( 30 ) # Wait for 30 seconds before polling again batch_job = client . batches . get ( name = job_name ) print ( f "Job finished with state: { batch_job . state . name } " ) if batch_job . state . name == 'JOB_STATE_FAILED' : print ( f "Error: { batch_job . error } " ) Retrieving results Once the job status indicates your batch job has succeeded, the results are available in the response field. Python import json # Use the name of the job you want to check # e.g., inline_batch_job.name from the previous step job_name = "YOUR_BATCH_JOB_NAME" batch_job = client . batches . get ( name = job_name ) if batch_job . state . name == 'JOB_STATE_SUCCEEDED' : # If batch job was created with a file if batch_job . dest and batch_job . dest . file_name : # Results are in a file result_file_name = batch_job . dest . file_name print ( f "Results are in file: { result_file_name } " ) print ( "Downloading result file content..." ) file_content = client . files . download ( file = result_file_name ) # Process file_content (bytes) as needed print ( file_content . decode ( 'utf-8' )) # If batch job was created with inline request elif batch_job . dest and batch_job . dest . inlined_responses : # Results are inline print ( "Results are inline:" ) for i , inline_response in enumerate ( batch_job . dest . inlined_responses ): print ( f "Response { i + 1 } :" ) if inline_response . response : # Accessing response, structure may vary. try : print ( inline_response . response . text ) except AttributeError : print ( inline_response . response ) # Fallback elif inline_response . error : print ( f "Error: { inline_response . error } " ) else : print ( "No results found (neither file nor inline)." ) else : print ( f "Job did not succeed. Final state: { batch_job . state . name } " ) if batch_job . error : print ( f "Error: { batch_job . error } " ) REST BATCH_NAME = "batches/123456" # Your batch job name curl https://generativelanguage.googleapis.com/v1beta/ $BATCH_NAME \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Type:application/json" 2 > /dev/null > batch_status.json if jq -r '.done' batch_status.json | grep -q "false" ; then echo "Batch has not finished processing" fi batch_state = $( jq -r '.metadata.state' batch_status.json ) if [[ $batch_state = "JOB_STATE_SUCCEEDED" ]] ; then if [[ $( jq '.response | has("inlinedResponses")' batch_status.json ) = "true" ]] ; then jq -r '.response.inlinedResponses' batch_status.json exit fi responses_file_name = $( jq -r '.response.responsesFile' batch_status.json ) curl https://generativelanguage.googleapis.com/download/v1beta/ $responses_file_name :download?alt = media \ -H "x-goog-api-key: $GEMINI_API_KEY " 2 > /dev/null elif [[ $batch_state = "JOB_STATE_FAILED" ]] ; then jq '.error' batch_status.json elif [[ $batch_state == "JOB_STATE_CANCELLED" ]] ; then echo "Batch was cancelled by the user" fi Cancelling a batch job You can cancel an ongoing batch job using its name. When a job is canceled, it stops processing new requests. Python # Cancel a batch job client . batches . cancel ( name = batch_job_to_cancel . name ) REST BATCH_NAME = "batches/123456" # Your batch job name # Cancel the batch curl https://generativelanguage.googleapis.com/v1beta/ $BATCH_NAME :cancel \ -H "x-goog-api-key: $GEMINI_API_KEY " \ # Confirm that the status of the batch after cancellation is JOB_STATE_CANCELLED curl https://generativelanguage.googleapis.com/v1beta/ $BATCH_NAME \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Type:application/json" 2 > /dev/null | jq -r '.metadata.state' Deleting a batch job You can delete an existing batch job using its name. When a job is deleted, it stops processing new requests and is removed from the list of batch jobs. Python # Delete a batch job client . batches . delete ( name = batch_job_to_delete . name ) REST BATCH_NAME = "batches/123456" # Your batch job name # Cancel the batch curl https://generativelanguage.googleapis.com/v1beta/ $BATCH_NAME :delete \ -H "x-goog-api-key: $GEMINI_API_KEY " \ Technical details Supported models: Batch Mode supports a range of Gemini models. Refer to the Models page for the latest list of compatible models. The supported modalities for Batch Mode are the same as what's supported on the interactive (or non-batch mode) API. Pricing: Batch Mode usage is priced at 50% of the standard interactive API cost for the equivalent model. Service Level Objective (SLO): Batch jobs are designed to complete within a 24-hour turnaround time. Many jobs may complete much faster depending on their size and current system load. Caching: Context caching is enabled for batch requests. If a request in your batch results in a cache hit, the cached tokens are priced the same as for non-batch mode traffic. Best practices Use input files for large requests: For a large number of requests, always use the file input method for better manageability and to avoid hitting request size limits for the BatchGenerateContent call itself. Note that there's a the 2GB file size limit per input file. Error handling: Check the batchStats for failedRequestCount after a job completes. If using file output, parse each line to check if it's a GenerateContentResponse or a status object indicating an error for that specific request. Submit jobs once: The creation of a batch job is not idempotent. If you send the same creation request twice, two separate batch jobs will be created. Break up very large batches: While the target turnaround time is 24 hours, actual processing time can vary based on system load and job size. For large jobs, consider breaking them into smaller batches if intermediate results are needed sooner. What's next Check out the batch mode notebook for more examples. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC.
|
text_content/docs_caching_85703b6c.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
URL: https://ai.google.dev/gemini-api/docs/caching#main-content
|
2 |
+
Title: Context caching | Gemini API | Google AI for Developers
|
3 |
+
==================================================
|
4 |
+
|
5 |
+
Context caching | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Context caching Python JavaScript Go REST In a typical AI workflow, you might pass the same input tokens over and over to a model. The Gemini API offers two different caching mechanisms: Implicit caching (automatically enabled on Gemini 2.5 models, no cost saving guarantee) Explicit caching (can be manually enabled on most models, cost saving guarantee) Explicit caching is useful in cases where you want to guarantee cost savings, but with some added developer work. Implicit caching Implicit caching is enabled by default for all Gemini 2.5 models. We automatically pass on cost savings if your request hits caches. There is nothing you need to do in order to enable this. It is effective as of May 8th, 2025. The minimum input token count for context caching is 1,024 for 2.5 Flash and 2,048 for 2.5 Pro. To increase the chance of an implicit cache hit: Try putting large and common contents at the beginning of your prompt Try to send requests with similar prefix in a short amount of time You can see the number of tokens which were cache hits in the response object's usage_metadata field. Explicit caching Using the Gemini API explicit caching feature, you can pass some content to the model once, cache the input tokens, and then refer to the cached tokens for subsequent requests. At certain volumes, using cached tokens is lower cost than passing in the same corpus of tokens repeatedly. When you cache a set of tokens, you can choose how long you want the cache to exist before the tokens are automatically deleted. This caching duration is called the time to live (TTL). If not set, the TTL defaults to 1 hour. The cost for caching depends on the input token size and how long you want the tokens to persist. This section assumes that you've installed a Gemini SDK (or have curl installed) and that you've configured an API key, as shown in the quickstart . Explicit caching using the OpenAI library If you're using an OpenAI library , you can enable explicit caching using the cached_content property on extra_body . When to use explicit caching Context caching is particularly well suited to scenarios where a substantial initial context is referenced repeatedly by shorter requests. Consider using context caching for use cases such as: Chatbots with extensive system instructions Repetitive analysis of lengthy video files Recurring queries against large document sets Frequent code repository analysis or bug fixing How explicit caching reduces costs Context caching is a paid feature designed to reduce overall operational costs. Billing is based on the following factors: Cache token count: The number of input tokens cached, billed at a reduced rate when included in subsequent prompts. Storage duration: The amount of time cached tokens are stored (TTL), billed based on the TTL duration of cached token count. There are no minimum or maximum bounds on the TTL. Other factors: Other charges apply, such as for non-cached input tokens and output tokens. For up-to-date pricing details, refer to the Gemini API pricing page . To learn how to count tokens, see the Token guide . Additional considerations Keep the following considerations in mind when using context caching: The minimum input token count for context caching is 1,024 for 2.5 Flash and 2,048 for 2.5 Pro. The maximum is the same as the maximum for the given model. (For more on counting tokens, see the Token guide ). The model doesn't make any distinction between cached tokens and regular input tokens. Cached content is a prefix to the prompt. There are no special rate or usage limits on context caching; the standard rate limits for GenerateContent apply, and token limits include cached tokens. The number of cached tokens is returned in the usage_metadata from the create, get, and list operations of the cache service, and also in GenerateContent when using the cache. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC.
|
text_content/docs_code-execution_857b45c1.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
URL: https://ai.google.dev/gemini-api/docs/code-execution#supported-libraries
|
2 |
+
Title: Code execution | Gemini API | Google AI for Developers
|
3 |
+
==================================================
|
4 |
+
|
5 |
+
Code execution | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Code execution The Gemini API provides a code execution tool that enables the model to generate and run Python code. The model can then learn iteratively from the code execution results until it arrives at a final output. You can use code execution to build applications that benefit from code-based reasoning. For example, you can use code execution to solve equations or process text. You can also use the libraries included in the code execution environment to perform more specialized tasks. Gemini is only able to execute code in Python. You can still ask Gemini to generate code in another language, but the model can't use the code execution tool to run it. Enable code execution To enable code execution, configure the code execution tool on the model. This allows the model to generate and run code. Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "What is the sum of the first 50 prime numbers? " "Generate and run code for the calculation, and make sure you get all 50." , config = types . GenerateContentConfig ( tools = [ types . Tool ( code_execution = types . ToolCodeExecution )] ), ) for part in response . candidates [ 0 ] . content . parts : if part . text is not None : print ( part . text ) if part . executable_code is not None : print ( part . executable_code . code ) if part . code_execution_result is not None : print ( part . code_execution_result . output ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); let response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : [ "What is the sum of the first 50 prime numbers? " + "Generate and run code for the calculation, and make sure you get all 50." , ], config : { tools : [{ codeExecution : {} }], }, }); const parts = response ? . candidates ? .[ 0 ] ? . content ? . parts || []; parts . forEach (( part ) = > { if ( part . text ) { console . log ( part . text ); } if ( part . executableCode && part . executableCode . code ) { console . log ( part . executableCode . code ); } if ( part . codeExecutionResult && part . codeExecutionResult . output ) { console . log ( part . codeExecutionResult . output ); } }); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } config := & genai . GenerateContentConfig { Tools : [] * genai . Tool { { CodeExecution : & genai . ToolCodeExecution {}}, }, } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "What is the sum of the first 50 prime numbers? " + "Generate and run code for the calculation, and make sure you get all 50." ), config , ) fmt . Println ( result . Text ()) fmt . Println ( result . ExecutableCode ()) fmt . Println ( result . CodeExecutionResult ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -d ' {"tools": [{"code_execution": {}}], "contents": { "parts": { "text": "What is the sum of the first 50 prime numbers? Generate and run code for the calculation, and make sure you get all 50." } }, }' Note: This REST example doesn't parse the JSON response as shown in the example output. The output might look something like the following, which has been formatted for readability: Okay, I need to calculate the sum of the first 50 prime numbers. Here's how I'll approach this: 1. **Generate Prime Numbers:** I'll use an iterative method to find prime numbers. I'll start with 2 and check if each subsequent number is divisible by any number between 2 and its square root. If not, it's a prime. 2. **Store Primes:** I'll store the prime numbers in a list until I have 50 of them. 3. **Calculate the Sum:** Finally, I'll sum the prime numbers in the list. Here's the Python code to do this: def is_prime(n): """Efficiently checks if a number is prime.""" if n <= 1: return False if n <= 3: return True if n % 2 == 0 or n % 3 == 0: return False i = 5 while i * i <= n: if n % i == 0 or n % (i + 2) == 0: return False i += 6 return True primes = [] num = 2 while len(primes) < 50: if is_prime(num): primes.append(num) num += 1 sum_of_primes = sum(primes) print(f'{primes=}') print(f'{sum_of_primes=}') primes=[2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127, 131, 137, 139, 149, 151, 157, 163, 167, 173, 179, 181, 191, 193, 197, 199, 211, 223, 227, 229] sum_of_primes=5117 The sum of the first 50 prime numbers is 5117. This output combines several content parts that the model returns when using code execution: text : Inline text generated by the model executableCode : Code generated by the model that is meant to be executed codeExecutionResult : Result of the executable code The naming conventions for these parts vary by programming language. Use code execution in chat You can also use code execution as part of a chat. Python from google import genai from google.genai import types client = genai . Client () chat = client . chats . create ( model = "gemini-2.5-flash" , config = types . GenerateContentConfig ( tools = [ types . Tool ( code_execution = types . ToolCodeExecution )] ), ) response = chat . send_message ( "I have a math question for you." ) print ( response . text ) response = chat . send_message ( "What is the sum of the first 50 prime numbers? " "Generate and run code for the calculation, and make sure you get all 50." ) for part in response . candidates [ 0 ] . content . parts : if part . text is not None : print ( part . text ) if part . executable_code is not None : print ( part . executable_code . code ) if part . code_execution_result is not None : print ( part . code_execution_result . output ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); const chat = ai . chats . create ({ model : "gemini-2.5-flash" , history : [ { role : "user" , parts : [{ text : "I have a math question for you:" }], }, { role : "model" , parts : [{ text : "Great! I'm ready for your math question. Please ask away." }], }, ], config : { tools : [{ codeExecution : {}}], } }); const response = await chat . sendMessage ({ message : "What is the sum of the first 50 prime numbers? " + "Generate and run code for the calculation, and make sure you get all 50." }); console . log ( "Chat response:" , response . text ); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } config := & genai . GenerateContentConfig { Tools : [] * genai . Tool { { CodeExecution : & genai . ToolCodeExecution {}}, }, } chat , _ := client . Chats . Create ( ctx , "gemini-2.5-flash" , config , nil , ) result , _ := chat . SendMessage ( ctx , genai . Part { Text : "What is the sum of the first 50 prime numbers? " + "Generate and run code for the calculation, and " + "make sure you get all 50." , }, ) fmt . Println ( result . Text ()) fmt . Println ( result . ExecutableCode ()) fmt . Println ( result . CodeExecutionResult ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -d '{"tools": [{"code_execution": {}}], "contents": [ { "role": "user", "parts": [{ "text": "Can you print \"Hello world!\"?" }] },{ "role": "model", "parts": [ { "text": "" }, { "executable_code": { "language": "PYTHON", "code": "\nprint(\"hello world!\")\n" } }, { "code_execution_result": { "outcome": "OUTCOME_OK", "output": "hello world!\n" } }, { "text": "I have printed \"hello world!\" using the provided python code block. \n" } ], },{ "role": "user", "parts": [{ "text": "What is the sum of the first 50 prime numbers? Generate and run code for the calculation, and make sure you get all 50." }] } ] }' Input/output (I/O) Starting with Gemini 2.0 Flash , code execution supports file input and graph output. Using these input and output capabilities, you can upload CSV and text files, ask questions about the files, and have Matplotlib graphs generated as part of the response. The output files are returned as inline images in the response. I/O pricing When using code execution I/O, you're charged for input tokens and output tokens: Input tokens: User prompt Output tokens: Code generated by the model Code execution output in the code environment Thinking tokens Summary generated by the model I/O details When you're working with code execution I/O, be aware of the following technical details: The maximum runtime of the code environment is 30 seconds. If the code environment generates an error, the model may decide to regenerate the code output. This can happen up to 5 times. The maximum file input size is limited by the model token window. In AI Studio, using Gemini Flash 2.0, the maximum input file size is 1 million tokens (roughly 2MB for text files of the supported input types). If you upload a file that's too large, AI Studio won't let you send it. Code execution works best with text and CSV files. The input file can be passed in part.inlineData or part.fileData (uploaded via the Files API ), and the output file is always returned as part.inlineData . Single turn Bidirectional (Multimodal Live API) Models supported All Gemini 2.0 and 2.5 models Only Flash experimental models File input types supported .png, .jpeg, .csv, .xml, .cpp, .java, .py, .js, .ts .png, .jpeg, .csv, .xml, .cpp, .java, .py, .js, .ts Plotting libraries supported Matplotlib, seaborn Matplotlib, seaborn Multi-tool use Yes (code execution + grounding only) Yes Billing There's no additional charge for enabling code execution from the Gemini API. You'll be billed at the current rate of input and output tokens based on the Gemini model you're using. Here are a few other things to know about billing for code execution: You're only billed once for the input tokens you pass to the model, and you're billed for the final output tokens returned to you by the model. Tokens representing generated code are counted as output tokens. Generated code can include text and multimodal output like images. Code execution results are also counted as output tokens. The billing model is shown in the following diagram: You're billed at the current rate of input and output tokens based on the Gemini model you're using. If Gemini uses code execution when generating your response, the original prompt, the generated code, and the result of the executed code are labeled intermediate tokens and are billed as input tokens . Gemini then generates a summary and returns the generated code, the result of the executed code, and the final summary. These are billed as output tokens . The Gemini API includes an intermediate token count in the API response, so you know why you're getting additional input tokens beyond your initial prompt. Limitations The model can only generate and execute code. It can't return other artifacts like media files. In some cases, enabling code execution can lead to regressions in other areas of model output (for example, writing a story). There is some variation in the ability of the different models to use code execution successfully. Supported libraries The code execution environment includes the following libraries: attrs chess contourpy fpdf geopandas imageio jinja2 joblib jsonschema jsonschema-specifications lxml matplotlib mpmath numpy opencv-python openpyxl packaging pandas pillow protobuf pylatex pyparsing PyPDF2 python-dateutil python-docx python-pptx reportlab scikit-learn scipy seaborn six striprtf sympy tabulate tensorflow toolz xlrd You can't install your own libraries. Note: Only matplotlib is supported for graph rendering using code execution. What's next Try the code execution Colab . Learn about other Gemini API tools: Function calling Grounding with Google Search Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC.
|
text_content/docs_code-execution_d708f71c.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
URL: https://ai.google.dev/gemini-api/docs/code-execution#main-content
|
2 |
+
Title: Code execution | Gemini API | Google AI for Developers
|
3 |
+
==================================================
|
4 |
+
|
5 |
+
Code execution | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Code execution The Gemini API provides a code execution tool that enables the model to generate and run Python code. The model can then learn iteratively from the code execution results until it arrives at a final output. You can use code execution to build applications that benefit from code-based reasoning. For example, you can use code execution to solve equations or process text. You can also use the libraries included in the code execution environment to perform more specialized tasks. Gemini is only able to execute code in Python. You can still ask Gemini to generate code in another language, but the model can't use the code execution tool to run it. Enable code execution To enable code execution, configure the code execution tool on the model. This allows the model to generate and run code. Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "What is the sum of the first 50 prime numbers? " "Generate and run code for the calculation, and make sure you get all 50." , config = types . GenerateContentConfig ( tools = [ types . Tool ( code_execution = types . ToolCodeExecution )] ), ) for part in response . candidates [ 0 ] . content . parts : if part . text is not None : print ( part . text ) if part . executable_code is not None : print ( part . executable_code . code ) if part . code_execution_result is not None : print ( part . code_execution_result . output ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); let response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : [ "What is the sum of the first 50 prime numbers? " + "Generate and run code for the calculation, and make sure you get all 50." , ], config : { tools : [{ codeExecution : {} }], }, }); const parts = response ? . candidates ? .[ 0 ] ? . content ? . parts || []; parts . forEach (( part ) = > { if ( part . text ) { console . log ( part . text ); } if ( part . executableCode && part . executableCode . code ) { console . log ( part . executableCode . code ); } if ( part . codeExecutionResult && part . codeExecutionResult . output ) { console . log ( part . codeExecutionResult . output ); } }); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } config := & genai . GenerateContentConfig { Tools : [] * genai . Tool { { CodeExecution : & genai . ToolCodeExecution {}}, }, } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "What is the sum of the first 50 prime numbers? " + "Generate and run code for the calculation, and make sure you get all 50." ), config , ) fmt . Println ( result . Text ()) fmt . Println ( result . ExecutableCode ()) fmt . Println ( result . CodeExecutionResult ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -d ' {"tools": [{"code_execution": {}}], "contents": { "parts": { "text": "What is the sum of the first 50 prime numbers? Generate and run code for the calculation, and make sure you get all 50." } }, }' Note: This REST example doesn't parse the JSON response as shown in the example output. The output might look something like the following, which has been formatted for readability: Okay, I need to calculate the sum of the first 50 prime numbers. Here's how I'll approach this: 1. **Generate Prime Numbers:** I'll use an iterative method to find prime numbers. I'll start with 2 and check if each subsequent number is divisible by any number between 2 and its square root. If not, it's a prime. 2. **Store Primes:** I'll store the prime numbers in a list until I have 50 of them. 3. **Calculate the Sum:** Finally, I'll sum the prime numbers in the list. Here's the Python code to do this: def is_prime(n): """Efficiently checks if a number is prime.""" if n <= 1: return False if n <= 3: return True if n % 2 == 0 or n % 3 == 0: return False i = 5 while i * i <= n: if n % i == 0 or n % (i + 2) == 0: return False i += 6 return True primes = [] num = 2 while len(primes) < 50: if is_prime(num): primes.append(num) num += 1 sum_of_primes = sum(primes) print(f'{primes=}') print(f'{sum_of_primes=}') primes=[2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127, 131, 137, 139, 149, 151, 157, 163, 167, 173, 179, 181, 191, 193, 197, 199, 211, 223, 227, 229] sum_of_primes=5117 The sum of the first 50 prime numbers is 5117. This output combines several content parts that the model returns when using code execution: text : Inline text generated by the model executableCode : Code generated by the model that is meant to be executed codeExecutionResult : Result of the executable code The naming conventions for these parts vary by programming language. Use code execution in chat You can also use code execution as part of a chat. Python from google import genai from google.genai import types client = genai . Client () chat = client . chats . create ( model = "gemini-2.5-flash" , config = types . GenerateContentConfig ( tools = [ types . Tool ( code_execution = types . ToolCodeExecution )] ), ) response = chat . send_message ( "I have a math question for you." ) print ( response . text ) response = chat . send_message ( "What is the sum of the first 50 prime numbers? " "Generate and run code for the calculation, and make sure you get all 50." ) for part in response . candidates [ 0 ] . content . parts : if part . text is not None : print ( part . text ) if part . executable_code is not None : print ( part . executable_code . code ) if part . code_execution_result is not None : print ( part . code_execution_result . output ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); const chat = ai . chats . create ({ model : "gemini-2.5-flash" , history : [ { role : "user" , parts : [{ text : "I have a math question for you:" }], }, { role : "model" , parts : [{ text : "Great! I'm ready for your math question. Please ask away." }], }, ], config : { tools : [{ codeExecution : {}}], } }); const response = await chat . sendMessage ({ message : "What is the sum of the first 50 prime numbers? " + "Generate and run code for the calculation, and make sure you get all 50." }); console . log ( "Chat response:" , response . text ); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } config := & genai . GenerateContentConfig { Tools : [] * genai . Tool { { CodeExecution : & genai . ToolCodeExecution {}}, }, } chat , _ := client . Chats . Create ( ctx , "gemini-2.5-flash" , config , nil , ) result , _ := chat . SendMessage ( ctx , genai . Part { Text : "What is the sum of the first 50 prime numbers? " + "Generate and run code for the calculation, and " + "make sure you get all 50." , }, ) fmt . Println ( result . Text ()) fmt . Println ( result . ExecutableCode ()) fmt . Println ( result . CodeExecutionResult ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -d '{"tools": [{"code_execution": {}}], "contents": [ { "role": "user", "parts": [{ "text": "Can you print \"Hello world!\"?" }] },{ "role": "model", "parts": [ { "text": "" }, { "executable_code": { "language": "PYTHON", "code": "\nprint(\"hello world!\")\n" } }, { "code_execution_result": { "outcome": "OUTCOME_OK", "output": "hello world!\n" } }, { "text": "I have printed \"hello world!\" using the provided python code block. \n" } ], },{ "role": "user", "parts": [{ "text": "What is the sum of the first 50 prime numbers? Generate and run code for the calculation, and make sure you get all 50." }] } ] }' Input/output (I/O) Starting with Gemini 2.0 Flash , code execution supports file input and graph output. Using these input and output capabilities, you can upload CSV and text files, ask questions about the files, and have Matplotlib graphs generated as part of the response. The output files are returned as inline images in the response. I/O pricing When using code execution I/O, you're charged for input tokens and output tokens: Input tokens: User prompt Output tokens: Code generated by the model Code execution output in the code environment Thinking tokens Summary generated by the model I/O details When you're working with code execution I/O, be aware of the following technical details: The maximum runtime of the code environment is 30 seconds. If the code environment generates an error, the model may decide to regenerate the code output. This can happen up to 5 times. The maximum file input size is limited by the model token window. In AI Studio, using Gemini Flash 2.0, the maximum input file size is 1 million tokens (roughly 2MB for text files of the supported input types). If you upload a file that's too large, AI Studio won't let you send it. Code execution works best with text and CSV files. The input file can be passed in part.inlineData or part.fileData (uploaded via the Files API ), and the output file is always returned as part.inlineData . Single turn Bidirectional (Multimodal Live API) Models supported All Gemini 2.0 and 2.5 models Only Flash experimental models File input types supported .png, .jpeg, .csv, .xml, .cpp, .java, .py, .js, .ts .png, .jpeg, .csv, .xml, .cpp, .java, .py, .js, .ts Plotting libraries supported Matplotlib, seaborn Matplotlib, seaborn Multi-tool use Yes (code execution + grounding only) Yes Billing There's no additional charge for enabling code execution from the Gemini API. You'll be billed at the current rate of input and output tokens based on the Gemini model you're using. Here are a few other things to know about billing for code execution: You're only billed once for the input tokens you pass to the model, and you're billed for the final output tokens returned to you by the model. Tokens representing generated code are counted as output tokens. Generated code can include text and multimodal output like images. Code execution results are also counted as output tokens. The billing model is shown in the following diagram: You're billed at the current rate of input and output tokens based on the Gemini model you're using. If Gemini uses code execution when generating your response, the original prompt, the generated code, and the result of the executed code are labeled intermediate tokens and are billed as input tokens . Gemini then generates a summary and returns the generated code, the result of the executed code, and the final summary. These are billed as output tokens . The Gemini API includes an intermediate token count in the API response, so you know why you're getting additional input tokens beyond your initial prompt. Limitations The model can only generate and execute code. It can't return other artifacts like media files. In some cases, enabling code execution can lead to regressions in other areas of model output (for example, writing a story). There is some variation in the ability of the different models to use code execution successfully. Supported libraries The code execution environment includes the following libraries: attrs chess contourpy fpdf geopandas imageio jinja2 joblib jsonschema jsonschema-specifications lxml matplotlib mpmath numpy opencv-python openpyxl packaging pandas pillow protobuf pylatex pyparsing PyPDF2 python-dateutil python-docx python-pptx reportlab scikit-learn scipy seaborn six striprtf sympy tabulate tensorflow toolz xlrd You can't install your own libraries. Note: Only matplotlib is supported for graph rendering using code execution. What's next Try the code execution Colab . Learn about other Gemini API tools: Function calling Grounding with Google Search Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC.
|
text_content/docs_downloads_7592cb44.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
URL: https://ai.google.dev/gemini-api/docs/downloads#install
|
2 |
+
Title: Gemini API libraries | Google AI for Developers
|
3 |
+
==================================================
|
4 |
+
|
5 |
+
Gemini API libraries | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini API libraries When building with the Gemini API, we recommend using our official collection of libraries across major languages: the Google GenAI SDK . They are production ready under General Availability . Our samples and documentation across this site are built using these libraries. Note: If you're using one of our legacy libraries, we strongly recommend you migrate to the Google GenAI SDK. Review the legacy libraries section for more information. If you're new to the Gemini API, follow our quickstart guide to get started. Language support and installation The Google GenAI SDK is available for the Python, JavaScript/TypeScript, Go and Java languages. You can install each language's library using package managers, or visit their GitHub repos for further engagement: Python Library: google-genai GitHub Repository: googleapis/python-genai Installation: pip install google-genai JavaScript Library: @google/genai GitHub Repository: googleapis/js-genai Installation: npm install @google/genai Go Library: google.golang.org/genai GitHub Repository: googleapis/go-genai Installation: go get google.golang.org/genai Java Library: google-genai GitHub Repository: googleapis/java-genai Installation: If you're using Maven, add the following to your dependencies: <dependencies> <dependency> <groupId>com.google.genai</groupId> <artifactId>google-genai</artifactId> <version>1.0.0</version> </dependency> </dependencies> General availability We started rolling out the Google GenAI SDK in late 2024. As of May 2025, it reached General Availability (GA) across all supported platforms. This means the libraries are stable and fully supported for production use. They are actively maintained, provide access to the latest features, and offer the best performance working with Gemini. If you're not using the Google GenAI SDK and using one of our legacy libraries, we strongly recommend you to migrate. Review the legacy libraries section for more information. Legacy libraries and migration If you are using one of our legacy libraries, we recommend that you migrate to the new libraries . The legacy libraries don't provide access to recent features (such as Live API and Veo ) and are on a deprecation path. They will stop receiving updates at the end of September 2025, the feature gaps will grow and potential bugs may no longer get fixed. Each legacy library's support status varies, detailed in the following table: Language Legacy library Support status Recommended library Python google-generativeai All support, including bug fixes, ends end of September 2025. google-genai JavaScript/TypeScript @google/generativeai All support, including bug fixes, ends end of September 2025. @google/genai Go google.golang.org/generative-ai All support, including bug fixes, ends end of September 2025. google.golang.org/genai Dart and Flutter google_generative_ai Not actively maintained Use trusted community or third party libraries, like firebase_ai , or access using REST API Swift generative-ai-swift Not actively maintained Use Gemini in Firebase Android generative-ai-android Not actively maintained Use Gemini in Firebase Note for Java developers: There was no legacy Google-provided Java SDK for the Gemini API, so no migration from a previous Google library is required. You can start directly with the new library in the Language support and installation section. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-09 UTC.
|
text_content/docs_embeddings_1c200669.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
URL: https://ai.google.dev/gemini-api/docs/embeddings#main-content
|
2 |
+
Title: Embeddings | Gemini API | Google AI for Developers
|
3 |
+
==================================================
|
4 |
+
|
5 |
+
Embeddings | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Embeddings Note: Introducing our first Gemini embedding model, available now to developers as gemini-embedding-exp-03-07 in the API. The Gemini API supports several embedding models that generate embeddings for words, phrases, code, and sentences. The resulting embeddings can then be used for tasks such as semantic search, text classification, and clustering, among many others. What are embeddings? Embeddings are numerical representations of text (or other media formats) that capture relationships between inputs. Text embeddings work by converting text into arrays of floating point numbers, called vectors . These vectors are designed to capture the meaning of the text. The length of the embedding array is called the vector's dimensionality . A passage of text might be represented by a vector containing hundreds of dimensions. Embeddings capture semantic meaning and context, which results in text with similar meanings having "closer" embeddings. For example, the sentence "I took my dog to the vet" and "I took my cat to the vet" would have embeddings that are close to each other in the vector space. You can use embeddings to compare different texts and understand how they relate. For example, if the embeddings of the text "cat" and "dog" are close together you can infer that these words are similar in meaning, context, or both. This enables a variety of common AI use cases . Before you begin Before calling the Gemini API, ensure you have your SDK of choice installed, and a Gemini API key configured and ready to use. Generate embeddings Use the embedContent method to generate text embeddings: Python from google import genai client = genai . Client () result = client . models . embed_content ( model = "gemini-embedding-exp-03-07" , contents = "What is the meaning of life?" ) print ( result . embeddings ) JavaScript import { GoogleGenAI } from "@google/genai" ; async function main () { const ai = new GoogleGenAI ({}); const response = await ai . models . embedContent ({ model : 'gemini-embedding-exp-03-07' , contents : 'What is the meaning of life?' , }); console . log ( response . embeddings ); } main (); Go package main import ( "context" "encoding/json" "fmt" "log" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } contents := [] * genai . Content { genai . NewContentFromText ( "What is the meaning of life?" , genai . RoleUser ), } result , err := client . Models . EmbedContent ( ctx , "gemini-embedding-exp-03-07" , contents , nil , ) if err != nil { log . Fatal ( err ) } embeddings , err := json . MarshalIndent ( result . Embeddings , "" , " " ) if err != nil { log . Fatal ( err ) } fmt . Println ( string ( embeddings )) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-embedding-exp-03-07:embedContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -d '{"model": "models/gemini-embedding-exp-03-07", "content": { "parts":[{ "text": "What is the meaning of life?"}]} }' You can also generate embeddings for multiple chunks at once by passing them in as a list of strings. Task types When building Retrieval Augmented Generation (RAG) systems, a common design is to use text embeddings to perform a similarity search. In some cases this can lead to degraded quality, because questions and their answers are not semantically similar. For example, a question like "Why is the sky blue?" and its answer "The scattering of sunlight causes the blue color," have distinctly different meanings as statements, which means that a RAG system won't automatically recognize their relation. Task types enable you to generate optimized embeddings for specific tasks, saving you time and cost and improving performance. Python from google import genai from google.genai import types client = genai . Client () result = client . models . embed_content ( model = "gemini-embedding-exp-03-07" , contents = "What is the meaning of life?" , config = types . EmbedContentConfig ( task_type = "SEMANTIC_SIMILARITY" ) ) print ( result . embeddings ) JavaScript import { GoogleGenAI } from "@google/genai" ; async function main () { const ai = new GoogleGenAI ({}); const response = await ai . models . embedContent ({ model : 'gemini-embedding-exp-03-07' , contents : 'What is the meaning of life?' , config : { taskType : "SEMANTIC_SIMILARITY" , } }); console . log ( response . embeddings ); } main (); REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-embedding-exp-03-07:embedContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -d '{"model": "models/gemini-embedding-exp-03-07", "content": { "parts":[{ "text": "What is the meaning of life?"}]}, "taskType": "SEMANTIC_SIMILARITY" }' Supported task types Task type Description SEMANTIC_SIMILARITY Used to generate embeddings that are optimized to assess text similarity. CLASSIFICATION Used to generate embeddings that are optimized to classify texts according to preset labels. CLUSTERING Used to generate embeddings that are optimized to cluster texts based on their similarities. RETRIEVAL_DOCUMENT , RETRIEVAL_QUERY , QUESTION_ANSWERING , and FACT_VERIFICATION Used to generate embeddings that are optimized for document search or information retrieval. CODE_RETRIEVAL_QUERY Used to retrieve a code block based on a natural language query, such as sort an array or reverse a linked list. Embeddings of the code blocks are computed using RETRIEVAL_DOCUMENT . Use cases Text embeddings are used in a variety of common AI use cases, such as: Information retrieval: You can use embeddings to retrieve semantically similar text given a piece of input text. Document search tutorial task Clustering: Comparing groups of embeddings can help identify hidden trends. Embedding clustering tutorial bubble_chart Vector database: As you take different embedding use cases to production, it is common to store embeddings in a vector database. Vector database tutorial bolt Classification: You can train a model using embeddings to classify documents into categories. Classification tutorial token Embedding models The Gemini API offers three models that generate text embeddings: gemini-embedding-exp-03-07 text-embedding-004 embedding-001 What's next Check out the embeddings quickstart notebook . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC.
|
text_content/docs_ephemeral-tokens_14d8195d.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
URL: https://ai.google.dev/gemini-api/docs/ephemeral-tokens#main-content
|
2 |
+
Title: Ephemeral tokens | Gemini API | Google AI for Developers
|
3 |
+
==================================================
|
4 |
+
|
5 |
+
Ephemeral tokens | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Ephemeral tokens Ephemeral tokens are short-lived authentication tokens for accessing the Gemini API through WebSockets . They are designed to enhance security when you are connecting directly from a user's device to the API (a client-to-server implementation). Like standard API keys, ephemeral tokens can be extracted from client-side applications such as web browsers or mobile apps. But because ephemeral tokens expire quickly and can be restricted, they significantly reduce the security risks in a production environment. Note: Ephemeral tokens are only compatible with Live API at this time. You should use them when accessing the Live API directly from client-side applications to enhance API key security. How ephemeral tokens work Here's how ephemeral tokens work at a high level: Your client (e.g. web app) authenticates with your backend. Your backend requests an ephemeral token from Gemini API's provisioning service. Gemini API issues a short-lived token. Your backend sends the token to the client for WebSocket connections to Live API. You can do this by swapping your API key with an ephemeral token. The client then uses the token as if it were an API key. This enhances security because even if extracted, the token is short-lived, unlike a long-lived API key deployed client-side. Since the client sends data directly to Gemini, this also improves latency and avoids your backends needing to proxy the real time data. Create an ephemeral token Here is a simplified example of how to get an ephemeral token from Gemini. By default, you'll have 1 minute to start new Live API sessions using the token from this request ( newSessionExpireTime ), and 30 minutes to send messages over that connection ( expireTime ). Python import datetime now = datetime . datetime . now ( tz = datetime . timezone . utc ) client = genai . Client ( http_options = { 'api_version' : 'v1alpha' ,} ) token = client . auth_tokens . create ( config = { 'uses' : 1 , # The ephemeral token can only be used to start a single session 'expire_time' : now + datetime . timedelta ( minutes = 30 ), # Default is 30 minutes in the future # 'expire_time': '2025-05-17T00:00:00Z', # Accepts isoformat. 'new_session_expire_time' : now + datetime . timedelta ( minutes = 1 ), # Default 1 minute in the future 'http_options' : { 'api_version' : 'v1alpha' }, } ) # You'll need to pass the value under token.name back to your client to use it JavaScript import { GoogleGenAI } from "@google/genai" ; const client = new GoogleGenAI ({}); const expireTime = new Date ( Date . now () + 30 * 60 * 1000 ). toISOString (); const token : AuthToken = await client . authTokens . create ({ config : { uses : 1 , // The default expireTime : expireTime // Default is 30 mins newSessionExpireTime : new Date ( Date . now () + ( 1 * 60 * 1000 )), // Default 1 minute in the future httpOptions : { apiVersion : 'v1alpha' }, }, }); For expireTime value constraints, defaults, and other field specs, see the API reference . Within the expireTime timeframe, you'll need sessionResumption to reconnect the call every 10 minutes (this can be done with the same token even if uses: 1 ). It's also possible to lock an ephemeral token to a set of configurations. This might be useful to further improve security of your application and keep your system instructions on the server side. Python client = genai . Client ( http_options = { 'api_version' : 'v1alpha' ,} ) token = client . auth_tokens . create ( config = { 'uses' : 1 , 'live_connect_constraints' : { 'model' : 'gemini-2.0-flash-live-001' , 'config' : { 'session_resumption' :{}, 'temperature' : 0.7 , 'response_modalities' :[ 'TEXT' ] } }, 'http_options' : { 'api_version' : 'v1alpha' }, } ) # You'll need to pass the value under token.name back to your client to use it JavaScript import { GoogleGenAI } from "@google/genai" ; const client = new GoogleGenAI ({}); const expireTime = new Date ( Date . now () + 30 * 60 * 1000 ). toISOString (); const token = await client . authTokens . create ({ config : { uses : 1 , // The default expireTime : expireTime , liveConnectConstraints : { model : 'gemini-2.0-flash-live-001' , config : { sessionResumption : {}, temperature : 0.7 , responseModalities : [ 'TEXT' ] } }, httpOptions : { apiVersion : 'v1alpha' } } }); // You'll need to pass the value under token.name back to your client to use it You can also lock a subset of fields, see the SDK documentation for more info. Connect to Live API with an ephemeral token Once you have an ephemeral token, you use it as if it were an API key (but remember, it only works for the live API, and only with the v1alpha version of the API). Note that use of ephemeral tokens only adds value when deploying applications that follow client-to-server implementation approach. JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; // Use the token generated in the "Create an ephemeral token" section here const ai = new GoogleGenAI ({ apiKey : token . name }); const model = 'gemini-2.0-flash-live-001' ; const config = { responseModalities : [ Modality . TEXT ] }; async function main () { const session = await ai . live . connect ({ model : model , config : config , callbacks : { ... }, }); // Send content... session . close (); } main (); Note: If not using the SDK, note that ephemeral tokens must either be passed in an access_token query parameter, or in an HTTP Authorization prefixed by the auth-scheme Token . See Get started with Live API for more examples. Best practices Set a short expiration duration using the expire_time parameter. Tokens expire, requiring re-initiation of the provisioning process. Verify secure authentication for your own backend. Ephemeral tokens will only be as secure as your backend authentication method. Generally, avoid using ephemeral tokens for backend-to-Gemini connections, as this path is typically considered secure. Limitations Ephemeral tokens are only compatible with Live API at this time. What's next Read the Live API reference on ephemeral tokens for more information. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-10 UTC.
|
text_content/docs_function-calling_065bfaa3.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
URL: https://ai.google.dev/gemini-api/docs/function-calling#main-content
|
2 |
+
Title: Function calling with the Gemini API | Google AI for Developers
|
3 |
+
==================================================
|
4 |
+
|
5 |
+
Function calling with the Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Function calling with the Gemini API Function calling lets you connect models to external tools and APIs. Instead of generating text responses, the model determines when to call specific functions and provides the necessary parameters to execute real-world actions. This allows the model to act as a bridge between natural language and real-world actions and data. Function calling has 3 primary use cases: Augment Knowledge: Access information from external sources like databases, APIs, and knowledge bases. Extend Capabilities: Use external tools to perform computations and extend the limitations of the model, such as using a calculator or creating charts. Take Actions: Interact with external systems using APIs, such as scheduling appointments, creating invoices, sending emails, or controlling smart home devices. Get Weather Schedule Meeting Create Chart How function calling works Function calling involves a structured interaction between your application, the model, and external functions. Here's a breakdown of the process: Define Function Declaration: Define the function declaration in your application code. Function Declarations describe the function's name, parameters, and purpose to the model. Call LLM with function declarations: Send user prompt along with the function declaration(s) to the model. It analyzes the request and determines if a function call would be helpful. If so, it responds with a structured JSON object. Execute Function Code (Your Responsibility): The Model does not execute the function itself. It's your application's responsibility to process the response and check for Function Call, if Yes : Extract the name and args of the function and execute the corresponding function in your application. No: The model has provided a direct text response to the prompt (this flow is less emphasized in the example but is a possible outcome). Create User friendly response: If a function was executed, capture the result and send it back to the model in a subsequent turn of the conversation. It will use the result to generate a final, user-friendly response that incorporates the information from the function call. This process can be repeated over multiple turns, allowing for complex interactions and workflows. The model also supports calling multiple functions in a single turn ( parallel function calling ) and in sequence ( compositional function calling ). Step 1: Define a function declaration Define a function and its declaration within your application code that allows users to set light values and make an API request. This function could call external services or APIs. Python # Define a function that the model can call to control smart lights set_light_values_declaration = { "name" : "set_light_values" , "description" : "Sets the brightness and color temperature of a light." , "parameters" : { "type" : "object" , "properties" : { "brightness" : { "type" : "integer" , "description" : "Light level from 0 to 100. Zero is off and 100 is full brightness" , }, "color_temp" : { "type" : "string" , "enum" : [ "daylight" , "cool" , "warm" ], "description" : "Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`." , }, }, "required" : [ "brightness" , "color_temp" ], }, } # This is the actual function that would be called based on the model's suggestion def set_light_values ( brightness : int , color_temp : str ) - > dict [ str , int | str ]: """Set the brightness and color temperature of a room light. (mock API). Args: brightness: Light level from 0 to 100. Zero is off and 100 is full brightness color_temp: Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`. Returns: A dictionary containing the set brightness and color temperature. """ return { "brightness" : brightness , "colorTemperature" : color_temp } JavaScript import { Type } from '@google/genai' ; // Define a function that the model can call to control smart lights const setLightValuesFunctionDeclaration = { name : 'set_light_values' , description : 'Sets the brightness and color temperature of a light.' , parameters : { type : Type . OBJECT , properties : { brightness : { type : Type . NUMBER , description : 'Light level from 0 to 100. Zero is off and 100 is full brightness' , }, color_temp : { type : Type . STRING , enum : [ 'daylight' , 'cool' , 'warm' ], description : 'Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`.' , }, }, required : [ 'brightness' , 'color_temp' ], }, }; /** * Set the brightness and color temperature of a room light. (mock API) * @param {number} brightness - Light level from 0 to 100. Zero is off and 100 is full brightness * @param {string} color_temp - Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`. * @return {Object} A dictionary containing the set brightness and color temperature. */ function setLightValues ( brightness , color_temp ) { return { brightness : brightness , colorTemperature : color_temp }; } Step 2: Call the model with function declarations Once you have defined your function declarations, you can prompt the model to use them. It analyzes the prompt and function declarations and decides whether to respond directly or to call a function. If a function is called, the response object will contain a function call suggestion. Python from google.genai import types # Configure the client and tools client = genai . Client () tools = types . Tool ( function_declarations = [ set_light_values_declaration ]) config = types . GenerateContentConfig ( tools = [ tools ]) # Define user prompt contents = [ types . Content ( role = "user" , parts = [ types . Part ( text = "Turn the lights down to a romantic level" )] ) ] # Send request with function declarations response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = contents config = config , ) print ( response . candidates [ 0 ] . content . parts [ 0 ] . function_call ) JavaScript import { GoogleGenAI } from '@google/genai' ; // Generation config with function declaration const config = { tools : [{ functionDeclarations : [ setLightValuesFunctionDeclaration ] }] }; // Configure the client const ai = new GoogleGenAI ({}); // Define user prompt const contents = [ { role : 'user' , parts : [{ text : 'Turn the lights down to a romantic level' }] } ]; // Send request with function declarations const response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : contents , config : config }); console . log ( response . functionCalls [ 0 ]); The model then returns a functionCall object in an OpenAPI compatible schema specifying how to call one or more of the declared functions in order to respond to the user's question. Python id = None args = { 'color_temp' : 'warm' , 'brightness' : 25 } name = 'set_light_values' JavaScript { name : 'set_light_values' , args : { brightness : 25 , color_temp : 'warm' } } Step 3: Execute set_light_values function code Extract the function call details from the model's response, parse the arguments , and execute the set_light_values function. Python # Extract tool call details, it may not be in the first part. tool_call = response . candidates [ 0 ] . content . parts [ 0 ] . function_call if tool_call . name == "set_light_values" : result = set_light_values ( ** tool_call . args ) print ( f "Function execution result: { result } " ) JavaScript // Extract tool call details const tool_call = response . functionCalls [ 0 ] let result ; if ( tool_call . name === 'set_light_values' ) { result = setLightValues ( tool_call . args . brightness , tool_call . args . color_temp ); console . log ( `Function execution result: ${ JSON . stringify ( result ) } ` ); } Step 4: Create user friendly response with function result and call the model again Finally, send the result of the function execution back to the model so it can incorporate this information into its final response to the user. Python # Create a function response part function_response_part = types . Part . from_function_response ( name = tool_call . name , response = { "result" : result }, ) # Append function call and result of the function execution to contents contents . append ( response . candidates [ 0 ] . content ) # Append the content from the model's response. contents . append ( types . Content ( role = "user" , parts = [ function_response_part ])) # Append the function response final_response = client . models . generate_content ( model = "gemini-2.5-flash" , config = config , contents = contents , ) print ( final_response . text ) JavaScript // Create a function response part const function_response_part = { name : tool_call . name , response : { result } } // Append function call and result of the function execution to contents contents . push ( response . candidates [ 0 ]. content ); contents . push ({ role : 'user' , parts : [{ functionResponse : function_response_part }] }); // Get the final response from the model const final_response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : contents , config : config }); console . log ( final_response . text ); This completes the function calling flow. The model successfully used the set_light_values function to perform the request action of the user. Function declarations When you implement function calling in a prompt, you create a tools object, which contains one or more function declarations . You define functions using JSON, specifically with a select subset of the OpenAPI schema format. A single function declaration can include the following parameters: name (string): A unique name for the function ( get_weather_forecast , send_email ). Use descriptive names without spaces or special characters (use underscores or camelCase). description (string): A clear and detailed explanation of the function's purpose and capabilities. This is crucial for the model to understand when to use the function. Be specific and provide examples if helpful ("Finds theaters based on location and optionally movie title which is currently playing in theaters."). parameters (object): Defines the input parameters the function expects. type (string): Specifies the overall data type, such as object . properties (object): Lists individual parameters, each with: type (string): The data type of the parameter, such as string , integer , boolean, array . description (string): A description of the parameter's purpose and format. Provide examples and constraints ("The city and state, e.g., 'San Francisco, CA' or a zip code e.g., '95616'."). enum (array, optional): If the parameter values are from a fixed set, use "enum" to list the allowed values instead of just describing them in the description. This improves accuracy ("enum": ["daylight", "cool", "warm"]). required (array): An array of strings listing the parameter names that are mandatory for the function to operate. Function calling with thinking Enabling "thinking" can improve function call performance by allowing the model to reason through a request before suggesting function calls. However, because the Gemini API is stateless, this reasoning context is lost between turns, which can reduce the quality of function calls as they require multiple turn requests. To preserve this context you can use thought signatures. A thought signature is an encrypted representation of the model's internal thought process that you pass back to the model on subsequent turns. To use thought signatures: Receive the signature: When thinking is enabled, the API response will include a thought_signature field containing an encrypted representation of the model's reasoning. Return the signature: When you send the function's execution result back to the server, include the thought_signature you received. This allows the model to restore its previous thinking context and will likely result in better function calling performance. Receiving signatures from the server Signatures are returned in the part after the model's thinking phase, which typically is a text or function call. Here are some examples of what thought signatures look like returned in each type of part, in response to the request "What's the weather in Lake Tahoe?" using the Get Weather example: Text part [{ "candidates" : [ { "content" : { "parts" : [ { "text" : "Here's what the weather in Lake Tahoe is today" , "thoughtSignature" : "ClcBVKhc7ru7KzUI7SrdUoIdAYLm/+i93aHjfIt4xHyAoO/G70tApxnK2ujBhOhC1PrRy1pkQa88fqFvpHNVd1HDjNLO7mkp6/hFwE+SPPEB3fh0hs4oM8MKhgIBVKhc7uIGvrS7i/T4HpfbnYrluFfWNjZ62gewqe4cVdR/Dlh+zbjtYmDD0gPZ+SuBO7vvHQdzsjePRP+2Y5XddX6LEf/cGGgakq8EhVvw/a6IVzUO6XmpHg2Ag1sl8E9+VFH/lC0R0ZuYdFWligtDuYwp5p5q3o59G0TtWeU2MC1y2MJfE9u/KWd313ldka80/X2W/xF2O/4djMp5G2WKcULfve75zeRCy0mc5iS3SB9mTH0cT6x0vtKjeBx50gcg+CQWtJcRuwTVzz54dmvmK9xvnqA8gKGw3DuaM9wfy5hyY7Qg0z3iyyWdP8T/lbjKim8IEQOk7O1vVwP1Ko7oMYH8JgA1CsoBAVSoXO6v4c5RSyd1cn6EIU0pEFQsjW7rYWPuZdOFq/tsGJT9BCfW7KGkPGwlNSq8jTJFvbcJ/DjtndISQYXwiXd2kGa5JfdS2Kh4zOxCxiWtOk+2nCc3+XQk2nonhO+esGJpkDdbbHZSqRgcUtYKq7q28iPFOQvOFyCiZNB7K86Z/6Hnagu2snSlN/BcTMaFGaWpcCClSUo4foRZn3WbNCoM8rcpD7qEJMp4a5baaSxyyeL1ZTGd2HLpFys/oiW6e3oAnhxuIysCwg==" } ] , "role" : "model" } , "index" : 0 } ] , # Remainder of response... Function call part [{ "candidates" : [ { "content" : { "parts" : [ { "functionCall" : { "name" : "getWeather" , "args" : { "city" : "Lake Tahoe" } } , "thoughtSignature" : "CiwBVKhc7nRyTi3HmggPD9iQiRc261f5jwuMdw3H/itDH0emsb9ZVo3Nwx9p6wpsAVSoXO5i8fDV4jBSBLoaWxB5zUdlGY6aIGp+I0oEnwRRSRQ1LOvrDlojEH8JE8HjiKXALdJrvNPiG+HY3GZEO8pZjEZtc3UoBUh7+SVyjK7Xolu7aRYYeUyzrCapoETWypER1jbrJXnFV23hCosBAVSoXO6oIPNJSmbuEDfGafOhuCSHkpr1yjTp35RXYqmCESzRzWf5+nFXLqncqeFo4ohoxbiYQVpVQbOZF81p8o9zg6xeRE7qMeOv+XN7enXGJ4/s3qNFQpfkSMqRdBITN1VpX7jyfEAjvxBNc7PDfDJZmEPY338ZIY5nFFcmzJSWjVrboFt2sMFv+A==" } ] , "role" : "model" } , "finishReason" : "STOP" , "index" : 0 } ] , # Remainder of response... You can confirm that you received a signature and see what a signature looks like using the following code: # Step 2: Call the model with function declarations # ...Generation config, Configure the client, and Define user prompt (No changes) # Send request with declarations (using a thinking model) response = client . models . generate_content ( model = "gemini-2.5-flash" , config = config , contents = contents ) # See thought signatures for part in response . candidates [ 0 ] . content . parts : if part . thought_signature : print ( "Thought signature:" ) print ( part . thought_signature ) Returning signatures back to the server In order to return signatures back: You should return signatures along with their containing parts back to the server You shouldn't merge a part with a signature with another part which also contains a signature. The signature string is not concatenable You shouldn't merge one part with a signature with another part without a signature. This breaks the correct positioning of the thought represented by the signature. The code will remain the same as in Step 4 of the previous section. But in this case (as indicated in the comment below) you will return signatures to the model along with the result of the function execution so the model can incorporate the thoughts into its final response: Python # Step 4: Create user friendly response with function result and call the model again # ...Create a function response part (No change) # Append thought signatures, function call and result of the function execution to contents function_call_content = response . candidates [ 0 ] . content # Append the model's function call message, which includes thought signatures contents . append ( function_call_content ) contents . append ( types . Content ( role = "user" , parts = [ function_response_part ])) # Append the function response final_response = client . models . generate_content ( model = "gemini-2.5-flash" , config = config , contents = contents , ) print ( final_response . text ) JavaScript // Step 4: Create user friendly response with function result and call the model again // ...Create a function response part (No change) // Append thought signatures, function call and result of the function execution to contents const function_response_content = response . candidates [ 0 ]. content ; contents . push ( function_response_content ); contents . push ({ role : 'user' , parts : [{ functionResponse : function_response_part }] }); const final_response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : contents , config : config }); console . log ( final_response . text ); The following shows what a request returning a thought signature may look like: [{ "contents" : [ { "role" : "user" , "parts" : [ { "text" : "what is the weather in Lake Tahoe?" } ] } , { "parts" : [ { "functionCall" : { "name" : "getWeather" , "args" : { "city" : "Lake Tahoe" } } , "thoughtSignature" : "CiIBVKhc7oDPpCaXyJKKssjqr4g3JNOSgJ/M2V+1THC1icsWCmwBVKhc7pBABbZ+zR3e9234WnWWS6GFXmf8IVwpnzjd5KYd7vyJbn/4vTorWBGayj/vbd9JPaZQjxdAIXhoE5mX/MDsQ7M9N/b0qJjHm39tYIBvS4sIWkMDHqTJqXGLzhhKtrTkfbV3RbaJEkQKmwEBVKhc7qVUgC3hfTXZLo9R3AJzUUIx50NKvJTb9B+UU+LBqgg7Nck1x5OpjWVS2R+SsveprIuYOruk2Y0H53J2OJF8qsxTdIq2si8DGW2V7WK8xyoJH5kbqd7drIw1jLb44b6lx4SMyB0VaULuTBki4d+Ljjg1tJTwR0IYMKqDLDZt9mheINsi0ZxcNjfpnDydRXdWbcSwzmK/wgqJAQFUqFzuKgNVElxs3cbO+xebr2IwcOro84nKTisi0tTp9bICPC9fTUhn3L+rvQWA+d3J1Za8at2bakrqiRj7BTh+CVO9fWQMAEQAs3ni0Z2hfaYG92tOD26E4IoZwyYEoWbfNudpH1fr5tEkyqnEGtWIh7H+XoZQ2DXeiOa+br7Zk88SrNE+trJMCogBAVSoXO5e9fBLg7hnbkmKsrzNLnQtLsQm1gNzjcjEC7nJYklYPp0KI2uGBE1PkM8XNsfllAfHVn7LzHcHNlbQ9pJ7QZTSIeG42goS971r5wNZwxaXwCTphClQh826eqJWo6A/28TtAVQWLhTx5ekbP7qb4nh1UblESZ1saxDQAEo4OKPbDzx5BgqKAQFUqFzuVyjNm5i0wN8hTDnKjfpDroEpPPTs531iFy9BOX+xDCdGHy8D+osFpaoBq6TFekQQbz4hIoUR1YEcP4zI80/cNimEeb9IcFxZTTxiNrbhbbcv0969DSMWhB+ZEqIz4vuw4GLe/xcUvqhlChQwFdgIbdOQHSHpatn5uDlktnP/bi26nKuXIwo0AVSoXO7US22OUH7d1f4abNPI0IyAvhqkPp12rbtWLx9vkOtojE8IP+xCfYtIFuZIzRNZqA==" } ] , "role" : "model" } , { "role" : "user" , "parts" : [ { "functionResponse" : { "name" : "getWeather" , "response" : { "response" : { "stringValue" : "Sunny and hot. 90 degrees Fahrenheit" } } } } ] } ] , # Remainder of request... Learn more about limitations and usage of thought signatures, and about thinking models in general, on the Thinking page. Parallel function calling In addition to single turn function calling, you can also call multiple functions at once. Parallel function calling lets you execute multiple functions at once and is used when the functions are not dependent on each other. This is useful in scenarios like gathering data from multiple independent sources, such as retrieving customer details from different databases or checking inventory levels across various warehouses or performing multiple actions such as converting your apartment into a disco. Python power_disco_ball = { "name" : "power_disco_ball" , "description" : "Powers the spinning disco ball." , "parameters" : { "type" : "object" , "properties" : { "power" : { "type" : "boolean" , "description" : "Whether to turn the disco ball on or off." , } }, "required" : [ "power" ], }, } start_music = { "name" : "start_music" , "description" : "Play some music matching the specified parameters." , "parameters" : { "type" : "object" , "properties" : { "energetic" : { "type" : "boolean" , "description" : "Whether the music is energetic or not." , }, "loud" : { "type" : "boolean" , "description" : "Whether the music is loud or not." , }, }, "required" : [ "energetic" , "loud" ], }, } dim_lights = { "name" : "dim_lights" , "description" : "Dim the lights." , "parameters" : { "type" : "object" , "properties" : { "brightness" : { "type" : "number" , "description" : "The brightness of the lights, 0.0 is off, 1.0 is full." , } }, "required" : [ "brightness" ], }, } JavaScript import { Type } from '@google/genai' ; const powerDiscoBall = { name : 'power_disco_ball' , description : 'Powers the spinning disco ball.' , parameters : { type : Type . OBJECT , properties : { power : { type : Type . BOOLEAN , description : 'Whether to turn the disco ball on or off.' } }, required : [ 'power' ] } }; const startMusic = { name : 'start_music' , description : 'Play some music matching the specified parameters.' , parameters : { type : Type . OBJECT , properties : { energetic : { type : Type . BOOLEAN , description : 'Whether the music is energetic or not.' }, loud : { type : Type . BOOLEAN , description : 'Whether the music is loud or not.' } }, required : [ 'energetic' , 'loud' ] } }; const dimLights = { name : 'dim_lights' , description : 'Dim the lights.' , parameters : { type : Type . OBJECT , properties : { brightness : { type : Type . NUMBER , description : 'The brightness of the lights, 0.0 is off, 1.0 is full.' } }, required : [ 'brightness' ] } }; Configure the function calling mode to allow using all of the specified tools. To learn more, you can read about configuring function calling . Python from google import genai from google.genai import types # Configure the client and tools client = genai . Client () house_tools = [ types . Tool ( function_declarations = [ power_disco_ball , start_music , dim_lights ]) ] config = types . GenerateContentConfig ( tools = house_tools , automatic_function_calling = types . AutomaticFunctionCallingConfig ( disable = True ), # Force the model to call 'any' function, instead of chatting. tool_config = types . ToolConfig ( function_calling_config = types . FunctionCallingConfig ( mode = 'ANY' ) ), ) chat = client . chats . create ( model = "gemini-2.5-flash" , config = config ) response = chat . send_message ( "Turn this place into a party!" ) # Print out each of the function calls requested from this single call print ( "Example 1: Forced function calling" ) for fn in response . function_calls : args = ", " . join ( f " { key } = { val } " for key , val in fn . args . items ()) print ( f " { fn . name } ( { args } )" ) JavaScript import { GoogleGenAI } from '@google/genai' ; // Set up function declarations const houseFns = [ powerDiscoBall , startMusic , dimLights ]; const config = { tools : [{ functionDeclarations : houseFns }], // Force the model to call 'any' function, instead of chatting. toolConfig : { functionCallingConfig : { mode : 'any' } } }; // Configure the client const ai = new GoogleGenAI ({}); // Create a chat session const chat = ai . chats . create ({ model : 'gemini-2.5-flash' , config : config }); const response = await chat . sendMessage ({ message : 'Turn this place into a party!' }); // Print out each of the function calls requested from this single call console . log ( "Example 1: Forced function calling" ); for ( const fn of response . functionCalls ) { const args = Object . entries ( fn . args ) . map (([ key , val ]) = > ` ${ key } = ${ val } ` ) . join ( ', ' ); console . log ( ` ${ fn . name } ( ${ args } )` ); } Each of the printed results reflects a single function call that the model has requested. To send the results back, include the responses in the same order as they were requested. The Python SDK supports automatic function calling , which automatically converts Python functions to declarations, handles the function call execution and response cycle for you. Following is an example for the disco use case. Note: Automatic Function Calling is a Python SDK only feature at the moment. Python from google import genai from google.genai import types # Actual function implementations def power_disco_ball_impl ( power : bool ) - > dict : """Powers the spinning disco ball. Args: power: Whether to turn the disco ball on or off. Returns: A status dictionary indicating the current state. """ return { "status" : f "Disco ball powered { 'on' if power else 'off' } " } def start_music_impl ( energetic : bool , loud : bool ) - > dict : """Play some music matching the specified parameters. Args: energetic: Whether the music is energetic or not. loud: Whether the music is loud or not. Returns: A dictionary containing the music settings. """ music_type = "energetic" if energetic else "chill" volume = "loud" if loud else "quiet" return { "music_type" : music_type , "volume" : volume } def dim_lights_impl ( brightness : float ) - > dict : """Dim the lights. Args: brightness: The brightness of the lights, 0.0 is off, 1.0 is full. Returns: A dictionary containing the new brightness setting. """ return { "brightness" : brightness } # Configure the client client = genai . Client () config = types . GenerateContentConfig ( tools = [ power_disco_ball_impl , start_music_impl , dim_lights_impl ] ) # Make the request response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "Do everything you need to this place into party!" , config = config , ) print ( " \n Example 2: Automatic function calling" ) print ( response . text ) # I've turned on the disco ball, started playing loud and energetic music, and dimmed the lights to 50% brightness. Let's get this party started! Compositional function calling Compositional or sequential function calling allows Gemini to chain multiple function calls together to fulfill a complex request. For example, to answer "Get the temperature in my current location", the Gemini API might first invoke a get_current_location() function followed by a get_weather() function that takes the location as a parameter. The following example demonstrates how to implement compositional function calling using the Python SDK and automatic function calling. Python This example uses the automatic function calling feature of the google-genai Python SDK. The SDK automatically converts the Python functions to the required schema, executes the function calls when requested by the model, and sends the results back to the model to complete the task. import os from google import genai from google.genai import types # Example Functions def get_weather_forecast ( location : str ) - > dict : """Gets the current weather temperature for a given location.""" print ( f "Tool Call: get_weather_forecast(location= { location } )" ) # TODO: Make API call print ( "Tool Response: {'temperature': 25, 'unit': 'celsius'}" ) return { "temperature" : 25 , "unit" : "celsius" } # Dummy response def set_thermostat_temperature ( temperature : int ) - > dict : """Sets the thermostat to a desired temperature.""" print ( f "Tool Call: set_thermostat_temperature(temperature= { temperature } )" ) # TODO: Interact with a thermostat API print ( "Tool Response: {'status': 'success'}" ) return { "status" : "success" } # Configure the client and model client = genai . Client () config = types . GenerateContentConfig ( tools = [ get_weather_forecast , set_thermostat_temperature ] ) # Make the request response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "If it's warmer than 20°C in London, set the thermostat to 20°C, otherwise set it to 18°C." , config = config , ) # Print the final, user-facing response print ( response . text ) Expected Output When you run the code, you will see the SDK orchestrating the function calls. The model first calls get_weather_forecast , receives the temperature, and then calls set_thermostat_temperature with the correct value based on the logic in the prompt. Tool Call : get_weather_forecast ( location = London ) Tool Response : { 'temperature' : 25 , 'unit' : 'celsius' } Tool Call : set_thermostat_temperature ( temperature = 20 ) Tool Response : { 'status' : 'success' } OK . I 've set the thermostat to 20°C. JavaScript This example shows how to use JavaScript/TypeScript SDK to do comopositional function calling using a manual execution loop. import { GoogleGenAI , Type } from "@google/genai" ; // Configure the client const ai = new GoogleGenAI ({}); // Example Functions function get_weather_forecast ({ location }) { console . log ( `Tool Call: get_weather_forecast(location= ${ location } )` ); // TODO: Make API call console . log ( "Tool Response: {'temperature': 25, 'unit': 'celsius'}" ); return { temperature : 25 , unit : "celsius" }; } function set_thermostat_temperature ({ temperature }) { console . log ( `Tool Call: set_thermostat_temperature(temperature= ${ temperature } )` , ); // TODO: Make API call console . log ( "Tool Response: {'status': 'success'}" ); return { status : "success" }; } const toolFunctions = { get_weather_forecast , set_thermostat_temperature , }; const tools = [ { functionDeclarations : [ { name : "get_weather_forecast" , description : "Gets the current weather temperature for a given location." , parameters : { type : Type . OBJECT , properties : { location : { type : Type . STRING , }, }, required : [ "location" ], }, }, { name : "set_thermostat_temperature" , description : "Sets the thermostat to a desired temperature." , parameters : { type : Type . OBJECT , properties : { temperature : { type : Type . NUMBER , }, }, required : [ "temperature" ], }, }, ], }, ]; // Prompt for the model let contents = [ { role : "user" , parts : [ { text : "If it's warmer than 20°C in London, set the thermostat to 20°C, otherwise set it to 18°C." , }, ], }, ]; // Loop until the model has no more function calls to make while ( true ) { const result = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents , config : { tools }, }); if ( result . functionCalls && result . functionCalls . length > 0 ) { const functionCall = result . functionCalls [ 0 ]; const { name , args } = functionCall ; if ( ! toolFunctions [ name ]) { throw new Error ( `Unknown function call: ${ name } ` ); } // Call the function and get the response. const toolResponse = toolFunctions [ name ]( args ); const functionResponsePart = { name : functionCall . name , response : { result : toolResponse , }, }; // Send the function response back to the model. contents . push ({ role : "model" , parts : [ { functionCall : functionCall , }, ], }); contents . push ({ role : "user" , parts : [ { functionResponse : functionResponsePart , }, ], }); } else { // No more function calls, break the loop. console . log ( result . text ); break ; } } Expected Output When you run the code, you will see the SDK orchestrating the function calls. The model first calls get_weather_forecast , receives the temperature, and then calls set_thermostat_temperature with the correct value based on the logic in the prompt. Tool Call : get_weather_forecast ( location = London ) Tool Response : { 'temperature' : 25 , 'unit' : 'celsius' } Tool Call : set_thermostat_temperature ( temperature = 20 ) Tool Response : { 'status' : 'success' } OK . It 's 25°C in London, so I' ve set the thermostat to 20 ° C . Compositional function calling is a native Live API feature. This means Live API can handle the function calling similar to the Python SDK. Python # Light control schemas turn_on_the_lights_schema = { 'name' : 'turn_on_the_lights' } turn_off_the_lights_schema = { 'name' : 'turn_off_the_lights' } prompt = """ Hey, can you write run some python code to turn on the lights, wait 10s and then turn off the lights? """ tools = [ { 'code_execution' : {}}, { 'function_declarations' : [ turn_on_the_lights_schema , turn_off_the_lights_schema ]} ] await run ( prompt , tools = tools , modality = "AUDIO" ) JavaScript // Light control schemas const turnOnTheLightsSchema = { name : 'turn_on_the_lights' }; const turnOffTheLightsSchema = { name : 'turn_off_the_lights' }; const prompt = ` Hey, can you write run some python code to turn on the lights, wait 10s and then turn off the lights? ` ; const tools = [ { codeExecution : {} }, { functionDeclarations : [ turnOnTheLightsSchema , turnOffTheLightsSchema ] } ]; await run ( prompt , tools = tools , modality = "AUDIO" ) Function calling modes The Gemini API lets you control how the model uses the provided tools (function declarations). Specifically, you can set the mode within the. function_calling_config . AUTO (Default) : The model decides whether to generate a natural language response or suggest a function call based on the prompt and context. This is the most flexible mode and recommended for most scenarios. ANY : The model is constrained to always predict a function call and guarantees function schema adherence. If allowed_function_names is not specified, the model can choose from any of the provided function declarations. If allowed_function_names is provided as a list, the model can only choose from the functions in that list. Use this mode when you require a function call response to every prompt (if applicable). NONE : The model is prohibited from making function calls. This is equivalent to sending a request without any function declarations. Use this to temporarily disable function calling without removing your tool definitions. Python from google.genai import types # Configure function calling mode tool_config = types . ToolConfig ( function_calling_config = types . FunctionCallingConfig ( mode = "ANY" , allowed_function_names = [ "get_current_temperature" ] ) ) # Create the generation config config = types . GenerateContentConfig ( tools = [ tools ], # not defined here. tool_config = tool_config , ) JavaScript import { FunctionCallingConfigMode } from '@google/genai' ; // Configure function calling mode const toolConfig = { functionCallingConfig : { mode : FunctionCallingConfigMode . ANY , allowedFunctionNames : [ 'get_current_temperature' ] } }; // Create the generation config const config = { tools : tools , // not defined here. toolConfig : toolConfig , }; Automatic function calling (Python only) When using the Python SDK, you can provide Python functions directly as tools. The SDK automatically converts the Python function to declarations, handles the function call execution and the response cycle for you. The Python SDK then automatically: Detects function call responses from the model. Call the corresponding Python function in your code. Sends the function response back to the model. Returns the model's final text response. To use this, define your function with type hints and a docstring, and then pass the function itself (not a JSON declaration) as a tool: Python from google import genai from google.genai import types # Define the function with type hints and docstring def get_current_temperature ( location : str ) - > dict : """Gets the current temperature for a given location. Args: location: The city and state, e.g. San Francisco, CA Returns: A dictionary containing the temperature and unit. """ # ... (implementation) ... return { "temperature" : 25 , "unit" : "Celsius" } # Configure the client client = genai . Client () config = types . GenerateContentConfig ( tools = [ get_current_temperature ] ) # Pass the function itself # Make the request response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "What's the temperature in Boston?" , config = config , ) print ( response . text ) # The SDK handles the function call and returns the final text You can disable automatic function calling with: Python config = types . GenerateContentConfig ( tools = [ get_current_temperature ], automatic_function_calling = types . AutomaticFunctionCallingConfig ( disable = True ) ) Automatic function schema declaration Automatic schema extraction from Python functions doesn't work in all cases. For example, it doesn't handle cases where you describe the fields of a nested dictionary-object. The API is able to describe any of the following types: Python AllowedType = ( int | float | bool | str | list [ 'AllowedType' ] | dict [ str , AllowedType ]) To see what the inferred schema looks like, you can convert it using from_callable : Python def multiply ( a : float , b : float ): """Returns a * b.""" return a * b fn_decl = types . FunctionDeclaration . from_callable ( callable = multiply , client = client ) # to_json_dict() provides a clean JSON representation. print ( fn_decl . to_json_dict ()) Multi-tool use: Combine native tools with function calling You can enable multiple tools combining native tools with function calling at the same time. Here's an example that enables two tools, Grounding with Google Search and code execution , in a request using the Live API . Note: Multi-tool use is a- Live API only feature at the moment. The run() function declaration, which handles the asynchronous websocket setup, is omitted for brevity. Python # Multiple tasks example - combining lights, code execution, and search prompt = """ Hey, I need you to do three things for me. 1. Turn on the lights. 2. Then compute the largest prime palindrome under 100000. 3. Then use Google Search to look up information about the largest earthquake in California the week of Dec 5 2024. Thanks! """ tools = [ { 'google_search' : {}}, { 'code_execution' : {}}, { 'function_declarations' : [ turn_on_the_lights_schema , turn_off_the_lights_schema ]} # not defined here. ] # Execute the prompt with specified tools in audio modality await run ( prompt , tools = tools , modality = "AUDIO" ) JavaScript // Multiple tasks example - combining lights, code execution, and search const prompt = ` Hey, I need you to do three things for me. 1. Turn on the lights. 2. Then compute the largest prime palindrome under 100000. 3. Then use Google Search to look up information about the largest earthquake in California the week of Dec 5 2024. Thanks! ` ; const tools = [ { googleSearch : {} }, { codeExecution : {} }, { functionDeclarations : [ turnOnTheLightsSchema , turnOffTheLightsSchema ] } // not defined here. ]; // Execute the prompt with specified tools in audio modality await run ( prompt , { tools : tools , modality : "AUDIO" }); Python developers can try this out in the Live API Tool Use notebook . Model context protocol (MCP) Model Context Protocol (MCP) is an open standard for connecting AI applications with external tools and data. MCP provides a common protocol for models to access context, such as functions (tools), data sources (resources), or predefined prompts. The Gemini SDKs have built-in support for the MCP, reducing boilerplate code and offering automatic tool calling for MCP tools. When the model generates an MCP tool call, the Python and JavaScript client SDK can automatically execute the MCP tool and send the response back to the model in a subsequent request, continuing this loop until no more tool calls are made by the model. Here, you can find an example of how to use a local MCP server with Gemini and mcp SDK. Python Make sure the latest version of the mcp SDK is installed on your platform of choice. pip install mcp Note: Python supports automatic tool calling by passing in the ClientSession into the tools parameters. If you want to disable it, you can provide automatic_function_calling with disabled True . import os import asyncio from datetime import datetime from mcp import ClientSession , StdioServerParameters from mcp.client.stdio import stdio_client from google import genai client = genai . Client () # Create server parameters for stdio connection server_params = StdioServerParameters ( command = "npx" , # Executable args = [ "-y" , "@philschmid/weather-mcp" ], # MCP Server env = None , # Optional environment variables ) async def run (): async with stdio_client ( server_params ) as ( read , write ): async with ClientSession ( read , write ) as session : # Prompt to get the weather for the current day in London. prompt = f "What is the weather in London in { datetime . now () . strftime ( '%Y-%m- %d ' ) } ?" # Initialize the connection between client and server await session . initialize () # Send request to the model with MCP function declarations response = await client . aio . models . generate_content ( model = "gemini-2.5-flash" , contents = prompt , config = genai . types . GenerateContentConfig ( temperature = 0 , tools = [ session ], # uses the session, will automatically call the tool # Uncomment if you **don't** want the SDK to automatically call the tool # automatic_function_calling=genai.types.AutomaticFunctionCallingConfig( # disable=True # ), ), ) print ( response . text ) # Start the asyncio event loop and run the main function asyncio . run ( run ()) JavaScript Make sure the latest version of the mcp SDK is installed on your platform of choice. npm install @modelcontextprotocol/sdk Note: JavaScript supports automatic tool calling by wrapping the client with mcpToTool . If you want to disable it, you can provide automaticFunctionCalling with disabled true . import { GoogleGenAI , FunctionCallingConfigMode , mcpToTool } from '@google/genai' ; import { Client } from "@modelcontextprotocol/sdk/client/index.js" ; import { StdioClientTransport } from "@modelcontextprotocol/sdk/client/stdio.js" ; // Create server parameters for stdio connection const serverParams = new StdioClientTransport ({ command : "npx" , // Executable args : [ "-y" , "@philschmid/weather-mcp" ] // MCP Server }); const client = new Client ( { name : "example-client" , version : "1.0.0" } ); // Configure the client const ai = new GoogleGenAI ({}); // Initialize the connection between client and server await client . connect ( serverParams ); // Send request to the model with MCP tools const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : `What is the weather in London in ${ new Date (). toLocaleDateString () } ?` , config : { tools : [ mcpToTool ( client )], // uses the session, will automatically call the tool // Uncomment if you **don't** want the sdk to automatically call the tool // automaticFunctionCalling: { // disable: true, // }, }, }); console . log ( response . text ) // Close the connection await client . close (); Limitations with built-in MCP support Built-in MCP support is a experimental feature in our SDKs and has the following limitations: Only tools are supported, not resources nor prompts It is available for the Python and JavaScript/TypeScript SDK. Breaking changes might occur in future releases. Manual integration of MCP servers is always an option if these limit what you're building. Supported models This section lists models and their function calling capabilities. Experimental models are not included. You can find a comprehensive capabilities overview on the model overview page. Model Function Calling Parallel Function Calling Compositional Function Calling Gemini 2.5 Pro ✔️ ✔️ ✔️ Gemini 2.5 Flash ✔️ ✔️ ✔️ Gemini 2.5 Flash-Lite ✔️ ✔️ ✔️ Gemini 2.0 Flash ✔️ ✔️ ✔️ Gemini 2.0 Flash-Lite X X X Best practices Function and Parameter Descriptions: Be extremely clear and specific in your descriptions. The model relies on these to choose the correct function and provide appropriate arguments. Naming: Use descriptive function names (without spaces, periods, or dashes). Strong Typing: Use specific types (integer, string, enum) for parameters to reduce errors. If a parameter has a limited set of valid values, use an enum. Tool Selection: While the model can use an arbitrary number of tools, providing too many can increase the risk of selecting an incorrect or suboptimal tool. For best results, aim to provide only the relevant tools for the context or task, ideally keeping the active set to a maximum of 10-20. Consider dynamic tool selection based on conversation context if you have a large total number of tools. Prompt Engineering: Provide context: Tell the model its role (e.g., "You are a helpful weather assistant."). Give instructions: Specify how and when to use functions (e.g., "Don't guess dates; always use a future date for forecasts."). Encourage clarification: Instruct the model to ask clarifying questions if needed. Temperature: Use a low temperature (e.g., 0) for more deterministic and reliable function calls. Validation: If a function call has significant consequences (e.g., placing an order), validate the call with the user before executing it. Error Handling : Implement robust error handling in your functions to gracefully handle unexpected inputs or API failures. Return informative error messages that the model can use to generate helpful responses to the user. Security: Be mindful of security when calling external APIs. Use appropriate authentication and authorization mechanisms. Avoid exposing sensitive data in function calls. Token Limits: Function descriptions and parameters count towards your input token limit. If you're hitting token limits, consider limiting the number of functions or the length of the descriptions, break down complex tasks into smaller, more focused function sets. Notes and limitations Only a subset of the OpenAPI schema is supported. Supported parameter types in Python are limited. Automatic function calling is a Python SDK feature only. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-10 UTC.
|
text_content/docs_function-calling_8437d0a9.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
URL: https://ai.google.dev/gemini-api/docs/function-calling
|
2 |
+
Title: Function calling with the Gemini API | Google AI for Developers
|
3 |
+
==================================================
|
4 |
+
|
5 |
+
Function calling with the Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Function calling with the Gemini API Function calling lets you connect models to external tools and APIs. Instead of generating text responses, the model determines when to call specific functions and provides the necessary parameters to execute real-world actions. This allows the model to act as a bridge between natural language and real-world actions and data. Function calling has 3 primary use cases: Augment Knowledge: Access information from external sources like databases, APIs, and knowledge bases. Extend Capabilities: Use external tools to perform computations and extend the limitations of the model, such as using a calculator or creating charts. Take Actions: Interact with external systems using APIs, such as scheduling appointments, creating invoices, sending emails, or controlling smart home devices. Get Weather Schedule Meeting Create Chart How function calling works Function calling involves a structured interaction between your application, the model, and external functions. Here's a breakdown of the process: Define Function Declaration: Define the function declaration in your application code. Function Declarations describe the function's name, parameters, and purpose to the model. Call LLM with function declarations: Send user prompt along with the function declaration(s) to the model. It analyzes the request and determines if a function call would be helpful. If so, it responds with a structured JSON object. Execute Function Code (Your Responsibility): The Model does not execute the function itself. It's your application's responsibility to process the response and check for Function Call, if Yes : Extract the name and args of the function and execute the corresponding function in your application. No: The model has provided a direct text response to the prompt (this flow is less emphasized in the example but is a possible outcome). Create User friendly response: If a function was executed, capture the result and send it back to the model in a subsequent turn of the conversation. It will use the result to generate a final, user-friendly response that incorporates the information from the function call. This process can be repeated over multiple turns, allowing for complex interactions and workflows. The model also supports calling multiple functions in a single turn ( parallel function calling ) and in sequence ( compositional function calling ). Step 1: Define a function declaration Define a function and its declaration within your application code that allows users to set light values and make an API request. This function could call external services or APIs. Python # Define a function that the model can call to control smart lights set_light_values_declaration = { "name" : "set_light_values" , "description" : "Sets the brightness and color temperature of a light." , "parameters" : { "type" : "object" , "properties" : { "brightness" : { "type" : "integer" , "description" : "Light level from 0 to 100. Zero is off and 100 is full brightness" , }, "color_temp" : { "type" : "string" , "enum" : [ "daylight" , "cool" , "warm" ], "description" : "Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`." , }, }, "required" : [ "brightness" , "color_temp" ], }, } # This is the actual function that would be called based on the model's suggestion def set_light_values ( brightness : int , color_temp : str ) - > dict [ str , int | str ]: """Set the brightness and color temperature of a room light. (mock API). Args: brightness: Light level from 0 to 100. Zero is off and 100 is full brightness color_temp: Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`. Returns: A dictionary containing the set brightness and color temperature. """ return { "brightness" : brightness , "colorTemperature" : color_temp } JavaScript import { Type } from '@google/genai' ; // Define a function that the model can call to control smart lights const setLightValuesFunctionDeclaration = { name : 'set_light_values' , description : 'Sets the brightness and color temperature of a light.' , parameters : { type : Type . OBJECT , properties : { brightness : { type : Type . NUMBER , description : 'Light level from 0 to 100. Zero is off and 100 is full brightness' , }, color_temp : { type : Type . STRING , enum : [ 'daylight' , 'cool' , 'warm' ], description : 'Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`.' , }, }, required : [ 'brightness' , 'color_temp' ], }, }; /** * Set the brightness and color temperature of a room light. (mock API) * @param {number} brightness - Light level from 0 to 100. Zero is off and 100 is full brightness * @param {string} color_temp - Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`. * @return {Object} A dictionary containing the set brightness and color temperature. */ function setLightValues ( brightness , color_temp ) { return { brightness : brightness , colorTemperature : color_temp }; } Step 2: Call the model with function declarations Once you have defined your function declarations, you can prompt the model to use them. It analyzes the prompt and function declarations and decides whether to respond directly or to call a function. If a function is called, the response object will contain a function call suggestion. Python from google.genai import types # Configure the client and tools client = genai . Client () tools = types . Tool ( function_declarations = [ set_light_values_declaration ]) config = types . GenerateContentConfig ( tools = [ tools ]) # Define user prompt contents = [ types . Content ( role = "user" , parts = [ types . Part ( text = "Turn the lights down to a romantic level" )] ) ] # Send request with function declarations response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = contents config = config , ) print ( response . candidates [ 0 ] . content . parts [ 0 ] . function_call ) JavaScript import { GoogleGenAI } from '@google/genai' ; // Generation config with function declaration const config = { tools : [{ functionDeclarations : [ setLightValuesFunctionDeclaration ] }] }; // Configure the client const ai = new GoogleGenAI ({}); // Define user prompt const contents = [ { role : 'user' , parts : [{ text : 'Turn the lights down to a romantic level' }] } ]; // Send request with function declarations const response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : contents , config : config }); console . log ( response . functionCalls [ 0 ]); The model then returns a functionCall object in an OpenAPI compatible schema specifying how to call one or more of the declared functions in order to respond to the user's question. Python id = None args = { 'color_temp' : 'warm' , 'brightness' : 25 } name = 'set_light_values' JavaScript { name : 'set_light_values' , args : { brightness : 25 , color_temp : 'warm' } } Step 3: Execute set_light_values function code Extract the function call details from the model's response, parse the arguments , and execute the set_light_values function. Python # Extract tool call details, it may not be in the first part. tool_call = response . candidates [ 0 ] . content . parts [ 0 ] . function_call if tool_call . name == "set_light_values" : result = set_light_values ( ** tool_call . args ) print ( f "Function execution result: { result } " ) JavaScript // Extract tool call details const tool_call = response . functionCalls [ 0 ] let result ; if ( tool_call . name === 'set_light_values' ) { result = setLightValues ( tool_call . args . brightness , tool_call . args . color_temp ); console . log ( `Function execution result: ${ JSON . stringify ( result ) } ` ); } Step 4: Create user friendly response with function result and call the model again Finally, send the result of the function execution back to the model so it can incorporate this information into its final response to the user. Python # Create a function response part function_response_part = types . Part . from_function_response ( name = tool_call . name , response = { "result" : result }, ) # Append function call and result of the function execution to contents contents . append ( response . candidates [ 0 ] . content ) # Append the content from the model's response. contents . append ( types . Content ( role = "user" , parts = [ function_response_part ])) # Append the function response final_response = client . models . generate_content ( model = "gemini-2.5-flash" , config = config , contents = contents , ) print ( final_response . text ) JavaScript // Create a function response part const function_response_part = { name : tool_call . name , response : { result } } // Append function call and result of the function execution to contents contents . push ( response . candidates [ 0 ]. content ); contents . push ({ role : 'user' , parts : [{ functionResponse : function_response_part }] }); // Get the final response from the model const final_response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : contents , config : config }); console . log ( final_response . text ); This completes the function calling flow. The model successfully used the set_light_values function to perform the request action of the user. Function declarations When you implement function calling in a prompt, you create a tools object, which contains one or more function declarations . You define functions using JSON, specifically with a select subset of the OpenAPI schema format. A single function declaration can include the following parameters: name (string): A unique name for the function ( get_weather_forecast , send_email ). Use descriptive names without spaces or special characters (use underscores or camelCase). description (string): A clear and detailed explanation of the function's purpose and capabilities. This is crucial for the model to understand when to use the function. Be specific and provide examples if helpful ("Finds theaters based on location and optionally movie title which is currently playing in theaters."). parameters (object): Defines the input parameters the function expects. type (string): Specifies the overall data type, such as object . properties (object): Lists individual parameters, each with: type (string): The data type of the parameter, such as string , integer , boolean, array . description (string): A description of the parameter's purpose and format. Provide examples and constraints ("The city and state, e.g., 'San Francisco, CA' or a zip code e.g., '95616'."). enum (array, optional): If the parameter values are from a fixed set, use "enum" to list the allowed values instead of just describing them in the description. This improves accuracy ("enum": ["daylight", "cool", "warm"]). required (array): An array of strings listing the parameter names that are mandatory for the function to operate. Function calling with thinking Enabling "thinking" can improve function call performance by allowing the model to reason through a request before suggesting function calls. However, because the Gemini API is stateless, this reasoning context is lost between turns, which can reduce the quality of function calls as they require multiple turn requests. To preserve this context you can use thought signatures. A thought signature is an encrypted representation of the model's internal thought process that you pass back to the model on subsequent turns. To use thought signatures: Receive the signature: When thinking is enabled, the API response will include a thought_signature field containing an encrypted representation of the model's reasoning. Return the signature: When you send the function's execution result back to the server, include the thought_signature you received. This allows the model to restore its previous thinking context and will likely result in better function calling performance. Receiving signatures from the server Signatures are returned in the part after the model's thinking phase, which typically is a text or function call. Here are some examples of what thought signatures look like returned in each type of part, in response to the request "What's the weather in Lake Tahoe?" using the Get Weather example: Text part [{ "candidates" : [ { "content" : { "parts" : [ { "text" : "Here's what the weather in Lake Tahoe is today" , "thoughtSignature" : "ClcBVKhc7ru7KzUI7SrdUoIdAYLm/+i93aHjfIt4xHyAoO/G70tApxnK2ujBhOhC1PrRy1pkQa88fqFvpHNVd1HDjNLO7mkp6/hFwE+SPPEB3fh0hs4oM8MKhgIBVKhc7uIGvrS7i/T4HpfbnYrluFfWNjZ62gewqe4cVdR/Dlh+zbjtYmDD0gPZ+SuBO7vvHQdzsjePRP+2Y5XddX6LEf/cGGgakq8EhVvw/a6IVzUO6XmpHg2Ag1sl8E9+VFH/lC0R0ZuYdFWligtDuYwp5p5q3o59G0TtWeU2MC1y2MJfE9u/KWd313ldka80/X2W/xF2O/4djMp5G2WKcULfve75zeRCy0mc5iS3SB9mTH0cT6x0vtKjeBx50gcg+CQWtJcRuwTVzz54dmvmK9xvnqA8gKGw3DuaM9wfy5hyY7Qg0z3iyyWdP8T/lbjKim8IEQOk7O1vVwP1Ko7oMYH8JgA1CsoBAVSoXO6v4c5RSyd1cn6EIU0pEFQsjW7rYWPuZdOFq/tsGJT9BCfW7KGkPGwlNSq8jTJFvbcJ/DjtndISQYXwiXd2kGa5JfdS2Kh4zOxCxiWtOk+2nCc3+XQk2nonhO+esGJpkDdbbHZSqRgcUtYKq7q28iPFOQvOFyCiZNB7K86Z/6Hnagu2snSlN/BcTMaFGaWpcCClSUo4foRZn3WbNCoM8rcpD7qEJMp4a5baaSxyyeL1ZTGd2HLpFys/oiW6e3oAnhxuIysCwg==" } ] , "role" : "model" } , "index" : 0 } ] , # Remainder of response... Function call part [{ "candidates" : [ { "content" : { "parts" : [ { "functionCall" : { "name" : "getWeather" , "args" : { "city" : "Lake Tahoe" } } , "thoughtSignature" : "CiwBVKhc7nRyTi3HmggPD9iQiRc261f5jwuMdw3H/itDH0emsb9ZVo3Nwx9p6wpsAVSoXO5i8fDV4jBSBLoaWxB5zUdlGY6aIGp+I0oEnwRRSRQ1LOvrDlojEH8JE8HjiKXALdJrvNPiG+HY3GZEO8pZjEZtc3UoBUh7+SVyjK7Xolu7aRYYeUyzrCapoETWypER1jbrJXnFV23hCosBAVSoXO6oIPNJSmbuEDfGafOhuCSHkpr1yjTp35RXYqmCESzRzWf5+nFXLqncqeFo4ohoxbiYQVpVQbOZF81p8o9zg6xeRE7qMeOv+XN7enXGJ4/s3qNFQpfkSMqRdBITN1VpX7jyfEAjvxBNc7PDfDJZmEPY338ZIY5nFFcmzJSWjVrboFt2sMFv+A==" } ] , "role" : "model" } , "finishReason" : "STOP" , "index" : 0 } ] , # Remainder of response... You can confirm that you received a signature and see what a signature looks like using the following code: # Step 2: Call the model with function declarations # ...Generation config, Configure the client, and Define user prompt (No changes) # Send request with declarations (using a thinking model) response = client . models . generate_content ( model = "gemini-2.5-flash" , config = config , contents = contents ) # See thought signatures for part in response . candidates [ 0 ] . content . parts : if part . thought_signature : print ( "Thought signature:" ) print ( part . thought_signature ) Returning signatures back to the server In order to return signatures back: You should return signatures along with their containing parts back to the server You shouldn't merge a part with a signature with another part which also contains a signature. The signature string is not concatenable You shouldn't merge one part with a signature with another part without a signature. This breaks the correct positioning of the thought represented by the signature. The code will remain the same as in Step 4 of the previous section. But in this case (as indicated in the comment below) you will return signatures to the model along with the result of the function execution so the model can incorporate the thoughts into its final response: Python # Step 4: Create user friendly response with function result and call the model again # ...Create a function response part (No change) # Append thought signatures, function call and result of the function execution to contents function_call_content = response . candidates [ 0 ] . content # Append the model's function call message, which includes thought signatures contents . append ( function_call_content ) contents . append ( types . Content ( role = "user" , parts = [ function_response_part ])) # Append the function response final_response = client . models . generate_content ( model = "gemini-2.5-flash" , config = config , contents = contents , ) print ( final_response . text ) JavaScript // Step 4: Create user friendly response with function result and call the model again // ...Create a function response part (No change) // Append thought signatures, function call and result of the function execution to contents const function_response_content = response . candidates [ 0 ]. content ; contents . push ( function_response_content ); contents . push ({ role : 'user' , parts : [{ functionResponse : function_response_part }] }); const final_response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : contents , config : config }); console . log ( final_response . text ); The following shows what a request returning a thought signature may look like: [{ "contents" : [ { "role" : "user" , "parts" : [ { "text" : "what is the weather in Lake Tahoe?" } ] } , { "parts" : [ { "functionCall" : { "name" : "getWeather" , "args" : { "city" : "Lake Tahoe" } } , "thoughtSignature" : "CiIBVKhc7oDPpCaXyJKKssjqr4g3JNOSgJ/M2V+1THC1icsWCmwBVKhc7pBABbZ+zR3e9234WnWWS6GFXmf8IVwpnzjd5KYd7vyJbn/4vTorWBGayj/vbd9JPaZQjxdAIXhoE5mX/MDsQ7M9N/b0qJjHm39tYIBvS4sIWkMDHqTJqXGLzhhKtrTkfbV3RbaJEkQKmwEBVKhc7qVUgC3hfTXZLo9R3AJzUUIx50NKvJTb9B+UU+LBqgg7Nck1x5OpjWVS2R+SsveprIuYOruk2Y0H53J2OJF8qsxTdIq2si8DGW2V7WK8xyoJH5kbqd7drIw1jLb44b6lx4SMyB0VaULuTBki4d+Ljjg1tJTwR0IYMKqDLDZt9mheINsi0ZxcNjfpnDydRXdWbcSwzmK/wgqJAQFUqFzuKgNVElxs3cbO+xebr2IwcOro84nKTisi0tTp9bICPC9fTUhn3L+rvQWA+d3J1Za8at2bakrqiRj7BTh+CVO9fWQMAEQAs3ni0Z2hfaYG92tOD26E4IoZwyYEoWbfNudpH1fr5tEkyqnEGtWIh7H+XoZQ2DXeiOa+br7Zk88SrNE+trJMCogBAVSoXO5e9fBLg7hnbkmKsrzNLnQtLsQm1gNzjcjEC7nJYklYPp0KI2uGBE1PkM8XNsfllAfHVn7LzHcHNlbQ9pJ7QZTSIeG42goS971r5wNZwxaXwCTphClQh826eqJWo6A/28TtAVQWLhTx5ekbP7qb4nh1UblESZ1saxDQAEo4OKPbDzx5BgqKAQFUqFzuVyjNm5i0wN8hTDnKjfpDroEpPPTs531iFy9BOX+xDCdGHy8D+osFpaoBq6TFekQQbz4hIoUR1YEcP4zI80/cNimEeb9IcFxZTTxiNrbhbbcv0969DSMWhB+ZEqIz4vuw4GLe/xcUvqhlChQwFdgIbdOQHSHpatn5uDlktnP/bi26nKuXIwo0AVSoXO7US22OUH7d1f4abNPI0IyAvhqkPp12rbtWLx9vkOtojE8IP+xCfYtIFuZIzRNZqA==" } ] , "role" : "model" } , { "role" : "user" , "parts" : [ { "functionResponse" : { "name" : "getWeather" , "response" : { "response" : { "stringValue" : "Sunny and hot. 90 degrees Fahrenheit" } } } } ] } ] , # Remainder of request... Learn more about limitations and usage of thought signatures, and about thinking models in general, on the Thinking page. Parallel function calling In addition to single turn function calling, you can also call multiple functions at once. Parallel function calling lets you execute multiple functions at once and is used when the functions are not dependent on each other. This is useful in scenarios like gathering data from multiple independent sources, such as retrieving customer details from different databases or checking inventory levels across various warehouses or performing multiple actions such as converting your apartment into a disco. Python power_disco_ball = { "name" : "power_disco_ball" , "description" : "Powers the spinning disco ball." , "parameters" : { "type" : "object" , "properties" : { "power" : { "type" : "boolean" , "description" : "Whether to turn the disco ball on or off." , } }, "required" : [ "power" ], }, } start_music = { "name" : "start_music" , "description" : "Play some music matching the specified parameters." , "parameters" : { "type" : "object" , "properties" : { "energetic" : { "type" : "boolean" , "description" : "Whether the music is energetic or not." , }, "loud" : { "type" : "boolean" , "description" : "Whether the music is loud or not." , }, }, "required" : [ "energetic" , "loud" ], }, } dim_lights = { "name" : "dim_lights" , "description" : "Dim the lights." , "parameters" : { "type" : "object" , "properties" : { "brightness" : { "type" : "number" , "description" : "The brightness of the lights, 0.0 is off, 1.0 is full." , } }, "required" : [ "brightness" ], }, } JavaScript import { Type } from '@google/genai' ; const powerDiscoBall = { name : 'power_disco_ball' , description : 'Powers the spinning disco ball.' , parameters : { type : Type . OBJECT , properties : { power : { type : Type . BOOLEAN , description : 'Whether to turn the disco ball on or off.' } }, required : [ 'power' ] } }; const startMusic = { name : 'start_music' , description : 'Play some music matching the specified parameters.' , parameters : { type : Type . OBJECT , properties : { energetic : { type : Type . BOOLEAN , description : 'Whether the music is energetic or not.' }, loud : { type : Type . BOOLEAN , description : 'Whether the music is loud or not.' } }, required : [ 'energetic' , 'loud' ] } }; const dimLights = { name : 'dim_lights' , description : 'Dim the lights.' , parameters : { type : Type . OBJECT , properties : { brightness : { type : Type . NUMBER , description : 'The brightness of the lights, 0.0 is off, 1.0 is full.' } }, required : [ 'brightness' ] } }; Configure the function calling mode to allow using all of the specified tools. To learn more, you can read about configuring function calling . Python from google import genai from google.genai import types # Configure the client and tools client = genai . Client () house_tools = [ types . Tool ( function_declarations = [ power_disco_ball , start_music , dim_lights ]) ] config = types . GenerateContentConfig ( tools = house_tools , automatic_function_calling = types . AutomaticFunctionCallingConfig ( disable = True ), # Force the model to call 'any' function, instead of chatting. tool_config = types . ToolConfig ( function_calling_config = types . FunctionCallingConfig ( mode = 'ANY' ) ), ) chat = client . chats . create ( model = "gemini-2.5-flash" , config = config ) response = chat . send_message ( "Turn this place into a party!" ) # Print out each of the function calls requested from this single call print ( "Example 1: Forced function calling" ) for fn in response . function_calls : args = ", " . join ( f " { key } = { val } " for key , val in fn . args . items ()) print ( f " { fn . name } ( { args } )" ) JavaScript import { GoogleGenAI } from '@google/genai' ; // Set up function declarations const houseFns = [ powerDiscoBall , startMusic , dimLights ]; const config = { tools : [{ functionDeclarations : houseFns }], // Force the model to call 'any' function, instead of chatting. toolConfig : { functionCallingConfig : { mode : 'any' } } }; // Configure the client const ai = new GoogleGenAI ({}); // Create a chat session const chat = ai . chats . create ({ model : 'gemini-2.5-flash' , config : config }); const response = await chat . sendMessage ({ message : 'Turn this place into a party!' }); // Print out each of the function calls requested from this single call console . log ( "Example 1: Forced function calling" ); for ( const fn of response . functionCalls ) { const args = Object . entries ( fn . args ) . map (([ key , val ]) = > ` ${ key } = ${ val } ` ) . join ( ', ' ); console . log ( ` ${ fn . name } ( ${ args } )` ); } Each of the printed results reflects a single function call that the model has requested. To send the results back, include the responses in the same order as they were requested. The Python SDK supports automatic function calling , which automatically converts Python functions to declarations, handles the function call execution and response cycle for you. Following is an example for the disco use case. Note: Automatic Function Calling is a Python SDK only feature at the moment. Python from google import genai from google.genai import types # Actual function implementations def power_disco_ball_impl ( power : bool ) - > dict : """Powers the spinning disco ball. Args: power: Whether to turn the disco ball on or off. Returns: A status dictionary indicating the current state. """ return { "status" : f "Disco ball powered { 'on' if power else 'off' } " } def start_music_impl ( energetic : bool , loud : bool ) - > dict : """Play some music matching the specified parameters. Args: energetic: Whether the music is energetic or not. loud: Whether the music is loud or not. Returns: A dictionary containing the music settings. """ music_type = "energetic" if energetic else "chill" volume = "loud" if loud else "quiet" return { "music_type" : music_type , "volume" : volume } def dim_lights_impl ( brightness : float ) - > dict : """Dim the lights. Args: brightness: The brightness of the lights, 0.0 is off, 1.0 is full. Returns: A dictionary containing the new brightness setting. """ return { "brightness" : brightness } # Configure the client client = genai . Client () config = types . GenerateContentConfig ( tools = [ power_disco_ball_impl , start_music_impl , dim_lights_impl ] ) # Make the request response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "Do everything you need to this place into party!" , config = config , ) print ( " \n Example 2: Automatic function calling" ) print ( response . text ) # I've turned on the disco ball, started playing loud and energetic music, and dimmed the lights to 50% brightness. Let's get this party started! Compositional function calling Compositional or sequential function calling allows Gemini to chain multiple function calls together to fulfill a complex request. For example, to answer "Get the temperature in my current location", the Gemini API might first invoke a get_current_location() function followed by a get_weather() function that takes the location as a parameter. The following example demonstrates how to implement compositional function calling using the Python SDK and automatic function calling. Python This example uses the automatic function calling feature of the google-genai Python SDK. The SDK automatically converts the Python functions to the required schema, executes the function calls when requested by the model, and sends the results back to the model to complete the task. import os from google import genai from google.genai import types # Example Functions def get_weather_forecast ( location : str ) - > dict : """Gets the current weather temperature for a given location.""" print ( f "Tool Call: get_weather_forecast(location= { location } )" ) # TODO: Make API call print ( "Tool Response: {'temperature': 25, 'unit': 'celsius'}" ) return { "temperature" : 25 , "unit" : "celsius" } # Dummy response def set_thermostat_temperature ( temperature : int ) - > dict : """Sets the thermostat to a desired temperature.""" print ( f "Tool Call: set_thermostat_temperature(temperature= { temperature } )" ) # TODO: Interact with a thermostat API print ( "Tool Response: {'status': 'success'}" ) return { "status" : "success" } # Configure the client and model client = genai . Client () config = types . GenerateContentConfig ( tools = [ get_weather_forecast , set_thermostat_temperature ] ) # Make the request response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "If it's warmer than 20°C in London, set the thermostat to 20°C, otherwise set it to 18°C." , config = config , ) # Print the final, user-facing response print ( response . text ) Expected Output When you run the code, you will see the SDK orchestrating the function calls. The model first calls get_weather_forecast , receives the temperature, and then calls set_thermostat_temperature with the correct value based on the logic in the prompt. Tool Call : get_weather_forecast ( location = London ) Tool Response : { 'temperature' : 25 , 'unit' : 'celsius' } Tool Call : set_thermostat_temperature ( temperature = 20 ) Tool Response : { 'status' : 'success' } OK . I 've set the thermostat to 20°C. JavaScript This example shows how to use JavaScript/TypeScript SDK to do comopositional function calling using a manual execution loop. import { GoogleGenAI , Type } from "@google/genai" ; // Configure the client const ai = new GoogleGenAI ({}); // Example Functions function get_weather_forecast ({ location }) { console . log ( `Tool Call: get_weather_forecast(location= ${ location } )` ); // TODO: Make API call console . log ( "Tool Response: {'temperature': 25, 'unit': 'celsius'}" ); return { temperature : 25 , unit : "celsius" }; } function set_thermostat_temperature ({ temperature }) { console . log ( `Tool Call: set_thermostat_temperature(temperature= ${ temperature } )` , ); // TODO: Make API call console . log ( "Tool Response: {'status': 'success'}" ); return { status : "success" }; } const toolFunctions = { get_weather_forecast , set_thermostat_temperature , }; const tools = [ { functionDeclarations : [ { name : "get_weather_forecast" , description : "Gets the current weather temperature for a given location." , parameters : { type : Type . OBJECT , properties : { location : { type : Type . STRING , }, }, required : [ "location" ], }, }, { name : "set_thermostat_temperature" , description : "Sets the thermostat to a desired temperature." , parameters : { type : Type . OBJECT , properties : { temperature : { type : Type . NUMBER , }, }, required : [ "temperature" ], }, }, ], }, ]; // Prompt for the model let contents = [ { role : "user" , parts : [ { text : "If it's warmer than 20°C in London, set the thermostat to 20°C, otherwise set it to 18°C." , }, ], }, ]; // Loop until the model has no more function calls to make while ( true ) { const result = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents , config : { tools }, }); if ( result . functionCalls && result . functionCalls . length > 0 ) { const functionCall = result . functionCalls [ 0 ]; const { name , args } = functionCall ; if ( ! toolFunctions [ name ]) { throw new Error ( `Unknown function call: ${ name } ` ); } // Call the function and get the response. const toolResponse = toolFunctions [ name ]( args ); const functionResponsePart = { name : functionCall . name , response : { result : toolResponse , }, }; // Send the function response back to the model. contents . push ({ role : "model" , parts : [ { functionCall : functionCall , }, ], }); contents . push ({ role : "user" , parts : [ { functionResponse : functionResponsePart , }, ], }); } else { // No more function calls, break the loop. console . log ( result . text ); break ; } } Expected Output When you run the code, you will see the SDK orchestrating the function calls. The model first calls get_weather_forecast , receives the temperature, and then calls set_thermostat_temperature with the correct value based on the logic in the prompt. Tool Call : get_weather_forecast ( location = London ) Tool Response : { 'temperature' : 25 , 'unit' : 'celsius' } Tool Call : set_thermostat_temperature ( temperature = 20 ) Tool Response : { 'status' : 'success' } OK . It 's 25°C in London, so I' ve set the thermostat to 20 ° C . Compositional function calling is a native Live API feature. This means Live API can handle the function calling similar to the Python SDK. Python # Light control schemas turn_on_the_lights_schema = { 'name' : 'turn_on_the_lights' } turn_off_the_lights_schema = { 'name' : 'turn_off_the_lights' } prompt = """ Hey, can you write run some python code to turn on the lights, wait 10s and then turn off the lights? """ tools = [ { 'code_execution' : {}}, { 'function_declarations' : [ turn_on_the_lights_schema , turn_off_the_lights_schema ]} ] await run ( prompt , tools = tools , modality = "AUDIO" ) JavaScript // Light control schemas const turnOnTheLightsSchema = { name : 'turn_on_the_lights' }; const turnOffTheLightsSchema = { name : 'turn_off_the_lights' }; const prompt = ` Hey, can you write run some python code to turn on the lights, wait 10s and then turn off the lights? ` ; const tools = [ { codeExecution : {} }, { functionDeclarations : [ turnOnTheLightsSchema , turnOffTheLightsSchema ] } ]; await run ( prompt , tools = tools , modality = "AUDIO" ) Function calling modes The Gemini API lets you control how the model uses the provided tools (function declarations). Specifically, you can set the mode within the. function_calling_config . AUTO (Default) : The model decides whether to generate a natural language response or suggest a function call based on the prompt and context. This is the most flexible mode and recommended for most scenarios. ANY : The model is constrained to always predict a function call and guarantees function schema adherence. If allowed_function_names is not specified, the model can choose from any of the provided function declarations. If allowed_function_names is provided as a list, the model can only choose from the functions in that list. Use this mode when you require a function call response to every prompt (if applicable). NONE : The model is prohibited from making function calls. This is equivalent to sending a request without any function declarations. Use this to temporarily disable function calling without removing your tool definitions. Python from google.genai import types # Configure function calling mode tool_config = types . ToolConfig ( function_calling_config = types . FunctionCallingConfig ( mode = "ANY" , allowed_function_names = [ "get_current_temperature" ] ) ) # Create the generation config config = types . GenerateContentConfig ( tools = [ tools ], # not defined here. tool_config = tool_config , ) JavaScript import { FunctionCallingConfigMode } from '@google/genai' ; // Configure function calling mode const toolConfig = { functionCallingConfig : { mode : FunctionCallingConfigMode . ANY , allowedFunctionNames : [ 'get_current_temperature' ] } }; // Create the generation config const config = { tools : tools , // not defined here. toolConfig : toolConfig , }; Automatic function calling (Python only) When using the Python SDK, you can provide Python functions directly as tools. The SDK automatically converts the Python function to declarations, handles the function call execution and the response cycle for you. The Python SDK then automatically: Detects function call responses from the model. Call the corresponding Python function in your code. Sends the function response back to the model. Returns the model's final text response. To use this, define your function with type hints and a docstring, and then pass the function itself (not a JSON declaration) as a tool: Python from google import genai from google.genai import types # Define the function with type hints and docstring def get_current_temperature ( location : str ) - > dict : """Gets the current temperature for a given location. Args: location: The city and state, e.g. San Francisco, CA Returns: A dictionary containing the temperature and unit. """ # ... (implementation) ... return { "temperature" : 25 , "unit" : "Celsius" } # Configure the client client = genai . Client () config = types . GenerateContentConfig ( tools = [ get_current_temperature ] ) # Pass the function itself # Make the request response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "What's the temperature in Boston?" , config = config , ) print ( response . text ) # The SDK handles the function call and returns the final text You can disable automatic function calling with: Python config = types . GenerateContentConfig ( tools = [ get_current_temperature ], automatic_function_calling = types . AutomaticFunctionCallingConfig ( disable = True ) ) Automatic function schema declaration Automatic schema extraction from Python functions doesn't work in all cases. For example, it doesn't handle cases where you describe the fields of a nested dictionary-object. The API is able to describe any of the following types: Python AllowedType = ( int | float | bool | str | list [ 'AllowedType' ] | dict [ str , AllowedType ]) To see what the inferred schema looks like, you can convert it using from_callable : Python def multiply ( a : float , b : float ): """Returns a * b.""" return a * b fn_decl = types . FunctionDeclaration . from_callable ( callable = multiply , client = client ) # to_json_dict() provides a clean JSON representation. print ( fn_decl . to_json_dict ()) Multi-tool use: Combine native tools with function calling You can enable multiple tools combining native tools with function calling at the same time. Here's an example that enables two tools, Grounding with Google Search and code execution , in a request using the Live API . Note: Multi-tool use is a- Live API only feature at the moment. The run() function declaration, which handles the asynchronous websocket setup, is omitted for brevity. Python # Multiple tasks example - combining lights, code execution, and search prompt = """ Hey, I need you to do three things for me. 1. Turn on the lights. 2. Then compute the largest prime palindrome under 100000. 3. Then use Google Search to look up information about the largest earthquake in California the week of Dec 5 2024. Thanks! """ tools = [ { 'google_search' : {}}, { 'code_execution' : {}}, { 'function_declarations' : [ turn_on_the_lights_schema , turn_off_the_lights_schema ]} # not defined here. ] # Execute the prompt with specified tools in audio modality await run ( prompt , tools = tools , modality = "AUDIO" ) JavaScript // Multiple tasks example - combining lights, code execution, and search const prompt = ` Hey, I need you to do three things for me. 1. Turn on the lights. 2. Then compute the largest prime palindrome under 100000. 3. Then use Google Search to look up information about the largest earthquake in California the week of Dec 5 2024. Thanks! ` ; const tools = [ { googleSearch : {} }, { codeExecution : {} }, { functionDeclarations : [ turnOnTheLightsSchema , turnOffTheLightsSchema ] } // not defined here. ]; // Execute the prompt with specified tools in audio modality await run ( prompt , { tools : tools , modality : "AUDIO" }); Python developers can try this out in the Live API Tool Use notebook . Model context protocol (MCP) Model Context Protocol (MCP) is an open standard for connecting AI applications with external tools and data. MCP provides a common protocol for models to access context, such as functions (tools), data sources (resources), or predefined prompts. The Gemini SDKs have built-in support for the MCP, reducing boilerplate code and offering automatic tool calling for MCP tools. When the model generates an MCP tool call, the Python and JavaScript client SDK can automatically execute the MCP tool and send the response back to the model in a subsequent request, continuing this loop until no more tool calls are made by the model. Here, you can find an example of how to use a local MCP server with Gemini and mcp SDK. Python Make sure the latest version of the mcp SDK is installed on your platform of choice. pip install mcp Note: Python supports automatic tool calling by passing in the ClientSession into the tools parameters. If you want to disable it, you can provide automatic_function_calling with disabled True . import os import asyncio from datetime import datetime from mcp import ClientSession , StdioServerParameters from mcp.client.stdio import stdio_client from google import genai client = genai . Client () # Create server parameters for stdio connection server_params = StdioServerParameters ( command = "npx" , # Executable args = [ "-y" , "@philschmid/weather-mcp" ], # MCP Server env = None , # Optional environment variables ) async def run (): async with stdio_client ( server_params ) as ( read , write ): async with ClientSession ( read , write ) as session : # Prompt to get the weather for the current day in London. prompt = f "What is the weather in London in { datetime . now () . strftime ( '%Y-%m- %d ' ) } ?" # Initialize the connection between client and server await session . initialize () # Send request to the model with MCP function declarations response = await client . aio . models . generate_content ( model = "gemini-2.5-flash" , contents = prompt , config = genai . types . GenerateContentConfig ( temperature = 0 , tools = [ session ], # uses the session, will automatically call the tool # Uncomment if you **don't** want the SDK to automatically call the tool # automatic_function_calling=genai.types.AutomaticFunctionCallingConfig( # disable=True # ), ), ) print ( response . text ) # Start the asyncio event loop and run the main function asyncio . run ( run ()) JavaScript Make sure the latest version of the mcp SDK is installed on your platform of choice. npm install @modelcontextprotocol/sdk Note: JavaScript supports automatic tool calling by wrapping the client with mcpToTool . If you want to disable it, you can provide automaticFunctionCalling with disabled true . import { GoogleGenAI , FunctionCallingConfigMode , mcpToTool } from '@google/genai' ; import { Client } from "@modelcontextprotocol/sdk/client/index.js" ; import { StdioClientTransport } from "@modelcontextprotocol/sdk/client/stdio.js" ; // Create server parameters for stdio connection const serverParams = new StdioClientTransport ({ command : "npx" , // Executable args : [ "-y" , "@philschmid/weather-mcp" ] // MCP Server }); const client = new Client ( { name : "example-client" , version : "1.0.0" } ); // Configure the client const ai = new GoogleGenAI ({}); // Initialize the connection between client and server await client . connect ( serverParams ); // Send request to the model with MCP tools const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : `What is the weather in London in ${ new Date (). toLocaleDateString () } ?` , config : { tools : [ mcpToTool ( client )], // uses the session, will automatically call the tool // Uncomment if you **don't** want the sdk to automatically call the tool // automaticFunctionCalling: { // disable: true, // }, }, }); console . log ( response . text ) // Close the connection await client . close (); Limitations with built-in MCP support Built-in MCP support is a experimental feature in our SDKs and has the following limitations: Only tools are supported, not resources nor prompts It is available for the Python and JavaScript/TypeScript SDK. Breaking changes might occur in future releases. Manual integration of MCP servers is always an option if these limit what you're building. Supported models This section lists models and their function calling capabilities. Experimental models are not included. You can find a comprehensive capabilities overview on the model overview page. Model Function Calling Parallel Function Calling Compositional Function Calling Gemini 2.5 Pro ✔️ ✔️ ✔️ Gemini 2.5 Flash ✔️ ✔️ ✔️ Gemini 2.5 Flash-Lite ✔️ ✔️ ✔️ Gemini 2.0 Flash ✔️ ✔️ ✔️ Gemini 2.0 Flash-Lite X X X Best practices Function and Parameter Descriptions: Be extremely clear and specific in your descriptions. The model relies on these to choose the correct function and provide appropriate arguments. Naming: Use descriptive function names (without spaces, periods, or dashes). Strong Typing: Use specific types (integer, string, enum) for parameters to reduce errors. If a parameter has a limited set of valid values, use an enum. Tool Selection: While the model can use an arbitrary number of tools, providing too many can increase the risk of selecting an incorrect or suboptimal tool. For best results, aim to provide only the relevant tools for the context or task, ideally keeping the active set to a maximum of 10-20. Consider dynamic tool selection based on conversation context if you have a large total number of tools. Prompt Engineering: Provide context: Tell the model its role (e.g., "You are a helpful weather assistant."). Give instructions: Specify how and when to use functions (e.g., "Don't guess dates; always use a future date for forecasts."). Encourage clarification: Instruct the model to ask clarifying questions if needed. Temperature: Use a low temperature (e.g., 0) for more deterministic and reliable function calls. Validation: If a function call has significant consequences (e.g., placing an order), validate the call with the user before executing it. Error Handling : Implement robust error handling in your functions to gracefully handle unexpected inputs or API failures. Return informative error messages that the model can use to generate helpful responses to the user. Security: Be mindful of security when calling external APIs. Use appropriate authentication and authorization mechanisms. Avoid exposing sensitive data in function calls. Token Limits: Function descriptions and parameters count towards your input token limit. If you're hitting token limits, consider limiting the number of functions or the length of the descriptions, break down complex tasks into smaller, more focused function sets. Notes and limitations Only a subset of the OpenAPI schema is supported. Supported parameter types in Python are limited. Automatic function calling is a Python SDK feature only. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-10 UTC.
|
text_content/docs_image-understanding_09b055f7.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
URL: https://ai.google.dev/gemini-api/docs/image-understanding#segmentation
|
2 |
+
Title: Image understanding | Gemini API | Google AI for Developers
|
3 |
+
==================================================
|
4 |
+
|
5 |
+
Image understanding | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Image understanding Gemini models are built to be multimodal from the ground up, unlocking a wide range of image processing and computer vision tasks including but not limited to image captioning, classification, and visual question answering without having to train specialized ML models. Tip: In addition to their general multimodal capabilities, Gemini models (2.0 and newer) offer improved accuracy for specific use cases like object detection and segmentation , through additional training. See the Capabilities section for more details. Passing images to Gemini You can provide images as input to Gemini using two methods: Passing inline image data : Ideal for smaller files (total request size less than 20MB, including prompts). Uploading images using the File API : Recommended for larger files or for reusing images across multiple requests. Passing inline image data You can pass inline image data in the request to generateContent . You can provide image data as Base64 encoded strings or by reading local files directly (depending on the language). The following example shows how to read an image from a local file and pass it to generateContent API for processing. Python from google.genai import types with open ( 'path/to/small-sample.jpg' , 'rb' ) as f : image_bytes = f . read () response = client . models . generate_content ( model = 'gemini-2.5-flash' , contents = [ types . Part . from_bytes ( data = image_bytes , mime_type = 'image/jpeg' , ), 'Caption this image.' ] ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; import * as fs from "node:fs" ; const ai = new GoogleGenAI ({}); const base64ImageFile = fs . readFileSync ( "path/to/small-sample.jpg" , { encoding : "base64" , }); const contents = [ { inlineData : { mimeType : "image/jpeg" , data : base64ImageFile , }, }, { text : "Caption this image." }, ]; const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : contents , }); console . log ( response . text ); Go bytes , _ := os . ReadFile ( "path/to/small-sample.jpg" ) parts := [] * genai . Part { genai . NewPartFromBytes ( bytes , "image/jpeg" ), genai . NewPartFromText ( "Caption this image." ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) REST IMG_PATH = "/path/to/your/image1.jpg" if [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then B64FLAGS = "--input" else B64FLAGS = "-w0" fi curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ { "inline_data": { "mime_type":"image/jpeg", "data": "' " $( base64 $B64FLAGS $IMG_PATH ) " '" } }, {"text": "Caption this image."}, ] }] }' 2 > /dev/null You can also fetch an image from a URL, convert it to bytes, and pass it to generateContent as shown in the following examples. Python from google import genai from google.genai import types import requests image_path = "https://goo.gle/instrument-img" image_bytes = requests . get ( image_path ) . content image = types . Part . from_bytes ( data = image_bytes , mime_type = "image/jpeg" ) client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ "What is this image?" , image ], ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; async function main () { const ai = new GoogleGenAI ({}); const imageUrl = "https://goo.gle/instrument-img" ; const response = await fetch ( imageUrl ); const imageArrayBuffer = await response . arrayBuffer (); const base64ImageData = Buffer . from ( imageArrayBuffer ). toString ( 'base64' ); const result = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : [ { inlineData : { mimeType : 'image/jpeg' , data : base64ImageData , }, }, { text : "Caption this image." } ], }); console . log ( result . text ); } main (); Go package main import ( "context" "fmt" "os" "io" "net/http" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } // Download the image. imageResp , _ := http . Get ( "https://goo.gle/instrument-img" ) imageBytes , _ := io . ReadAll ( imageResp . Body ) parts := [] * genai . Part { genai . NewPartFromBytes ( imageBytes , "image/jpeg" ), genai . NewPartFromText ( "Caption this image." ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST IMG_URL = "https://goo.gle/instrument-img" MIME_TYPE = $( curl -sIL " $IMG_URL " | grep -i '^content-type:' | awk -F ': ' '{print $2}' | sed 's/\r$//' | head -n 1 ) if [[ -z " $MIME_TYPE " || ! " $MIME_TYPE " == image/* ]] ; then MIME_TYPE = "image/jpeg" fi # Check for macOS if [[ " $( uname ) " == "Darwin" ]] ; then IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 -b 0 ) elif [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 ) else IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 -w0 ) fi curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ { "inline_data": { "mime_type":"' " $MIME_TYPE " '", "data": "' " $IMAGE_B64 " '" } }, {"text": "Caption this image."} ] }] }' 2 > /dev/null Note: Inline image data limits your total request size (text prompts, system instructions, and inline bytes) to 20MB. For larger requests, upload image files using the File API. Files API is also more efficient for scenarios that use the same image repeatedly. Uploading images using the File API For large files or to be able to use the same image file repeatedly, use the Files API. The following code uploads an image file and then uses the file in a call to generateContent . See the Files API guide for more information and examples. Python from google import genai client = genai . Client () my_file = client . files . upload ( file = "path/to/sample.jpg" ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ my_file , "Caption this image." ], ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const myfile = await ai . files . upload ({ file : "path/to/sample.jpg" , config : { mimeType : "image/jpeg" }, }); const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : createUserContent ([ createPartFromUri ( myfile . uri , myfile . mimeType ), "Caption this image." , ]), }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } uploadedFile , _ := client . Files . UploadFromPath ( ctx , "path/to/sample.jpg" , nil ) parts := [] * genai . Part { genai . NewPartFromText ( "Caption this image." ), genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST IMAGE_PATH = "path/to/sample.jpg" MIME_TYPE = $( file -b --mime-type " ${ IMAGE_PATH } " ) NUM_BYTES = $( wc -c < " ${ IMAGE_PATH } " ) DISPLAY_NAME = IMAGE tmp_header_file = upload-header.tmp # Initial resumable request defining metadata. # The upload url is in the response headers dump them to a file. curl "https://generativelanguage.googleapis.com/upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D upload-header.tmp \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME } '}}" 2 > /dev/null upload_url = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " # Upload the actual bytes. curl " ${ upload_url } " \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ IMAGE_PATH } " 2 > /dev/null > file_info.json file_uri = $( jq -r ".file.uri" file_info.json ) echo file_uri = $file_uri # Now generate content using that file curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"file_data":{"mime_type": "' " ${ MIME_TYPE } " '", "file_uri": "' " ${ file_uri } " '"}}, {"text": "Caption this image."}] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json Prompting with multiple images You can provide multiple images in a single prompt by including multiple image Part objects in the contents array. These can be a mix of inline data (local files or URLs) and File API references. Python from google import genai from google.genai import types client = genai . Client () # Upload the first image image1_path = "path/to/image1.jpg" uploaded_file = client . files . upload ( file = image1_path ) # Prepare the second image as inline data image2_path = "path/to/image2.png" with open ( image2_path , 'rb' ) as f : img2_bytes = f . read () # Create the prompt with text and multiple images response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ "What is different between these two images?" , uploaded_file , # Use the uploaded file reference types . Part . from_bytes ( data = img2_bytes , mime_type = 'image/png' ) ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; import * as fs from "node:fs" ; const ai = new GoogleGenAI ({}); async function main () { // Upload the first image const image1_path = "path/to/image1.jpg" ; const uploadedFile = await ai . files . upload ({ file : image1_path , config : { mimeType : "image/jpeg" }, }); // Prepare the second image as inline data const image2_path = "path/to/image2.png" ; const base64Image2File = fs . readFileSync ( image2_path , { encoding : "base64" , }); // Create the prompt with text and multiple images const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : createUserContent ([ "What is different between these two images?" , createPartFromUri ( uploadedFile . uri , uploadedFile . mimeType ), { inlineData : { mimeType : "image/png" , data : base64Image2File , }, }, ]), }); console . log ( response . text ); } await main (); Go // Upload the first image image1Path := "path/to/image1.jpg" uploadedFile , _ := client . Files . UploadFromPath ( ctx , image1Path , nil ) // Prepare the second image as inline data image2Path := "path/to/image2.jpeg" imgBytes , _ := os . ReadFile ( image2Path ) parts := [] * genai . Part { genai . NewPartFromText ( "What is different between these two images?" ), genai . NewPartFromBytes ( imgBytes , "image/jpeg" ), genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) REST # Upload the first image IMAGE1_PATH = "path/to/image1.jpg" MIME1_TYPE = $( file -b --mime-type " ${ IMAGE1_PATH } " ) NUM1_BYTES = $( wc -c < " ${ IMAGE1_PATH } " ) DISPLAY_NAME1 = IMAGE1 tmp_header_file1 = upload-header1.tmp curl "https://generativelanguage.googleapis.com/upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D upload-header1.tmp \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM1_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME1_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME1 } '}}" 2 > /dev/null upload_url1 = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file1 } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file1 } " curl " ${ upload_url1 } " \ -H "Content-Length: ${ NUM1_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ IMAGE1_PATH } " 2 > /dev/null > file_info1.json file1_uri = $( jq ".file.uri" file_info1.json ) echo file1_uri = $file1_uri # Prepare the second image (inline) IMAGE2_PATH = "path/to/image2.png" MIME2_TYPE = $( file -b --mime-type " ${ IMAGE2_PATH } " ) if [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then B64FLAGS = "--input" else B64FLAGS = "-w0" fi IMAGE2_BASE64 = $( base64 $B64FLAGS $IMAGE2_PATH ) # Now generate content using both images curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"text": "What is different between these two images?"}, {"file_data":{"mime_type": "' " ${ MIME1_TYPE } " '", "file_uri": ' $file1_uri '}}, { "inline_data": { "mime_type":"' " ${ MIME2_TYPE } " '", "data": "' " $IMAGE2_BASE64 " '" } } ] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json Object detection From Gemini 2.0 onwards, models are further trained to detect objects in an image and get their bounding box coordinates. The coordinates, relative to image dimensions, scale to [0, 1000]. You need to descale these coordinates based on your original image size. Python from google import genai from google.genai import types from PIL import Image import json client = genai . Client () prompt = "Detect the all of the prominent items in the image. The box_2d should be [ymin, xmin, ymax, xmax] normalized to 0-1000." image = Image . open ( "/path/to/image.png" ) config = types . GenerateContentConfig ( response_mime_type = "application/json" ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ image , prompt ], config = config ) width , height = image . size bounding_boxes = json . loads ( response . text ) converted_bounding_boxes = [] for bounding_box in bounding_boxes : abs_y1 = int ( bounding_box [ "box_2d" ][ 0 ] / 1000 * height ) abs_x1 = int ( bounding_box [ "box_2d" ][ 1 ] / 1000 * width ) abs_y2 = int ( bounding_box [ "box_2d" ][ 2 ] / 1000 * height ) abs_x2 = int ( bounding_box [ "box_2d" ][ 3 ] / 1000 * width ) converted_bounding_boxes . append ([ abs_x1 , abs_y1 , abs_x2 , abs_y2 ]) print ( "Image size: " , width , height ) print ( "Bounding boxes:" , converted_bounding_boxes ) Note: The model also supports generating bounding boxes based on custom instructions, such as: "Show bounding boxes of all green objects in this image". It also support custom labels like "label the items with the allergens they can contain". For more examples, check following notebooks in the Gemini Cookbook : 2D spatial understanding notebook Experimental 3D pointing notebook Segmentation Starting with Gemini 2.5, models not only detect items but also segment them and provide their contour masks. The model predicts a JSON list, where each item represents a segmentation mask. Each item has a bounding box (" box_2d ") in the format [y0, x0, y1, x1] with normalized coordinates between 0 and 1000, a label (" label ") that identifies the object, and finally the segmentation mask inside the bounding box, as base64 encoded png that is a probability map with values between 0 and 255. The mask needs to be resized to match the bounding box dimensions, then binarized at your confidence threshold (127 for the midpoint). Note: For better results, disable thinking by setting the thinking budget to 0. See code sample below for an example. Python from google import genai from google.genai import types from PIL import Image , ImageDraw import io import base64 import json import numpy as np import os client = genai . Client () def parse_json ( json_output : str ): # Parsing out the markdown fencing lines = json_output . splitlines () for i , line in enumerate ( lines ): if line == "```json" : json_output = " \n " . join ( lines [ i + 1 :]) # Remove everything before "```json" output = json_output . split ( "```" )[ 0 ] # Remove everything after the closing "```" break # Exit the loop once "```json" is found return json_output def extract_segmentation_masks ( image_path : str , output_dir : str = "segmentation_outputs" ): # Load and resize image im = Image . open ( image_path ) im . thumbnail ([ 1024 , 1024 ], Image . Resampling . LANCZOS ) prompt = """ Give the segmentation masks for the wooden and glass items. Output a JSON list of segmentation masks where each entry contains the 2D bounding box in the key "box_2d", the segmentation mask in key "mask", and the text label in the key "label". Use descriptive labels. """ config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( thinking_budget = 0 ) # set thinking_budget to 0 for better results in object detection ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ prompt , im ], # Pillow images can be directly passed as inputs (which will be converted by the SDK) config = config ) # Parse JSON response items = json . loads ( parse_json ( response . text )) # Create output directory os . makedirs ( output_dir , exist_ok = True ) # Process each mask for i , item in enumerate ( items ): # Get bounding box coordinates box = item [ "box_2d" ] y0 = int ( box [ 0 ] / 1000 * im . size [ 1 ]) x0 = int ( box [ 1 ] / 1000 * im . size [ 0 ]) y1 = int ( box [ 2 ] / 1000 * im . size [ 1 ]) x1 = int ( box [ 3 ] / 1000 * im . size [ 0 ]) # Skip invalid boxes if y0 > = y1 or x0 > = x1 : continue # Process mask png_str = item [ "mask" ] if not png_str . startswith ( "data:image/png;base64," ): continue # Remove prefix png_str = png_str . removeprefix ( "data:image/png;base64," ) mask_data = base64 . b64decode ( png_str ) mask = Image . open ( io . BytesIO ( mask_data )) # Resize mask to match bounding box mask = mask . resize (( x1 - x0 , y1 - y0 ), Image . Resampling . BILINEAR ) # Convert mask to numpy array for processing mask_array = np . array ( mask ) # Create overlay for this mask overlay = Image . new ( 'RGBA' , im . size , ( 0 , 0 , 0 , 0 )) overlay_draw = ImageDraw . Draw ( overlay ) # Create overlay for the mask color = ( 255 , 255 , 255 , 200 ) for y in range ( y0 , y1 ): for x in range ( x0 , x1 ): if mask_array [ y - y0 , x - x0 ] > 128 : # Threshold for mask overlay_draw . point (( x , y ), fill = color ) # Save individual mask and its overlay mask_filename = f " { item [ 'label' ] } _ { i } _mask.png" overlay_filename = f " { item [ 'label' ] } _ { i } _overlay.png" mask . save ( os . path . join ( output_dir , mask_filename )) # Create and save overlay composite = Image . alpha_composite ( im . convert ( 'RGBA' ), overlay ) composite . save ( os . path . join ( output_dir , overlay_filename )) print ( f "Saved mask and overlay for { item [ 'label' ] } to { output_dir } " ) # Example usage if __name__ == "__main__" : extract_segmentation_masks ( "path/to/image.png" ) Check the segmentation example in the cookbook guide for a more detailed example. An example segmentation output with objects and segmentation masks Supported image formats Gemini supports the following image format MIME types: PNG - image/png JPEG - image/jpeg WEBP - image/webp HEIC - image/heic HEIF - image/heif Capabilities All Gemini model versions are multimodal and can be utilized in a wide range of image processing and computer vision tasks including but not limited to image captioning, visual question and answering, image classification, object detection and segmentation. Gemini can reduce the need to use specialized ML models depending on your quality and performance requirements. Some later model versions are specifically trained improve accuracy of specialized tasks in addition to generic capabilities: Gemini 2.0 models are further trained to support enhanced object detection . Gemini 2.5 models are further trained to support enhanced segmentation in addition to object detection . Limitations and key technical information File limit Gemini 2.5 Pro/Flash, 2.0 Flash, 1.5 Pro, and 1.5 Flash support a maximum of 3,600 image files per request. Token calculation Gemini 1.5 Flash and Gemini 1.5 Pro : 258 tokens if both dimensions <= 384 pixels. Larger images are tiled (min tile 256px, max 768px, resized to 768x768), with each tile costing 258 tokens. Gemini 2.0 Flash and Gemini 2.5 Flash/Pro : 258 tokens if both dimensions <= 384 pixels. Larger images are tiled into 768x768 pixel tiles, each costing 258 tokens. Tips and best practices Verify that images are correctly rotated. Use clear, non-blurry images. When using a single image with text, place the text prompt after the image part in the contents array. What's next This guide shows you how to upload image files and generate text outputs from image inputs. To learn more, see the following resources: Files API : Learn more about uploading and managing files for use with Gemini. System instructions : System instructions let you steer the behavior of the model based on your specific needs and use cases. File prompting strategies : The Gemini API supports prompting with text, image, audio, and video data, also known as multimodal prompting. Safety guidance : Sometimes generative AI models produce unexpected outputs, such as outputs that are inaccurate, biased, or offensive. Post-processing and human evaluation are essential to limit the risk of harm from such outputs. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC.
|
text_content/docs_libraries_9ab83d4c.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
URL: https://ai.google.dev/gemini-api/docs/libraries#main-content
|
2 |
+
Title: Gemini API libraries | Google AI for Developers
|
3 |
+
==================================================
|
4 |
+
|
5 |
+
Gemini API libraries | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini API libraries When building with the Gemini API, we recommend using our official collection of libraries across major languages: the Google GenAI SDK . They are production ready under General Availability . Our samples and documentation across this site are built using these libraries. Note: If you're using one of our legacy libraries, we strongly recommend you migrate to the Google GenAI SDK. Review the legacy libraries section for more information. If you're new to the Gemini API, follow our quickstart guide to get started. Language support and installation The Google GenAI SDK is available for the Python, JavaScript/TypeScript, Go and Java languages. You can install each language's library using package managers, or visit their GitHub repos for further engagement: Python Library: google-genai GitHub Repository: googleapis/python-genai Installation: pip install google-genai JavaScript Library: @google/genai GitHub Repository: googleapis/js-genai Installation: npm install @google/genai Go Library: google.golang.org/genai GitHub Repository: googleapis/go-genai Installation: go get google.golang.org/genai Java Library: google-genai GitHub Repository: googleapis/java-genai Installation: If you're using Maven, add the following to your dependencies: <dependencies> <dependency> <groupId>com.google.genai</groupId> <artifactId>google-genai</artifactId> <version>1.0.0</version> </dependency> </dependencies> General availability We started rolling out the Google GenAI SDK in late 2024. As of May 2025, it reached General Availability (GA) across all supported platforms. This means the libraries are stable and fully supported for production use. They are actively maintained, provide access to the latest features, and offer the best performance working with Gemini. If you're not using the Google GenAI SDK and using one of our legacy libraries, we strongly recommend you to migrate. Review the legacy libraries section for more information. Legacy libraries and migration If you are using one of our legacy libraries, we recommend that you migrate to the new libraries . The legacy libraries don't provide access to recent features (such as Live API and Veo ) and are on a deprecation path. They will stop receiving updates at the end of September 2025, the feature gaps will grow and potential bugs may no longer get fixed. Each legacy library's support status varies, detailed in the following table: Language Legacy library Support status Recommended library Python google-generativeai All support, including bug fixes, ends end of September 2025. google-genai JavaScript/TypeScript @google/generativeai All support, including bug fixes, ends end of September 2025. @google/genai Go google.golang.org/generative-ai All support, including bug fixes, ends end of September 2025. google.golang.org/genai Dart and Flutter google_generative_ai Not actively maintained Use trusted community or third party libraries, like firebase_ai , or access using REST API Swift generative-ai-swift Not actively maintained Use Gemini in Firebase Android generative-ai-android Not actively maintained Use Gemini in Firebase Note for Java developers: There was no legacy Google-provided Java SDK for the Gemini API, so no migration from a previous Google library is required. You can start directly with the new library in the Language support and installation section. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-09 UTC.
|
text_content/docs_live-guide_0a3a44d0.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
URL: https://ai.google.dev/gemini-api/docs/live-guide#native-audio-output
|
2 |
+
Title: Live API capabilities guide | Gemini API | Google AI for Developers
|
3 |
+
==================================================
|
4 |
+
|
5 |
+
Live API capabilities guide | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Live API capabilities guide Preview: The Live API is in preview. This is a comprehensive guide that covers capabilities and configurations available with the Live API. See Get started with Live API page for a overview and sample code for common use cases. Before you begin Familiarize yourself with core concepts: If you haven't already done so, read the Get started with Live API page first. This will introduce you to the fundamental principles of the Live API, how it works, and the distinction between the different models and their corresponding audio generation methods ( native audio or half-cascade). Try the Live API in AI Studio: You may find it useful to try the Live API in Google AI Studio before you start building. To use the Live API in Google AI Studio, select Stream . Establishing a connection The following example shows how to create a connection with an API key: Python import asyncio from google import genai client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : print ( "Session started" ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ] }; async function main () { const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { console . debug ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); // Send content... session . close (); } main (); Note: You can only set one modality in the response_modalities field. This means that you can configure the model to respond with either text or audio, but not both in the same session. Interaction modalities The following sections provide examples and supporting context for the different input and output modalities available in Live API. Sending and receiving text Here's how you can send and receive text: Python import asyncio from google import genai client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : message = "Hello, how are you?" await session . send_client_content ( turns = { "role" : "user" , "parts" : [{ "text" : message }]}, turn_complete = True ) async for response in session . receive (): if response . text is not None : print ( response . text , end = "" ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ] }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); const inputTurns = 'Hello how are you?' ; session . sendClientContent ({ turns : inputTurns }); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . text ) { console . debug ( 'Received text: %s\n' , turn . text ); } else if ( turn . data ) { console . debug ( 'Received inline data: %s\n' , turn . data ); } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); Incremental content updates Use incremental updates to send text input, establish session context, or restore session context. For short contexts you can send turn-by-turn interactions to represent the exact sequence of events: Python turns = [ { "role" : "user" , "parts" : [{ "text" : "What is the capital of France?" }]}, { "role" : "model" , "parts" : [{ "text" : "Paris" }]}, ] await session . send_client_content ( turns = turns , turn_complete = False ) turns = [{ "role" : "user" , "parts" : [{ "text" : "What is the capital of Germany?" }]}] await session . send_client_content ( turns = turns , turn_complete = True ) JavaScript let inputTurns = [ { "role" : "user" , "parts" : [{ "text" : "What is the capital of France?" }] }, { "role" : "model" , "parts" : [{ "text" : "Paris" }] }, ] session . sendClientContent ({ turns : inputTurns , turnComplete : false }) inputTurns = [{ "role" : "user" , "parts" : [{ "text" : "What is the capital of Germany?" }] }] session . sendClientContent ({ turns : inputTurns , turnComplete : true }) For longer contexts it's recommended to provide a single message summary to free up the context window for subsequent interactions. See Session Resumption for another method for loading session context. Sending and receiving audio The most common audio example, audio-to-audio , is covered in the Getting started guide. Here's an audio-to-text example that reads a WAV file, sends it in the correct format and receives text output: Python # Test file: https://storage.googleapis.com/generativeai-downloads/data/16000.wav # Install helpers for converting files: pip install librosa soundfile import asyncio import io from pathlib import Path from google import genai from google.genai import types import soundfile as sf import librosa client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : buffer = io . BytesIO () y , sr = librosa . load ( "sample.wav" , sr = 16000 ) sf . write ( buffer , y , sr , format = 'RAW' , subtype = 'PCM_16' ) buffer . seek ( 0 ) audio_bytes = buffer . read () # If already in correct format, you can use this: # audio_bytes = Path("sample.pcm").read_bytes() await session . send_realtime_input ( audio = types . Blob ( data = audio_bytes , mime_type = "audio/pcm;rate=16000" ) ) async for response in session . receive (): if response . text is not None : print ( response . text ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript // Test file: https://storage.googleapis.com/generativeai-downloads/data/16000.wav // Install helpers for converting files: npm install wavefile import { GoogleGenAI , Modality } from '@google/genai' ; import * as fs from "node:fs" ; import pkg from 'wavefile' ; const { WaveFile } = pkg ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ] }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); // Send Audio Chunk const fileBuffer = fs . readFileSync ( "sample.wav" ); // Ensure audio conforms to API requirements (16-bit PCM, 16kHz, mono) const wav = new WaveFile (); wav . fromBuffer ( fileBuffer ); wav . toSampleRate ( 16000 ); wav . toBitDepth ( "16" ); const base64Audio = wav . toBase64 (); // If already in correct format, you can use this: // const fileBuffer = fs.readFileSync("sample.pcm"); // const base64Audio = Buffer.from(fileBuffer).toString('base64'); session . sendRealtimeInput ( { audio : { data : base64Audio , mimeType : "audio/pcm;rate=16000" } } ); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . text ) { console . debug ( 'Received text: %s\n' , turn . text ); } else if ( turn . data ) { console . debug ( 'Received inline data: %s\n' , turn . data ); } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); And here is a text-to-audio example. You can receive audio by setting AUDIO as response modality. This example saves the received data as WAV file: Python import asyncio import wave from google import genai client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "AUDIO" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : wf = wave . open ( "audio.wav" , "wb" ) wf . setnchannels ( 1 ) wf . setsampwidth ( 2 ) wf . setframerate ( 24000 ) message = "Hello how are you?" await session . send_client_content ( turns = { "role" : "user" , "parts" : [{ "text" : message }]}, turn_complete = True ) async for response in session . receive (): if response . data is not None : wf . writeframes ( response . data ) # Un-comment this code to print audio data info # if response.server_content.model_turn is not None: # print(response.server_content.model_turn.parts[0].inline_data.mime_type) wf . close () if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; import * as fs from "node:fs" ; import pkg from 'wavefile' ; const { WaveFile } = pkg ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . AUDIO ] }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); const inputTurns = 'Hello how are you?' ; session . sendClientContent ({ turns : inputTurns }); const turns = await handleTurn (); // Combine audio data strings and save as wave file const combinedAudio = turns . reduce (( acc , turn ) = > { if ( turn . data ) { const buffer = Buffer . from ( turn . data , 'base64' ); const intArray = new Int16Array ( buffer . buffer , buffer . byteOffset , buffer . byteLength / Int16Array . BYTES_PER_ELEMENT ); return acc . concat ( Array . from ( intArray )); } return acc ; }, []); const audioBuffer = new Int16Array ( combinedAudio ); const wf = new WaveFile (); wf . fromScratch ( 1 , 24000 , '16' , audioBuffer ); fs . writeFileSync ( 'output.wav' , wf . toBuffer ()); session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); Audio formats Audio data in the Live API is always raw, little-endian, 16-bit PCM. Audio output always uses a sample rate of 24kHz. Input audio is natively 16kHz, but the Live API will resample if needed so any sample rate can be sent. To convey the sample rate of input audio, set the MIME type of each audio-containing Blob to a value like audio/pcm;rate=16000 . Audio transcriptions You can enable transcription of the model's audio output by sending output_audio_transcription in the setup config. The transcription language is inferred from the model's response. Python import asyncio from google import genai from google.genai import types client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "AUDIO" ], "output_audio_transcription" : {} } async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : message = "Hello? Gemini are you there?" await session . send_client_content ( turns = { "role" : "user" , "parts" : [{ "text" : message }]}, turn_complete = True ) async for response in session . receive (): if response . server_content . model_turn : print ( "Model turn:" , response . server_content . model_turn ) if response . server_content . output_transcription : print ( "Transcript:" , response . server_content . output_transcription . text ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . AUDIO ], outputAudioTranscription : {} }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); const inputTurns = 'Hello how are you?' ; session . sendClientContent ({ turns : inputTurns }); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . outputTranscription ) { console . debug ( 'Received output transcription: %s\n' , turn . serverContent . outputTranscription . text ); } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); You can enable transcription of the audio input by sending input_audio_transcription in setup config. Python import asyncio from pathlib import Path from google import genai from google.genai import types client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ], "input_audio_transcription" : {}, } async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : audio_data = Path ( "16000.pcm" ) . read_bytes () await session . send_realtime_input ( audio = types . Blob ( data = audio_data , mime_type = 'audio/pcm;rate=16000' ) ) async for msg in session . receive (): if msg . server_content . input_transcription : print ( 'Transcript:' , msg . server_content . input_transcription . text ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; import * as fs from "node:fs" ; import pkg from 'wavefile' ; const { WaveFile } = pkg ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ], inputAudioTranscription : {} }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); // Send Audio Chunk const fileBuffer = fs . readFileSync ( "16000.wav" ); // Ensure audio conforms to API requirements (16-bit PCM, 16kHz, mono) const wav = new WaveFile (); wav . fromBuffer ( fileBuffer ); wav . toSampleRate ( 16000 ); wav . toBitDepth ( "16" ); const base64Audio = wav . toBase64 (); // If already in correct format, you can use this: // const fileBuffer = fs.readFileSync("sample.pcm"); // const base64Audio = Buffer.from(fileBuffer).toString('base64'); session . sendRealtimeInput ( { audio : { data : base64Audio , mimeType : "audio/pcm;rate=16000" } } ); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . outputTranscription ) { console . log ( "Transcription" ) console . log ( turn . serverContent . outputTranscription . text ); } } for ( const turn of turns ) { if ( turn . text ) { console . debug ( 'Received text: %s\n' , turn . text ); } else if ( turn . data ) { console . debug ( 'Received inline data: %s\n' , turn . data ); } else if ( turn . serverContent && turn . serverContent . inputTranscription ) { console . debug ( 'Received input transcription: %s\n' , turn . serverContent . inputTranscription . text ); } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); Stream audio and video To see an example of how to use the Live API in a streaming audio and video format, run the "Live API - Get Started" file in the cookbooks repository: View on Colab Change voice and language The Live API models each support a different set of voices. Half-cascade supports Puck, Charon, Kore, Fenrir, Aoede, Leda, Orus, and Zephyr. Native audio supports a much longer list (identical to the TTS model list ). You can listen to all the voices in AI Studio . To specify a voice, set the voice name within the speechConfig object as part of the session configuration: Python config = { "response_modalities" : [ "AUDIO" ], "speech_config" : { "voice_config" : { "prebuilt_voice_config" : { "voice_name" : "Kore" }} }, } JavaScript const config = { responseModalities : [ Modality . AUDIO ], speechConfig : { voiceConfig : { prebuiltVoiceConfig : { voiceName : "Kore" } } } }; Note: If you're using the generateContent API, the set of available voices is slightly different. See the audio generation guide for generateContent audio generation voices. The Live API supports multiple languages . To change the language, set the language code within the speechConfig object as part of the session configuration: Python config = { "response_modalities" : [ "AUDIO" ], "speech_config" : { "language_code" : "de-DE" } } JavaScript const config = { responseModalities : [ Modality . AUDIO ], speechConfig : { languageCode : "de-DE" } }; Note: Native audio output models automatically choose the appropriate language and don't support explicitly setting the language code. Native audio capabilities The following capabilities are only available with native audio. You can learn more about native audio in Choose a model and audio generation . Note: Native audio models currently have limited tool use support. See Overview of supported tools for details. How to use native audio output To use native audio output, configure one of the native audio models and set response_modalities to AUDIO . See Send and receive audio for a full example. Python model = "gemini-2.5-flash-preview-native-audio-dialog" config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ]) async with client . aio . live . connect ( model = model , config = config ) as session : # Send audio input and receive audio JavaScript const model = 'gemini-2.5-flash-preview-native-audio-dialog' ; const config = { responseModalities : [ Modality . AUDIO ] }; async function main () { const session = await ai . live . connect ({ model : model , config : config , callbacks : ..., }); // Send audio input and receive audio session . close (); } main (); Affective dialog This feature lets Gemini adapt its response style to the input expression and tone. To use affective dialog, set the api version to v1alpha and set enable_affective_dialog to true in the setup message: Python client = genai . Client ( http_options = { "api_version" : "v1alpha" }) config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ], enable_affective_dialog = True ) JavaScript const ai = new GoogleGenAI ({ httpOptions : { "apiVersion" : "v1alpha" } }); const config = { responseModalities : [ Modality . AUDIO ], enableAffectiveDialog : true }; Note that affective dialog is currently only supported by the native audio output models. Proactive audio When this feature is enabled, Gemini can proactively decide not to respond if the content is not relevant. To use it, set the api version to v1alpha and configure the proactivity field in the setup message and set proactive_audio to true : Python client = genai . Client ( http_options = { "api_version" : "v1alpha" }) config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ], proactivity = { 'proactive_audio' : True } ) JavaScript const ai = new GoogleGenAI ({ httpOptions : { "apiVersion" : "v1alpha" } }); const config = { responseModalities : [ Modality . AUDIO ], proactivity : { proactiveAudio : true } } Note that proactive audio is currently only supported by the native audio output models. Native audio output with thinking Native audio output supports thinking capabilities , available via a separate model gemini-2.5-flash-exp-native-audio-thinking-dialog . See Send and receive audio for a full example. Python model = "gemini-2.5-flash-exp-native-audio-thinking-dialog" config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ]) async with client . aio . live . connect ( model = model , config = config ) as session : # Send audio input and receive audio JavaScript const model = 'gemini-2.5-flash-exp-native-audio-thinking-dialog' ; const config = { responseModalities : [ Modality . AUDIO ] }; async function main () { const session = await ai . live . connect ({ model : model , config : config , callbacks : ..., }); // Send audio input and receive audio session . close (); } main (); Voice Activity Detection (VAD) Voice Activity Detection (VAD) allows the model to recognize when a person is speaking. This is essential for creating natural conversations, as it allows a user to interrupt the model at any time. When VAD detects an interruption, the ongoing generation is canceled and discarded. Only the information already sent to the client is retained in the session history. The server then sends a BidiGenerateContentServerContent message to report the interruption. The Gemini server then discards any pending function calls and sends a BidiGenerateContentServerContent message with the IDs of the canceled calls. Python async for response in session . receive (): if response . server_content . interrupted is True : # The generation was interrupted # If realtime playback is implemented in your application, # you should stop playing audio and clear queued playback here. JavaScript const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . interrupted ) { // The generation was interrupted // If realtime playback is implemented in your application, // you should stop playing audio and clear queued playback here. } } Automatic VAD By default, the model automatically performs VAD on a continuous audio input stream. VAD can be configured with the realtimeInputConfig.automaticActivityDetection field of the setup configuration . When the audio stream is paused for more than a second (for example, because the user switched off the microphone), an audioStreamEnd event should be sent to flush any cached audio. The client can resume sending audio data at any time. Python # example audio file to try: # URL = "https://storage.googleapis.com/generativeai-downloads/data/hello_are_you_there.pcm" # !wget -q $URL -O sample.pcm import asyncio from pathlib import Path from google import genai from google.genai import types client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : audio_bytes = Path ( "sample.pcm" ) . read_bytes () await session . send_realtime_input ( audio = types . Blob ( data = audio_bytes , mime_type = "audio/pcm;rate=16000" ) ) # if stream gets paused, send: # await session.send_realtime_input(audio_stream_end=True) async for response in session . receive (): if response . text is not None : print ( response . text ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript // example audio file to try: // URL = "https://storage.googleapis.com/generativeai-downloads/data/hello_are_you_there.pcm" // !wget -q $URL -O sample.pcm import { GoogleGenAI , Modality } from '@google/genai' ; import * as fs from "node:fs" ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ] }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); // Send Audio Chunk const fileBuffer = fs . readFileSync ( "sample.pcm" ); const base64Audio = Buffer . from ( fileBuffer ). toString ( 'base64' ); session . sendRealtimeInput ( { audio : { data : base64Audio , mimeType : "audio/pcm;rate=16000" } } ); // if stream gets paused, send: // session.sendRealtimeInput({ audioStreamEnd: true }) const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . text ) { console . debug ( 'Received text: %s\n' , turn . text ); } else if ( turn . data ) { console . debug ( 'Received inline data: %s\n' , turn . data ); } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); With send_realtime_input , the API will respond to audio automatically based on VAD. While send_client_content adds messages to the model context in order, send_realtime_input is optimized for responsiveness at the expense of deterministic ordering. Automatic VAD configuration For more control over the VAD activity, you can configure the following parameters. See API reference for more info. Python from google.genai import types config = { "response_modalities" : [ "TEXT" ], "realtime_input_config" : { "automatic_activity_detection" : { "disabled" : False , # default "start_of_speech_sensitivity" : types . StartSensitivity . START_SENSITIVITY_LOW , "end_of_speech_sensitivity" : types . EndSensitivity . END_SENSITIVITY_LOW , "prefix_padding_ms" : 20 , "silence_duration_ms" : 100 , } } } JavaScript import { GoogleGenAI , Modality , StartSensitivity , EndSensitivity } from '@google/genai' ; const config = { responseModalities : [ Modality . TEXT ], realtimeInputConfig : { automaticActivityDetection : { disabled : false , // default startOfSpeechSensitivity : StartSensitivity . START_SENSITIVITY_LOW , endOfSpeechSensitivity : EndSensitivity . END_SENSITIVITY_LOW , prefixPaddingMs : 20 , silenceDurationMs : 100 , } } }; Disable automatic VAD Alternatively, the automatic VAD can be disabled by setting realtimeInputConfig.automaticActivityDetection.disabled to true in the setup message. In this configuration the client is responsible for detecting user speech and sending activityStart and activityEnd messages at the appropriate times. An audioStreamEnd isn't sent in this configuration. Instead, any interruption of the stream is marked by an activityEnd message. Python config = { "response_modalities" : [ "TEXT" ], "realtime_input_config" : { "automatic_activity_detection" : { "disabled" : True }}, } async with client . aio . live . connect ( model = model , config = config ) as session : # ... await session . send_realtime_input ( activity_start = types . ActivityStart ()) await session . send_realtime_input ( audio = types . Blob ( data = audio_bytes , mime_type = "audio/pcm;rate=16000" ) ) await session . send_realtime_input ( activity_end = types . ActivityEnd ()) # ... JavaScript const config = { responseModalities : [ Modality . TEXT ], realtimeInputConfig : { automaticActivityDetection : { disabled : true , } } }; session . sendRealtimeInput ({ activityStart : {} }) session . sendRealtimeInput ( { audio : { data : base64Audio , mimeType : "audio/pcm;rate=16000" } } ); session . sendRealtimeInput ({ activityEnd : {} }) Token count You can find the total number of consumed tokens in the usageMetadata field of the returned server message. Python async for message in session . receive (): # The server will periodically send messages that include UsageMetadata. if message . usage_metadata : usage = message . usage_metadata print ( f "Used { usage . total_token_count } tokens in total. Response token breakdown:" ) for detail in usage . response_tokens_details : match detail : case types . ModalityTokenCount ( modality = modality , token_count = count ): print ( f " { modality } : { count } " ) JavaScript const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . usageMetadata ) { console . debug ( 'Used %s tokens in total. Response token breakdown:\n' , turn . usageMetadata . totalTokenCount ); for ( const detail of turn . usageMetadata . responseTokensDetails ) { console . debug ( '%s\n' , detail ); } } } Media resolution You can specify the media resolution for the input media by setting the mediaResolution field as part of the session configuration: Python from google.genai import types config = { "response_modalities" : [ "AUDIO" ], "media_resolution" : types . MediaResolution . MEDIA_RESOLUTION_LOW , } JavaScript import { GoogleGenAI , Modality , MediaResolution } from '@google/genai' ; const config = { responseModalities : [ Modality . TEXT ], mediaResolution : MediaResolution . MEDIA_RESOLUTION_LOW , }; Limitations Consider the following limitations of the Live API when you plan your project. Response modalities You can only set one response modality ( TEXT or AUDIO ) per session in the session configuration. Setting both results in a config error message. This means that you can configure the model to respond with either text or audio, but not both in the same session. Client authentication The Live API only provides server-to-server authentication by default. If you're implementing your Live API application using a client-to-server approach , you need to use ephemeral tokens to mitigate security risks. Session duration Audio-only sessions are limited to 15 minutes, and audio plus video sessions are limited to 2 minutes. However, you can configure different session management techniques for unlimited extensions on session duration. Context window A session has a context window limit of: 128k tokens for native audio output models 32k tokens for other Live API models Supported languages Live API supports the following languages. Note: Native audio output models automatically choose the appropriate language and don't support explicitly setting the language code. Language BCP-47 Code Language BCP-47 Code German (Germany) de-DE English (Australia)* en-AU English (UK)* en-GB English (India) en-IN English (US) en-US Spanish (US) es-US French (France) fr-FR Hindi (India) hi-IN Portuguese (Brazil) pt-BR Arabic (Generic) ar-XA Spanish (Spain)* es-ES French (Canada)* fr-CA Indonesian (Indonesia) id-ID Italian (Italy) it-IT Japanese (Japan) ja-JP Turkish (Turkey) tr-TR Vietnamese (Vietnam) vi-VN Bengali (India) bn-IN Gujarati (India)* gu-IN Kannada (India)* kn-IN Marathi (India) mr-IN Malayalam (India)* ml-IN Tamil (India) ta-IN Telugu (India) te-IN Dutch (Netherlands) nl-NL Korean (South Korea) ko-KR Mandarin Chinese (China)* cmn-CN Polish (Poland) pl-PL Russian (Russia) ru-RU Thai (Thailand) th-TH Languages marked with an asterisk (*) are not available for Native audio . What's next Read the Tool Use and Session Management guides for essential information on using the Live API effectively. Try the Live API in Google AI Studio . For more info about the Live API models, see Gemini 2.0 Flash Live and Gemini 2.5 Flash Native Audio on the Models page. Try more examples in the Live API cookbook , the Live API Tools cookbook , and the Live API Get Started script . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC.
|
text_content/docs_live-guide_a0b9dda4.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
URL: https://ai.google.dev/gemini-api/docs/live-guide
|
2 |
+
Title: Live API capabilities guide | Gemini API | Google AI for Developers
|
3 |
+
==================================================
|
4 |
+
|
5 |
+
Live API capabilities guide | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Live API capabilities guide Preview: The Live API is in preview. This is a comprehensive guide that covers capabilities and configurations available with the Live API. See Get started with Live API page for a overview and sample code for common use cases. Before you begin Familiarize yourself with core concepts: If you haven't already done so, read the Get started with Live API page first. This will introduce you to the fundamental principles of the Live API, how it works, and the distinction between the different models and their corresponding audio generation methods ( native audio or half-cascade). Try the Live API in AI Studio: You may find it useful to try the Live API in Google AI Studio before you start building. To use the Live API in Google AI Studio, select Stream . Establishing a connection The following example shows how to create a connection with an API key: Python import asyncio from google import genai client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : print ( "Session started" ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ] }; async function main () { const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { console . debug ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); // Send content... session . close (); } main (); Note: You can only set one modality in the response_modalities field. This means that you can configure the model to respond with either text or audio, but not both in the same session. Interaction modalities The following sections provide examples and supporting context for the different input and output modalities available in Live API. Sending and receiving text Here's how you can send and receive text: Python import asyncio from google import genai client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : message = "Hello, how are you?" await session . send_client_content ( turns = { "role" : "user" , "parts" : [{ "text" : message }]}, turn_complete = True ) async for response in session . receive (): if response . text is not None : print ( response . text , end = "" ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ] }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); const inputTurns = 'Hello how are you?' ; session . sendClientContent ({ turns : inputTurns }); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . text ) { console . debug ( 'Received text: %s\n' , turn . text ); } else if ( turn . data ) { console . debug ( 'Received inline data: %s\n' , turn . data ); } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); Incremental content updates Use incremental updates to send text input, establish session context, or restore session context. For short contexts you can send turn-by-turn interactions to represent the exact sequence of events: Python turns = [ { "role" : "user" , "parts" : [{ "text" : "What is the capital of France?" }]}, { "role" : "model" , "parts" : [{ "text" : "Paris" }]}, ] await session . send_client_content ( turns = turns , turn_complete = False ) turns = [{ "role" : "user" , "parts" : [{ "text" : "What is the capital of Germany?" }]}] await session . send_client_content ( turns = turns , turn_complete = True ) JavaScript let inputTurns = [ { "role" : "user" , "parts" : [{ "text" : "What is the capital of France?" }] }, { "role" : "model" , "parts" : [{ "text" : "Paris" }] }, ] session . sendClientContent ({ turns : inputTurns , turnComplete : false }) inputTurns = [{ "role" : "user" , "parts" : [{ "text" : "What is the capital of Germany?" }] }] session . sendClientContent ({ turns : inputTurns , turnComplete : true }) For longer contexts it's recommended to provide a single message summary to free up the context window for subsequent interactions. See Session Resumption for another method for loading session context. Sending and receiving audio The most common audio example, audio-to-audio , is covered in the Getting started guide. Here's an audio-to-text example that reads a WAV file, sends it in the correct format and receives text output: Python # Test file: https://storage.googleapis.com/generativeai-downloads/data/16000.wav # Install helpers for converting files: pip install librosa soundfile import asyncio import io from pathlib import Path from google import genai from google.genai import types import soundfile as sf import librosa client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : buffer = io . BytesIO () y , sr = librosa . load ( "sample.wav" , sr = 16000 ) sf . write ( buffer , y , sr , format = 'RAW' , subtype = 'PCM_16' ) buffer . seek ( 0 ) audio_bytes = buffer . read () # If already in correct format, you can use this: # audio_bytes = Path("sample.pcm").read_bytes() await session . send_realtime_input ( audio = types . Blob ( data = audio_bytes , mime_type = "audio/pcm;rate=16000" ) ) async for response in session . receive (): if response . text is not None : print ( response . text ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript // Test file: https://storage.googleapis.com/generativeai-downloads/data/16000.wav // Install helpers for converting files: npm install wavefile import { GoogleGenAI , Modality } from '@google/genai' ; import * as fs from "node:fs" ; import pkg from 'wavefile' ; const { WaveFile } = pkg ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ] }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); // Send Audio Chunk const fileBuffer = fs . readFileSync ( "sample.wav" ); // Ensure audio conforms to API requirements (16-bit PCM, 16kHz, mono) const wav = new WaveFile (); wav . fromBuffer ( fileBuffer ); wav . toSampleRate ( 16000 ); wav . toBitDepth ( "16" ); const base64Audio = wav . toBase64 (); // If already in correct format, you can use this: // const fileBuffer = fs.readFileSync("sample.pcm"); // const base64Audio = Buffer.from(fileBuffer).toString('base64'); session . sendRealtimeInput ( { audio : { data : base64Audio , mimeType : "audio/pcm;rate=16000" } } ); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . text ) { console . debug ( 'Received text: %s\n' , turn . text ); } else if ( turn . data ) { console . debug ( 'Received inline data: %s\n' , turn . data ); } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); And here is a text-to-audio example. You can receive audio by setting AUDIO as response modality. This example saves the received data as WAV file: Python import asyncio import wave from google import genai client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "AUDIO" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : wf = wave . open ( "audio.wav" , "wb" ) wf . setnchannels ( 1 ) wf . setsampwidth ( 2 ) wf . setframerate ( 24000 ) message = "Hello how are you?" await session . send_client_content ( turns = { "role" : "user" , "parts" : [{ "text" : message }]}, turn_complete = True ) async for response in session . receive (): if response . data is not None : wf . writeframes ( response . data ) # Un-comment this code to print audio data info # if response.server_content.model_turn is not None: # print(response.server_content.model_turn.parts[0].inline_data.mime_type) wf . close () if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; import * as fs from "node:fs" ; import pkg from 'wavefile' ; const { WaveFile } = pkg ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . AUDIO ] }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); const inputTurns = 'Hello how are you?' ; session . sendClientContent ({ turns : inputTurns }); const turns = await handleTurn (); // Combine audio data strings and save as wave file const combinedAudio = turns . reduce (( acc , turn ) = > { if ( turn . data ) { const buffer = Buffer . from ( turn . data , 'base64' ); const intArray = new Int16Array ( buffer . buffer , buffer . byteOffset , buffer . byteLength / Int16Array . BYTES_PER_ELEMENT ); return acc . concat ( Array . from ( intArray )); } return acc ; }, []); const audioBuffer = new Int16Array ( combinedAudio ); const wf = new WaveFile (); wf . fromScratch ( 1 , 24000 , '16' , audioBuffer ); fs . writeFileSync ( 'output.wav' , wf . toBuffer ()); session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); Audio formats Audio data in the Live API is always raw, little-endian, 16-bit PCM. Audio output always uses a sample rate of 24kHz. Input audio is natively 16kHz, but the Live API will resample if needed so any sample rate can be sent. To convey the sample rate of input audio, set the MIME type of each audio-containing Blob to a value like audio/pcm;rate=16000 . Audio transcriptions You can enable transcription of the model's audio output by sending output_audio_transcription in the setup config. The transcription language is inferred from the model's response. Python import asyncio from google import genai from google.genai import types client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "AUDIO" ], "output_audio_transcription" : {} } async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : message = "Hello? Gemini are you there?" await session . send_client_content ( turns = { "role" : "user" , "parts" : [{ "text" : message }]}, turn_complete = True ) async for response in session . receive (): if response . server_content . model_turn : print ( "Model turn:" , response . server_content . model_turn ) if response . server_content . output_transcription : print ( "Transcript:" , response . server_content . output_transcription . text ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . AUDIO ], outputAudioTranscription : {} }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); const inputTurns = 'Hello how are you?' ; session . sendClientContent ({ turns : inputTurns }); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . outputTranscription ) { console . debug ( 'Received output transcription: %s\n' , turn . serverContent . outputTranscription . text ); } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); You can enable transcription of the audio input by sending input_audio_transcription in setup config. Python import asyncio from pathlib import Path from google import genai from google.genai import types client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ], "input_audio_transcription" : {}, } async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : audio_data = Path ( "16000.pcm" ) . read_bytes () await session . send_realtime_input ( audio = types . Blob ( data = audio_data , mime_type = 'audio/pcm;rate=16000' ) ) async for msg in session . receive (): if msg . server_content . input_transcription : print ( 'Transcript:' , msg . server_content . input_transcription . text ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; import * as fs from "node:fs" ; import pkg from 'wavefile' ; const { WaveFile } = pkg ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ], inputAudioTranscription : {} }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); // Send Audio Chunk const fileBuffer = fs . readFileSync ( "16000.wav" ); // Ensure audio conforms to API requirements (16-bit PCM, 16kHz, mono) const wav = new WaveFile (); wav . fromBuffer ( fileBuffer ); wav . toSampleRate ( 16000 ); wav . toBitDepth ( "16" ); const base64Audio = wav . toBase64 (); // If already in correct format, you can use this: // const fileBuffer = fs.readFileSync("sample.pcm"); // const base64Audio = Buffer.from(fileBuffer).toString('base64'); session . sendRealtimeInput ( { audio : { data : base64Audio , mimeType : "audio/pcm;rate=16000" } } ); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . outputTranscription ) { console . log ( "Transcription" ) console . log ( turn . serverContent . outputTranscription . text ); } } for ( const turn of turns ) { if ( turn . text ) { console . debug ( 'Received text: %s\n' , turn . text ); } else if ( turn . data ) { console . debug ( 'Received inline data: %s\n' , turn . data ); } else if ( turn . serverContent && turn . serverContent . inputTranscription ) { console . debug ( 'Received input transcription: %s\n' , turn . serverContent . inputTranscription . text ); } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); Stream audio and video To see an example of how to use the Live API in a streaming audio and video format, run the "Live API - Get Started" file in the cookbooks repository: View on Colab Change voice and language The Live API models each support a different set of voices. Half-cascade supports Puck, Charon, Kore, Fenrir, Aoede, Leda, Orus, and Zephyr. Native audio supports a much longer list (identical to the TTS model list ). You can listen to all the voices in AI Studio . To specify a voice, set the voice name within the speechConfig object as part of the session configuration: Python config = { "response_modalities" : [ "AUDIO" ], "speech_config" : { "voice_config" : { "prebuilt_voice_config" : { "voice_name" : "Kore" }} }, } JavaScript const config = { responseModalities : [ Modality . AUDIO ], speechConfig : { voiceConfig : { prebuiltVoiceConfig : { voiceName : "Kore" } } } }; Note: If you're using the generateContent API, the set of available voices is slightly different. See the audio generation guide for generateContent audio generation voices. The Live API supports multiple languages . To change the language, set the language code within the speechConfig object as part of the session configuration: Python config = { "response_modalities" : [ "AUDIO" ], "speech_config" : { "language_code" : "de-DE" } } JavaScript const config = { responseModalities : [ Modality . AUDIO ], speechConfig : { languageCode : "de-DE" } }; Note: Native audio output models automatically choose the appropriate language and don't support explicitly setting the language code. Native audio capabilities The following capabilities are only available with native audio. You can learn more about native audio in Choose a model and audio generation . Note: Native audio models currently have limited tool use support. See Overview of supported tools for details. How to use native audio output To use native audio output, configure one of the native audio models and set response_modalities to AUDIO . See Send and receive audio for a full example. Python model = "gemini-2.5-flash-preview-native-audio-dialog" config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ]) async with client . aio . live . connect ( model = model , config = config ) as session : # Send audio input and receive audio JavaScript const model = 'gemini-2.5-flash-preview-native-audio-dialog' ; const config = { responseModalities : [ Modality . AUDIO ] }; async function main () { const session = await ai . live . connect ({ model : model , config : config , callbacks : ..., }); // Send audio input and receive audio session . close (); } main (); Affective dialog This feature lets Gemini adapt its response style to the input expression and tone. To use affective dialog, set the api version to v1alpha and set enable_affective_dialog to true in the setup message: Python client = genai . Client ( http_options = { "api_version" : "v1alpha" }) config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ], enable_affective_dialog = True ) JavaScript const ai = new GoogleGenAI ({ httpOptions : { "apiVersion" : "v1alpha" } }); const config = { responseModalities : [ Modality . AUDIO ], enableAffectiveDialog : true }; Note that affective dialog is currently only supported by the native audio output models. Proactive audio When this feature is enabled, Gemini can proactively decide not to respond if the content is not relevant. To use it, set the api version to v1alpha and configure the proactivity field in the setup message and set proactive_audio to true : Python client = genai . Client ( http_options = { "api_version" : "v1alpha" }) config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ], proactivity = { 'proactive_audio' : True } ) JavaScript const ai = new GoogleGenAI ({ httpOptions : { "apiVersion" : "v1alpha" } }); const config = { responseModalities : [ Modality . AUDIO ], proactivity : { proactiveAudio : true } } Note that proactive audio is currently only supported by the native audio output models. Native audio output with thinking Native audio output supports thinking capabilities , available via a separate model gemini-2.5-flash-exp-native-audio-thinking-dialog . See Send and receive audio for a full example. Python model = "gemini-2.5-flash-exp-native-audio-thinking-dialog" config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ]) async with client . aio . live . connect ( model = model , config = config ) as session : # Send audio input and receive audio JavaScript const model = 'gemini-2.5-flash-exp-native-audio-thinking-dialog' ; const config = { responseModalities : [ Modality . AUDIO ] }; async function main () { const session = await ai . live . connect ({ model : model , config : config , callbacks : ..., }); // Send audio input and receive audio session . close (); } main (); Voice Activity Detection (VAD) Voice Activity Detection (VAD) allows the model to recognize when a person is speaking. This is essential for creating natural conversations, as it allows a user to interrupt the model at any time. When VAD detects an interruption, the ongoing generation is canceled and discarded. Only the information already sent to the client is retained in the session history. The server then sends a BidiGenerateContentServerContent message to report the interruption. The Gemini server then discards any pending function calls and sends a BidiGenerateContentServerContent message with the IDs of the canceled calls. Python async for response in session . receive (): if response . server_content . interrupted is True : # The generation was interrupted # If realtime playback is implemented in your application, # you should stop playing audio and clear queued playback here. JavaScript const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . interrupted ) { // The generation was interrupted // If realtime playback is implemented in your application, // you should stop playing audio and clear queued playback here. } } Automatic VAD By default, the model automatically performs VAD on a continuous audio input stream. VAD can be configured with the realtimeInputConfig.automaticActivityDetection field of the setup configuration . When the audio stream is paused for more than a second (for example, because the user switched off the microphone), an audioStreamEnd event should be sent to flush any cached audio. The client can resume sending audio data at any time. Python # example audio file to try: # URL = "https://storage.googleapis.com/generativeai-downloads/data/hello_are_you_there.pcm" # !wget -q $URL -O sample.pcm import asyncio from pathlib import Path from google import genai from google.genai import types client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : audio_bytes = Path ( "sample.pcm" ) . read_bytes () await session . send_realtime_input ( audio = types . Blob ( data = audio_bytes , mime_type = "audio/pcm;rate=16000" ) ) # if stream gets paused, send: # await session.send_realtime_input(audio_stream_end=True) async for response in session . receive (): if response . text is not None : print ( response . text ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript // example audio file to try: // URL = "https://storage.googleapis.com/generativeai-downloads/data/hello_are_you_there.pcm" // !wget -q $URL -O sample.pcm import { GoogleGenAI , Modality } from '@google/genai' ; import * as fs from "node:fs" ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ] }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); // Send Audio Chunk const fileBuffer = fs . readFileSync ( "sample.pcm" ); const base64Audio = Buffer . from ( fileBuffer ). toString ( 'base64' ); session . sendRealtimeInput ( { audio : { data : base64Audio , mimeType : "audio/pcm;rate=16000" } } ); // if stream gets paused, send: // session.sendRealtimeInput({ audioStreamEnd: true }) const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . text ) { console . debug ( 'Received text: %s\n' , turn . text ); } else if ( turn . data ) { console . debug ( 'Received inline data: %s\n' , turn . data ); } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); With send_realtime_input , the API will respond to audio automatically based on VAD. While send_client_content adds messages to the model context in order, send_realtime_input is optimized for responsiveness at the expense of deterministic ordering. Automatic VAD configuration For more control over the VAD activity, you can configure the following parameters. See API reference for more info. Python from google.genai import types config = { "response_modalities" : [ "TEXT" ], "realtime_input_config" : { "automatic_activity_detection" : { "disabled" : False , # default "start_of_speech_sensitivity" : types . StartSensitivity . START_SENSITIVITY_LOW , "end_of_speech_sensitivity" : types . EndSensitivity . END_SENSITIVITY_LOW , "prefix_padding_ms" : 20 , "silence_duration_ms" : 100 , } } } JavaScript import { GoogleGenAI , Modality , StartSensitivity , EndSensitivity } from '@google/genai' ; const config = { responseModalities : [ Modality . TEXT ], realtimeInputConfig : { automaticActivityDetection : { disabled : false , // default startOfSpeechSensitivity : StartSensitivity . START_SENSITIVITY_LOW , endOfSpeechSensitivity : EndSensitivity . END_SENSITIVITY_LOW , prefixPaddingMs : 20 , silenceDurationMs : 100 , } } }; Disable automatic VAD Alternatively, the automatic VAD can be disabled by setting realtimeInputConfig.automaticActivityDetection.disabled to true in the setup message. In this configuration the client is responsible for detecting user speech and sending activityStart and activityEnd messages at the appropriate times. An audioStreamEnd isn't sent in this configuration. Instead, any interruption of the stream is marked by an activityEnd message. Python config = { "response_modalities" : [ "TEXT" ], "realtime_input_config" : { "automatic_activity_detection" : { "disabled" : True }}, } async with client . aio . live . connect ( model = model , config = config ) as session : # ... await session . send_realtime_input ( activity_start = types . ActivityStart ()) await session . send_realtime_input ( audio = types . Blob ( data = audio_bytes , mime_type = "audio/pcm;rate=16000" ) ) await session . send_realtime_input ( activity_end = types . ActivityEnd ()) # ... JavaScript const config = { responseModalities : [ Modality . TEXT ], realtimeInputConfig : { automaticActivityDetection : { disabled : true , } } }; session . sendRealtimeInput ({ activityStart : {} }) session . sendRealtimeInput ( { audio : { data : base64Audio , mimeType : "audio/pcm;rate=16000" } } ); session . sendRealtimeInput ({ activityEnd : {} }) Token count You can find the total number of consumed tokens in the usageMetadata field of the returned server message. Python async for message in session . receive (): # The server will periodically send messages that include UsageMetadata. if message . usage_metadata : usage = message . usage_metadata print ( f "Used { usage . total_token_count } tokens in total. Response token breakdown:" ) for detail in usage . response_tokens_details : match detail : case types . ModalityTokenCount ( modality = modality , token_count = count ): print ( f " { modality } : { count } " ) JavaScript const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . usageMetadata ) { console . debug ( 'Used %s tokens in total. Response token breakdown:\n' , turn . usageMetadata . totalTokenCount ); for ( const detail of turn . usageMetadata . responseTokensDetails ) { console . debug ( '%s\n' , detail ); } } } Media resolution You can specify the media resolution for the input media by setting the mediaResolution field as part of the session configuration: Python from google.genai import types config = { "response_modalities" : [ "AUDIO" ], "media_resolution" : types . MediaResolution . MEDIA_RESOLUTION_LOW , } JavaScript import { GoogleGenAI , Modality , MediaResolution } from '@google/genai' ; const config = { responseModalities : [ Modality . TEXT ], mediaResolution : MediaResolution . MEDIA_RESOLUTION_LOW , }; Limitations Consider the following limitations of the Live API when you plan your project. Response modalities You can only set one response modality ( TEXT or AUDIO ) per session in the session configuration. Setting both results in a config error message. This means that you can configure the model to respond with either text or audio, but not both in the same session. Client authentication The Live API only provides server-to-server authentication by default. If you're implementing your Live API application using a client-to-server approach , you need to use ephemeral tokens to mitigate security risks. Session duration Audio-only sessions are limited to 15 minutes, and audio plus video sessions are limited to 2 minutes. However, you can configure different session management techniques for unlimited extensions on session duration. Context window A session has a context window limit of: 128k tokens for native audio output models 32k tokens for other Live API models Supported languages Live API supports the following languages. Note: Native audio output models automatically choose the appropriate language and don't support explicitly setting the language code. Language BCP-47 Code Language BCP-47 Code German (Germany) de-DE English (Australia)* en-AU English (UK)* en-GB English (India) en-IN English (US) en-US Spanish (US) es-US French (France) fr-FR Hindi (India) hi-IN Portuguese (Brazil) pt-BR Arabic (Generic) ar-XA Spanish (Spain)* es-ES French (Canada)* fr-CA Indonesian (Indonesia) id-ID Italian (Italy) it-IT Japanese (Japan) ja-JP Turkish (Turkey) tr-TR Vietnamese (Vietnam) vi-VN Bengali (India) bn-IN Gujarati (India)* gu-IN Kannada (India)* kn-IN Marathi (India) mr-IN Malayalam (India)* ml-IN Tamil (India) ta-IN Telugu (India) te-IN Dutch (Netherlands) nl-NL Korean (South Korea) ko-KR Mandarin Chinese (China)* cmn-CN Polish (Poland) pl-PL Russian (Russia) ru-RU Thai (Thailand) th-TH Languages marked with an asterisk (*) are not available for Native audio . What's next Read the Tool Use and Session Management guides for essential information on using the Live API effectively. Try the Live API in Google AI Studio . For more info about the Live API models, see Gemini 2.0 Flash Live and Gemini 2.5 Flash Native Audio on the Models page. Try more examples in the Live API cookbook , the Live API Tools cookbook , and the Live API Get Started script . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC.
|
text_content/docs_live-session_02e3a506.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
URL: https://ai.google.dev/gemini-api/docs/live-session#main-content
|
2 |
+
Title: Session management with Live API | Gemini API | Google AI for Developers
|
3 |
+
==================================================
|
4 |
+
|
5 |
+
Session management with Live API | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Session management with Live API In the Live API, a session refers to a persistent connection where input and output are streamed continuously over the same connection (read more about how it works ). This unique session design enables low latency and supports unique features, but can also introduce challenges, like session time limits, and early termination. This guide covers strategies for overcoming the session management challenges that can arise when using the Live API. Session lifetime Without compression, audio-only sessions are limited to 15 minutes, and audio-video sessions are limited to 2 minutes. Exceeding these limits will terminate the session (and therefore, the connection), but you can use context window compression to extend sessions to an unlimited amount of time. The lifetime of a connection is limited as well, to around 10 minutes. When the connection terminates, the session terminates as well. In this case, you can configure a single session to stay active over multiple connections using session resumption . You'll also receive a GoAway message before the connection ends, allowing you to take further actions. Context window compression To enable longer sessions, and avoid abrupt connection termination, you can enable context window compression by setting the contextWindowCompression field as part of the session configuration. In the ContextWindowCompressionConfig , you can configure a sliding-window mechanism and the number of tokens that triggers compression. Python from google.genai import types config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ], context_window_compression = ( # Configures compression with default parameters. types . ContextWindowCompressionConfig ( sliding_window = types . SlidingWindow (), ) ), ) JavaScript const config = { responseModalities : [ Modality . AUDIO ], contextWindowCompression : { slidingWindow : {} } }; Session resumption To prevent session termination when the server periodically resets the WebSocket connection, configure the sessionResumption field within the setup configuration . Passing this configuration causes the server to send SessionResumptionUpdate messages, which can be used to resume the session by passing the last resumption token as the SessionResumptionConfig.handle of the subsequent connection. Python import asyncio from google import genai from google.genai import types client = genai . Client () model = "gemini-live-2.5-flash-preview" async def main (): print ( f "Connecting to the service with handle { previous_session_handle } ..." ) async with client . aio . live . connect ( model = model , config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ], session_resumption = types . SessionResumptionConfig ( # The handle of the session to resume is passed here, # or else None to start a new session. handle = previous_session_handle ), ), ) as session : while True : await session . send_client_content ( turns = types . Content ( role = "user" , parts = [ types . Part ( text = "Hello world!" )] ) ) async for message in session . receive (): # Periodically, the server will send update messages that may # contain a handle for the current state of the session. if message . session_resumption_update : update = message . session_resumption_update if update . resumable and update . new_handle : # The handle should be retained and linked to the session. return update . new_handle # For the purposes of this example, placeholder input is continually fed # to the model. In non-sample code, the model inputs would come from # the user. if message . server_content and message . server_content . turn_complete : break if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } console . debug ( 'Connecting to the service with handle %s...' , previousSessionHandle ) const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : { responseModalities : [ Modality . TEXT ], sessionResumption : { handle : previousSessionHandle } // The handle of the session to resume is passed here, or else null to start a new session. } }); const inputTurns = 'Hello how are you?' ; session . sendClientContent ({ turns : inputTurns }); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . sessionResumptionUpdate ) { if ( turn . sessionResumptionUpdate . resumable && turn . sessionResumptionUpdate . newHandle ) { let newHandle = turn . sessionResumptionUpdate . newHandle // ...Store newHandle and start new session with this handle here } } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); Receiving a message before the session disconnects The server sends a GoAway message that signals that the current connection will soon be terminated. This message includes the timeLeft , indicating the remaining time and lets you take further action before the connection will be terminated as ABORTED. Python async for response in session . receive (): if response . go_away is not None : # The connection will soon be terminated print ( response . go_away . time_left ) JavaScript const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . goAway ) { console . debug ( 'Time left: %s\n' , turn . goAway . timeLeft ); } } Receiving a message when the generation is complete The server sends a generationComplete message that signals that the model finished generating the response. Python async for response in session . receive (): if response . server_content . generation_complete is True : # The generation is complete JavaScript const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . generationComplete ) { // The generation is complete } } What's next Explore more ways to work with the Live API in the full Capabilities guide, the Tool use page, or the Live API cookbook . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC.
|
text_content/docs_live-session_ebf143cf.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
URL: https://ai.google.dev/gemini-api/docs/live-session#context-window-compression
|
2 |
+
Title: Session management with Live API | Gemini API | Google AI for Developers
|
3 |
+
==================================================
|
4 |
+
|
5 |
+
Session management with Live API | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Session management with Live API In the Live API, a session refers to a persistent connection where input and output are streamed continuously over the same connection (read more about how it works ). This unique session design enables low latency and supports unique features, but can also introduce challenges, like session time limits, and early termination. This guide covers strategies for overcoming the session management challenges that can arise when using the Live API. Session lifetime Without compression, audio-only sessions are limited to 15 minutes, and audio-video sessions are limited to 2 minutes. Exceeding these limits will terminate the session (and therefore, the connection), but you can use context window compression to extend sessions to an unlimited amount of time. The lifetime of a connection is limited as well, to around 10 minutes. When the connection terminates, the session terminates as well. In this case, you can configure a single session to stay active over multiple connections using session resumption . You'll also receive a GoAway message before the connection ends, allowing you to take further actions. Context window compression To enable longer sessions, and avoid abrupt connection termination, you can enable context window compression by setting the contextWindowCompression field as part of the session configuration. In the ContextWindowCompressionConfig , you can configure a sliding-window mechanism and the number of tokens that triggers compression. Python from google.genai import types config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ], context_window_compression = ( # Configures compression with default parameters. types . ContextWindowCompressionConfig ( sliding_window = types . SlidingWindow (), ) ), ) JavaScript const config = { responseModalities : [ Modality . AUDIO ], contextWindowCompression : { slidingWindow : {} } }; Session resumption To prevent session termination when the server periodically resets the WebSocket connection, configure the sessionResumption field within the setup configuration . Passing this configuration causes the server to send SessionResumptionUpdate messages, which can be used to resume the session by passing the last resumption token as the SessionResumptionConfig.handle of the subsequent connection. Python import asyncio from google import genai from google.genai import types client = genai . Client () model = "gemini-live-2.5-flash-preview" async def main (): print ( f "Connecting to the service with handle { previous_session_handle } ..." ) async with client . aio . live . connect ( model = model , config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ], session_resumption = types . SessionResumptionConfig ( # The handle of the session to resume is passed here, # or else None to start a new session. handle = previous_session_handle ), ), ) as session : while True : await session . send_client_content ( turns = types . Content ( role = "user" , parts = [ types . Part ( text = "Hello world!" )] ) ) async for message in session . receive (): # Periodically, the server will send update messages that may # contain a handle for the current state of the session. if message . session_resumption_update : update = message . session_resumption_update if update . resumable and update . new_handle : # The handle should be retained and linked to the session. return update . new_handle # For the purposes of this example, placeholder input is continually fed # to the model. In non-sample code, the model inputs would come from # the user. if message . server_content and message . server_content . turn_complete : break if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } console . debug ( 'Connecting to the service with handle %s...' , previousSessionHandle ) const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : { responseModalities : [ Modality . TEXT ], sessionResumption : { handle : previousSessionHandle } // The handle of the session to resume is passed here, or else null to start a new session. } }); const inputTurns = 'Hello how are you?' ; session . sendClientContent ({ turns : inputTurns }); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . sessionResumptionUpdate ) { if ( turn . sessionResumptionUpdate . resumable && turn . sessionResumptionUpdate . newHandle ) { let newHandle = turn . sessionResumptionUpdate . newHandle // ...Store newHandle and start new session with this handle here } } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); Receiving a message before the session disconnects The server sends a GoAway message that signals that the current connection will soon be terminated. This message includes the timeLeft , indicating the remaining time and lets you take further action before the connection will be terminated as ABORTED. Python async for response in session . receive (): if response . go_away is not None : # The connection will soon be terminated print ( response . go_away . time_left ) JavaScript const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . goAway ) { console . debug ( 'Time left: %s\n' , turn . goAway . timeLeft ); } } Receiving a message when the generation is complete The server sends a generationComplete message that signals that the model finished generating the response. Python async for response in session . receive (): if response . server_content . generation_complete is True : # The generation is complete JavaScript const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . generationComplete ) { // The generation is complete } } What's next Explore more ways to work with the Live API in the full Capabilities guide, the Tool use page, or the Live API cookbook . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC.
|
text_content/docs_live_1b6bdc03.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
URL: https://ai.google.dev/gemini-api/docs/live
|
2 |
+
Title: Get started with Live API | Gemini API | Google AI for Developers
|
3 |
+
==================================================
|
4 |
+
|
5 |
+
Get started with Live API | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Get started with Live API Preview: The Live API is in preview. The Live API enables low-latency, real-time voice and video interactions with Gemini. It processes continuous streams of audio, video, or text to deliver immediate, human-like spoken responses, creating a natural conversational experience for your users. Live API offers a comprehensive set of features such as Voice Activity Detection , tool use and function calling , session management (for managing long running conversations) and ephemeral tokens (for secure client-sided authentication). This page gets you up and running with examples and basic code samples. Example applications Check out the following example applications that illustrate how to use Live API for end-to-end use cases: Live audio starter app on AI Studio, using JavaScript libraries to connect to Live API and stream bidirectional audio through your microphone and speakers. Live API Python cookbook using Pyaudio that connects to Live API. Partner integrations If you prefer a simpler development process, you can use Daily or LiveKit . These are third-party partner platforms that have already integrated the Gemini Live API over the WebRTC protocol to streamline the development of real-time audio and video applications. Before you begin building There are two important decisions to make before you begin building with the Live API: choosing a model and choosing an implementation approach. Choose an audio generation architecture If you're building an audio-based use case, your choice of model determines the audio generation architecture used to create the audio response: Native audio : This option provides the most natural and realistic-sounding speech and better multilingual performance. It also enables advanced features like affective (emotion-aware) dialogue , proactive audio (where the model can decide to ignore or respond to certain inputs), and "thinking" . Native audio is supported by the following native audio models : gemini-2.5-flash-preview-native-audio-dialog gemini-2.5-flash-exp-native-audio-thinking-dialog Half-cascade audio : This option uses a cascaded model architecture (native audio input and text-to-speech output). It offers better performance and reliability in production environments, especially with tool use . Half-cascaded audio is supported by the following models: gemini-live-2.5-flash-preview gemini-2.0-flash-live-001 Choose an implementation approach When integrating with Live API, you'll need to choose one of the following implementation approaches: Server-to-server : Your backend connects to the Live API using WebSockets . Typically, your client sends stream data (audio, video, text) to your server, which then forwards it to the Live API. Client-to-server : Your frontend code connects directly to the Live API using WebSockets to stream data, bypassing your backend. Note: Client-to-server generally offers better performance for streaming audio and video, since it bypasses the need to send the stream to your backend first. It's also easier to set up since you don't need to implement a proxy that sends data from your client to your server and then your server to the API. However, for production environments, in order to mitigate security risks, we recommend using ephemeral tokens instead of standard API keys. Get started This example reads a WAV file , sends it in the correct format, and saves the received data as WAV file. You can send audio by converting it to 16-bit PCM, 16kHz, mono format, and you can receive audio by setting AUDIO as response modality. The output uses a sample rate of 24kHz. Python # Test file: https://storage.googleapis.com/generativeai-downloads/data/16000.wav # Install helpers for converting files: pip install librosa soundfile import asyncio import io from pathlib import Path import wave from google import genai from google.genai import types import soundfile as sf import librosa client = genai . Client () # Half cascade model: # model = "gemini-live-2.5-flash-preview" # Native audio output model: model = "gemini-2.5-flash-preview-native-audio-dialog" config = { "response_modalities" : [ "AUDIO" ], "system_instruction" : "You are a helpful assistant and answer in a friendly tone." , } async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : buffer = io . BytesIO () y , sr = librosa . load ( "sample.wav" , sr = 16000 ) sf . write ( buffer , y , sr , format = 'RAW' , subtype = 'PCM_16' ) buffer . seek ( 0 ) audio_bytes = buffer . read () # If already in correct format, you can use this: # audio_bytes = Path("sample.pcm").read_bytes() await session . send_realtime_input ( audio = types . Blob ( data = audio_bytes , mime_type = "audio/pcm;rate=16000" ) ) wf = wave . open ( "audio.wav" , "wb" ) wf . setnchannels ( 1 ) wf . setsampwidth ( 2 ) wf . setframerate ( 24000 ) # Output is 24kHz async for response in session . receive (): if response . data is not None : wf . writeframes ( response . data ) # Un-comment this code to print audio data info # if response.server_content.model_turn is not None: # print(response.server_content.model_turn.parts[0].inline_data.mime_type) wf . close () if __name__ == "__main__" : asyncio . run ( main ()) JavaScript // Test file: https://storage.googleapis.com/generativeai-downloads/data/16000.wav import { GoogleGenAI , Modality } from '@google/genai' ; import * as fs from "node:fs" ; import pkg from 'wavefile' ; // npm install wavefile const { WaveFile } = pkg ; const ai = new GoogleGenAI ({}); // WARNING: Do not use API keys in client-side (browser based) applications // Consider using Ephemeral Tokens instead // More information at: https://ai.google.dev/gemini-api/docs/ephemeral-tokens // Half cascade model: // const model = "gemini-live-2.5-flash-preview" // Native audio output model: const model = "gemini-2.5-flash-preview-native-audio-dialog" const config = { responseModalities : [ Modality . AUDIO ], systemInstruction : "You are a helpful assistant and answer in a friendly tone." }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); // Send Audio Chunk const fileBuffer = fs . readFileSync ( "sample.wav" ); // Ensure audio conforms to API requirements (16-bit PCM, 16kHz, mono) const wav = new WaveFile (); wav . fromBuffer ( fileBuffer ); wav . toSampleRate ( 16000 ); wav . toBitDepth ( "16" ); const base64Audio = wav . toBase64 (); // If already in correct format, you can use this: // const fileBuffer = fs.readFileSync("sample.pcm"); // const base64Audio = Buffer.from(fileBuffer).toString('base64'); session . sendRealtimeInput ( { audio : { data : base64Audio , mimeType : "audio/pcm;rate=16000" } } ); const turns = await handleTurn (); // Combine audio data strings and save as wave file const combinedAudio = turns . reduce (( acc , turn ) = > { if ( turn . data ) { const buffer = Buffer . from ( turn . data , 'base64' ); const intArray = new Int16Array ( buffer . buffer , buffer . byteOffset , buffer . byteLength / Int16Array . BYTES_PER_ELEMENT ); return acc . concat ( Array . from ( intArray )); } return acc ; }, []); const audioBuffer = new Int16Array ( combinedAudio ); const wf = new WaveFile (); wf . fromScratch ( 1 , 24000 , '16' , audioBuffer ); // output is 24kHz fs . writeFileSync ( 'audio.wav' , wf . toBuffer ()); session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); What's next Read the full Live API Capabilities guide for key capabilities and configurations; including Voice Activity Detection and native audio features. Read the Tool use guide to learn how to integrate Live API with tools and function calling. Read the Session management guide for managing long running conversations. Read the Ephemeral tokens guide for secure authentication in client-to-server applications. For more information about the underlying WebSockets API, see the WebSockets API reference . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC.
|
text_content/docs_migrate-to-cloud_8062fb32.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
URL: https://ai.google.dev/gemini-api/docs/migrate-to-cloud
|
2 |
+
Title: Gemini Developer API v.s. Vertex AI | Gemini API | Google AI for Developers
|
3 |
+
==================================================
|
4 |
+
|
5 |
+
Gemini Developer API v.s. Vertex AI | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini Developer API v.s. Vertex AI When developing generative AI solutions with Gemini, Google offers two API products: the Gemini Developer API and the Vertex AI Gemini API . The Gemini Developer API provides the fastest path to build, productionize, and scale Gemini powered applications. Most developers should use the Gemini Developer API unless there is a need for specific enterprise controls. Vertex AI offers a comprehensive ecosystem of enterprise ready features and services for building and deploying generative AI applications backed by the Google Cloud Platform. We've recently simplified migrating between these services. Both the Gemini Developer API and the Vertex AI Gemini API are now accessible through the unified Google Gen AI SDK . Code comparison This page has side-by-side code comparisons between Gemini Developer API and Vertex AI quickstarts for text generation. Python You can access both the Gemini Developer API and Vertex AI services through the google-genai library. See the libraries page for instructions on how to install google-genai . Gemini Developer API from google import genai client = genai . Client () response = client . models . generate_content ( model = "gemini-2.0-flash" , contents = "Explain how AI works in a few words" ) print ( response . text ) Vertex AI Gemini API from google import genai client = genai . Client ( vertexai = True , project = 'your-project-id' , location = 'us-central1' ) response = client . models . generate_content ( model = "gemini-2.0-flash" , contents = "Explain how AI works in a few words" ) print ( response . text ) JavaScript and TypeScript You can access both Gemini Developer API and Vertex AI services through @google/genai library. See libraries page for instructions on how to install @google/genai . Gemini Developer API import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : "Explain how AI works in a few words" , }); console . log ( response . text ); } main (); Vertex AI Gemini API import { GoogleGenAI } from '@google/genai' ; const ai = new GoogleGenAI ({ vertexai : true , project : 'your_project' , location : 'your_location' , }); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : "Explain how AI works in a few words" , }); console . log ( response . text ); } main (); Go You can access both Gemini Developer API and Vertex AI services through google.golang.org/genai library. See libraries page for instructions on how to install google.golang.org/genai . Gemini Developer API import ( "context" "encoding/json" "fmt" "log" "google.golang.org/genai" ) // Your Google API key const apiKey = "your-api-key" func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } // Call the GenerateContent method. result , err := client . Models . GenerateContent ( ctx , "gemini-2.0-flash" , genai . Text ( "Tell me about New York?" ), nil ) } Vertex AI Gemini API import ( "context" "encoding/json" "fmt" "log" "google.golang.org/genai" ) // Your GCP project const project = "your-project" // A GCP location like "us-central1" const location = "some-gcp-location" func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , & genai . ClientConfig { Project : project , Location : location , Backend : genai . BackendVertexAI , }) // Call the GenerateContent method. result , err := client . Models . GenerateContent ( ctx , "gemini-2.0-flash" , genai . Text ( "Tell me about New York?" ), nil ) } Other use cases and platforms Refer to use case specific guides on Gemini Developer API Documentation and Vertex AI documentation for other platforms and use cases. Migration considerations When you migrate: You'll need to use Google Cloud service accounts to authenticate. See the Vertex AI documentation for more information. You can use your existing Google Cloud project (the same one you used to generate your API key) or you can create a new Google Cloud project . Supported regions may differ between the Gemini Developer API and the Vertex AI Gemini API. See the list of supported regions for generative AI on Google Cloud . Any models you created in Google AI Studio need to be retrained in Vertex AI. If you no longer need to use your Gemini API key for the Gemini Developer API, then follow security best practices and delete it. To delete an API key: Open the Google Cloud API Credentials page. Find the API key you want to delete and click the Actions icon. Select Delete API key . In the Delete credential modal, select Delete . Deleting an API key takes a few minutes to propagate. After propagation completes, any traffic using the deleted API key is rejected. Important: If you have deleted a key that is still used in production and need to recover it, see gcloud beta services api-keys undelete . Next steps See the Generative AI on Vertex AI overview to learn more about generative AI solutions on Vertex AI. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-26 UTC.
|
text_content/docs_migrate_e2b2e25c.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
URL: https://ai.google.dev/gemini-api/docs/migrate
|
2 |
+
Title: Migrate to the Google GenAI SDK | Gemini API | Google AI for Developers
|
3 |
+
==================================================
|
4 |
+
|
5 |
+
Migrate to the Google GenAI SDK | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Migrate to the Google GenAI SDK Starting with the Gemini 2.0 release in late 2024, we introduced a new set of libraries called the Google GenAI SDK . It offers an improved developer experience through an updated client architecture , and simplifies the transition between developer and enterprise workflows. The Google GenAI SDK is now in General Availability (GA) across all supported platforms. If you're using one of our legacy libraries , we strongly recommend you to migrate. This guide provides before-and-after examples of migrated code to help you get started. Note: The Go examples omit imports and other boilerplate code to improve readability. Installation Before Python pip install -U -q "google-generativeai" JavaScript npm install @google/generative-ai Go go get github.com/google/generative-ai-go After Python pip install -U -q "google-genai" JavaScript npm install @google/genai Go go get google.golang.org/genai API access The old SDK implicitly handled the API client behind the scenes using a variety of ad hoc methods. This made it hard to manage the client and credentials. Now, you interact through a central Client object. This Client object acts as a single entry point for various API services (e.g., models , chats , files , tunings ), promoting consistency and simplifying credential and configuration management across different API calls. Before (Less Centralized API Access) Python The old SDK didn't explicitly use a top-level client object for most API calls. You would directly instantiate and interact with GenerativeModel objects. import google.generativeai as genai # Directly create and use model objects model = genai . GenerativeModel ( 'gemini-1.5-flash' ) response = model . generate_content ( ... ) chat = model . start_chat ( ... ) JavaScript While GoogleGenerativeAI was a central point for models and chat, other functionalities like file and cache management often required importing and instantiating entirely separate client classes. import { GoogleGenerativeAI } from "@google/generative-ai" ; import { GoogleAIFileManager , GoogleAICacheManager } from "@google/generative-ai/server" ; // For files/caching const genAI = new GoogleGenerativeAI ( "YOUR_API_KEY" ); const fileManager = new GoogleAIFileManager ( "YOUR_API_KEY" ); const cacheManager = new GoogleAICacheManager ( "YOUR_API_KEY" ); // Get a model instance, then call methods on it const model = genAI . getGenerativeModel ({ model : "gemini-1.5-flash" }); const result = await model . generateContent (...); const chat = model . startChat (...); // Call methods on separate client objects for other services const uploadedFile = await fileManager . uploadFile (...); const cache = await cacheManager . create (...); Go The genai.NewClient function created a client, but generative model operations were typically called on a separate GenerativeModel instance obtained from this client. Other services might have been accessed via distinct packages or patterns. import ( "github.com/google/generative-ai-go/genai" "github.com/google/generative-ai-go/genai/fileman" // For files "google.golang.org/api/option" ) client , err := genai . NewClient ( ctx , option . WithAPIKey ( "YOUR_API_KEY" )) fileClient , err := fileman . NewClient ( ctx , option . WithAPIKey ( "YOUR_API_KEY" )) // Get a model instance, then call methods on it model := client . GenerativeModel ( "gemini-1.5-flash" ) resp , err := model . GenerateContent ( ... ) cs := model . StartChat () // Call methods on separate client objects for other services uploadedFile , err := fileClient . UploadFile ( ... ) After (Centralized Client Object) Python from google import genai # Create a single client object client = genai . Client () # Access API methods through services on the client object response = client . models . generate_content ( ... ) chat = client . chats . create ( ... ) my_file = client . files . upload ( ... ) tuning_job = client . tunings . tune ( ... ) JavaScript import { GoogleGenAI } from "@google/genai" ; // Create a single client object const ai = new GoogleGenAI ({ apiKey : "YOUR_API_KEY" }); // Access API methods through services on the client object const response = await ai . models . generateContent (...); const chat = ai . chats . create (...); const uploadedFile = await ai . files . upload (...); const cache = await ai . caches . create (...); Go import "google.golang.org/genai" // Create a single client object client , err := genai . NewClient ( ctx , nil ) // Access API methods through services on the client object result , err := client . Models . GenerateContent ( ... ) chat , err := client . Chats . Create ( ... ) uploadedFile , err := client . Files . Upload ( ... ) tuningJob , err := client . Tunings . Tune ( ... ) Authentication Both legacy and new libraries authenticate using API keys. You can create your API key in Google AI Studio. Before Python The old SDK handled the API client object implicitly. import google.generativeai as genai genai . configure ( api_key =... ) JavaScript import { GoogleGenerativeAI } from "@google/generative-ai" ; const genAI = new GoogleGenerativeAI ( "GOOGLE_API_KEY" ); Go Import the Google libraries: import ( "github.com/google/generative-ai-go/genai" "google.golang.org/api/option" ) Create the client: client , err := genai . NewClient ( ctx , option . WithAPIKey ( "GOOGLE_API_KEY" )) After Python With Google GenAI SDK, you create an API client first, which is used to call the API. The new SDK will pick up your API key from either one of the GEMINI_API_KEY or GOOGLE_API_KEY environment variables, if you don't pass one to the client. export GEMINI_API_KEY = "YOUR_API_KEY" from google import genai client = genai . Client () # Set the API key using the GEMINI_API_KEY env var. # Alternatively, you could set the API key explicitly: # client = genai.Client(api_key="your_api_key") JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({ apiKey : "GEMINI_API_KEY" }); Go Import the GenAI library: import "google.golang.org/genai" Create the client: client , err := genai . NewClient ( ctx , & genai . ClientConfig { Backend : genai . BackendGeminiAPI , }) Generate content Text Before Python Previously, there were no client objects, you accessed APIs directly through GenerativeModel objects. import google.generativeai as genai model = genai . GenerativeModel ( 'gemini-1.5-flash' ) response = model . generate_content ( 'Tell me a story in 300 words' ) print ( response . text ) JavaScript import { GoogleGenerativeAI } from "@google/generative-ai" ; const genAI = new GoogleGenerativeAI ( process . env . API_KEY ); const model = genAI . getGenerativeModel ({ model : "gemini-1.5-flash" }); const prompt = "Tell me a story in 300 words" ; const result = await model . generateContent ( prompt ); console . log ( result . response . text ()); Go ctx := context . Background () client , err := genai . NewClient ( ctx , option . WithAPIKey ( "GOOGLE_API_KEY" )) if err != nil { log . Fatal ( err ) } defer client . Close () model := client . GenerativeModel ( "gemini-1.5-flash" ) resp , err := model . GenerateContent ( ctx , genai . Text ( "Tell me a story in 300 words." )) if err != nil { log . Fatal ( err ) } printResponse ( resp ) // utility for printing response parts After Python The new Google GenAI SDK provides access to all the API methods through the Client object. Except for a few stateful special cases ( chat and live-api session s), these are all stateless functions. For utility and uniformity, objects returned are pydantic classes. from google import genai client = genai . Client () response = client . models . generate_content ( model = 'gemini-2.0-flash' , contents = 'Tell me a story in 300 words.' ) print ( response . text ) print ( response . model_dump_json ( exclude_none = True , indent = 4 )) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({ apiKey : "GOOGLE_API_KEY" }); const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : "Tell me a story in 300 words." , }); console . log ( response . text ); Go ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } result , err := client . Models . GenerateContent ( ctx , "gemini-2.0-flash" , genai . Text ( "Tell me a story in 300 words." ), nil ) if err != nil { log . Fatal ( err ) } debugPrint ( result ) // utility for printing result Image Before Python import google.generativeai as genai model = genai . GenerativeModel ( 'gemini-1.5-flash' ) response = model . generate_content ([ 'Tell me a story based on this image' , Image . open ( image_path ) ]) print ( response . text ) JavaScript import { GoogleGenerativeAI } from "@google/generative-ai" ; const genAI = new GoogleGenerativeAI ( "GOOGLE_API_KEY" ); const model = genAI . getGenerativeModel ({ model : "gemini-1.5-flash" }); function fileToGenerativePart ( path , mimeType ) { return { inlineData : { data : Buffer . from ( fs . readFileSync ( path )). toString ( "base64" ), mimeType , }, }; } const prompt = "Tell me a story based on this image" ; const imagePart = fileToGenerativePart ( `path/to/organ.jpg` , "image/jpeg" , ); const result = await model . generateContent ([ prompt , imagePart ]); console . log ( result . response . text ()); Go ctx := context . Background () client , err := genai . NewClient ( ctx , option . WithAPIKey ( "GOOGLE_API_KEY" )) if err != nil { log . Fatal ( err ) } defer client . Close () model := client . GenerativeModel ( "gemini-1.5-flash" ) imgData , err := os . ReadFile ( "path/to/organ.jpg" ) if err != nil { log . Fatal ( err ) } resp , err := model . GenerateContent ( ctx , genai . Text ( "Tell me about this instrument" ), genai . ImageData ( "jpeg" , imgData )) if err != nil { log . Fatal ( err ) } printResponse ( resp ) // utility for printing response After Python Many of the same convenience features exist in the new SDK. For example, PIL.Image objects are automatically converted. from google import genai from PIL import Image client = genai . Client () response = client . models . generate_content ( model = 'gemini-2.0-flash' , contents = [ 'Tell me a story based on this image' , Image . open ( image_path ) ] ) print ( response . text ) JavaScript import { GoogleGenAI } from '@google/genai' ; const ai = new GoogleGenAI ({ apiKey : "GOOGLE_API_KEY" }); const organ = await ai . files . upload ({ file : "path/to/organ.jpg" , }); const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : [ createUserContent ([ "Tell me a story based on this image" , createPartFromUri ( organ . uri , organ . mimeType ) ]), ], }); console . log ( response . text ); Go ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } imgData , err := os . ReadFile ( "path/to/organ.jpg" ) if err != nil { log . Fatal ( err ) } parts := [] * genai . Part { { Text : "Tell me a story based on this image" }, { InlineData : & genai . Blob { Data : imgData , MIMEType : "image/jpeg" }}, } contents := [] * genai . Content { { Parts : parts }, } result , err := client . Models . GenerateContent ( ctx , "gemini-2.0-flash" , contents , nil ) if err != nil { log . Fatal ( err ) } debugPrint ( result ) // utility for printing result Streaming Before Python import google.generativeai as genai response = model . generate_content ( "Write a cute story about cats." , stream = True ) for chunk in response : print ( chunk . text ) JavaScript import { GoogleGenerativeAI } from "@google/generative-ai" ; const genAI = new GoogleGenerativeAI ( "GOOGLE_API_KEY" ); const model = genAI . getGenerativeModel ({ model : "gemini-1.5-flash" }); const prompt = "Write a story about a magic backpack." ; const result = await model . generateContentStream ( prompt ); // Print text as it comes in. for await ( const chunk of result . stream ) { const chunkText = chunk . text (); process . stdout . write ( chunkText ); } Go ctx := context . Background () client , err := genai . NewClient ( ctx , option . WithAPIKey ( "GOOGLE_API_KEY" )) if err != nil { log . Fatal ( err ) } defer client . Close () model := client . GenerativeModel ( "gemini-1.5-flash" ) iter := model . GenerateContentStream ( ctx , genai . Text ( "Write a story about a magic backpack." )) for { resp , err := iter . Next () if err == iterator . Done { break } if err != nil { log . Fatal ( err ) } printResponse ( resp ) // utility for printing the response } After Python from google import genai client = genai . Client () for chunk in client . models . generate_content_stream ( model = 'gemini-2.0-flash' , contents = 'Tell me a story in 300 words.' ): print ( chunk . text ) JavaScript import { GoogleGenAI } from '@google/genai' ; const ai = new GoogleGenAI ({ apiKey : "GOOGLE_API_KEY" }); const response = await ai . models . generateContentStream ({ model : "gemini-2.0-flash" , contents : "Write a story about a magic backpack." , }); let text = "" ; for await ( const chunk of response ) { console . log ( chunk . text ); text += chunk . text ; } Go ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } for result , err := range client . Models . GenerateContentStream ( ctx , "gemini-2.0-flash" , genai . Text ( "Write a story about a magic backpack." ), nil , ) { if err != nil { log . Fatal ( err ) } fmt . Print ( result . Candidates [ 0 ]. Content . Parts [ 0 ]. Text ) } Configuration Before Python import google.generativeai as genai model = genai . GenerativeModel ( 'gemini-1.5-flash' , system_instruction = 'you are a story teller for kids under 5 years old' , generation_config = genai . GenerationConfig ( max_output_tokens = 400 , top_k = 2 , top_p = 0.5 , temperature = 0.5 , response_mime_type = 'application/json' , stop_sequences = [ ' \n ' ], ) ) response = model . generate_content ( 'tell me a story in 100 words' ) JavaScript import { GoogleGenerativeAI } from "@google/generative-ai" ; const genAI = new GoogleGenerativeAI ( "GOOGLE_API_KEY" ); const model = genAI . getGenerativeModel ({ model : "gemini-1.5-flash" , generationConfig : { candidateCount : 1 , stopSequences : [ "x" ], maxOutputTokens : 20 , temperature : 1.0 , }, }); const result = await model . generateContent ( "Tell me a story about a magic backpack." , ); console . log ( result . response . text ()) Go ctx := context . Background () client , err := genai . NewClient ( ctx , option . WithAPIKey ( "GOOGLE_API_KEY" )) if err != nil { log . Fatal ( err ) } defer client . Close () model := client . GenerativeModel ( "gemini-1.5-flash" ) model . SetTemperature ( 0.5 ) model . SetTopP ( 0.5 ) model . SetTopK ( 2.0 ) model . SetMaxOutputTokens ( 100 ) model . ResponseMIMEType = "application/json" resp , err := model . GenerateContent ( ctx , genai . Text ( "Tell me about New York" )) if err != nil { log . Fatal ( err ) } printResponse ( resp ) // utility for printing response After Python For all methods in the new SDK, the required arguments are provided as keyword arguments. All optional inputs are provided in the config argument. Config arguments can be specified as either Python dictionaries or Config classes in the google.genai.types namespace. For utility and uniformity, all definitions within the types module are pydantic classes. from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = 'gemini-2.0-flash' , contents = 'Tell me a story in 100 words.' , config = types . GenerateContentConfig ( system_instruction = 'you are a story teller for kids under 5 years old' , max_output_tokens = 400 , top_k = 2 , top_p = 0.5 , temperature = 0.5 , response_mime_type = 'application/json' , stop_sequences = [ ' \n ' ], seed = 42 , ), ) JavaScript import { GoogleGenAI } from '@google/genai' ; const ai = new GoogleGenAI ({ apiKey : "GOOGLE_API_KEY" }); const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : "Tell me a story about a magic backpack." , config : { candidateCount : 1 , stopSequences : [ "x" ], maxOutputTokens : 20 , temperature : 1.0 , }, }); console . log ( response . text ); Go ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } result , err := client . Models . GenerateContent ( ctx , "gemini-2.0-flash" , genai . Text ( "Tell me about New York" ), & genai . GenerateContentConfig { Temperature : genai . Ptr [ float32 ]( 0.5 ), TopP : genai . Ptr [ float32 ]( 0.5 ), TopK : genai . Ptr [ float32 ]( 2.0 ), ResponseMIMEType : "application/json" , StopSequences : [] string { "Yankees" }, CandidateCount : 2 , Seed : genai . Ptr [ int32 ]( 42 ), MaxOutputTokens : 128 , PresencePenalty : genai . Ptr [ float32 ]( 0.5 ), FrequencyPenalty : genai . Ptr [ float32 ]( 0.5 ), }, ) if err != nil { log . Fatal ( err ) } debugPrint ( result ) // utility for printing response Safety settings Generate a response with safety settings: Before Python import google.generativeai as genai model = genai . GenerativeModel ( 'gemini-1.5-flash' ) response = model . generate_content ( 'say something bad' , safety_settings = { 'HATE' : 'BLOCK_ONLY_HIGH' , 'HARASSMENT' : 'BLOCK_ONLY_HIGH' , } ) JavaScript import { GoogleGenerativeAI , HarmCategory , HarmBlockThreshold } from "@google/generative-ai" ; const genAI = new GoogleGenerativeAI ( "GOOGLE_API_KEY" ); const model = genAI . getGenerativeModel ({ model : "gemini-1.5-flash" , safetySettings : [ { category : HarmCategory . HARM_CATEGORY_HARASSMENT , threshold : HarmBlockThreshold . BLOCK_LOW_AND_ABOVE , }, ], }); const unsafePrompt = "I support Martians Soccer Club and I think " + "Jupiterians Football Club sucks! Write an ironic phrase telling " + "them how I feel about them." ; const result = await model . generateContent ( unsafePrompt ); try { result . response . text (); } catch ( e ) { console . error ( e ); console . log ( result . response . candidates [ 0 ]. safetyRatings ); } After Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = 'gemini-2.0-flash' , contents = 'say something bad' , config = types . GenerateContentConfig ( safety_settings = [ types . SafetySetting ( category = 'HARM_CATEGORY_HATE_SPEECH' , threshold = 'BLOCK_ONLY_HIGH' ), ] ), ) JavaScript import { GoogleGenAI } from '@google/genai' ; const ai = new GoogleGenAI ({ apiKey : "GOOGLE_API_KEY" }); const unsafePrompt = "I support Martians Soccer Club and I think " + "Jupiterians Football Club sucks! Write an ironic phrase telling " + "them how I feel about them." ; const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : unsafePrompt , config : { safetySettings : [ { category : "HARM_CATEGORY_HARASSMENT" , threshold : "BLOCK_ONLY_HIGH" , }, ], }, }); console . log ( "Finish reason:" , response . candidates [ 0 ]. finishReason ); console . log ( "Safety ratings:" , response . candidates [ 0 ]. safetyRatings ); Async Before Python import google.generativeai as genai model = genai . GenerativeModel ( 'gemini-1.5-flash' ) response = model . generate_content_async ( 'tell me a story in 100 words' ) After Python To use the new SDK with asyncio , there is a separate async implementation of every method under client.aio . from google import genai client = genai . Client () response = await client . aio . models . generate_content ( model = 'gemini-2.0-flash' , contents = 'Tell me a story in 300 words.' ) Chat Start a chat and send a message to the model: Before Python import google.generativeai as genai model = genai . GenerativeModel ( 'gemini-1.5-flash' ) chat = model . start_chat () response = chat . send_message ( "Tell me a story in 100 words" ) response = chat . send_message ( "What happened after that?" ) JavaScript import { GoogleGenerativeAI } from "@google/generative-ai" ; const genAI = new GoogleGenerativeAI ( "GOOGLE_API_KEY" ); const model = genAI . getGenerativeModel ({ model : "gemini-1.5-flash" }); const chat = model . startChat ({ history : [ { role : "user" , parts : [{ text : "Hello" }], }, { role : "model" , parts : [{ text : "Great to meet you. What would you like to know?" }], }, ], }); let result = await chat . sendMessage ( "I have 2 dogs in my house." ); console . log ( result . response . text ()); result = await chat . sendMessage ( "How many paws are in my house?" ); console . log ( result . response . text ()); Go ctx := context . Background () client , err := genai . NewClient ( ctx , option . WithAPIKey ( "GOOGLE_API_KEY" )) if err != nil { log . Fatal ( err ) } defer client . Close () model := client . GenerativeModel ( "gemini-1.5-flash" ) cs := model . StartChat () cs . History = [] * genai . Content { { Parts : [] genai . Part { genai . Text ( "Hello, I have 2 dogs in my house." ), }, Role : "user" , }, { Parts : [] genai . Part { genai . Text ( "Great to meet you. What would you like to know?" ), }, Role : "model" , }, } res , err := cs . SendMessage ( ctx , genai . Text ( "How many paws are in my house?" )) if err != nil { log . Fatal ( err ) } printResponse ( res ) // utility for printing the response After Python from google import genai client = genai . Client () chat = client . chats . create ( model = 'gemini-2.0-flash' ) response = chat . send_message ( message = 'Tell me a story in 100 words' ) response = chat . send_message ( message = 'What happened after that?' ) JavaScript import { GoogleGenAI } from '@google/genai' ; const ai = new GoogleGenAI ({ apiKey : "GOOGLE_API_KEY" }); const chat = ai . chats . create ({ model : "gemini-2.0-flash" , history : [ { role : "user" , parts : [{ text : "Hello" }], }, { role : "model" , parts : [{ text : "Great to meet you. What would you like to know?" }], }, ], }); const response1 = await chat . sendMessage ({ message : "I have 2 dogs in my house." , }); console . log ( "Chat response 1:" , response1 . text ); const response2 = await chat . sendMessage ({ message : "How many paws are in my house?" , }); console . log ( "Chat response 2:" , response2 . text ); Go ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } chat , err := client . Chats . Create ( ctx , "gemini-2.0-flash" , nil , nil ) if err != nil { log . Fatal ( err ) } result , err := chat . SendMessage ( ctx , genai . Part { Text : "Hello, I have 2 dogs in my house." }) if err != nil { log . Fatal ( err ) } debugPrint ( result ) // utility for printing result result , err = chat . SendMessage ( ctx , genai . Part { Text : "How many paws are in my house?" }) if err != nil { log . Fatal ( err ) } debugPrint ( result ) // utility for printing result Function calling Before Python import google.generativeai as genai from enum import Enum def get_current_weather ( location : str ) - > str : """Get the current whether in a given location. Args: location: required, The city and state, e.g. San Franciso, CA unit: celsius or fahrenheit """ print ( f 'Called with: { location =} ' ) return "23C" model = genai . GenerativeModel ( model_name = "gemini-1.5-flash" , tools = [ get_current_weather ] ) response = model . generate_content ( "What is the weather in San Francisco?" ) function_call = response . candidates [ 0 ] . parts [ 0 ] . function_call After Python In the new SDK, automatic function calling is the default. Here, you disable it. from google import genai from google.genai import types client = genai . Client () def get_current_weather ( location : str ) - > str : """Get the current whether in a given location. Args: location: required, The city and state, e.g. San Franciso, CA unit: celsius or fahrenheit """ print ( f 'Called with: { location =} ' ) return "23C" response = client . models . generate_content ( model = 'gemini-2.0-flash' , contents = "What is the weather like in Boston?" , config = types . GenerateContentConfig ( tools = [ get_current_weather ], automatic_function_calling = { 'disable' : True }, ), ) function_call = response . candidates [ 0 ] . content . parts [ 0 ] . function_call Automatic function calling Before Python The old SDK only supports automatic function calling in chat. In the new SDK this is the default behavior in generate_content . import google.generativeai as genai def get_current_weather ( city : str ) - > str : return "23C" model = genai . GenerativeModel ( model_name = "gemini-1.5-flash" , tools = [ get_current_weather ] ) chat = model . start_chat ( enable_automatic_function_calling = True ) result = chat . send_message ( "What is the weather in San Francisco?" ) After Python from google import genai from google.genai import types client = genai . Client () def get_current_weather ( city : str ) - > str : return "23C" response = client . models . generate_content ( model = 'gemini-2.0-flash' , contents = "What is the weather like in Boston?" , config = types . GenerateContentConfig ( tools = [ get_current_weather ] ), ) Code execution Code execution is a tool that allows the model to generate Python code, run it, and return the result. Before Python import google.generativeai as genai model = genai . GenerativeModel ( model_name = "gemini-1.5-flash" , tools = "code_execution" ) result = model . generate_content ( "What is the sum of the first 50 prime numbers? Generate and run code for " "the calculation, and make sure you get all 50." ) JavaScript import { GoogleGenerativeAI } from "@google/generative-ai" ; const genAI = new GoogleGenerativeAI ( "GOOGLE_API_KEY" ); const model = genAI . getGenerativeModel ({ model : "gemini-1.5-flash" , tools : [{ codeExecution : {} }], }); const result = await model . generateContent ( "What is the sum of the first 50 prime numbers? " + "Generate and run code for the calculation, and make sure you get " + "all 50." , ); console . log ( result . response . text ()); After Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = 'gemini-2.0-flash' , contents = 'What is the sum of the first 50 prime numbers? Generate and run ' 'code for the calculation, and make sure you get all 50.' , config = types . GenerateContentConfig ( tools = [ types . Tool ( code_execution = types . ToolCodeExecution )], ), ) JavaScript import { GoogleGenAI } from '@google/genai' ; const ai = new GoogleGenAI ({ apiKey : "GOOGLE_API_KEY" }); const response = await ai . models . generateContent ({ model : "gemini-2.0-pro-exp-02-05" , contents : `Write and execute code that calculates the sum of the first 50 prime numbers. Ensure that only the executable code and its resulting output are generated.` , }); // Each part may contain text, executable code, or an execution result. for ( const part of response . candidates [ 0 ]. content . parts ) { console . log ( part ); console . log ( "\n" ); } console . log ( "-" . repeat ( 80 )); // The `.text` accessor concatenates the parts into a markdown-formatted text. console . log ( "\n" , response . text ); Search grounding GoogleSearch (Gemini>=2.0) and GoogleSearchRetrieval (Gemini < 2.0) are tools that allow the model to retrieve public web data for grounding, powered by Google. Before Python import google.generativeai as genai model = genai . GenerativeModel ( 'gemini-1.5-flash' ) response = model . generate_content ( contents = "what is the Google stock price?" , tools = 'google_search_retrieval' ) After Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = 'gemini-2.0-flash' , contents = 'What is the Google stock price?' , config = types . GenerateContentConfig ( tools = [ types . Tool ( google_search = types . GoogleSearch () ) ] ) ) JSON response Generate answers in JSON format. Before Python By specifying a response_schema and setting response_mime_type="application/json" users can constrain the model to produce a JSON response following a given structure. import google.generativeai as genai import typing_extensions as typing class CountryInfo ( typing . TypedDict ): name : str population : int capital : str continent : str major_cities : list [ str ] gdp : int official_language : str total_area_sq_mi : int model = genai . GenerativeModel ( model_name = "gemini-1.5-flash" ) result = model . generate_content ( "Give me information of the United States" , generation_config = genai . GenerationConfig ( response_mime_type = "application/json" , response_schema = CountryInfo ), ) JavaScript import { GoogleGenerativeAI , SchemaType } from "@google/generative-ai" ; const genAI = new GoogleGenerativeAI ( "GOOGLE_API_KEY" ); const schema = { description : "List of recipes" , type : SchemaType . ARRAY , items : { type : SchemaType . OBJECT , properties : { recipeName : { type : SchemaType . STRING , description : "Name of the recipe" , nullable : false , }, }, required : [ "recipeName" ], }, }; const model = genAI . getGenerativeModel ({ model : "gemini-1.5-pro" , generationConfig : { responseMimeType : "application/json" , responseSchema : schema , }, }); const result = await model . generateContent ( "List a few popular cookie recipes." , ); console . log ( result . response . text ()); After Python The new SDK uses pydantic classes to provide the schema (although you can pass a genai.types.Schema , or equivalent dict ). When possible, the SDK will parse the returned JSON, and return the result in response.parsed . If you provided a pydantic class as the schema the SDK will convert that JSON to an instance of the class. from google import genai from pydantic import BaseModel client = genai . Client () class CountryInfo ( BaseModel ): name : str population : int capital : str continent : str major_cities : list [ str ] gdp : int official_language : str total_area_sq_mi : int response = client . models . generate_content ( model = 'gemini-2.0-flash' , contents = 'Give me information of the United States.' , config = { 'response_mime_type' : 'application/json' , 'response_schema' : CountryInfo , }, ) response . parsed JavaScript import { GoogleGenAI } from '@google/genai' ; const ai = new GoogleGenAI ({ apiKey : "GOOGLE_API_KEY" }); const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : "List a few popular cookie recipes." , config : { responseMimeType : "application/json" , responseSchema : { type : "array" , items : { type : "object" , properties : { recipeName : { type : "string" }, ingredients : { type : "array" , items : { type : "string" } }, }, required : [ "recipeName" , "ingredients" ], }, }, }, }); console . log ( response . text ); Files Upload Upload a file: Before Python import requests import pathlib import google.generativeai as genai # Download file response = requests . get ( 'https://storage.googleapis.com/generativeai-downloads/data/a11.txt' ) pathlib . Path ( 'a11.txt' ) . write_text ( response . text ) file = genai . upload_file ( path = 'a11.txt' ) model = genai . GenerativeModel ( 'gemini-1.5-flash' ) response = model . generate_content ([ 'Can you summarize this file:' , my_file ]) print ( response . text ) After Python import requests import pathlib from google import genai client = genai . Client () # Download file response = requests . get ( 'https://storage.googleapis.com/generativeai-downloads/data/a11.txt' ) pathlib . Path ( 'a11.txt' ) . write_text ( response . text ) my_file = client . files . upload ( file = 'a11.txt' ) response = client . models . generate_content ( model = 'gemini-2.0-flash' , contents = [ 'Can you summarize this file:' , my_file ] ) print ( response . text ) List and get List uploaded files and get an uploaded file with a filename: Before Python import google.generativeai as genai for file in genai . list_files (): print ( file . name ) file = genai . get_file ( name = file . name ) After Python from google import genai client = genai . Client () for file in client . files . list (): print ( file . name ) file = client . files . get ( name = file . name ) Delete Delete a file: Before Python import pathlib import google.generativeai as genai pathlib . Path ( 'dummy.txt' ) . write_text ( dummy ) dummy_file = genai . upload_file ( path = 'dummy.txt' ) file = genai . delete_file ( name = dummy_file . name ) After Python import pathlib from google import genai client = genai . Client () pathlib . Path ( 'dummy.txt' ) . write_text ( dummy ) dummy_file = client . files . upload ( file = 'dummy.txt' ) response = client . files . delete ( name = dummy_file . name ) Context caching Context caching allows the user to pass the content to the model once, cache the input tokens, and then refer to the cached tokens in subsequent calls to lower the cost. Before Python import requests import pathlib import google.generativeai as genai from google.generativeai import caching # Download file response = requests . get ( 'https://storage.googleapis.com/generativeai-downloads/data/a11.txt' ) pathlib . Path ( 'a11.txt' ) . write_text ( response . text ) # Upload file document = genai . upload_file ( path = "a11.txt" ) # Create cache apollo_cache = caching . CachedContent . create ( model = "gemini-1.5-flash-001" , system_instruction = "You are an expert at analyzing transcripts." , contents = [ document ], ) # Generate response apollo_model = genai . GenerativeModel . from_cached_content ( cached_content = apollo_cache ) response = apollo_model . generate_content ( "Find a lighthearted moment from this transcript" ) JavaScript import { GoogleAICacheManager , GoogleAIFileManager } from "@google/generative-ai/server" ; import { GoogleGenerativeAI } from "@google/generative-ai" ; const cacheManager = new GoogleAICacheManager ( "GOOGLE_API_KEY" ); const fileManager = new GoogleAIFileManager ( "GOOGLE_API_KEY" ); const uploadResult = await fileManager . uploadFile ( "path/to/a11.txt" , { mimeType : "text/plain" , }); const cacheResult = await cacheManager . create ({ model : "models/gemini-1.5-flash" , contents : [ { role : "user" , parts : [ { fileData : { fileUri : uploadResult . file . uri , mimeType : uploadResult . file . mimeType , }, }, ], }, ], }); console . log ( cacheResult ); const genAI = new GoogleGenerativeAI ( "GOOGLE_API_KEY" ); const model = genAI . getGenerativeModelFromCachedContent ( cacheResult ); const result = await model . generateContent ( "Please summarize this transcript." , ); console . log ( result . response . text ()); After Python import requests import pathlib from google import genai from google.genai import types client = genai . Client () # Check which models support caching. for m in client . models . list (): for action in m . supported_actions : if action == "createCachedContent" : print ( m . name ) break # Download file response = requests . get ( 'https://storage.googleapis.com/generativeai-downloads/data/a11.txt' ) pathlib . Path ( 'a11.txt' ) . write_text ( response . text ) # Upload file document = client . files . upload ( file = 'a11.txt' ) # Create cache model = 'gemini-1.5-flash-001' apollo_cache = client . caches . create ( model = model , config = { 'contents' : [ document ], 'system_instruction' : 'You are an expert at analyzing transcripts.' , }, ) # Generate response response = client . models . generate_content ( model = model , contents = 'Find a lighthearted moment from this transcript' , config = types . GenerateContentConfig ( cached_content = apollo_cache . name , ) ) JavaScript import { GoogleGenAI } from '@google/genai' ; const ai = new GoogleGenAI ({ apiKey : "GOOGLE_API_KEY" }); const filePath = path . join ( media , "a11.txt" ); const document = await ai . files . upload ({ file : filePath , config : { mimeType : "text/plain" }, }); console . log ( "Uploaded file name:" , document . name ); const modelName = "gemini-1.5-flash" ; const contents = [ createUserContent ( createPartFromUri ( document . uri , document . mimeType )), ]; const cache = await ai . caches . create ({ model : modelName , config : { contents : contents , systemInstruction : "You are an expert analyzing transcripts." , }, }); console . log ( "Cache created:" , cache ); const response = await ai . models . generateContent ({ model : modelName , contents : "Please summarize this transcript" , config : { cachedContent : cache . name }, }); console . log ( "Response text:" , response . text ); Count tokens Count the number of tokens in a request. Before Python import google.generativeai as genai model = genai . GenerativeModel ( 'gemini-1.5-flash' ) response = model . count_tokens ( 'The quick brown fox jumps over the lazy dog.' ) JavaScript import { GoogleGenerativeAI } from "@google/generative-ai" ; const genAI = new GoogleGenerativeAI ( "GOOGLE_API_KEY+); const model = genAI.getGenerativeModel({ model: " gemini - 1.5 - flash ", }); // Count tokens in a prompt without calling text generation. const countResult = await model.countTokens( " The quick brown fox jumps over the lazy dog . ", ); console.log(countResult.totalTokens); // 11 const generateResult = await model.generateContent( " The quick brown fox jumps over the lazy dog . " , ); // On the response for `generateContent`, use `usageMetadata` // to get separate input and output token counts // (`promptTokenCount` and `candidatesTokenCount`, respectively), // as well as the combined token count (`totalTokenCount`). console . log ( generateResult . response . usageMetadata ); // candidatesTokenCount and totalTokenCount depend on response, may vary // { promptTokenCount: 11, candidatesTokenCount: 124, totalTokenCount: 135 } After Python from google import genai client = genai . Client () response = client . models . count_tokens ( model = 'gemini-2.0-flash' , contents = 'The quick brown fox jumps over the lazy dog.' , ) JavaScript import { GoogleGenAI } from '@google/genai' ; const ai = new GoogleGenAI ({ apiKey : "GOOGLE_API_KEY" }); const prompt = "The quick brown fox jumps over the lazy dog." ; const countTokensResponse = await ai . models . countTokens ({ model : "gemini-2.0-flash" , contents : prompt , }); console . log ( countTokensResponse . totalTokens ); const generateResponse = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : prompt , }); console . log ( generateResponse . usageMetadata ); Generate images Generate images: Before Python #pip install https://github.com/google-gemini/generative-ai-python@imagen import google.generativeai as genai imagen = genai . ImageGenerationModel ( "imagen-3.0-generate-001" ) gen_images = imagen . generate_images ( prompt = "Robot holding a red skateboard" , number_of_images = 1 , safety_filter_level = "block_low_and_above" , person_generation = "allow_adult" , aspect_ratio = "3:4" , ) After Python from google import genai client = genai . Client () gen_images = client . models . generate_images ( model = 'imagen-3.0-generate-001' , prompt = 'Robot holding a red skateboard' , config = types . GenerateImagesConfig ( number_of_images = 1 , safety_filter_level = "BLOCK_LOW_AND_ABOVE" , person_generation = "ALLOW_ADULT" , aspect_ratio = "3:4" , ) ) for n , image in enumerate ( gen_images . generated_images ): pathlib . Path ( f ' { n } .png' ) . write_bytes ( image . image . image_bytes ) Embed content Generate content embeddings. Before Python import google.generativeai as genai response = genai . embed_content ( model = 'models/text-embedding-004' , content = 'Hello world' ) JavaScript import { GoogleGenerativeAI } from "@google/generative-ai" ; const genAI = new GoogleGenerativeAI ( "GOOGLE_API_KEY" ); const model = genAI . getGenerativeModel ({ model : "text-embedding-004" , }); const result = await model . embedContent ( "Hello world!" ); console . log ( result . embedding ); After Python from google import genai client = genai . Client () response = client . models . embed_content ( model = 'text-embedding-004' , contents = 'Hello world' , ) JavaScript import { GoogleGenAI } from '@google/genai' ; const ai = new GoogleGenAI ({ apiKey : "GOOGLE_API_KEY" }); const text = "Hello World!" ; const result = await ai . models . embedContent ({ model : "text-embedding-004" , contents : text , config : { outputDimensionality : 10 }, }); console . log ( result . embeddings ); Tune a Model Create and use a tuned model. The new SDK simplifies tuning with client.tunings.tune , which launches the tuning job and polls until the job is complete. Before Python import google.generativeai as genai import random # create tuning model train_data = {} for i in range ( 1 , 6 ): key = f 'input { i } ' value = f 'output { i } ' train_data [ key ] = value name = f 'generate-num- { random . randint ( 0 , 10000 ) } ' operation = genai . create_tuned_model ( source_model = 'models/gemini-1.5-flash-001-tuning' , training_data = train_data , id = name , epoch_count = 5 , batch_size = 4 , learning_rate = 0.001 , ) # wait for tuning complete tuningProgress = operation . result () # generate content with the tuned model model = genai . GenerativeModel ( model_name = f 'tunedModels/ { name } ' ) response = model . generate_content ( '55' ) After Python from google import genai from google.genai import types client = genai . Client () # Check which models are available for tuning. for m in client . models . list (): for action in m . supported_actions : if action == "createTunedModel" : print ( m . name ) break # create tuning model training_dataset = types . TuningDataset ( examples = [ types . TuningExample ( text_input = f 'input { i } ' , output = f 'output { i } ' , ) for i in range ( 5 ) ], ) tuning_job = client . tunings . tune ( base_model = 'models/gemini-1.5-flash-001-tuning' , training_dataset = training_dataset , config = types . CreateTuningJobConfig ( epoch_count = 5 , batch_size = 4 , learning_rate = 0.001 , tuned_model_display_name = "test tuned model" ) ) # generate content with the tuned model response = client . models . generate_content ( model = tuning_job . tuned_model . model , contents = '55' , ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-09 UTC.
|
text_content/docs_models_1b6f8b5b.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
URL: https://ai.google.dev/gemini-api/docs/models#gemini-1.5-flash-8b
|
2 |
+
Title: Gemini models | Gemini API | Google AI for Developers
|
3 |
+
==================================================
|
4 |
+
|
5 |
+
Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: <model>-<generation>-<variation> . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: <model>-<generation>-<variation>-<version> . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: <model>-<generation>-<variation>-<version> . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: <model>-<generation>-<variation>-<version> . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC.
|
text_content/docs_models_36def28a.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
URL: https://ai.google.dev/gemini-api/docs/models
|
2 |
+
Title: Gemini models | Gemini API | Google AI for Developers
|
3 |
+
==================================================
|
4 |
+
|
5 |
+
Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: <model>-<generation>-<variation> . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: <model>-<generation>-<variation>-<version> . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: <model>-<generation>-<variation>-<version> . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: <model>-<generation>-<variation>-<version> . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC.
|
text_content/docs_models_988b7c3a.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
URL: https://ai.google.dev/gemini-api/docs/models#gemini-2.5-flash-preview-tts
|
2 |
+
Title: Gemini models | Gemini API | Google AI for Developers
|
3 |
+
==================================================
|
4 |
+
|
5 |
+
Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: <model>-<generation>-<variation> . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: <model>-<generation>-<variation>-<version> . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: <model>-<generation>-<variation>-<version> . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: <model>-<generation>-<variation>-<version> . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC.
|
text_content/docs_openai_30a57b37.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
URL: https://ai.google.dev/gemini-api/docs/openai#extra-body
|
2 |
+
Title: OpenAI compatibility | Gemini API | Google AI for Developers
|
3 |
+
==================================================
|
4 |
+
|
5 |
+
OpenAI compatibility | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback OpenAI compatibility Gemini models are accessible using the OpenAI libraries (Python and TypeScript / Javascript) along with the REST API, by updating three lines of code and using your Gemini API key . If you aren't already using the OpenAI libraries, we recommend that you call the Gemini API directly . Python from openai import OpenAI client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" ) response = client . chat . completions . create ( model = "gemini-2.5-flash" , messages = [ { "role" : "system" , "content" : "You are a helpful assistant." }, { "role" : "user" , "content" : "Explain to me how AI works" } ] ) print ( response . choices [ 0 ] . message ) JavaScript import OpenAI from "openai" ; const openai = new OpenAI ({ apiKey : "GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai/" }); const response = await openai . chat . completions . create ({ model : "gemini-2.0-flash" , messages : [ { role : "system" , content : "You are a helpful assistant." }, { role : "user" , content : "Explain to me how AI works" , }, ], }); console . log ( response . choices [ 0 ]. message ); REST curl "https://generativelanguage.googleapis.com/v1beta/openai/chat/completions" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer GEMINI_API_KEY" \ -d '{ "model": "gemini-2.0-flash", "messages": [ {"role": "user", "content": "Explain to me how AI works"} ] }' What changed? Just three lines! api_key="GEMINI_API_KEY" : Replace " GEMINI_API_KEY " with your actual Gemini API key, which you can get in Google AI Studio . base_url="https://generativelanguage.googleapis.com/v1beta/openai/" : This tells the OpenAI library to send requests to the Gemini API endpoint instead of the default URL. model="gemini-2.0-flash" : Choose a compatible Gemini model Thinking Gemini 2.5 models are trained to think through complex problems, leading to significantly improved reasoning. The Gemini API comes with a "thinking budget" parameter which gives fine grain control over how much the model will think. Unlike the Gemini API, the OpenAI API offers three levels of thinking control: "low" , "medium" , and "high" , which map to 1,024, 8,192, and 24,576 tokens, respectively. If you want to disable thinking, you can set reasoning_effort to "none" (note that reasoning cannot be turned off for 2.5 Pro models). Python from openai import OpenAI client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" ) response = client . chat . completions . create ( model = "gemini-2.5-flash" , reasoning_effort = "low" , messages = [ { "role" : "system" , "content" : "You are a helpful assistant." }, { "role" : "user" , "content" : "Explain to me how AI works" } ] ) print ( response . choices [ 0 ] . message ) JavaScript import OpenAI from "openai" ; const openai = new OpenAI ({ apiKey : "GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai/" }); const response = await openai . chat . completions . create ({ model : "gemini-2.5-flash" , reasoning_effort : "low" , messages : [ { role : "system" , content : "You are a helpful assistant." }, { role : "user" , content : "Explain to me how AI works" , }, ], }); console . log ( response . choices [ 0 ]. message ); REST curl "https://generativelanguage.googleapis.com/v1beta/openai/chat/completions" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer GEMINI_API_KEY" \ -d '{ "model": "gemini-2.5-flash", "reasoning_effort": "low", "messages": [ {"role": "user", "content": "Explain to me how AI works"} ] }' Gemini thinking models also produce thought summaries and can use exact thinking budgets . You can use the extra_body field to include these fields in your request. Note that reasoning_effort and thinking_budget overlap functionality, so they can't be used at the same time. Python from openai import OpenAI client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" ) response = client . chat . completions . create ( model = "gemini-2.5-flash" , messages = [{ "role" : "user" , "content" : "Explain to me how AI works" }], extra_body = { 'extra_body' : { "google" : { "thinking_config" : { "thinking_budget" : 800 , "include_thoughts" : True } } } } ) print ( response . choices [ 0 ] . message ) JavaScript import OpenAI from "openai" ; const openai = new OpenAI ({ apiKey : "GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai/" }); const response = await openai . chat . completions . create ({ model : "gemini-2.5-flash" , messages : [{ role : "user" , content : "Explain to me how AI works" ,}], extra_body : { "google" : { "thinking_config" : { "thinking_budget" : 800 , "include_thoughts" : true } } } }); console . log ( response . choices [ 0 ]. message ); REST curl "https://generativelanguage.googleapis.com/v1beta/openai/chat/completions" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer GEMINI_API_KEY" \ -d '{ "model": "gemini-2.5-flash", "messages": [{"role": "user", "content": "Explain to me how AI works"}], "extra_body": { "google": { "thinking_config": { "include_thoughts": true } } } }' Streaming The Gemini API supports streaming responses . Python from openai import OpenAI client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" ) response = client . chat . completions . create ( model = "gemini-2.0-flash" , messages = [ { "role" : "system" , "content" : "You are a helpful assistant." }, { "role" : "user" , "content" : "Hello!" } ], stream = True ) for chunk in response : print ( chunk . choices [ 0 ] . delta ) JavaScript import OpenAI from "openai" ; const openai = new OpenAI ({ apiKey : "GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai/" }); async function main () { const completion = await openai . chat . completions . create ({ model : "gemini-2.0-flash" , messages : [ { "role" : "system" , "content" : "You are a helpful assistant." }, { "role" : "user" , "content" : "Hello!" } ], stream : true , }); for await ( const chunk of completion ) { console . log ( chunk . choices [ 0 ]. delta . content ); } } main (); REST curl "https://generativelanguage.googleapis.com/v1beta/openai/chat/completions" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer GEMINI_API_KEY" \ -d '{ "model": "gemini-2.0-flash", "messages": [ {"role": "user", "content": "Explain to me how AI works"} ], "stream": true }' Function calling Function calling makes it easier for you to get structured data outputs from generative models and is supported in the Gemini API . Python from openai import OpenAI client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" ) tools = [ { "type" : "function" , "function" : { "name" : "get_weather" , "description" : "Get the weather in a given location" , "parameters" : { "type" : "object" , "properties" : { "location" : { "type" : "string" , "description" : "The city and state, e.g. Chicago, IL" , }, "unit" : { "type" : "string" , "enum" : [ "celsius" , "fahrenheit" ]}, }, "required" : [ "location" ], }, } } ] messages = [{ "role" : "user" , "content" : "What's the weather like in Chicago today?" }] response = client . chat . completions . create ( model = "gemini-2.0-flash" , messages = messages , tools = tools , tool_choice = "auto" ) print ( response ) JavaScript import OpenAI from "openai" ; const openai = new OpenAI ({ apiKey : "GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai/" }); async function main () { const messages = [{ "role" : "user" , "content" : "What's the weather like in Chicago today?" }]; const tools = [ { "type" : "function" , "function" : { "name" : "get_weather" , "description" : "Get the weather in a given location" , "parameters" : { "type" : "object" , "properties" : { "location" : { "type" : "string" , "description" : "The city and state, e.g. Chicago, IL" , }, "unit" : { "type" : "string" , "enum" : [ "celsius" , "fahrenheit" ]}, }, "required" : [ "location" ], }, } } ]; const response = await openai . chat . completions . create ({ model : "gemini-2.0-flash" , messages : messages , tools : tools , tool_choice : "auto" , }); console . log ( response ); } main (); REST curl "https://generativelanguage.googleapis.com/v1beta/openai/chat/completions" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer GEMINI_API_KEY" \ -d '{ "model": "gemini-2.0-flash", "messages": [ { "role": "user", "content": "What' \' 's the weather like in Chicago today?" } ], "tools": [ { "type": "function", "function": { "name": "get_weather", "description": "Get the current weather in a given location", "parameters": { "type": "object", "properties": { "location": { "type": "string", "description": "The city and state, e.g. Chicago, IL" }, "unit": { "type": "string", "enum": ["celsius", "fahrenheit"] } }, "required": ["location"] } } } ], "tool_choice": "auto" }' Image understanding Gemini models are natively multimodal and provide best in class performance on many common vision tasks . Python import base64 from openai import OpenAI client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" ) # Function to encode the image def encode_image ( image_path ): with open ( image_path , "rb" ) as image_file : return base64 . b64encode ( image_file . read ()) . decode ( 'utf-8' ) # Getting the base64 string base64_image = encode_image ( "Path/to/agi/image.jpeg" ) response = client . chat . completions . create ( model = "gemini-2.0-flash" , messages = [ { "role" : "user" , "content" : [ { "type" : "text" , "text" : "What is in this image?" , }, { "type" : "image_url" , "image_url" : { "url" : f "data:image/jpeg;base64, { base64_image } " }, }, ], } ], ) print ( response . choices [ 0 ]) JavaScript import OpenAI from "openai" ; import fs from 'fs/promises' ; const openai = new OpenAI ({ apiKey : "GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai/" }); async function encodeImage ( imagePath ) { try { const imageBuffer = await fs . readFile ( imagePath ); return imageBuffer . toString ( 'base64' ); } catch ( error ) { console . error ( "Error encoding image:" , error ); return null ; } } async function main () { const imagePath = "Path/to/agi/image.jpeg" ; const base64Image = await encodeImage ( imagePath ); const messages = [ { "role" : "user" , "content" : [ { "type" : "text" , "text" : "What is in this image?" , }, { "type" : "image_url" , "image_url" : { "url" : `data:image/jpeg;base64, ${ base64Image } ` }, }, ], } ]; try { const response = await openai . chat . completions . create ({ model : "gemini-2.0-flash" , messages : messages , }); console . log ( response . choices [ 0 ]); } catch ( error ) { console . error ( "Error calling Gemini API:" , error ); } } main (); REST bash -c ' base64_image=$(base64 -i "Path/to/agi/image.jpeg"); curl "https://generativelanguage.googleapis.com/v1beta/openai/chat/completions" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer GEMINI_API_KEY" \ -d "{ \"model\": \"gemini-2.0-flash\", \"messages\": [ { \"role\": \"user\", \"content\": [ { \"type\": \"text\", \"text\": \"What is in this image?\" }, { \"type\": \"image_url\", \"image_url\": { \"url\": \"data:image/jpeg;base64,${base64_image}\" } } ] } ] }" ' Generate an image Note: Image generation is only available in the paid tier. Generate an image: Python import base64 from openai import OpenAI from PIL import Image from io import BytesIO client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" , ) response = client . images . generate ( model = "imagen-3.0-generate-002" , prompt = "a portrait of a sheepadoodle wearing a cape" , response_format = 'b64_json' , n = 1 , ) for image_data in response . data : image = Image . open ( BytesIO ( base64 . b64decode ( image_data . b64_json ))) image . show () JavaScript import OpenAI from "openai" ; const openai = new OpenAI ({ apiKey : "GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai/" , }); async function main () { const image = await openai . images . generate ( { model : "imagen-3.0-generate-002" , prompt : "a portrait of a sheepadoodle wearing a cape" , response_format : "b64_json" , n : 1 , } ); console . log ( image . data ); } main (); REST curl "https://generativelanguage.googleapis.com/v1beta/openai/images/generations" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer GEMINI_API_KEY" \ -d '{ "model": "imagen-3.0-generate-002", "prompt": "a portrait of a sheepadoodle wearing a cape", "response_format": "b64_json", "n": 1, }' Audio understanding Analyze audio input: Python import base64 from openai import OpenAI client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" ) with open ( "/path/to/your/audio/file.wav" , "rb" ) as audio_file : base64_audio = base64 . b64encode ( audio_file . read ()) . decode ( 'utf-8' ) response = client . chat . completions . create ( model = "gemini-2.0-flash" , messages = [ { "role" : "user" , "content" : [ { "type" : "text" , "text" : "Transcribe this audio" , }, { "type" : "input_audio" , "input_audio" : { "data" : base64_audio , "format" : "wav" } } ], } ], ) print ( response . choices [ 0 ] . message . content ) JavaScript import fs from "fs" ; import OpenAI from "openai" ; const client = new OpenAI ({ apiKey : "GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai/" , }); const audioFile = fs . readFileSync ( "/path/to/your/audio/file.wav" ); const base64Audio = Buffer . from ( audioFile ). toString ( "base64" ); async function main () { const response = await client . chat . completions . create ({ model : "gemini-2.0-flash" , messages : [ { role : "user" , content : [ { type : "text" , text : "Transcribe this audio" , }, { type : "input_audio" , input_audio : { data : base64Audio , format : "wav" , }, }, ], }, ], }); console . log ( response . choices [ 0 ]. message . content ); } main (); REST Note: If you get an Argument list too long error, the encoding of your audio file might be too long for curl. bash -c ' base64_audio=$(base64 -i "/path/to/your/audio/file.wav"); curl "https://generativelanguage.googleapis.com/v1beta/openai/chat/completions" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer GEMINI_API_KEY" \ -d "{ \"model\": \"gemini-2.0-flash\", \"messages\": [ { \"role\": \"user\", \"content\": [ { \"type\": \"text\", \"text\": \"Transcribe this audio file.\" }, { \"type\": \"input_audio\", \"input_audio\": { \"data\": \"${base64_audio}\", \"format\": \"wav\" } } ] } ] }" ' Structured output Gemini models can output JSON objects in any structure you define . Python from pydantic import BaseModel from openai import OpenAI client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" ) class CalendarEvent ( BaseModel ): name : str date : str participants : list [ str ] completion = client . beta . chat . completions . parse ( model = "gemini-2.0-flash" , messages = [ { "role" : "system" , "content" : "Extract the event information." }, { "role" : "user" , "content" : "John and Susan are going to an AI conference on Friday." }, ], response_format = CalendarEvent , ) print ( completion . choices [ 0 ] . message . parsed ) JavaScript import OpenAI from "openai" ; import { zodResponseFormat } from "openai/helpers/zod" ; import { z } from "zod" ; const openai = new OpenAI ({ apiKey : "GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai" }); const CalendarEvent = z . object ({ name : z . string (), date : z . string (), participants : z . array ( z . string ()), }); const completion = await openai . beta . chat . completions . parse ({ model : "gemini-2.0-flash" , messages : [ { role : "system" , content : "Extract the event information." }, { role : "user" , content : "John and Susan are going to an AI conference on Friday" }, ], response_format : zodResponseFormat ( CalendarEvent , "event" ), }); const event = completion . choices [ 0 ]. message . parsed ; console . log ( event ); Embeddings Text embeddings measure the relatedness of text strings and can be generated using the Gemini API . Python from openai import OpenAI client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" ) response = client . embeddings . create ( input = "Your text string goes here" , model = "text-embedding-004" ) print ( response . data [ 0 ] . embedding ) JavaScript import OpenAI from "openai" ; const openai = new OpenAI ({ apiKey : "GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai/" }); async function main () { const embedding = await openai . embeddings . create ({ model : "text-embedding-004" , input : "Your text string goes here" , }); console . log ( embedding ); } main (); REST curl "https://generativelanguage.googleapis.com/v1beta/openai/embeddings" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer GEMINI_API_KEY" \ -d '{ "input": "Your text string goes here", "model": "text-embedding-004" }' extra_body There are several features supported by Gemini that are not available in OpenAI models but can be enabled using the extra_body field. extra_body features safety_settings Corresponds to Gemini's SafetySetting . cached_content Corresponds to Gemini's GenerateContentRequest.cached_content . thinking_config Corresponds to Gemini's ThinkingConfig . cached_content Here's an example of using extra_body to set cached_content : Python from openai import OpenAI client = OpenAI ( api_key = MY_API_KEY , base_url = "https://generativelanguage.googleapis.com/v1beta/" ) stream = client . chat . completions . create ( model = "gemini-2.5-pro" , n = 1 , messages = [ { "role" : "user" , "content" : "Summarize the video" } ], stream = True , stream_options = { 'include_usage' : True }, extra_body = { 'extra_body' : { 'google' : { 'cached_content' : "cachedContents/0000aaaa1111bbbb2222cccc3333dddd4444eeee" } } } ) for chunk in stream : print ( chunk ) print ( chunk . usage . to_dict ()) List models Get a list of available Gemini models: Python from openai import OpenAI client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" ) models = client . models . list () for model in models : print ( model . id ) JavaScript import OpenAI from "openai" ; const openai = new OpenAI ({ apiKey : "GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai/" , }); async function main () { const list = await openai . models . list (); for await ( const model of list ) { console . log ( model ); } } main (); REST curl https://generativelanguage.googleapis.com/v1beta/openai/models \ -H "Authorization: Bearer GEMINI_API_KEY" Retrieve a model Retrieve a Gemini model: Python from openai import OpenAI client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" ) model = client . models . retrieve ( "gemini-2.0-flash" ) print ( model . id ) JavaScript import OpenAI from "openai" ; const openai = new OpenAI ({ apiKey : "GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai/" , }); async function main () { const model = await openai . models . retrieve ( "gemini-2.0-flash" ); console . log ( model . id ); } main (); REST curl https://generativelanguage.googleapis.com/v1beta/openai/models/gemini-2.0-flash \ -H "Authorization: Bearer GEMINI_API_KEY" Current limitations Support for the OpenAI libraries is still in beta while we extend feature support. If you have questions about supported parameters, upcoming features, or run into any issues getting started with Gemini, join our Developer Forum . What's next Try our OpenAI Compatibility Colab to work through more detailed examples. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-18 UTC.
|
text_content/docs_openai_49a96628.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
URL: https://ai.google.dev/gemini-api/docs/openai#main-content
|
2 |
+
Title: OpenAI compatibility | Gemini API | Google AI for Developers
|
3 |
+
==================================================
|
4 |
+
|
5 |
+
OpenAI compatibility | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback OpenAI compatibility Gemini models are accessible using the OpenAI libraries (Python and TypeScript / Javascript) along with the REST API, by updating three lines of code and using your Gemini API key . If you aren't already using the OpenAI libraries, we recommend that you call the Gemini API directly . Python from openai import OpenAI client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" ) response = client . chat . completions . create ( model = "gemini-2.5-flash" , messages = [ { "role" : "system" , "content" : "You are a helpful assistant." }, { "role" : "user" , "content" : "Explain to me how AI works" } ] ) print ( response . choices [ 0 ] . message ) JavaScript import OpenAI from "openai" ; const openai = new OpenAI ({ apiKey : "GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai/" }); const response = await openai . chat . completions . create ({ model : "gemini-2.0-flash" , messages : [ { role : "system" , content : "You are a helpful assistant." }, { role : "user" , content : "Explain to me how AI works" , }, ], }); console . log ( response . choices [ 0 ]. message ); REST curl "https://generativelanguage.googleapis.com/v1beta/openai/chat/completions" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer GEMINI_API_KEY" \ -d '{ "model": "gemini-2.0-flash", "messages": [ {"role": "user", "content": "Explain to me how AI works"} ] }' What changed? Just three lines! api_key="GEMINI_API_KEY" : Replace " GEMINI_API_KEY " with your actual Gemini API key, which you can get in Google AI Studio . base_url="https://generativelanguage.googleapis.com/v1beta/openai/" : This tells the OpenAI library to send requests to the Gemini API endpoint instead of the default URL. model="gemini-2.0-flash" : Choose a compatible Gemini model Thinking Gemini 2.5 models are trained to think through complex problems, leading to significantly improved reasoning. The Gemini API comes with a "thinking budget" parameter which gives fine grain control over how much the model will think. Unlike the Gemini API, the OpenAI API offers three levels of thinking control: "low" , "medium" , and "high" , which map to 1,024, 8,192, and 24,576 tokens, respectively. If you want to disable thinking, you can set reasoning_effort to "none" (note that reasoning cannot be turned off for 2.5 Pro models). Python from openai import OpenAI client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" ) response = client . chat . completions . create ( model = "gemini-2.5-flash" , reasoning_effort = "low" , messages = [ { "role" : "system" , "content" : "You are a helpful assistant." }, { "role" : "user" , "content" : "Explain to me how AI works" } ] ) print ( response . choices [ 0 ] . message ) JavaScript import OpenAI from "openai" ; const openai = new OpenAI ({ apiKey : "GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai/" }); const response = await openai . chat . completions . create ({ model : "gemini-2.5-flash" , reasoning_effort : "low" , messages : [ { role : "system" , content : "You are a helpful assistant." }, { role : "user" , content : "Explain to me how AI works" , }, ], }); console . log ( response . choices [ 0 ]. message ); REST curl "https://generativelanguage.googleapis.com/v1beta/openai/chat/completions" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer GEMINI_API_KEY" \ -d '{ "model": "gemini-2.5-flash", "reasoning_effort": "low", "messages": [ {"role": "user", "content": "Explain to me how AI works"} ] }' Gemini thinking models also produce thought summaries and can use exact thinking budgets . You can use the extra_body field to include these fields in your request. Note that reasoning_effort and thinking_budget overlap functionality, so they can't be used at the same time. Python from openai import OpenAI client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" ) response = client . chat . completions . create ( model = "gemini-2.5-flash" , messages = [{ "role" : "user" , "content" : "Explain to me how AI works" }], extra_body = { 'extra_body' : { "google" : { "thinking_config" : { "thinking_budget" : 800 , "include_thoughts" : True } } } } ) print ( response . choices [ 0 ] . message ) JavaScript import OpenAI from "openai" ; const openai = new OpenAI ({ apiKey : "GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai/" }); const response = await openai . chat . completions . create ({ model : "gemini-2.5-flash" , messages : [{ role : "user" , content : "Explain to me how AI works" ,}], extra_body : { "google" : { "thinking_config" : { "thinking_budget" : 800 , "include_thoughts" : true } } } }); console . log ( response . choices [ 0 ]. message ); REST curl "https://generativelanguage.googleapis.com/v1beta/openai/chat/completions" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer GEMINI_API_KEY" \ -d '{ "model": "gemini-2.5-flash", "messages": [{"role": "user", "content": "Explain to me how AI works"}], "extra_body": { "google": { "thinking_config": { "include_thoughts": true } } } }' Streaming The Gemini API supports streaming responses . Python from openai import OpenAI client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" ) response = client . chat . completions . create ( model = "gemini-2.0-flash" , messages = [ { "role" : "system" , "content" : "You are a helpful assistant." }, { "role" : "user" , "content" : "Hello!" } ], stream = True ) for chunk in response : print ( chunk . choices [ 0 ] . delta ) JavaScript import OpenAI from "openai" ; const openai = new OpenAI ({ apiKey : "GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai/" }); async function main () { const completion = await openai . chat . completions . create ({ model : "gemini-2.0-flash" , messages : [ { "role" : "system" , "content" : "You are a helpful assistant." }, { "role" : "user" , "content" : "Hello!" } ], stream : true , }); for await ( const chunk of completion ) { console . log ( chunk . choices [ 0 ]. delta . content ); } } main (); REST curl "https://generativelanguage.googleapis.com/v1beta/openai/chat/completions" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer GEMINI_API_KEY" \ -d '{ "model": "gemini-2.0-flash", "messages": [ {"role": "user", "content": "Explain to me how AI works"} ], "stream": true }' Function calling Function calling makes it easier for you to get structured data outputs from generative models and is supported in the Gemini API . Python from openai import OpenAI client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" ) tools = [ { "type" : "function" , "function" : { "name" : "get_weather" , "description" : "Get the weather in a given location" , "parameters" : { "type" : "object" , "properties" : { "location" : { "type" : "string" , "description" : "The city and state, e.g. Chicago, IL" , }, "unit" : { "type" : "string" , "enum" : [ "celsius" , "fahrenheit" ]}, }, "required" : [ "location" ], }, } } ] messages = [{ "role" : "user" , "content" : "What's the weather like in Chicago today?" }] response = client . chat . completions . create ( model = "gemini-2.0-flash" , messages = messages , tools = tools , tool_choice = "auto" ) print ( response ) JavaScript import OpenAI from "openai" ; const openai = new OpenAI ({ apiKey : "GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai/" }); async function main () { const messages = [{ "role" : "user" , "content" : "What's the weather like in Chicago today?" }]; const tools = [ { "type" : "function" , "function" : { "name" : "get_weather" , "description" : "Get the weather in a given location" , "parameters" : { "type" : "object" , "properties" : { "location" : { "type" : "string" , "description" : "The city and state, e.g. Chicago, IL" , }, "unit" : { "type" : "string" , "enum" : [ "celsius" , "fahrenheit" ]}, }, "required" : [ "location" ], }, } } ]; const response = await openai . chat . completions . create ({ model : "gemini-2.0-flash" , messages : messages , tools : tools , tool_choice : "auto" , }); console . log ( response ); } main (); REST curl "https://generativelanguage.googleapis.com/v1beta/openai/chat/completions" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer GEMINI_API_KEY" \ -d '{ "model": "gemini-2.0-flash", "messages": [ { "role": "user", "content": "What' \' 's the weather like in Chicago today?" } ], "tools": [ { "type": "function", "function": { "name": "get_weather", "description": "Get the current weather in a given location", "parameters": { "type": "object", "properties": { "location": { "type": "string", "description": "The city and state, e.g. Chicago, IL" }, "unit": { "type": "string", "enum": ["celsius", "fahrenheit"] } }, "required": ["location"] } } } ], "tool_choice": "auto" }' Image understanding Gemini models are natively multimodal and provide best in class performance on many common vision tasks . Python import base64 from openai import OpenAI client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" ) # Function to encode the image def encode_image ( image_path ): with open ( image_path , "rb" ) as image_file : return base64 . b64encode ( image_file . read ()) . decode ( 'utf-8' ) # Getting the base64 string base64_image = encode_image ( "Path/to/agi/image.jpeg" ) response = client . chat . completions . create ( model = "gemini-2.0-flash" , messages = [ { "role" : "user" , "content" : [ { "type" : "text" , "text" : "What is in this image?" , }, { "type" : "image_url" , "image_url" : { "url" : f "data:image/jpeg;base64, { base64_image } " }, }, ], } ], ) print ( response . choices [ 0 ]) JavaScript import OpenAI from "openai" ; import fs from 'fs/promises' ; const openai = new OpenAI ({ apiKey : "GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai/" }); async function encodeImage ( imagePath ) { try { const imageBuffer = await fs . readFile ( imagePath ); return imageBuffer . toString ( 'base64' ); } catch ( error ) { console . error ( "Error encoding image:" , error ); return null ; } } async function main () { const imagePath = "Path/to/agi/image.jpeg" ; const base64Image = await encodeImage ( imagePath ); const messages = [ { "role" : "user" , "content" : [ { "type" : "text" , "text" : "What is in this image?" , }, { "type" : "image_url" , "image_url" : { "url" : `data:image/jpeg;base64, ${ base64Image } ` }, }, ], } ]; try { const response = await openai . chat . completions . create ({ model : "gemini-2.0-flash" , messages : messages , }); console . log ( response . choices [ 0 ]); } catch ( error ) { console . error ( "Error calling Gemini API:" , error ); } } main (); REST bash -c ' base64_image=$(base64 -i "Path/to/agi/image.jpeg"); curl "https://generativelanguage.googleapis.com/v1beta/openai/chat/completions" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer GEMINI_API_KEY" \ -d "{ \"model\": \"gemini-2.0-flash\", \"messages\": [ { \"role\": \"user\", \"content\": [ { \"type\": \"text\", \"text\": \"What is in this image?\" }, { \"type\": \"image_url\", \"image_url\": { \"url\": \"data:image/jpeg;base64,${base64_image}\" } } ] } ] }" ' Generate an image Note: Image generation is only available in the paid tier. Generate an image: Python import base64 from openai import OpenAI from PIL import Image from io import BytesIO client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" , ) response = client . images . generate ( model = "imagen-3.0-generate-002" , prompt = "a portrait of a sheepadoodle wearing a cape" , response_format = 'b64_json' , n = 1 , ) for image_data in response . data : image = Image . open ( BytesIO ( base64 . b64decode ( image_data . b64_json ))) image . show () JavaScript import OpenAI from "openai" ; const openai = new OpenAI ({ apiKey : "GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai/" , }); async function main () { const image = await openai . images . generate ( { model : "imagen-3.0-generate-002" , prompt : "a portrait of a sheepadoodle wearing a cape" , response_format : "b64_json" , n : 1 , } ); console . log ( image . data ); } main (); REST curl "https://generativelanguage.googleapis.com/v1beta/openai/images/generations" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer GEMINI_API_KEY" \ -d '{ "model": "imagen-3.0-generate-002", "prompt": "a portrait of a sheepadoodle wearing a cape", "response_format": "b64_json", "n": 1, }' Audio understanding Analyze audio input: Python import base64 from openai import OpenAI client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" ) with open ( "/path/to/your/audio/file.wav" , "rb" ) as audio_file : base64_audio = base64 . b64encode ( audio_file . read ()) . decode ( 'utf-8' ) response = client . chat . completions . create ( model = "gemini-2.0-flash" , messages = [ { "role" : "user" , "content" : [ { "type" : "text" , "text" : "Transcribe this audio" , }, { "type" : "input_audio" , "input_audio" : { "data" : base64_audio , "format" : "wav" } } ], } ], ) print ( response . choices [ 0 ] . message . content ) JavaScript import fs from "fs" ; import OpenAI from "openai" ; const client = new OpenAI ({ apiKey : "GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai/" , }); const audioFile = fs . readFileSync ( "/path/to/your/audio/file.wav" ); const base64Audio = Buffer . from ( audioFile ). toString ( "base64" ); async function main () { const response = await client . chat . completions . create ({ model : "gemini-2.0-flash" , messages : [ { role : "user" , content : [ { type : "text" , text : "Transcribe this audio" , }, { type : "input_audio" , input_audio : { data : base64Audio , format : "wav" , }, }, ], }, ], }); console . log ( response . choices [ 0 ]. message . content ); } main (); REST Note: If you get an Argument list too long error, the encoding of your audio file might be too long for curl. bash -c ' base64_audio=$(base64 -i "/path/to/your/audio/file.wav"); curl "https://generativelanguage.googleapis.com/v1beta/openai/chat/completions" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer GEMINI_API_KEY" \ -d "{ \"model\": \"gemini-2.0-flash\", \"messages\": [ { \"role\": \"user\", \"content\": [ { \"type\": \"text\", \"text\": \"Transcribe this audio file.\" }, { \"type\": \"input_audio\", \"input_audio\": { \"data\": \"${base64_audio}\", \"format\": \"wav\" } } ] } ] }" ' Structured output Gemini models can output JSON objects in any structure you define . Python from pydantic import BaseModel from openai import OpenAI client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" ) class CalendarEvent ( BaseModel ): name : str date : str participants : list [ str ] completion = client . beta . chat . completions . parse ( model = "gemini-2.0-flash" , messages = [ { "role" : "system" , "content" : "Extract the event information." }, { "role" : "user" , "content" : "John and Susan are going to an AI conference on Friday." }, ], response_format = CalendarEvent , ) print ( completion . choices [ 0 ] . message . parsed ) JavaScript import OpenAI from "openai" ; import { zodResponseFormat } from "openai/helpers/zod" ; import { z } from "zod" ; const openai = new OpenAI ({ apiKey : "GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai" }); const CalendarEvent = z . object ({ name : z . string (), date : z . string (), participants : z . array ( z . string ()), }); const completion = await openai . beta . chat . completions . parse ({ model : "gemini-2.0-flash" , messages : [ { role : "system" , content : "Extract the event information." }, { role : "user" , content : "John and Susan are going to an AI conference on Friday" }, ], response_format : zodResponseFormat ( CalendarEvent , "event" ), }); const event = completion . choices [ 0 ]. message . parsed ; console . log ( event ); Embeddings Text embeddings measure the relatedness of text strings and can be generated using the Gemini API . Python from openai import OpenAI client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" ) response = client . embeddings . create ( input = "Your text string goes here" , model = "text-embedding-004" ) print ( response . data [ 0 ] . embedding ) JavaScript import OpenAI from "openai" ; const openai = new OpenAI ({ apiKey : "GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai/" }); async function main () { const embedding = await openai . embeddings . create ({ model : "text-embedding-004" , input : "Your text string goes here" , }); console . log ( embedding ); } main (); REST curl "https://generativelanguage.googleapis.com/v1beta/openai/embeddings" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer GEMINI_API_KEY" \ -d '{ "input": "Your text string goes here", "model": "text-embedding-004" }' extra_body There are several features supported by Gemini that are not available in OpenAI models but can be enabled using the extra_body field. extra_body features safety_settings Corresponds to Gemini's SafetySetting . cached_content Corresponds to Gemini's GenerateContentRequest.cached_content . thinking_config Corresponds to Gemini's ThinkingConfig . cached_content Here's an example of using extra_body to set cached_content : Python from openai import OpenAI client = OpenAI ( api_key = MY_API_KEY , base_url = "https://generativelanguage.googleapis.com/v1beta/" ) stream = client . chat . completions . create ( model = "gemini-2.5-pro" , n = 1 , messages = [ { "role" : "user" , "content" : "Summarize the video" } ], stream = True , stream_options = { 'include_usage' : True }, extra_body = { 'extra_body' : { 'google' : { 'cached_content' : "cachedContents/0000aaaa1111bbbb2222cccc3333dddd4444eeee" } } } ) for chunk in stream : print ( chunk ) print ( chunk . usage . to_dict ()) List models Get a list of available Gemini models: Python from openai import OpenAI client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" ) models = client . models . list () for model in models : print ( model . id ) JavaScript import OpenAI from "openai" ; const openai = new OpenAI ({ apiKey : "GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai/" , }); async function main () { const list = await openai . models . list (); for await ( const model of list ) { console . log ( model ); } } main (); REST curl https://generativelanguage.googleapis.com/v1beta/openai/models \ -H "Authorization: Bearer GEMINI_API_KEY" Retrieve a model Retrieve a Gemini model: Python from openai import OpenAI client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" ) model = client . models . retrieve ( "gemini-2.0-flash" ) print ( model . id ) JavaScript import OpenAI from "openai" ; const openai = new OpenAI ({ apiKey : "GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai/" , }); async function main () { const model = await openai . models . retrieve ( "gemini-2.0-flash" ); console . log ( model . id ); } main (); REST curl https://generativelanguage.googleapis.com/v1beta/openai/models/gemini-2.0-flash \ -H "Authorization: Bearer GEMINI_API_KEY" Current limitations Support for the OpenAI libraries is still in beta while we extend feature support. If you have questions about supported parameters, upcoming features, or run into any issues getting started with Gemini, join our Developer Forum . What's next Try our OpenAI Compatibility Colab to work through more detailed examples. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-18 UTC.
|
text_content/docs_prompting_with_media_e3aaecf9.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
URL: https://ai.google.dev/gemini-api/docs/prompting_with_media?lang=python#specific-instructions
|
2 |
+
Title: Files API | Gemini API | Google AI for Developers
|
3 |
+
==================================================
|
4 |
+
|
5 |
+
Files API | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Files API The Gemini family of artificial intelligence (AI) models is built to handle various types of input data, including text, images, and audio. Since these models can handle more than one type or mode of data, the Gemini models are called multimodal models or explained as having multimodal capabilities . This guide shows you how to work with media files using the Files API. The basic operations are the same for audio files, images, videos, documents, and other supported file types. For file prompting guidance, check out the File prompt guide section. Upload a file You can use the Files API to upload a media file. Always use the Files API when the total request size (including the files, text prompt, system instructions, etc.) is larger than 20 MB. The following code uploads a file and then uses the file in a call to generateContent . Python from google import genai client = genai . Client () myfile = client . files . upload ( file = "path/to/sample.mp3" ) response = client . models . generate_content ( model = "gemini-2.0-flash" , contents = [ "Describe this audio clip" , myfile ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const myfile = await ai . files . upload ({ file : "path/to/sample.mp3" , config : { mimeType : "audio/mpeg" }, }); const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : createUserContent ([ createPartFromUri ( myfile . uri , myfile . mimeType ), "Describe this audio clip" , ]), }); console . log ( response . text ); } await main (); Go file , err := client . UploadFileFromPath ( ctx , "path/to/sample.mp3" , nil ) if err != nil { log . Fatal ( err ) } defer client . DeleteFile ( ctx , file . Name ) model := client . GenerativeModel ( "gemini-2.0-flash" ) resp , err := model . GenerateContent ( ctx , genai . FileData { URI : file . URI }, genai . Text ( "Describe this audio clip" )) if err != nil { log . Fatal ( err ) } printResponse ( resp ) REST AUDIO_PATH = "path/to/sample.mp3" MIME_TYPE = $( file -b --mime-type " ${ AUDIO_PATH } " ) NUM_BYTES = $( wc -c < " ${ AUDIO_PATH } " ) DISPLAY_NAME = AUDIO tmp_header_file = upload-header.tmp # Initial resumable request defining metadata. # The upload url is in the response headers dump them to a file. curl " ${ BASE_URL } /upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D " ${ tmp_header_file } " \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME } '}}" 2 > /dev/null upload_url = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " # Upload the actual bytes. curl " ${ upload_url } " \ -H "Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ AUDIO_PATH } " 2 > /dev/null > file_info.json file_uri = $( jq ".file.uri" file_info.json ) echo file_uri = $file_uri # Now generate content using that file curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"text": "Describe this audio clip"}, {"file_data":{"mime_type": "${MIME_TYPE}", "file_uri": ' $file_uri '}}] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json Get metadata for a file You can verify that the API successfully stored the uploaded file and get its metadata by calling files.get . Python myfile = client . files . upload ( file = 'path/to/sample.mp3' ) file_name = myfile . name myfile = client . files . get ( name = file_name ) print ( myfile ) JavaScript const myfile = await ai . files . upload ({ file : "path/to/sample.mp3" , config : { mimeType : "audio/mpeg" }, }); const fileName = myfile . name ; const fetchedFile = await ai . files . get ({ name : fileName }); console . log ( fetchedFile ); Go file , err := client . UploadFileFromPath ( ctx , "path/to/sample.mp3" , nil ) if err != nil { log . Fatal ( err ) } gotFile , err := client . GetFile ( ctx , file . Name ) if err != nil { log . Fatal ( err ) } fmt . Println ( "Got file:" , gotFile . Name ) REST # file_info.json was created in the upload example name = $( jq ".file.name" file_info.json ) # Get the file of interest to check state curl https://generativelanguage.googleapis.com/v1beta/files/ $name \ -H "x-goog-api-key: $GEMINI_API_KEY " > file_info.json # Print some information about the file you got name = $( jq ".file.name" file_info.json ) echo name = $name file_uri = $( jq ".file.uri" file_info.json ) echo file_uri = $file_uri List uploaded files You can upload multiple files using the Files API. The following code gets a list of all the files uploaded: Python print ( 'My files:' ) for f in client . files . list (): print ( ' ' , f . name ) JavaScript const listResponse = await ai . files . list ({ config : { pageSize : 10 } }); for await ( const file of listResponse ) { console . log ( file . name ); } Go iter := client . ListFiles ( ctx ) for { ifile , err := iter . Next () if err == iterator . Done { break } if err != nil { log . Fatal ( err ) } fmt . Println ( ifile . Name ) } REST echo "My files: " curl "https://generativelanguage.googleapis.com/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " Delete uploaded files Files are automatically deleted after 48 hours. You can also manually delete an uploaded file: Python myfile = client . files . upload ( file = 'path/to/sample.mp3' ) client . files . delete ( name = myfile . name ) JavaScript const myfile = await ai . files . upload ({ file : "path/to/sample.mp3" , config : { mimeType : "audio/mpeg" }, }); const fileName = myfile . name ; await ai . files . delete ({ name : fileName }); Go file , err := client . UploadFileFromPath ( ctx , "path/to/sample.mp3" , nil ) if err != nil { log . Fatal ( err ) } client . DeleteFile ( ctx , file . Name ) REST curl --request "DELETE" https://generativelanguage.googleapis.com/v1beta/files/ $name \ -H "x-goog-api-key: $GEMINI_API_KEY " Usage info You can use the Files API to upload and interact with media files. The Files API lets you store up to 20 GB of files per project, with a per-file maximum size of 2 GB. Files are stored for 48 hours. During that time, you can use the API to get metadata about the files, but you can't download the files. The Files API is available at no cost in all regions where the Gemini API is available. File prompting strategies This section provides guidance and best practices for using media files with prompts for the Gemini API. Being able to use various types of data in your prompts gives you more flexibility in terms of what tasks you can tackle with the Gemini API. For example, you can send the model a photo of a delicious meal and ask it to write a short blog about the meal. Prompt Response Write a short, engaging blog post based on this picture. It should include a description of the meal in the photo and talk about my journey meal prepping. Meal prepping is a great way to save time and money, and it can also help you to eat healthier. This meal is a great example of a healthy and delicious meal that can be easily prepped ahead of time. This meal features brown rice, roasted vegetables, and chicken teriyaki. The brown rice is a healthy and complex carbohydrate that will give you sustained energy. The roasted vegetables are a great way to get your daily dose of vitamins and minerals, and the chicken teriyaki is a delicious and protein-rich option. This meal is also very easy to prepare. Simply cook the brown rice, roast the vegetables, and cook the chicken teriyaki. Once everything is cooked, divide it into meal prep containers and store them in the refrigerator. You can then grab a container and go on busy mornings! If you're looking for a healthy and delicious meal that can be easily prepped ahead of time, this meal is a great option. It's packed with nutrients and flavor, and it's sure to keep you feeling full and satisfied. Here's to healthy and delicious meal prepping! If you are having trouble getting the output you want from prompts that use media files, there are some strategies that can help you get the results you want. The following sections provide design approaches and troubleshooting tips for improving prompts that use multimodal input. You can improve your multimodal prompts by following these best practices: Prompt design fundamentals Be specific in your instructions : Craft clear and concise instructions that leave minimal room for misinterpretation. Add a few examples to your prompt: Use realistic few-shot examples to illustrate what you want to achieve. Break it down step-by-step : Divide complex tasks into manageable sub-goals, guiding the model through the process. Specify the output format : In your prompt, ask for the output to be in the format you want, like markdown, JSON, HTML and more. Put your image first for single-image prompts : While Gemini can handle image and text inputs in any order, for prompts containing a single image, it might perform better if that image (or video) is placed before the text prompt. However, for prompts that require images to be highly interleaved with texts to make sense, use whatever order is most natural. Troubleshooting your multimodal prompt If the model is not drawing information from the relevant part of the image: Drop hints with which aspects of the image you want the prompt to draw information from. If the model output is too generic (not tailored enough to the image/video input): At the start of the prompt, try asking the model to describe the image(s) or video before providing the task instruction, or try asking the model to refer to what's in the image. To troubleshoot which part failed: Ask the model to describe the image, or ask the model to explain its reasoning, to gauge the model's initial understanding. If your prompt results in hallucinated content: Try dialing down the temperature setting or asking the model for shorter descriptions so that it's less likely to extrapolate additional details. Tuning the sampling parameters: Experiment with different temperature settings and top-k selections to adjust the model's creativity. Be specific in your instructions Prompts have the most success when they are clear and detailed. If you have a specific output in mind, it's better to include that requirement in the prompt to ensure you get the output you want. For this image of an airport board, asking the model to just "describe this image" could generate a general description. If you need the model to parse the time and city from the image, you can include that request directly in your prompt. Prompt Model response Describe this image. The image shows an airport arrivals and departures board. Updated prompt Improved response Parse the time and city from the airport board shown in this image into a list. 10:50 Moscow 11:05 Edinburgh 11:05 London 11:10 Bucharest 11:30 Kiev 11:35 Dublin 11:45 East Midlands 12:15 Sofia 12:30 London 12:30 Newcastle 12:40 St Petersburg 12:40 London 12:45 Manchester Add a few examples The Gemini model can accept multiple inputs which it can use as examples to understand the output you want. Adding these examples can help the model identify the patterns and apply the relationship between the given images and responses to the new example. This is also called "few-shot" learning. In the example below, the initial output is written in sentence form, and also contains the country (Brazil). Suppose you want the output in a different format or style, and you want only the city, not the country. Adding few-shot examples to your prompt can steer the model to respond in the way you want. Prompt Model response Determine the city along with the landmark. The landmark is the Christ the Redeemer statue in Rio de Janeiro, Brazil. Updated prompt Improved response Determine the city along with the landmark. city: Rome, landmark: the Colosseum. city: Beijing, landmark: Forbidden City city: Rio de Janeiro, landmark: Christ the Redeemer statue Break it down step-by-step For complex tasks like the ones that require both visual understanding and reasoning, it can be helpful to split the task into smaller, more straightforward steps. Alternatively, it could also be effective if you directly ask the model to “think step by step” in your prompt. Prompt Model response When will I run out of toilet paper? Soon, you only have 3 rolls left. Updated prompt Improved response 1. First, count how many toilet paper rolls are in this picture. 2. Then, determine how much toilet paper a typical person uses per day. 3. Calculate how long these rolls of toilet paper will last. 1. There are 3 rolls of toilet paper in this picture. 2. A typical person uses about 20 sheets of toilet paper per day. 3. If each roll contains 200 sheets, then each roll will last for about 10 days. Therefore, the 3 rolls will last for about a month. Math problems or other types of word problems are great candidates for asking the model to think step-by-step. Prompt Response What is the 4th term in the sequence? -135 The response from the model is incorrect. Some ways to improve this is to ask the model to parse the problem before answering the prompt. Prompt Response Parse the formula in the image first. Then based on the formula, answer "what's the 4th term in the sequence?". Click to expand the result $$ b(n) = b(n-1) * (-3) $$ $$ b(1) = 15 $$ Substitute n = 4 into the formula: $$ b(4) = b(4-1) * (-3) $$ $$ b(4) = b(3) * (-3) $$ We don't know the value of b(3), so substitute n = 3 into the formula: $$ b(3) = b(3-1) * (-3) $$ $$ b(3) = b(2) * (-3) $$ We don't know the value of b(2), so substitute n = 2 into the formula: $$ b(2) = b(2-1) * (-3) $$ $$ b(2) = b(1) * (-3) $$ We know the value of b(1) is 15, so substitute it into the formula: $$ b(2) = 15 * (-3) $$ $$ b(2) = -45 $$ Now that we know the value of b(2), substitute it into the formula for b(3): $$ b(3) = b(2) * (-3) $$ $$ b(3) = -45 * (-3) $$ $$ b(3) = 135 $$ Now that we know the value of b(3), substitute it into the formula for b(4): $$ b(4) = b(3) * (-3) $$ $$ b(4) = 135 * (-3) $$ $$ b(4) = -405 $$ Therefore, the 4th term in the sequence is -405. This model response is correct - now to make the response even more streamlined, you can ask the model to to think step-by-step. Updated prompt Improved response What is the 4th term in the sequence? Think step by step. The first term is 15. The second term is b(1) * (-3) = 15 * (-3) = -45. The third term is b(2) * (-3) = -45 * (-3) = 135. The fourth term is b(3) * (-3) = 135 * (-3) = -405. Specify the output format A common problem is the need for model output to be in a certain format (e.g. markdown, JSON, HTML), especially if the model output needs to be ingested by a downstream task. You can try to produce output in that format by instructing the model to do so within the prompt. Below are two examples: Prompt Model response Parse the table in this image into markdown format | Subject | Status | Time | Start date | Due on | | ------- | ------ | ---- | ---------- | ------ | | Math | Done | 1.5 hrs | 11/29/2023 | 12/1/2023 | | Geography | Done | 30 min | 12/2/2023 | 12/4/2023 | | Physics | In progress | 2 hrs | 12/3/2023 | 12/5/2023 | | Biology | Not started | 6 hrs | 11/2/2016 | 12/6/2023 | Prompt Model response Provide a list of all the following attributes: ingredients, type of cuisine, vegetarian or not, in JSON format ```json { "ingredients": [ "rice", "shrimp", "clams", "mussels", "peas", "tomatoes", "onions", "garlic", "olive oil", "paprika", "salt", "pepper" ], "type of cuisine": "Spanish", "vegetarian": false } ``` Put your image first for single-image prompts While Gemini can interpret image and text in any order within a prompt, placing a single image before the text prompt might lead to better results. In the following samples, you'll notice the image comes first before the text in each prompt. Troubleshooting your multimodal prompt You might need to troubleshoot your prompt if you are not getting a helpful response. Here are a few strategies you could try. If the model is not drawing information from the relevant part of the image To get a more specific response, you can point out which aspects of the image you want the prompt to draw information from to inform its response. Prompt Model response How many days will these diapers last a baby? The diapers will last for 198 days before they run out. Updated prompt Improved response How long will these diapers last before I run out? Use the weight shown on the box to determine the child's age, and use the total number of diapers in the box. Divide the total number by how many diapers the child goes through per day. The box contains 198 diapers. The box states the diapers are for a child weighing 22-37 lbs. The average child in this weight range is likely between 1 and 2 years old. A 2-year-old typically goes through 6 diapers per day, therefore, the diapers will last around 33 days (198/6). If the model output is too generic and not tailored enough to the image input To help the model tailor its response to the image(s), try asking it to describe the images before performing its reasoning task. Prompt Model response What is in common between these images? All four images are examples of landscape photography. Updated prompt Improved response First, describe what's in each image in detail. What's in common between these images? The first image shows a person packing a suitcase. The second image shows an empty hallway in a shopping mall. The third image shows a roll of toilet paper. The fourth image shows a dark and stormy sea. The commonality between these images is that they are all examples of liminal spaces. Liminal spaces are places or situations that are in between two other places or situations. They are often characterized by a sense of uncertainty or ambiguity. The first image shows the person in between being at home and being on a trip. The second image shows the hallway in between two stores. The third image is of toilet paper, which can represent a bathroom. A bathroom is a transitional space between private and public realms. It's a place where we prepare ourselves to enter or leave the outside world. The fourth image shows the sea in between being calm and being stormy. Liminal spaces can be both exciting and unsettling. They can be a place of new beginning or a place of endings. They can be a place of reflection or a place of action. You can also try explicitly asking the model to refer to what's in the image. Updated prompt Improved response What is in common between these images? Refer to what's in the images in your response. All of the images are related to travel. The first image shows a person packing a suitcase, the second image shows an empty hallway in a shopping mall, the third image shows a stack of toilet paper, and the fourth image shows a dark and stormy sea. These images are all related to travel because they are all things that people need or use when they travel. The first image shows a person packing a suitcase, which is something that people do when they are going on a trip. The second image shows an empty hallway in a shopping mall, which is a place where people often go to buy things for their trips. The third image shows a stack of toilet paper, which is something that people often use when they are traveling. The fourth image shows a dark and stormy sea, which is something that people often encounter when they are traveling. The same tips work for creative output from images. In the following example with the rabbit, explicitly asking the model to include aspects from the image can improve the output. Prompt Model response Can you write me a poem about this image? Little bunny, so soft and brown, Hopping through the field, up and down. Your ears so long, your tail so small, You're the cutest creature of them all. You nibble on the grass, so green and sweet, And hop away when you see me. But I'll always be your friend, Little bunny, until the very end. Updated prompt Improved response Can you write me a descriptive and dramatic poem about this image and include the location? Little bunny on the green, Ears so long, so keen. What do you see out there, In the distance, where you stare? Is it the ocean blue, Sparkling in the sun's hue? Or the waves that crash and roar, A rhythm that you adore? Little bunny, tell me true, What captures your attention so? Is it the salty breeze, Or the seagulls' cries that please? Perhaps it's the vastness of it all, A world so big and small. A place where dreams can roam, And adventures can be sown. Little bunny, I wonder what you dream, As you sit on the grass, so serene. Do you yearn to explore the deep, Or stay on land, where you can leap? Whatever it may be, little bunny, Keep that spark of wonder burning brightly. For in your dreams and aspirations, Lies a world waiting for your creation. Troubleshooting which part of the prompt failed It can be hard to know whether a prompt failed because the model didn't understand the image to begin with, or if it did understand the image but did not perform the correct reasoning steps afterward. To disambiguate between those reasons, ask the model to describe what's in the image. In the following example, if the model responds with a snack that seems surprising when paired with tea (e.g. popcorn), you can first troubleshoot to determine whether the model correctly recognized that the image contains tea. Prompt Prompt for troubleshooting What's a snack I can make in 1 minute that would go well with this? Describe what's in this image. Another strategy is to ask the model to explain its reasoning. That can help you narrow down which part of the reasoning broke down, if any. Prompt Prompt for troubleshooting What's a snack I can make in 1 minute that would go well with this? What's a snack I can make in 1 minute that would go well with this? Please explain why. What's next Try writing your own multimodal prompts using Google AI Studio . For information on using the Gemini Files API for uploading media files and including them in your prompts, see the Vision , Audio , and Document processing guides. For more guidance on prompt design, like tuning sampling parameters, see the Prompt strategies page. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC.
|
text_content/docs_sdks_de55873c.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
URL: https://ai.google.dev/gemini-api/docs/sdks
|
2 |
+
Title: Gemini API libraries | Google AI for Developers
|
3 |
+
==================================================
|
4 |
+
|
5 |
+
Gemini API libraries | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini API libraries When building with the Gemini API, we recommend using our official collection of libraries across major languages: the Google GenAI SDK . They are production ready under General Availability . Our samples and documentation across this site are built using these libraries. Note: If you're using one of our legacy libraries, we strongly recommend you migrate to the Google GenAI SDK. Review the legacy libraries section for more information. If you're new to the Gemini API, follow our quickstart guide to get started. Language support and installation The Google GenAI SDK is available for the Python, JavaScript/TypeScript, Go and Java languages. You can install each language's library using package managers, or visit their GitHub repos for further engagement: Python Library: google-genai GitHub Repository: googleapis/python-genai Installation: pip install google-genai JavaScript Library: @google/genai GitHub Repository: googleapis/js-genai Installation: npm install @google/genai Go Library: google.golang.org/genai GitHub Repository: googleapis/go-genai Installation: go get google.golang.org/genai Java Library: google-genai GitHub Repository: googleapis/java-genai Installation: If you're using Maven, add the following to your dependencies: <dependencies> <dependency> <groupId>com.google.genai</groupId> <artifactId>google-genai</artifactId> <version>1.0.0</version> </dependency> </dependencies> General availability We started rolling out the Google GenAI SDK in late 2024. As of May 2025, it reached General Availability (GA) across all supported platforms. This means the libraries are stable and fully supported for production use. They are actively maintained, provide access to the latest features, and offer the best performance working with Gemini. If you're not using the Google GenAI SDK and using one of our legacy libraries, we strongly recommend you to migrate. Review the legacy libraries section for more information. Legacy libraries and migration If you are using one of our legacy libraries, we recommend that you migrate to the new libraries . The legacy libraries don't provide access to recent features (such as Live API and Veo ) and are on a deprecation path. They will stop receiving updates at the end of September 2025, the feature gaps will grow and potential bugs may no longer get fixed. Each legacy library's support status varies, detailed in the following table: Language Legacy library Support status Recommended library Python google-generativeai All support, including bug fixes, ends end of September 2025. google-genai JavaScript/TypeScript @google/generativeai All support, including bug fixes, ends end of September 2025. @google/genai Go google.golang.org/generative-ai All support, including bug fixes, ends end of September 2025. google.golang.org/genai Dart and Flutter google_generative_ai Not actively maintained Use trusted community or third party libraries, like firebase_ai , or access using REST API Swift generative-ai-swift Not actively maintained Use Gemini in Firebase Android generative-ai-android Not actively maintained Use Gemini in Firebase Note for Java developers: There was no legacy Google-provided Java SDK for the Gemini API, so no migration from a previous Google library is required. You can start directly with the new library in the Language support and installation section. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-09 UTC.
|
text_content/docs_system-instructions_f32b59f7.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
URL: https://ai.google.dev/gemini-api/docs/system-instructions#system-instructions
|
2 |
+
Title: Text generation | Gemini API | Google AI for Developers
|
3 |
+
==================================================
|
4 |
+
|
5 |
+
Text generation | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Text generation The Gemini API can generate text output from various inputs, including text, images, video, and audio, leveraging Gemini models. Here's a basic example that takes a single text input: Python from google import genai client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "How does AI work?" ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "How does AI work?" , }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "Explain how AI works in a few words" ), nil , ) fmt . Println ( result . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "How does AI work?" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { parts : [ { text : 'How AI does work?' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Thinking with Gemini 2.5 2.5 Flash and Pro models have "thinking" enabled by default to enhance quality, which may take longer to run and increase token usage. When using 2.5 Flash, you can disable thinking by setting the thinking budget to zero. For more details, see the thinking guide . Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "How does AI work?" , config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( thinking_budget = 0 ) # Disables thinking ), ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "How does AI work?" , config : { thinkingConfig : { thinkingBudget : 0 , // Disables thinking }, } }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "How does AI work?" ), & genai . GenerateContentConfig { ThinkingConfig : & genai . ThinkingConfig { ThinkingBudget : int32 ( 0 ), // Disables thinking }, } ) fmt . Println ( result . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "How does AI work?" } ] } ], "generationConfig": { "thinkingConfig": { "thinkingBudget": 0 } } }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { parts : [ { text : 'How AI does work?' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } System instructions and other configurations You can guide the behavior of Gemini models with system instructions. To do so, pass a GenerateContentConfig object. Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , config = types . GenerateContentConfig ( system_instruction = "You are a cat. Your name is Neko." ), contents = "Hello there" ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "Hello there" , config : { systemInstruction : "You are a cat. Your name is Neko." , }, }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } config := & genai . GenerateContentConfig { SystemInstruction : genai . NewContentFromText ( "You are a cat. Your name is Neko." , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "Hello there" ), config , ) fmt . Println ( result . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -d '{ "system_instruction": { "parts": [ { "text": "You are a cat. Your name is Neko." } ] }, "contents": [ { "parts": [ { "text": "Hello there" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const systemInstruction = { parts : [{ text : 'You are a cat. Your name is Neko.' }] }; const payload = { systemInstruction , contents : [ { parts : [ { text : 'Hello there' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } The GenerateContentConfig object also lets you override default generation parameters, such as temperature . Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ "Explain how AI works" ], config = types . GenerateContentConfig ( temperature = 0.1 ) ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "Explain how AI works" , config : { temperature : 0.1 , }, }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } temp := float32 ( 0.9 ) topP := float32 ( 0.5 ) topK := float32 ( 20.0 ) config := & genai . GenerateContentConfig { Temperature : & temp , TopP : & topP , TopK : & topK , ResponseMIMEType : "application/json" , } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "What is the average size of a swallow?" ), config , ) fmt . Println ( result . Text ()) } REST curl https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "Explain how AI works" } ] } ], "generationConfig": { "stopSequences": [ "Title" ], "temperature": 1.0, "topP": 0.8, "topK": 10 } }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const generationConfig = { temperature : 1 , topP : 0.95 , topK : 40 , responseMimeType : 'text/plain' , }; const payload = { generationConfig , contents : [ { parts : [ { text : 'Explain how AI works in a few words' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Refer to the GenerateContentConfig in our API reference for a complete list of configurable parameters and their descriptions. Multimodal inputs The Gemini API supports multimodal inputs, allowing you to combine text with media files. The following example demonstrates providing an image: Python from PIL import Image from google import genai client = genai . Client () image = Image . open ( "/path/to/organ.png" ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ image , "Tell me about this instrument" ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const image = await ai . files . upload ({ file : "/path/to/organ.png" , }); const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : [ createUserContent ([ "Tell me about this instrument" , createPartFromUri ( image . uri , image . mimeType ), ]), ], }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } imagePath := "/path/to/organ.jpg" imgData , _ := os . ReadFile ( imagePath ) parts := [] * genai . Part { genai . NewPartFromText ( "Tell me about this instrument" ), & genai . Part { InlineData : & genai . Blob { MIMEType : "image/jpeg" , Data : imgData , }, }, } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST # Use a temporary file to hold the base64 encoded image data TEMP_B64 = $( mktemp ) trap 'rm -f "$TEMP_B64"' EXIT base64 $B64FLAGS $IMG_PATH > " $TEMP_B64 " # Use a temporary file to hold the JSON payload TEMP_JSON = $( mktemp ) trap 'rm -f "$TEMP_JSON"' EXIT cat > " $TEMP_JSON " << EOF { "contents" : [ { "parts" : [ { "text" : "Tell me about this instrument" } , { "inline_data" : { "mime_type" : "image/jpeg" , "data" : " $( cat " $TEMP_B64 " ) " } } ] } ] } EOF curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d "@ $TEMP_JSON " Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const imageUrl = 'http://image/url' ; const image = getImageData ( imageUrl ); const payload = { contents : [ { parts : [ { image }, { text : 'Tell me about this instrument' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } function getImageData ( url ) { const blob = UrlFetchApp . fetch ( url ). getBlob (); return { mimeType : blob . getContentType (), data : Utilities . base64Encode ( blob . getBytes ()) }; } For alternative methods of providing images and more advanced image processing, see our image understanding guide . The API also supports document , video , and audio inputs and understanding. Streaming responses By default, the model returns a response only after the entire generation process is complete. For more fluid interactions, use streaming to receive GenerateContentResponse instances incrementally as they're generated. Python from google import genai client = genai . Client () response = client . models . generate_content_stream ( model = "gemini-2.5-flash" , contents = [ "Explain how AI works" ] ) for chunk in response : print ( chunk . text , end = "" ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContentStream ({ model : "gemini-2.5-flash" , contents : "Explain how AI works" , }); for await ( const chunk of response ) { console . log ( chunk . text ); } } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } stream := client . Models . GenerateContentStream ( ctx , "gemini-2.5-flash" , genai . Text ( "Write a story about a magic backpack." ), nil , ) for chunk , _ := range stream { part := chunk . Candidates [ 0 ]. Content . Parts [ 0 ] fmt . Print ( part . Text ) } } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent?alt=sse" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ --no-buffer \ -d '{ "contents": [ { "parts": [ { "text": "Explain how AI works" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { parts : [ { text : 'Explain how AI works' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Multi-turn conversations (Chat) Our SDKs provide functionality to collect multiple rounds of prompts and responses into a chat, giving you an easy way to keep track of the conversation history. Note: Chat functionality is only implemented as part of the SDKs. Behind the scenes, it still uses the generateContent API. For multi-turn conversations, the full conversation history is sent to the model with each follow-up turn. Python from google import genai client = genai . Client () chat = client . chats . create ( model = "gemini-2.5-flash" ) response = chat . send_message ( "I have 2 dogs in my house." ) print ( response . text ) response = chat . send_message ( "How many paws are in my house?" ) print ( response . text ) for message in chat . get_history (): print ( f 'role - { message . role } ' , end = ": " ) print ( message . parts [ 0 ] . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const chat = ai . chats . create ({ model : "gemini-2.5-flash" , history : [ { role : "user" , parts : [{ text : "Hello" }], }, { role : "model" , parts : [{ text : "Great to meet you. What would you like to know?" }], }, ], }); const response1 = await chat . sendMessage ({ message : "I have 2 dogs in my house." , }); console . log ( "Chat response 1:" , response1 . text ); const response2 = await chat . sendMessage ({ message : "How many paws are in my house?" , }); console . log ( "Chat response 2:" , response2 . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } history := [] * genai . Content { genai . NewContentFromText ( "Hi nice to meet you! I have 2 dogs in my house." , genai . RoleUser ), genai . NewContentFromText ( "Great to meet you. What would you like to know?" , genai . RoleModel ), } chat , _ := client . Chats . Create ( ctx , "gemini-2.5-flash" , nil , history ) res , _ := chat . SendMessage ( ctx , genai . Part { Text : "How many paws are in my house?" }) if len ( res . Candidates ) > 0 { fmt . Println ( res . Candidates [ 0 ]. Content . Parts [ 0 ]. Text ) } } REST curl https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "role": "user", "parts": [ { "text": "Hello" } ] }, { "role": "model", "parts": [ { "text": "Great to meet you. What would you like to know?" } ] }, { "role": "user", "parts": [ { "text": "I have two dogs in my house. How many paws are in my house?" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { role : 'user' , parts : [ { text : 'Hello' }, ], }, { role : 'model' , parts : [ { text : 'Great to meet you. What would you like to know?' }, ], }, { role : 'user' , parts : [ { text : 'I have two dogs in my house. How many paws are in my house?' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Streaming can also be used for multi-turn conversations. Python from google import genai client = genai . Client () chat = client . chats . create ( model = "gemini-2.5-flash" ) response = chat . send_message_stream ( "I have 2 dogs in my house." ) for chunk in response : print ( chunk . text , end = "" ) response = chat . send_message_stream ( "How many paws are in my house?" ) for chunk in response : print ( chunk . text , end = "" ) for message in chat . get_history (): print ( f 'role - { message . role } ' , end = ": " ) print ( message . parts [ 0 ] . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const chat = ai . chats . create ({ model : "gemini-2.5-flash" , history : [ { role : "user" , parts : [{ text : "Hello" }], }, { role : "model" , parts : [{ text : "Great to meet you. What would you like to know?" }], }, ], }); const stream1 = await chat . sendMessageStream ({ message : "I have 2 dogs in my house." , }); for await ( const chunk of stream1 ) { console . log ( chunk . text ); console . log ( "_" . repeat ( 80 )); } const stream2 = await chat . sendMessageStream ({ message : "How many paws are in my house?" , }); for await ( const chunk of stream2 ) { console . log ( chunk . text ); console . log ( "_" . repeat ( 80 )); } } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } history := [] * genai . Content { genai . NewContentFromText ( "Hi nice to meet you! I have 2 dogs in my house." , genai . RoleUser ), genai . NewContentFromText ( "Great to meet you. What would you like to know?" , genai . RoleModel ), } chat , _ := client . Chats . Create ( ctx , "gemini-2.5-flash" , nil , history ) stream := chat . SendMessageStream ( ctx , genai . Part { Text : "How many paws are in my house?" }) for chunk , _ := range stream { part := chunk . Candidates [ 0 ]. Content . Parts [ 0 ] fmt . Print ( part . Text ) } } REST curl https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent?alt = sse \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "role": "user", "parts": [ { "text": "Hello" } ] }, { "role": "model", "parts": [ { "text": "Great to meet you. What would you like to know?" } ] }, { "role": "user", "parts": [ { "text": "I have two dogs in my house. How many paws are in my house?" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { role : 'user' , parts : [ { text : 'Hello' }, ], }, { role : 'model' , parts : [ { text : 'Great to meet you. What would you like to know?' }, ], }, { role : 'user' , parts : [ { text : 'I have two dogs in my house. How many paws are in my house?' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Supported models All models in the Gemini family support text generation. To learn more about the models and their capabilities, visit the Models page. Best practices Prompting tips For basic text generation, a zero-shot prompt often suffices without needing examples, system instructions or specific formatting. For more tailored outputs: Use System instructions to guide the model. Provide few example inputs and outputs to guide the model. This is often referred to as few-shot prompting. Consult our prompt engineering guide for more tips. Structured output In some cases, you may need structured output, such as JSON. Refer to our structured output guide to learn how. What's next Try the Gemini API getting started Colab . Explore Gemini's image , video , audio and document understanding capabilities. Learn about multimodal file prompting strategies . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC.
|
text_content/docs_text-generation_61cae815.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
URL: https://ai.google.dev/gemini-api/docs/text-generation?lang=python#main-content
|
2 |
+
Title: Text generation | Gemini API | Google AI for Developers
|
3 |
+
==================================================
|
4 |
+
|
5 |
+
Text generation | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Text generation The Gemini API can generate text output from various inputs, including text, images, video, and audio, leveraging Gemini models. Here's a basic example that takes a single text input: Python from google import genai client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "How does AI work?" ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "How does AI work?" , }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "Explain how AI works in a few words" ), nil , ) fmt . Println ( result . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "How does AI work?" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { parts : [ { text : 'How AI does work?' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Thinking with Gemini 2.5 2.5 Flash and Pro models have "thinking" enabled by default to enhance quality, which may take longer to run and increase token usage. When using 2.5 Flash, you can disable thinking by setting the thinking budget to zero. For more details, see the thinking guide . Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "How does AI work?" , config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( thinking_budget = 0 ) # Disables thinking ), ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "How does AI work?" , config : { thinkingConfig : { thinkingBudget : 0 , // Disables thinking }, } }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "How does AI work?" ), & genai . GenerateContentConfig { ThinkingConfig : & genai . ThinkingConfig { ThinkingBudget : int32 ( 0 ), // Disables thinking }, } ) fmt . Println ( result . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "How does AI work?" } ] } ], "generationConfig": { "thinkingConfig": { "thinkingBudget": 0 } } }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { parts : [ { text : 'How AI does work?' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } System instructions and other configurations You can guide the behavior of Gemini models with system instructions. To do so, pass a GenerateContentConfig object. Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , config = types . GenerateContentConfig ( system_instruction = "You are a cat. Your name is Neko." ), contents = "Hello there" ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "Hello there" , config : { systemInstruction : "You are a cat. Your name is Neko." , }, }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } config := & genai . GenerateContentConfig { SystemInstruction : genai . NewContentFromText ( "You are a cat. Your name is Neko." , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "Hello there" ), config , ) fmt . Println ( result . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -d '{ "system_instruction": { "parts": [ { "text": "You are a cat. Your name is Neko." } ] }, "contents": [ { "parts": [ { "text": "Hello there" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const systemInstruction = { parts : [{ text : 'You are a cat. Your name is Neko.' }] }; const payload = { systemInstruction , contents : [ { parts : [ { text : 'Hello there' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } The GenerateContentConfig object also lets you override default generation parameters, such as temperature . Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ "Explain how AI works" ], config = types . GenerateContentConfig ( temperature = 0.1 ) ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "Explain how AI works" , config : { temperature : 0.1 , }, }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } temp := float32 ( 0.9 ) topP := float32 ( 0.5 ) topK := float32 ( 20.0 ) config := & genai . GenerateContentConfig { Temperature : & temp , TopP : & topP , TopK : & topK , ResponseMIMEType : "application/json" , } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "What is the average size of a swallow?" ), config , ) fmt . Println ( result . Text ()) } REST curl https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "Explain how AI works" } ] } ], "generationConfig": { "stopSequences": [ "Title" ], "temperature": 1.0, "topP": 0.8, "topK": 10 } }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const generationConfig = { temperature : 1 , topP : 0.95 , topK : 40 , responseMimeType : 'text/plain' , }; const payload = { generationConfig , contents : [ { parts : [ { text : 'Explain how AI works in a few words' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Refer to the GenerateContentConfig in our API reference for a complete list of configurable parameters and their descriptions. Multimodal inputs The Gemini API supports multimodal inputs, allowing you to combine text with media files. The following example demonstrates providing an image: Python from PIL import Image from google import genai client = genai . Client () image = Image . open ( "/path/to/organ.png" ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ image , "Tell me about this instrument" ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const image = await ai . files . upload ({ file : "/path/to/organ.png" , }); const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : [ createUserContent ([ "Tell me about this instrument" , createPartFromUri ( image . uri , image . mimeType ), ]), ], }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } imagePath := "/path/to/organ.jpg" imgData , _ := os . ReadFile ( imagePath ) parts := [] * genai . Part { genai . NewPartFromText ( "Tell me about this instrument" ), & genai . Part { InlineData : & genai . Blob { MIMEType : "image/jpeg" , Data : imgData , }, }, } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST # Use a temporary file to hold the base64 encoded image data TEMP_B64 = $( mktemp ) trap 'rm -f "$TEMP_B64"' EXIT base64 $B64FLAGS $IMG_PATH > " $TEMP_B64 " # Use a temporary file to hold the JSON payload TEMP_JSON = $( mktemp ) trap 'rm -f "$TEMP_JSON"' EXIT cat > " $TEMP_JSON " << EOF { "contents" : [ { "parts" : [ { "text" : "Tell me about this instrument" } , { "inline_data" : { "mime_type" : "image/jpeg" , "data" : " $( cat " $TEMP_B64 " ) " } } ] } ] } EOF curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d "@ $TEMP_JSON " Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const imageUrl = 'http://image/url' ; const image = getImageData ( imageUrl ); const payload = { contents : [ { parts : [ { image }, { text : 'Tell me about this instrument' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } function getImageData ( url ) { const blob = UrlFetchApp . fetch ( url ). getBlob (); return { mimeType : blob . getContentType (), data : Utilities . base64Encode ( blob . getBytes ()) }; } For alternative methods of providing images and more advanced image processing, see our image understanding guide . The API also supports document , video , and audio inputs and understanding. Streaming responses By default, the model returns a response only after the entire generation process is complete. For more fluid interactions, use streaming to receive GenerateContentResponse instances incrementally as they're generated. Python from google import genai client = genai . Client () response = client . models . generate_content_stream ( model = "gemini-2.5-flash" , contents = [ "Explain how AI works" ] ) for chunk in response : print ( chunk . text , end = "" ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContentStream ({ model : "gemini-2.5-flash" , contents : "Explain how AI works" , }); for await ( const chunk of response ) { console . log ( chunk . text ); } } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } stream := client . Models . GenerateContentStream ( ctx , "gemini-2.5-flash" , genai . Text ( "Write a story about a magic backpack." ), nil , ) for chunk , _ := range stream { part := chunk . Candidates [ 0 ]. Content . Parts [ 0 ] fmt . Print ( part . Text ) } } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent?alt=sse" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ --no-buffer \ -d '{ "contents": [ { "parts": [ { "text": "Explain how AI works" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { parts : [ { text : 'Explain how AI works' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Multi-turn conversations (Chat) Our SDKs provide functionality to collect multiple rounds of prompts and responses into a chat, giving you an easy way to keep track of the conversation history. Note: Chat functionality is only implemented as part of the SDKs. Behind the scenes, it still uses the generateContent API. For multi-turn conversations, the full conversation history is sent to the model with each follow-up turn. Python from google import genai client = genai . Client () chat = client . chats . create ( model = "gemini-2.5-flash" ) response = chat . send_message ( "I have 2 dogs in my house." ) print ( response . text ) response = chat . send_message ( "How many paws are in my house?" ) print ( response . text ) for message in chat . get_history (): print ( f 'role - { message . role } ' , end = ": " ) print ( message . parts [ 0 ] . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const chat = ai . chats . create ({ model : "gemini-2.5-flash" , history : [ { role : "user" , parts : [{ text : "Hello" }], }, { role : "model" , parts : [{ text : "Great to meet you. What would you like to know?" }], }, ], }); const response1 = await chat . sendMessage ({ message : "I have 2 dogs in my house." , }); console . log ( "Chat response 1:" , response1 . text ); const response2 = await chat . sendMessage ({ message : "How many paws are in my house?" , }); console . log ( "Chat response 2:" , response2 . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } history := [] * genai . Content { genai . NewContentFromText ( "Hi nice to meet you! I have 2 dogs in my house." , genai . RoleUser ), genai . NewContentFromText ( "Great to meet you. What would you like to know?" , genai . RoleModel ), } chat , _ := client . Chats . Create ( ctx , "gemini-2.5-flash" , nil , history ) res , _ := chat . SendMessage ( ctx , genai . Part { Text : "How many paws are in my house?" }) if len ( res . Candidates ) > 0 { fmt . Println ( res . Candidates [ 0 ]. Content . Parts [ 0 ]. Text ) } } REST curl https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "role": "user", "parts": [ { "text": "Hello" } ] }, { "role": "model", "parts": [ { "text": "Great to meet you. What would you like to know?" } ] }, { "role": "user", "parts": [ { "text": "I have two dogs in my house. How many paws are in my house?" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { role : 'user' , parts : [ { text : 'Hello' }, ], }, { role : 'model' , parts : [ { text : 'Great to meet you. What would you like to know?' }, ], }, { role : 'user' , parts : [ { text : 'I have two dogs in my house. How many paws are in my house?' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Streaming can also be used for multi-turn conversations. Python from google import genai client = genai . Client () chat = client . chats . create ( model = "gemini-2.5-flash" ) response = chat . send_message_stream ( "I have 2 dogs in my house." ) for chunk in response : print ( chunk . text , end = "" ) response = chat . send_message_stream ( "How many paws are in my house?" ) for chunk in response : print ( chunk . text , end = "" ) for message in chat . get_history (): print ( f 'role - { message . role } ' , end = ": " ) print ( message . parts [ 0 ] . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const chat = ai . chats . create ({ model : "gemini-2.5-flash" , history : [ { role : "user" , parts : [{ text : "Hello" }], }, { role : "model" , parts : [{ text : "Great to meet you. What would you like to know?" }], }, ], }); const stream1 = await chat . sendMessageStream ({ message : "I have 2 dogs in my house." , }); for await ( const chunk of stream1 ) { console . log ( chunk . text ); console . log ( "_" . repeat ( 80 )); } const stream2 = await chat . sendMessageStream ({ message : "How many paws are in my house?" , }); for await ( const chunk of stream2 ) { console . log ( chunk . text ); console . log ( "_" . repeat ( 80 )); } } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } history := [] * genai . Content { genai . NewContentFromText ( "Hi nice to meet you! I have 2 dogs in my house." , genai . RoleUser ), genai . NewContentFromText ( "Great to meet you. What would you like to know?" , genai . RoleModel ), } chat , _ := client . Chats . Create ( ctx , "gemini-2.5-flash" , nil , history ) stream := chat . SendMessageStream ( ctx , genai . Part { Text : "How many paws are in my house?" }) for chunk , _ := range stream { part := chunk . Candidates [ 0 ]. Content . Parts [ 0 ] fmt . Print ( part . Text ) } } REST curl https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent?alt = sse \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "role": "user", "parts": [ { "text": "Hello" } ] }, { "role": "model", "parts": [ { "text": "Great to meet you. What would you like to know?" } ] }, { "role": "user", "parts": [ { "text": "I have two dogs in my house. How many paws are in my house?" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { role : 'user' , parts : [ { text : 'Hello' }, ], }, { role : 'model' , parts : [ { text : 'Great to meet you. What would you like to know?' }, ], }, { role : 'user' , parts : [ { text : 'I have two dogs in my house. How many paws are in my house?' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Supported models All models in the Gemini family support text generation. To learn more about the models and their capabilities, visit the Models page. Best practices Prompting tips For basic text generation, a zero-shot prompt often suffices without needing examples, system instructions or specific formatting. For more tailored outputs: Use System instructions to guide the model. Provide few example inputs and outputs to guide the model. This is often referred to as few-shot prompting. Consult our prompt engineering guide for more tips. Structured output In some cases, you may need structured output, such as JSON. Refer to our structured output guide to learn how. What's next Try the Gemini API getting started Colab . Explore Gemini's image , video , audio and document understanding capabilities. Learn about multimodal file prompting strategies . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC.
|
text_content/docs_text-generation_840d1f94.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
URL: https://ai.google.dev/gemini-api/docs/text-generation?lang=python#system-instructions
|
2 |
+
Title: Text generation | Gemini API | Google AI for Developers
|
3 |
+
==================================================
|
4 |
+
|
5 |
+
Text generation | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Text generation The Gemini API can generate text output from various inputs, including text, images, video, and audio, leveraging Gemini models. Here's a basic example that takes a single text input: Python from google import genai client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "How does AI work?" ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "How does AI work?" , }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "Explain how AI works in a few words" ), nil , ) fmt . Println ( result . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "How does AI work?" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { parts : [ { text : 'How AI does work?' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Thinking with Gemini 2.5 2.5 Flash and Pro models have "thinking" enabled by default to enhance quality, which may take longer to run and increase token usage. When using 2.5 Flash, you can disable thinking by setting the thinking budget to zero. For more details, see the thinking guide . Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "How does AI work?" , config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( thinking_budget = 0 ) # Disables thinking ), ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "How does AI work?" , config : { thinkingConfig : { thinkingBudget : 0 , // Disables thinking }, } }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "How does AI work?" ), & genai . GenerateContentConfig { ThinkingConfig : & genai . ThinkingConfig { ThinkingBudget : int32 ( 0 ), // Disables thinking }, } ) fmt . Println ( result . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "How does AI work?" } ] } ], "generationConfig": { "thinkingConfig": { "thinkingBudget": 0 } } }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { parts : [ { text : 'How AI does work?' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } System instructions and other configurations You can guide the behavior of Gemini models with system instructions. To do so, pass a GenerateContentConfig object. Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , config = types . GenerateContentConfig ( system_instruction = "You are a cat. Your name is Neko." ), contents = "Hello there" ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "Hello there" , config : { systemInstruction : "You are a cat. Your name is Neko." , }, }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } config := & genai . GenerateContentConfig { SystemInstruction : genai . NewContentFromText ( "You are a cat. Your name is Neko." , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "Hello there" ), config , ) fmt . Println ( result . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -d '{ "system_instruction": { "parts": [ { "text": "You are a cat. Your name is Neko." } ] }, "contents": [ { "parts": [ { "text": "Hello there" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const systemInstruction = { parts : [{ text : 'You are a cat. Your name is Neko.' }] }; const payload = { systemInstruction , contents : [ { parts : [ { text : 'Hello there' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } The GenerateContentConfig object also lets you override default generation parameters, such as temperature . Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ "Explain how AI works" ], config = types . GenerateContentConfig ( temperature = 0.1 ) ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "Explain how AI works" , config : { temperature : 0.1 , }, }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } temp := float32 ( 0.9 ) topP := float32 ( 0.5 ) topK := float32 ( 20.0 ) config := & genai . GenerateContentConfig { Temperature : & temp , TopP : & topP , TopK : & topK , ResponseMIMEType : "application/json" , } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "What is the average size of a swallow?" ), config , ) fmt . Println ( result . Text ()) } REST curl https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "Explain how AI works" } ] } ], "generationConfig": { "stopSequences": [ "Title" ], "temperature": 1.0, "topP": 0.8, "topK": 10 } }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const generationConfig = { temperature : 1 , topP : 0.95 , topK : 40 , responseMimeType : 'text/plain' , }; const payload = { generationConfig , contents : [ { parts : [ { text : 'Explain how AI works in a few words' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Refer to the GenerateContentConfig in our API reference for a complete list of configurable parameters and their descriptions. Multimodal inputs The Gemini API supports multimodal inputs, allowing you to combine text with media files. The following example demonstrates providing an image: Python from PIL import Image from google import genai client = genai . Client () image = Image . open ( "/path/to/organ.png" ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ image , "Tell me about this instrument" ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const image = await ai . files . upload ({ file : "/path/to/organ.png" , }); const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : [ createUserContent ([ "Tell me about this instrument" , createPartFromUri ( image . uri , image . mimeType ), ]), ], }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } imagePath := "/path/to/organ.jpg" imgData , _ := os . ReadFile ( imagePath ) parts := [] * genai . Part { genai . NewPartFromText ( "Tell me about this instrument" ), & genai . Part { InlineData : & genai . Blob { MIMEType : "image/jpeg" , Data : imgData , }, }, } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST # Use a temporary file to hold the base64 encoded image data TEMP_B64 = $( mktemp ) trap 'rm -f "$TEMP_B64"' EXIT base64 $B64FLAGS $IMG_PATH > " $TEMP_B64 " # Use a temporary file to hold the JSON payload TEMP_JSON = $( mktemp ) trap 'rm -f "$TEMP_JSON"' EXIT cat > " $TEMP_JSON " << EOF { "contents" : [ { "parts" : [ { "text" : "Tell me about this instrument" } , { "inline_data" : { "mime_type" : "image/jpeg" , "data" : " $( cat " $TEMP_B64 " ) " } } ] } ] } EOF curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d "@ $TEMP_JSON " Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const imageUrl = 'http://image/url' ; const image = getImageData ( imageUrl ); const payload = { contents : [ { parts : [ { image }, { text : 'Tell me about this instrument' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } function getImageData ( url ) { const blob = UrlFetchApp . fetch ( url ). getBlob (); return { mimeType : blob . getContentType (), data : Utilities . base64Encode ( blob . getBytes ()) }; } For alternative methods of providing images and more advanced image processing, see our image understanding guide . The API also supports document , video , and audio inputs and understanding. Streaming responses By default, the model returns a response only after the entire generation process is complete. For more fluid interactions, use streaming to receive GenerateContentResponse instances incrementally as they're generated. Python from google import genai client = genai . Client () response = client . models . generate_content_stream ( model = "gemini-2.5-flash" , contents = [ "Explain how AI works" ] ) for chunk in response : print ( chunk . text , end = "" ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContentStream ({ model : "gemini-2.5-flash" , contents : "Explain how AI works" , }); for await ( const chunk of response ) { console . log ( chunk . text ); } } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } stream := client . Models . GenerateContentStream ( ctx , "gemini-2.5-flash" , genai . Text ( "Write a story about a magic backpack." ), nil , ) for chunk , _ := range stream { part := chunk . Candidates [ 0 ]. Content . Parts [ 0 ] fmt . Print ( part . Text ) } } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent?alt=sse" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ --no-buffer \ -d '{ "contents": [ { "parts": [ { "text": "Explain how AI works" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { parts : [ { text : 'Explain how AI works' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Multi-turn conversations (Chat) Our SDKs provide functionality to collect multiple rounds of prompts and responses into a chat, giving you an easy way to keep track of the conversation history. Note: Chat functionality is only implemented as part of the SDKs. Behind the scenes, it still uses the generateContent API. For multi-turn conversations, the full conversation history is sent to the model with each follow-up turn. Python from google import genai client = genai . Client () chat = client . chats . create ( model = "gemini-2.5-flash" ) response = chat . send_message ( "I have 2 dogs in my house." ) print ( response . text ) response = chat . send_message ( "How many paws are in my house?" ) print ( response . text ) for message in chat . get_history (): print ( f 'role - { message . role } ' , end = ": " ) print ( message . parts [ 0 ] . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const chat = ai . chats . create ({ model : "gemini-2.5-flash" , history : [ { role : "user" , parts : [{ text : "Hello" }], }, { role : "model" , parts : [{ text : "Great to meet you. What would you like to know?" }], }, ], }); const response1 = await chat . sendMessage ({ message : "I have 2 dogs in my house." , }); console . log ( "Chat response 1:" , response1 . text ); const response2 = await chat . sendMessage ({ message : "How many paws are in my house?" , }); console . log ( "Chat response 2:" , response2 . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } history := [] * genai . Content { genai . NewContentFromText ( "Hi nice to meet you! I have 2 dogs in my house." , genai . RoleUser ), genai . NewContentFromText ( "Great to meet you. What would you like to know?" , genai . RoleModel ), } chat , _ := client . Chats . Create ( ctx , "gemini-2.5-flash" , nil , history ) res , _ := chat . SendMessage ( ctx , genai . Part { Text : "How many paws are in my house?" }) if len ( res . Candidates ) > 0 { fmt . Println ( res . Candidates [ 0 ]. Content . Parts [ 0 ]. Text ) } } REST curl https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "role": "user", "parts": [ { "text": "Hello" } ] }, { "role": "model", "parts": [ { "text": "Great to meet you. What would you like to know?" } ] }, { "role": "user", "parts": [ { "text": "I have two dogs in my house. How many paws are in my house?" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { role : 'user' , parts : [ { text : 'Hello' }, ], }, { role : 'model' , parts : [ { text : 'Great to meet you. What would you like to know?' }, ], }, { role : 'user' , parts : [ { text : 'I have two dogs in my house. How many paws are in my house?' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Streaming can also be used for multi-turn conversations. Python from google import genai client = genai . Client () chat = client . chats . create ( model = "gemini-2.5-flash" ) response = chat . send_message_stream ( "I have 2 dogs in my house." ) for chunk in response : print ( chunk . text , end = "" ) response = chat . send_message_stream ( "How many paws are in my house?" ) for chunk in response : print ( chunk . text , end = "" ) for message in chat . get_history (): print ( f 'role - { message . role } ' , end = ": " ) print ( message . parts [ 0 ] . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const chat = ai . chats . create ({ model : "gemini-2.5-flash" , history : [ { role : "user" , parts : [{ text : "Hello" }], }, { role : "model" , parts : [{ text : "Great to meet you. What would you like to know?" }], }, ], }); const stream1 = await chat . sendMessageStream ({ message : "I have 2 dogs in my house." , }); for await ( const chunk of stream1 ) { console . log ( chunk . text ); console . log ( "_" . repeat ( 80 )); } const stream2 = await chat . sendMessageStream ({ message : "How many paws are in my house?" , }); for await ( const chunk of stream2 ) { console . log ( chunk . text ); console . log ( "_" . repeat ( 80 )); } } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } history := [] * genai . Content { genai . NewContentFromText ( "Hi nice to meet you! I have 2 dogs in my house." , genai . RoleUser ), genai . NewContentFromText ( "Great to meet you. What would you like to know?" , genai . RoleModel ), } chat , _ := client . Chats . Create ( ctx , "gemini-2.5-flash" , nil , history ) stream := chat . SendMessageStream ( ctx , genai . Part { Text : "How many paws are in my house?" }) for chunk , _ := range stream { part := chunk . Candidates [ 0 ]. Content . Parts [ 0 ] fmt . Print ( part . Text ) } } REST curl https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent?alt = sse \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "role": "user", "parts": [ { "text": "Hello" } ] }, { "role": "model", "parts": [ { "text": "Great to meet you. What would you like to know?" } ] }, { "role": "user", "parts": [ { "text": "I have two dogs in my house. How many paws are in my house?" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { role : 'user' , parts : [ { text : 'Hello' }, ], }, { role : 'model' , parts : [ { text : 'Great to meet you. What would you like to know?' }, ], }, { role : 'user' , parts : [ { text : 'I have two dogs in my house. How many paws are in my house?' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Supported models All models in the Gemini family support text generation. To learn more about the models and their capabilities, visit the Models page. Best practices Prompting tips For basic text generation, a zero-shot prompt often suffices without needing examples, system instructions or specific formatting. For more tailored outputs: Use System instructions to guide the model. Provide few example inputs and outputs to guide the model. This is often referred to as few-shot prompting. Consult our prompt engineering guide for more tips. Structured output In some cases, you may need structured output, such as JSON. Refer to our structured output guide to learn how. What's next Try the Gemini API getting started Colab . Explore Gemini's image , video , audio and document understanding capabilities. Learn about multimodal file prompting strategies . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC.
|
text_content/docs_thinking_46527506.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
URL: https://ai.google.dev/gemini-api/docs/thinking#main-content
|
2 |
+
Title: Gemini thinking | Gemini API | Google AI for Developers
|
3 |
+
==================================================
|
4 |
+
|
5 |
+
Gemini thinking | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini thinking The Gemini 2.5 series models use an internal "thinking process" that significantly improves their reasoning and multi-step planning abilities, making them highly effective for complex tasks such as coding, advanced mathematics, and data analysis. This guide shows you how to work with Gemini's thinking capabilities using the Gemini API. Before you begin Ensure you use a supported 2.5 series model for thinking. You might find it beneficial to explore these models in AI Studio before diving into the API: Try Gemini 2.5 Flash in AI Studio Try Gemini 2.5 Pro in AI Studio Try Gemini 2.5 Flash-Lite Preview in AI Studio Generating content with thinking Initiating a request with a thinking model is similar to any other content generation request. The key difference lies in specifying one of the models with thinking support in the model field, as demonstrated in the following text generation example: Python from google import genai client = genai . Client () prompt = "Explain the concept of Occam's Razor and provide a simple, everyday example." response = client . models . generate_content ( model = "gemini-2.5-pro" , contents = prompt ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const prompt = "Explain the concept of Occam's Razor and provide a simple, everyday example." ; const response = await ai . models . generateContent ({ model : "gemini-2.5-pro" , contents : prompt , }); console . log ( response . text ); } main (); Go package main import ( "context" "fmt" "log" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } prompt := "Explain the concept of Occam's Razor and provide a simple, everyday example." model := "gemini-2.5-pro" resp , _ := client . Models . GenerateContent ( ctx , model , genai . Text ( prompt ), nil ) fmt . Println ( resp . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-pro:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "Explain the concept of Occam\' s Razor and provide a simple, everyday example. " } ] } ] }' ``` Thinking budgets The thinkingBudget parameter guides the model on the number of thinking tokens to use when generating a response. A higher token count generally allows for more detailed reasoning, which can be beneficial for tackling more complex tasks . If latency is more important, use a lower budget or disable thinking by setting thinkingBudget to 0. Setting the thinkingBudget to -1 turns on dynamic thinking , meaning the model will adjust the budget based on the complexity of the request. The thinkingBudget is only supported in Gemini 2.5 Flash, 2.5 Pro, and 2.5 Flash-Lite. Depending on the prompt, the model might overflow or underflow the token budget. The following are thinkingBudget configuration details for each model type. Model Default setting (Thinking budget is not set) Range Disable thinking Turn on dynamic thinking 2.5 Pro Dynamic thinking: Model decides when and how much to think 128 to 32768 N/A: Cannot disable thinking thinkingBudget = -1 2.5 Flash Dynamic thinking: Model decides when and how much to think 0 to 24576 thinkingBudget = 0 thinkingBudget = -1 2.5 Flash Lite Model does not think 512 to 24576 thinkingBudget = 0 thinkingBudget = -1 Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-pro" , contents = "Provide a list of 3 famous physicists and their key contributions" , config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( thinking_budget = 1024 ) # Turn off thinking: # thinking_config=types.ThinkingConfig(thinking_budget=0) # Turn on dynamic thinking: # thinking_config=types.ThinkingConfig(thinking_budget=-1) ), ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-pro" , contents : "Provide a list of 3 famous physicists and their key contributions" , config : { thinkingConfig : { thinkingBudget : 1024 , // Turn off thinking: // thinkingBudget: 0 // Turn on dynamic thinking: // thinkingBudget: -1 }, }, }); console . log ( response . text ); } main (); Go package main import ( "context" "fmt" "google.golang.org/genai" "os" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } thinkingBudgetVal := int32 ( 1024 ) contents := genai . Text ( "Provide a list of 3 famous physicists and their key contributions" ) model := "gemini-2.5-pro" resp , _ := client . Models . GenerateContent ( ctx , model , contents , & genai . GenerateContentConfig { ThinkingConfig : & genai . ThinkingConfig { ThinkingBudget : & thinkingBudgetVal , // Turn off thinking: // ThinkingBudget: int32(0), // Turn on dynamic thinking: // ThinkingBudget: int32(-1), }, }) fmt . Println ( resp . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-pro:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "Provide a list of 3 famous physicists and their key contributions" } ] } ], "generationConfig": { "thinkingConfig": { "thinkingBudget": 1024 # Thinking off: # "thinkingBudget": 0 # Turn on dynamic thinking: # "thinkingBudget": -1 } } }' Thought summaries Thought summaries are synthesized versions of the model's raw thoughts and offer insights into the model's internal reasoning process. Note that thinking budgets apply to the model's raw thoughts and not to thought summaries. You can enable thought summaries by setting includeThoughts to true in your request configuration. You can then access the summary by iterating through the response parameter's parts , and checking the thought boolean. Here's an example demonstrating how to enable and retrieve thought summaries without streaming, which returns a single, final thought summary with the response: Python from google import genai from google.genai import types client = genai . Client () prompt = "What is the sum of the first 50 prime numbers?" response = client . models . generate_content ( model = "gemini-2.5-pro" , contents = prompt , config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( include_thoughts = True ) ) ) for part in response . candidates [ 0 ] . content . parts : if not part . text : continue if part . thought : print ( "Thought summary:" ) print ( part . text ) print () else : print ( "Answer:" ) print ( part . text ) print () JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-pro" , contents : "What is the sum of the first 50 prime numbers?" , config : { thinkingConfig : { includeThoughts : true , }, }, }); for ( const part of response . candidates [ 0 ]. content . parts ) { if ( ! part . text ) { continue ; } else if ( part . thought ) { console . log ( "Thoughts summary:" ); console . log ( part . text ); } else { console . log ( "Answer:" ); console . log ( part . text ); } } } main (); Go package main import ( "context" "fmt" "google.golang.org/genai" "os" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } contents := genai . Text ( "What is the sum of the first 50 prime numbers?" ) model := "gemini-2.5-pro" resp , _ := client . Models . GenerateContent ( ctx , model , contents , & genai . GenerateContentConfig { ThinkingConfig : & genai . ThinkingConfig { IncludeThoughts : true , }, }) for _ , part := range resp . Candidates [ 0 ]. Content . Parts { if part . Text != "" { if part . Thought { fmt . Println ( "Thoughts Summary:" ) fmt . Println ( part . Text ) } else { fmt . Println ( "Answer:" ) fmt . Println ( part . Text ) } } } } And here is an example using thinking with streaming, which returns rolling, incremental summaries during generation: Python from google import genai from google.genai import types client = genai . Client () prompt = """ Alice, Bob, and Carol each live in a different house on the same street: red, green, and blue. The person who lives in the red house owns a cat. Bob does not live in the green house. Carol owns a dog. The green house is to the left of the red house. Alice does not own a cat. Who lives in each house, and what pet do they own? """ thoughts = "" answer = "" for chunk in client . models . generate_content_stream ( model = "gemini-2.5-pro" , contents = prompt , config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( include_thoughts = True ) ) ): for part in chunk . candidates [ 0 ] . content . parts : if not part . text : continue elif part . thought : if not thoughts : print ( "Thoughts summary:" ) print ( part . text ) thoughts += part . text else : if not answer : print ( "Answer:" ) print ( part . text ) answer += part . text JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); const prompt = `Alice, Bob, and Carol each live in a different house on the same street: red, green, and blue. The person who lives in the red house owns a cat. Bob does not live in the green house. Carol owns a dog. The green house is to the left of the red house. Alice does not own a cat. Who lives in each house, and what pet do they own?` ; let thoughts = "" ; let answer = "" ; async function main () { const response = await ai . models . generateContentStream ({ model : "gemini-2.5-pro" , contents : prompt , config : { thinkingConfig : { includeThoughts : true , }, }, }); for await ( const chunk of response ) { for ( const part of chunk . candidates [ 0 ]. content . parts ) { if ( ! part . text ) { continue ; } else if ( part . thought ) { if ( ! thoughts ) { console . log ( "Thoughts summary:" ); } console . log ( part . text ); thoughts = thoughts + part . text ; } else { if ( ! answer ) { console . log ( "Answer:" ); } console . log ( part . text ); answer = answer + part . text ; } } } } await main (); Go package main import ( "context" "fmt" "log" "os" "google.golang.org/genai" ) const prompt = ` Alice, Bob, and Carol each live in a different house on the same street: red, green, and blue. The person who lives in the red house owns a cat. Bob does not live in the green house. Carol owns a dog. The green house is to the left of the red house. Alice does not own a cat. Who lives in each house, and what pet do they own? ` func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } contents := genai . Text ( prompt ) model := "gemini-2.5-pro" resp := client . Models . GenerateContentStream ( ctx , model , contents , & genai . GenerateContentConfig { ThinkingConfig : & genai . ThinkingConfig { IncludeThoughts : true , }, }) for chunk := range resp { for _ , part := range chunk . Candidates [ 0 ]. Content . Parts { if len ( part . Text ) == 0 { continue } if part . Thought { fmt . Printf ( "Thought: %s\n" , part . Text ) } else { fmt . Printf ( "Answer: %s\n" , part . Text ) } } } } Thought signatures Because standard Gemini API text and content generation calls are stateless, when using thinking in multi-turn interactions (such as chat), the model doesn't have access to thought context from previous turns. You can maintain thought context using thought signatures, which are encrypted representations of the model's internal thought process. The model returns thought signatures in the response object when thinking and function calling are enabled. To ensure the model maintains context across multiple turns of a conversation, you must provide the thought signatures back to the model in the subsequent requests. You will receive thought signatures when: Thinking is enabled and thoughts are generated. The request includes function declarations . Note: Thought signatures are only available when you're using function calling, specifically, your request must include function declarations . You can find an example of thinking with function calls on the Function calling page. Other usage limitations to consider with function calling include: Signatures are returned from the model within other parts in the response, for example function calling or text parts. Return the entire response with all parts back to the model in subsequent turns. Don't concatenate parts with signatures together. Don't merge one part with a signature with another part without a signature. Pricing Note: Summaries are available in the free and paid tiers of the API. Thought signatures will increase the input tokens you are charged when sent back as part of the request. When thinking is turned on, response pricing is the sum of output tokens and thinking tokens. You can get the total number of generated thinking tokens from the thoughtsTokenCount field. Python # ... print ( "Thoughts tokens:" , response . usage_metadata . thoughts_token_count ) print ( "Output tokens:" , response . usage_metadata . candidates_token_count ) JavaScript // ... console . log ( `Thoughts tokens: ${ response . usageMetadata . thoughtsTokenCount } ` ); console . log ( `Output tokens: ${ response . usageMetadata . candidatesTokenCount } ` ); Go // ... usageMetadata , err := json . MarshalIndent ( response . UsageMetadata , "" , " " ) if err != nil { log . Fatal ( err ) } fmt . Println ( "Thoughts tokens:" , string ( usageMetadata . thoughts_token_count )) fmt . Println ( "Output tokens:" , string ( usageMetadata . candidates_token_count )) Thinking models generate full thoughts to improve the quality of the final response, and then output summaries to provide insight into the thought process. So, pricing is based on the full thought tokens the model needs to generate to create a summary, despite only the summary being output from the API. You can learn more about tokens in the Token counting guide. Supported models Thinking features are supported on all the 2.5 series models. You can find all model capabilities on the model overview page. Best practices This section includes some guidance for using thinking models efficiently. As always, following our prompting guidance and best practices will get you the best results. Debugging and steering Review reasoning : When you're not getting your expected response from the thinking models, it can help to carefully analyze Gemini's thought summaries. You can see how it broke down the task and arrived at its conclusion, and use that information to correct towards the right results. Provide Guidance in Reasoning : If you're hoping for a particularly lengthy output, you may want to provide guidance in your prompt to constrain the amount of thinking the model uses. This lets you reserve more of the token output for your response. Task complexity Easy Tasks (Thinking could be OFF): For straightforward requests where complex reasoning isn't required, such as fact retrieval or classification, thinking is not required. Examples include: "Where was DeepMind founded?" "Is this email asking for a meeting or just providing information?" Medium Tasks (Default/Some Thinking): Many common requests benefit from a degree of step-by-step processing or deeper understanding. Gemini can flexibly use thinking capability for tasks like: Analogize photosynthesis and growing up. Compare and contrast electric cars and hybrid cars. Hard Tasks (Maximum Thinking Capability): For truly complex challenges, such as solving complex math problems or coding tasks, we recommend setting a high thinking budget. These types of tasks require the model to engage its full reasoning and planning capabilities, often involving many internal steps before providing an answer. Examples include: Solve problem 1 in AIME 2025: Find the sum of all integer bases b > 9 for which 17 b is a divisor of 97 b . Write Python code for a web application that visualizes real-time stock market data, including user authentication. Make it as efficient as possible. Thinking with tools and capabilities Thinking models work with all of Gemini's tools and capabilities. This allows the models to interact with external systems, execute code, or access real-time information, incorporating the results into their reasoning and final response. The search tool allows the model to query Google Search to find up-to-date information or information beyond its training data. This is useful for questions about recent events or highly specific topics. The code execution tool enables the model to generate and run Python code to perform calculations, manipulate data, or solve problems that are best handled algorithmically. The model receives the code's output and can use it in its response. With structured output , you can constrain Gemini to respond with JSON. This is particularly useful for integrating the model's output into applications. Function calling connects the thinking model to external tools and APIs, so it can reason about when to call the right function and what parameters to provide. URL Context provides the model with URLs as additional context for your prompt. The model can then retrieve content from the URLs and use that content to inform and shape its response. You can try examples of using tools with thinking models in the Thinking cookbook . What's next? To work through more in depth examples, like: Using tools with thinking Streaming with thinking Adjusting the thinking budget for different results and more, try our Thinking cookbook . Thinking coverage is now available in our OpenAI Compatibility guide. For more info about Gemini 2.5 Pro, Gemini Flash 2.5, and Gemini 2.5 Flash-Lite, visit the model page . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-11 UTC.
|
text_content/docs_thinking_693f2dc9.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
URL: https://ai.google.dev/gemini-api/docs/thinking#set-budget
|
2 |
+
Title: Gemini thinking | Gemini API | Google AI for Developers
|
3 |
+
==================================================
|
4 |
+
|
5 |
+
Gemini thinking | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini thinking The Gemini 2.5 series models use an internal "thinking process" that significantly improves their reasoning and multi-step planning abilities, making them highly effective for complex tasks such as coding, advanced mathematics, and data analysis. This guide shows you how to work with Gemini's thinking capabilities using the Gemini API. Before you begin Ensure you use a supported 2.5 series model for thinking. You might find it beneficial to explore these models in AI Studio before diving into the API: Try Gemini 2.5 Flash in AI Studio Try Gemini 2.5 Pro in AI Studio Try Gemini 2.5 Flash-Lite Preview in AI Studio Generating content with thinking Initiating a request with a thinking model is similar to any other content generation request. The key difference lies in specifying one of the models with thinking support in the model field, as demonstrated in the following text generation example: Python from google import genai client = genai . Client () prompt = "Explain the concept of Occam's Razor and provide a simple, everyday example." response = client . models . generate_content ( model = "gemini-2.5-pro" , contents = prompt ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const prompt = "Explain the concept of Occam's Razor and provide a simple, everyday example." ; const response = await ai . models . generateContent ({ model : "gemini-2.5-pro" , contents : prompt , }); console . log ( response . text ); } main (); Go package main import ( "context" "fmt" "log" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } prompt := "Explain the concept of Occam's Razor and provide a simple, everyday example." model := "gemini-2.5-pro" resp , _ := client . Models . GenerateContent ( ctx , model , genai . Text ( prompt ), nil ) fmt . Println ( resp . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-pro:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "Explain the concept of Occam\' s Razor and provide a simple, everyday example. " } ] } ] }' ``` Thinking budgets The thinkingBudget parameter guides the model on the number of thinking tokens to use when generating a response. A higher token count generally allows for more detailed reasoning, which can be beneficial for tackling more complex tasks . If latency is more important, use a lower budget or disable thinking by setting thinkingBudget to 0. Setting the thinkingBudget to -1 turns on dynamic thinking , meaning the model will adjust the budget based on the complexity of the request. The thinkingBudget is only supported in Gemini 2.5 Flash, 2.5 Pro, and 2.5 Flash-Lite. Depending on the prompt, the model might overflow or underflow the token budget. The following are thinkingBudget configuration details for each model type. Model Default setting (Thinking budget is not set) Range Disable thinking Turn on dynamic thinking 2.5 Pro Dynamic thinking: Model decides when and how much to think 128 to 32768 N/A: Cannot disable thinking thinkingBudget = -1 2.5 Flash Dynamic thinking: Model decides when and how much to think 0 to 24576 thinkingBudget = 0 thinkingBudget = -1 2.5 Flash Lite Model does not think 512 to 24576 thinkingBudget = 0 thinkingBudget = -1 Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-pro" , contents = "Provide a list of 3 famous physicists and their key contributions" , config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( thinking_budget = 1024 ) # Turn off thinking: # thinking_config=types.ThinkingConfig(thinking_budget=0) # Turn on dynamic thinking: # thinking_config=types.ThinkingConfig(thinking_budget=-1) ), ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-pro" , contents : "Provide a list of 3 famous physicists and their key contributions" , config : { thinkingConfig : { thinkingBudget : 1024 , // Turn off thinking: // thinkingBudget: 0 // Turn on dynamic thinking: // thinkingBudget: -1 }, }, }); console . log ( response . text ); } main (); Go package main import ( "context" "fmt" "google.golang.org/genai" "os" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } thinkingBudgetVal := int32 ( 1024 ) contents := genai . Text ( "Provide a list of 3 famous physicists and their key contributions" ) model := "gemini-2.5-pro" resp , _ := client . Models . GenerateContent ( ctx , model , contents , & genai . GenerateContentConfig { ThinkingConfig : & genai . ThinkingConfig { ThinkingBudget : & thinkingBudgetVal , // Turn off thinking: // ThinkingBudget: int32(0), // Turn on dynamic thinking: // ThinkingBudget: int32(-1), }, }) fmt . Println ( resp . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-pro:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "Provide a list of 3 famous physicists and their key contributions" } ] } ], "generationConfig": { "thinkingConfig": { "thinkingBudget": 1024 # Thinking off: # "thinkingBudget": 0 # Turn on dynamic thinking: # "thinkingBudget": -1 } } }' Thought summaries Thought summaries are synthesized versions of the model's raw thoughts and offer insights into the model's internal reasoning process. Note that thinking budgets apply to the model's raw thoughts and not to thought summaries. You can enable thought summaries by setting includeThoughts to true in your request configuration. You can then access the summary by iterating through the response parameter's parts , and checking the thought boolean. Here's an example demonstrating how to enable and retrieve thought summaries without streaming, which returns a single, final thought summary with the response: Python from google import genai from google.genai import types client = genai . Client () prompt = "What is the sum of the first 50 prime numbers?" response = client . models . generate_content ( model = "gemini-2.5-pro" , contents = prompt , config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( include_thoughts = True ) ) ) for part in response . candidates [ 0 ] . content . parts : if not part . text : continue if part . thought : print ( "Thought summary:" ) print ( part . text ) print () else : print ( "Answer:" ) print ( part . text ) print () JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-pro" , contents : "What is the sum of the first 50 prime numbers?" , config : { thinkingConfig : { includeThoughts : true , }, }, }); for ( const part of response . candidates [ 0 ]. content . parts ) { if ( ! part . text ) { continue ; } else if ( part . thought ) { console . log ( "Thoughts summary:" ); console . log ( part . text ); } else { console . log ( "Answer:" ); console . log ( part . text ); } } } main (); Go package main import ( "context" "fmt" "google.golang.org/genai" "os" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } contents := genai . Text ( "What is the sum of the first 50 prime numbers?" ) model := "gemini-2.5-pro" resp , _ := client . Models . GenerateContent ( ctx , model , contents , & genai . GenerateContentConfig { ThinkingConfig : & genai . ThinkingConfig { IncludeThoughts : true , }, }) for _ , part := range resp . Candidates [ 0 ]. Content . Parts { if part . Text != "" { if part . Thought { fmt . Println ( "Thoughts Summary:" ) fmt . Println ( part . Text ) } else { fmt . Println ( "Answer:" ) fmt . Println ( part . Text ) } } } } And here is an example using thinking with streaming, which returns rolling, incremental summaries during generation: Python from google import genai from google.genai import types client = genai . Client () prompt = """ Alice, Bob, and Carol each live in a different house on the same street: red, green, and blue. The person who lives in the red house owns a cat. Bob does not live in the green house. Carol owns a dog. The green house is to the left of the red house. Alice does not own a cat. Who lives in each house, and what pet do they own? """ thoughts = "" answer = "" for chunk in client . models . generate_content_stream ( model = "gemini-2.5-pro" , contents = prompt , config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( include_thoughts = True ) ) ): for part in chunk . candidates [ 0 ] . content . parts : if not part . text : continue elif part . thought : if not thoughts : print ( "Thoughts summary:" ) print ( part . text ) thoughts += part . text else : if not answer : print ( "Answer:" ) print ( part . text ) answer += part . text JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); const prompt = `Alice, Bob, and Carol each live in a different house on the same street: red, green, and blue. The person who lives in the red house owns a cat. Bob does not live in the green house. Carol owns a dog. The green house is to the left of the red house. Alice does not own a cat. Who lives in each house, and what pet do they own?` ; let thoughts = "" ; let answer = "" ; async function main () { const response = await ai . models . generateContentStream ({ model : "gemini-2.5-pro" , contents : prompt , config : { thinkingConfig : { includeThoughts : true , }, }, }); for await ( const chunk of response ) { for ( const part of chunk . candidates [ 0 ]. content . parts ) { if ( ! part . text ) { continue ; } else if ( part . thought ) { if ( ! thoughts ) { console . log ( "Thoughts summary:" ); } console . log ( part . text ); thoughts = thoughts + part . text ; } else { if ( ! answer ) { console . log ( "Answer:" ); } console . log ( part . text ); answer = answer + part . text ; } } } } await main (); Go package main import ( "context" "fmt" "log" "os" "google.golang.org/genai" ) const prompt = ` Alice, Bob, and Carol each live in a different house on the same street: red, green, and blue. The person who lives in the red house owns a cat. Bob does not live in the green house. Carol owns a dog. The green house is to the left of the red house. Alice does not own a cat. Who lives in each house, and what pet do they own? ` func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } contents := genai . Text ( prompt ) model := "gemini-2.5-pro" resp := client . Models . GenerateContentStream ( ctx , model , contents , & genai . GenerateContentConfig { ThinkingConfig : & genai . ThinkingConfig { IncludeThoughts : true , }, }) for chunk := range resp { for _ , part := range chunk . Candidates [ 0 ]. Content . Parts { if len ( part . Text ) == 0 { continue } if part . Thought { fmt . Printf ( "Thought: %s\n" , part . Text ) } else { fmt . Printf ( "Answer: %s\n" , part . Text ) } } } } Thought signatures Because standard Gemini API text and content generation calls are stateless, when using thinking in multi-turn interactions (such as chat), the model doesn't have access to thought context from previous turns. You can maintain thought context using thought signatures, which are encrypted representations of the model's internal thought process. The model returns thought signatures in the response object when thinking and function calling are enabled. To ensure the model maintains context across multiple turns of a conversation, you must provide the thought signatures back to the model in the subsequent requests. You will receive thought signatures when: Thinking is enabled and thoughts are generated. The request includes function declarations . Note: Thought signatures are only available when you're using function calling, specifically, your request must include function declarations . You can find an example of thinking with function calls on the Function calling page. Other usage limitations to consider with function calling include: Signatures are returned from the model within other parts in the response, for example function calling or text parts. Return the entire response with all parts back to the model in subsequent turns. Don't concatenate parts with signatures together. Don't merge one part with a signature with another part without a signature. Pricing Note: Summaries are available in the free and paid tiers of the API. Thought signatures will increase the input tokens you are charged when sent back as part of the request. When thinking is turned on, response pricing is the sum of output tokens and thinking tokens. You can get the total number of generated thinking tokens from the thoughtsTokenCount field. Python # ... print ( "Thoughts tokens:" , response . usage_metadata . thoughts_token_count ) print ( "Output tokens:" , response . usage_metadata . candidates_token_count ) JavaScript // ... console . log ( `Thoughts tokens: ${ response . usageMetadata . thoughtsTokenCount } ` ); console . log ( `Output tokens: ${ response . usageMetadata . candidatesTokenCount } ` ); Go // ... usageMetadata , err := json . MarshalIndent ( response . UsageMetadata , "" , " " ) if err != nil { log . Fatal ( err ) } fmt . Println ( "Thoughts tokens:" , string ( usageMetadata . thoughts_token_count )) fmt . Println ( "Output tokens:" , string ( usageMetadata . candidates_token_count )) Thinking models generate full thoughts to improve the quality of the final response, and then output summaries to provide insight into the thought process. So, pricing is based on the full thought tokens the model needs to generate to create a summary, despite only the summary being output from the API. You can learn more about tokens in the Token counting guide. Supported models Thinking features are supported on all the 2.5 series models. You can find all model capabilities on the model overview page. Best practices This section includes some guidance for using thinking models efficiently. As always, following our prompting guidance and best practices will get you the best results. Debugging and steering Review reasoning : When you're not getting your expected response from the thinking models, it can help to carefully analyze Gemini's thought summaries. You can see how it broke down the task and arrived at its conclusion, and use that information to correct towards the right results. Provide Guidance in Reasoning : If you're hoping for a particularly lengthy output, you may want to provide guidance in your prompt to constrain the amount of thinking the model uses. This lets you reserve more of the token output for your response. Task complexity Easy Tasks (Thinking could be OFF): For straightforward requests where complex reasoning isn't required, such as fact retrieval or classification, thinking is not required. Examples include: "Where was DeepMind founded?" "Is this email asking for a meeting or just providing information?" Medium Tasks (Default/Some Thinking): Many common requests benefit from a degree of step-by-step processing or deeper understanding. Gemini can flexibly use thinking capability for tasks like: Analogize photosynthesis and growing up. Compare and contrast electric cars and hybrid cars. Hard Tasks (Maximum Thinking Capability): For truly complex challenges, such as solving complex math problems or coding tasks, we recommend setting a high thinking budget. These types of tasks require the model to engage its full reasoning and planning capabilities, often involving many internal steps before providing an answer. Examples include: Solve problem 1 in AIME 2025: Find the sum of all integer bases b > 9 for which 17 b is a divisor of 97 b . Write Python code for a web application that visualizes real-time stock market data, including user authentication. Make it as efficient as possible. Thinking with tools and capabilities Thinking models work with all of Gemini's tools and capabilities. This allows the models to interact with external systems, execute code, or access real-time information, incorporating the results into their reasoning and final response. The search tool allows the model to query Google Search to find up-to-date information or information beyond its training data. This is useful for questions about recent events or highly specific topics. The code execution tool enables the model to generate and run Python code to perform calculations, manipulate data, or solve problems that are best handled algorithmically. The model receives the code's output and can use it in its response. With structured output , you can constrain Gemini to respond with JSON. This is particularly useful for integrating the model's output into applications. Function calling connects the thinking model to external tools and APIs, so it can reason about when to call the right function and what parameters to provide. URL Context provides the model with URLs as additional context for your prompt. The model can then retrieve content from the URLs and use that content to inform and shape its response. You can try examples of using tools with thinking models in the Thinking cookbook . What's next? To work through more in depth examples, like: Using tools with thinking Streaming with thinking Adjusting the thinking budget for different results and more, try our Thinking cookbook . Thinking coverage is now available in our OpenAI Compatibility guide. For more info about Gemini 2.5 Pro, Gemini Flash 2.5, and Gemini 2.5 Flash-Lite, visit the model page . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-11 UTC.
|
text_content/docs_thinking_88bba5e2.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
URL: https://ai.google.dev/gemini-api/docs/thinking#supported-models
|
2 |
+
Title: Gemini thinking | Gemini API | Google AI for Developers
|
3 |
+
==================================================
|
4 |
+
|
5 |
+
Gemini thinking | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini thinking The Gemini 2.5 series models use an internal "thinking process" that significantly improves their reasoning and multi-step planning abilities, making them highly effective for complex tasks such as coding, advanced mathematics, and data analysis. This guide shows you how to work with Gemini's thinking capabilities using the Gemini API. Before you begin Ensure you use a supported 2.5 series model for thinking. You might find it beneficial to explore these models in AI Studio before diving into the API: Try Gemini 2.5 Flash in AI Studio Try Gemini 2.5 Pro in AI Studio Try Gemini 2.5 Flash-Lite Preview in AI Studio Generating content with thinking Initiating a request with a thinking model is similar to any other content generation request. The key difference lies in specifying one of the models with thinking support in the model field, as demonstrated in the following text generation example: Python from google import genai client = genai . Client () prompt = "Explain the concept of Occam's Razor and provide a simple, everyday example." response = client . models . generate_content ( model = "gemini-2.5-pro" , contents = prompt ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const prompt = "Explain the concept of Occam's Razor and provide a simple, everyday example." ; const response = await ai . models . generateContent ({ model : "gemini-2.5-pro" , contents : prompt , }); console . log ( response . text ); } main (); Go package main import ( "context" "fmt" "log" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } prompt := "Explain the concept of Occam's Razor and provide a simple, everyday example." model := "gemini-2.5-pro" resp , _ := client . Models . GenerateContent ( ctx , model , genai . Text ( prompt ), nil ) fmt . Println ( resp . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-pro:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "Explain the concept of Occam\' s Razor and provide a simple, everyday example. " } ] } ] }' ``` Thinking budgets The thinkingBudget parameter guides the model on the number of thinking tokens to use when generating a response. A higher token count generally allows for more detailed reasoning, which can be beneficial for tackling more complex tasks . If latency is more important, use a lower budget or disable thinking by setting thinkingBudget to 0. Setting the thinkingBudget to -1 turns on dynamic thinking , meaning the model will adjust the budget based on the complexity of the request. The thinkingBudget is only supported in Gemini 2.5 Flash, 2.5 Pro, and 2.5 Flash-Lite. Depending on the prompt, the model might overflow or underflow the token budget. The following are thinkingBudget configuration details for each model type. Model Default setting (Thinking budget is not set) Range Disable thinking Turn on dynamic thinking 2.5 Pro Dynamic thinking: Model decides when and how much to think 128 to 32768 N/A: Cannot disable thinking thinkingBudget = -1 2.5 Flash Dynamic thinking: Model decides when and how much to think 0 to 24576 thinkingBudget = 0 thinkingBudget = -1 2.5 Flash Lite Model does not think 512 to 24576 thinkingBudget = 0 thinkingBudget = -1 Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-pro" , contents = "Provide a list of 3 famous physicists and their key contributions" , config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( thinking_budget = 1024 ) # Turn off thinking: # thinking_config=types.ThinkingConfig(thinking_budget=0) # Turn on dynamic thinking: # thinking_config=types.ThinkingConfig(thinking_budget=-1) ), ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-pro" , contents : "Provide a list of 3 famous physicists and their key contributions" , config : { thinkingConfig : { thinkingBudget : 1024 , // Turn off thinking: // thinkingBudget: 0 // Turn on dynamic thinking: // thinkingBudget: -1 }, }, }); console . log ( response . text ); } main (); Go package main import ( "context" "fmt" "google.golang.org/genai" "os" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } thinkingBudgetVal := int32 ( 1024 ) contents := genai . Text ( "Provide a list of 3 famous physicists and their key contributions" ) model := "gemini-2.5-pro" resp , _ := client . Models . GenerateContent ( ctx , model , contents , & genai . GenerateContentConfig { ThinkingConfig : & genai . ThinkingConfig { ThinkingBudget : & thinkingBudgetVal , // Turn off thinking: // ThinkingBudget: int32(0), // Turn on dynamic thinking: // ThinkingBudget: int32(-1), }, }) fmt . Println ( resp . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-pro:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "Provide a list of 3 famous physicists and their key contributions" } ] } ], "generationConfig": { "thinkingConfig": { "thinkingBudget": 1024 # Thinking off: # "thinkingBudget": 0 # Turn on dynamic thinking: # "thinkingBudget": -1 } } }' Thought summaries Thought summaries are synthesized versions of the model's raw thoughts and offer insights into the model's internal reasoning process. Note that thinking budgets apply to the model's raw thoughts and not to thought summaries. You can enable thought summaries by setting includeThoughts to true in your request configuration. You can then access the summary by iterating through the response parameter's parts , and checking the thought boolean. Here's an example demonstrating how to enable and retrieve thought summaries without streaming, which returns a single, final thought summary with the response: Python from google import genai from google.genai import types client = genai . Client () prompt = "What is the sum of the first 50 prime numbers?" response = client . models . generate_content ( model = "gemini-2.5-pro" , contents = prompt , config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( include_thoughts = True ) ) ) for part in response . candidates [ 0 ] . content . parts : if not part . text : continue if part . thought : print ( "Thought summary:" ) print ( part . text ) print () else : print ( "Answer:" ) print ( part . text ) print () JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-pro" , contents : "What is the sum of the first 50 prime numbers?" , config : { thinkingConfig : { includeThoughts : true , }, }, }); for ( const part of response . candidates [ 0 ]. content . parts ) { if ( ! part . text ) { continue ; } else if ( part . thought ) { console . log ( "Thoughts summary:" ); console . log ( part . text ); } else { console . log ( "Answer:" ); console . log ( part . text ); } } } main (); Go package main import ( "context" "fmt" "google.golang.org/genai" "os" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } contents := genai . Text ( "What is the sum of the first 50 prime numbers?" ) model := "gemini-2.5-pro" resp , _ := client . Models . GenerateContent ( ctx , model , contents , & genai . GenerateContentConfig { ThinkingConfig : & genai . ThinkingConfig { IncludeThoughts : true , }, }) for _ , part := range resp . Candidates [ 0 ]. Content . Parts { if part . Text != "" { if part . Thought { fmt . Println ( "Thoughts Summary:" ) fmt . Println ( part . Text ) } else { fmt . Println ( "Answer:" ) fmt . Println ( part . Text ) } } } } And here is an example using thinking with streaming, which returns rolling, incremental summaries during generation: Python from google import genai from google.genai import types client = genai . Client () prompt = """ Alice, Bob, and Carol each live in a different house on the same street: red, green, and blue. The person who lives in the red house owns a cat. Bob does not live in the green house. Carol owns a dog. The green house is to the left of the red house. Alice does not own a cat. Who lives in each house, and what pet do they own? """ thoughts = "" answer = "" for chunk in client . models . generate_content_stream ( model = "gemini-2.5-pro" , contents = prompt , config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( include_thoughts = True ) ) ): for part in chunk . candidates [ 0 ] . content . parts : if not part . text : continue elif part . thought : if not thoughts : print ( "Thoughts summary:" ) print ( part . text ) thoughts += part . text else : if not answer : print ( "Answer:" ) print ( part . text ) answer += part . text JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); const prompt = `Alice, Bob, and Carol each live in a different house on the same street: red, green, and blue. The person who lives in the red house owns a cat. Bob does not live in the green house. Carol owns a dog. The green house is to the left of the red house. Alice does not own a cat. Who lives in each house, and what pet do they own?` ; let thoughts = "" ; let answer = "" ; async function main () { const response = await ai . models . generateContentStream ({ model : "gemini-2.5-pro" , contents : prompt , config : { thinkingConfig : { includeThoughts : true , }, }, }); for await ( const chunk of response ) { for ( const part of chunk . candidates [ 0 ]. content . parts ) { if ( ! part . text ) { continue ; } else if ( part . thought ) { if ( ! thoughts ) { console . log ( "Thoughts summary:" ); } console . log ( part . text ); thoughts = thoughts + part . text ; } else { if ( ! answer ) { console . log ( "Answer:" ); } console . log ( part . text ); answer = answer + part . text ; } } } } await main (); Go package main import ( "context" "fmt" "log" "os" "google.golang.org/genai" ) const prompt = ` Alice, Bob, and Carol each live in a different house on the same street: red, green, and blue. The person who lives in the red house owns a cat. Bob does not live in the green house. Carol owns a dog. The green house is to the left of the red house. Alice does not own a cat. Who lives in each house, and what pet do they own? ` func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } contents := genai . Text ( prompt ) model := "gemini-2.5-pro" resp := client . Models . GenerateContentStream ( ctx , model , contents , & genai . GenerateContentConfig { ThinkingConfig : & genai . ThinkingConfig { IncludeThoughts : true , }, }) for chunk := range resp { for _ , part := range chunk . Candidates [ 0 ]. Content . Parts { if len ( part . Text ) == 0 { continue } if part . Thought { fmt . Printf ( "Thought: %s\n" , part . Text ) } else { fmt . Printf ( "Answer: %s\n" , part . Text ) } } } } Thought signatures Because standard Gemini API text and content generation calls are stateless, when using thinking in multi-turn interactions (such as chat), the model doesn't have access to thought context from previous turns. You can maintain thought context using thought signatures, which are encrypted representations of the model's internal thought process. The model returns thought signatures in the response object when thinking and function calling are enabled. To ensure the model maintains context across multiple turns of a conversation, you must provide the thought signatures back to the model in the subsequent requests. You will receive thought signatures when: Thinking is enabled and thoughts are generated. The request includes function declarations . Note: Thought signatures are only available when you're using function calling, specifically, your request must include function declarations . You can find an example of thinking with function calls on the Function calling page. Other usage limitations to consider with function calling include: Signatures are returned from the model within other parts in the response, for example function calling or text parts. Return the entire response with all parts back to the model in subsequent turns. Don't concatenate parts with signatures together. Don't merge one part with a signature with another part without a signature. Pricing Note: Summaries are available in the free and paid tiers of the API. Thought signatures will increase the input tokens you are charged when sent back as part of the request. When thinking is turned on, response pricing is the sum of output tokens and thinking tokens. You can get the total number of generated thinking tokens from the thoughtsTokenCount field. Python # ... print ( "Thoughts tokens:" , response . usage_metadata . thoughts_token_count ) print ( "Output tokens:" , response . usage_metadata . candidates_token_count ) JavaScript // ... console . log ( `Thoughts tokens: ${ response . usageMetadata . thoughtsTokenCount } ` ); console . log ( `Output tokens: ${ response . usageMetadata . candidatesTokenCount } ` ); Go // ... usageMetadata , err := json . MarshalIndent ( response . UsageMetadata , "" , " " ) if err != nil { log . Fatal ( err ) } fmt . Println ( "Thoughts tokens:" , string ( usageMetadata . thoughts_token_count )) fmt . Println ( "Output tokens:" , string ( usageMetadata . candidates_token_count )) Thinking models generate full thoughts to improve the quality of the final response, and then output summaries to provide insight into the thought process. So, pricing is based on the full thought tokens the model needs to generate to create a summary, despite only the summary being output from the API. You can learn more about tokens in the Token counting guide. Supported models Thinking features are supported on all the 2.5 series models. You can find all model capabilities on the model overview page. Best practices This section includes some guidance for using thinking models efficiently. As always, following our prompting guidance and best practices will get you the best results. Debugging and steering Review reasoning : When you're not getting your expected response from the thinking models, it can help to carefully analyze Gemini's thought summaries. You can see how it broke down the task and arrived at its conclusion, and use that information to correct towards the right results. Provide Guidance in Reasoning : If you're hoping for a particularly lengthy output, you may want to provide guidance in your prompt to constrain the amount of thinking the model uses. This lets you reserve more of the token output for your response. Task complexity Easy Tasks (Thinking could be OFF): For straightforward requests where complex reasoning isn't required, such as fact retrieval or classification, thinking is not required. Examples include: "Where was DeepMind founded?" "Is this email asking for a meeting or just providing information?" Medium Tasks (Default/Some Thinking): Many common requests benefit from a degree of step-by-step processing or deeper understanding. Gemini can flexibly use thinking capability for tasks like: Analogize photosynthesis and growing up. Compare and contrast electric cars and hybrid cars. Hard Tasks (Maximum Thinking Capability): For truly complex challenges, such as solving complex math problems or coding tasks, we recommend setting a high thinking budget. These types of tasks require the model to engage its full reasoning and planning capabilities, often involving many internal steps before providing an answer. Examples include: Solve problem 1 in AIME 2025: Find the sum of all integer bases b > 9 for which 17 b is a divisor of 97 b . Write Python code for a web application that visualizes real-time stock market data, including user authentication. Make it as efficient as possible. Thinking with tools and capabilities Thinking models work with all of Gemini's tools and capabilities. This allows the models to interact with external systems, execute code, or access real-time information, incorporating the results into their reasoning and final response. The search tool allows the model to query Google Search to find up-to-date information or information beyond its training data. This is useful for questions about recent events or highly specific topics. The code execution tool enables the model to generate and run Python code to perform calculations, manipulate data, or solve problems that are best handled algorithmically. The model receives the code's output and can use it in its response. With structured output , you can constrain Gemini to respond with JSON. This is particularly useful for integrating the model's output into applications. Function calling connects the thinking model to external tools and APIs, so it can reason about when to call the right function and what parameters to provide. URL Context provides the model with URLs as additional context for your prompt. The model can then retrieve content from the URLs and use that content to inform and shape its response. You can try examples of using tools with thinking models in the Thinking cookbook . What's next? To work through more in depth examples, like: Using tools with thinking Streaming with thinking Adjusting the thinking budget for different results and more, try our Thinking cookbook . Thinking coverage is now available in our OpenAI Compatibility guide. For more info about Gemini 2.5 Pro, Gemini Flash 2.5, and Gemini 2.5 Flash-Lite, visit the model page . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-11 UTC.
|
text_content/docs_url-context_36731e11.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
URL: https://ai.google.dev/gemini-api/docs/url-context
|
2 |
+
Title: URL context | Gemini API | Google AI for Developers
|
3 |
+
==================================================
|
4 |
+
|
5 |
+
URL context | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback URL context Experimental: The URL context tool is an experimental feature. Using the URL context tool, you can provide Gemini with URLs as additional context for your prompt. The model can then retrieve content from the URLs and use that content to inform and shape its response. This tool is useful for tasks like the following: Extracting key data points or talking points from articles Comparing information across multiple links Synthesizing data from several sources Answering questions based on the content of a specific page or pages Analyzing content for specific purposes (like writing a job description or creating test questions) This guide explains how to use the URL context tool in the Gemini API. Use URL context You can use the URL context tool in two main ways, by itself or in conjunction with Grounding with Google Search . URL Context Only You provide specific URLs that you want the model to analyze directly in your prompt. Example prompts: Summarize this document: YOUR_URLs Extract the key features from the product description on this page: YOUR_URLs Grounding with Google Search + URL Context You can also enable both URL context and Grounding with Google Search together. You can enter a prompt with or without URLs. The model may first search for relevant information and then use the URL context tool to read the content of the search results for a more in-depth understanding. Example prompts: Give me three day events schedule based on YOUR_URL . Also let me know what needs to taken care of considering weather and commute. Recommend 3 books for beginners to read to learn more about the latest YOUR_subject . Code examples with URL context only Python from google import genai from google.genai.types import Tool , GenerateContentConfig , GoogleSearch client = genai . Client () model_id = "gemini-2.5-flash" url_context_tool = Tool ( url_context = types . UrlContext ) response = client . models . generate_content ( model = model_id , contents = "Compare recipes from YOUR_URL1 and YOUR_URL2 " , config = GenerateContentConfig ( tools = [ url_context_tool ], response_modalities = [ "TEXT" ], ) ) for each in response . candidates [ 0 ] . content . parts : print ( each . text ) # get URLs retrieved for context print ( response . candidates [ 0 ] . url_context_metadata ) Javascript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : [ "Compare recipes from YOUR_URL1 and YOUR_URL2 " , ], config : { tools : [{ urlContext : {}}], }, }); console . log ( response . text ); // To get URLs retrieved for context console . log ( response . candidates [ 0 ]. urlContextMetadata ) } await main (); REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Type: application/json" \ -d '{ "contents": [ { "parts": [ {"text": "Compare recipes from YOUR_URL1 and YOUR_URL2 "} ] } ], "tools": [ { "url_context": {} } ] }' > result.json cat result.json Code examples with Grounding with Google Search Python from google import genai from google.genai.types import Tool , GenerateContentConfig , GoogleSearch client = genai . Client () model_id = "gemini-2.5-flash" tools = [] tools . append ( Tool ( url_context = types . UrlContext )) tools . append ( Tool ( google_search = types . GoogleSearch )) response = client . models . generate_content ( model = model_id , contents = "Give me three day events schedule based on YOUR_URL . Also let me know what needs to taken care of considering weather and commute." , config = GenerateContentConfig ( tools = tools , response_modalities = [ "TEXT" ], ) ) for each in response . candidates [ 0 ] . content . parts : print ( each . text ) # get URLs retrieved for context print ( response . candidates [ 0 ] . url_context_metadata ) Javascript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : [ "Give me three day events schedule based on YOUR_URL . Also let me know what needs to taken care of considering weather and commute." , ], config : { tools : [{ urlContext : {}}, { googleSearch : {}}], }, }); console . log ( response . text ); // To get URLs retrieved for context console . log ( response . candidates [ 0 ]. urlContextMetadata ) } await main (); REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Type: application/json" \ -d '{ "contents": [ { "parts": [ {"text": "Give me three day events schedule based on YOUR_URL . Also let me know what needs to taken care of considering weather and commute."} ] } ], "tools": [ { "url_context": {} }, { "google_search": {} } ] }' > result.json cat result.json For more details about Grounding with Google Search, see the overview page. Contextual response The model's response will be based on the content it retrieved from the URLs. If the model retrieved content from URLs, the response will include url_context_metadata . Such a response might look something like the following (parts of the response have been omitted for brevity): { "candidates" : [ { "content" : { "parts" : [ { "text" : "... \n" } ], "role" : "model" }, ... "url_context_metadata" : { "url_metadata" : [ { "retrieved_url" : "https://vertexaisearch.cloud.google.com/grounding-api-redirect/1234567890abcdef" , "url_retrieval_status" : < UrlRe tr ievalS tatus .URL_RETRIEVAL_STATUS_SUCCESS : "URL_RETRIEVAL_STATUS_SUCCESS" > }, { "retrieved_url" : "https://vertexaisearch.cloud.google.com/grounding-api-redirect/abcdef1234567890" , "url_retrieval_status" : < UrlRe tr ievalS tatus .URL_RETRIEVAL_STATUS_SUCCESS : "URL_RETRIEVAL_STATUS_SUCCESS" > }, { "retrieved_url" : " YOUR_URL " , "url_retrieval_status" : < UrlRe tr ievalS tatus .URL_RETRIEVAL_STATUS_SUCCESS : "URL_RETRIEVAL_STATUS_SUCCESS" > }, { "retrieved_url" : "https://vertexaisearch.cloud.google.com/grounding-api-redirect/fedcba0987654321" , "url_retrieval_status" : < UrlRe tr ievalS tatus .URL_RETRIEVAL_STATUS_SUCCESS : "URL_RETRIEVAL_STATUS_SUCCESS" > } ] } } } Supported models gemini-2.5-pro gemini-2.5-flash gemini-2.5-flash-lite gemini-2.0-flash gemini-2.0-flash-live-001 Limitations The tool will consume up to 20 URLs per request for analysis. For best results during experimental phase, use the tool on standard web pages rather than multimedia content such as YouTube videos. During experimental phase, the tool is free to use. Billing to come later. The experimental release has the following quotas: 1500 queries per day per project for requests made through the Gemini API 100 queries per day per user in Google AI Studio Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC.
|
text_content/docs_url-context_41e99456.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
URL: https://ai.google.dev/gemini-api/docs/url-context#main-content
|
2 |
+
Title: URL context | Gemini API | Google AI for Developers
|
3 |
+
==================================================
|
4 |
+
|
5 |
+
URL context | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback URL context Experimental: The URL context tool is an experimental feature. Using the URL context tool, you can provide Gemini with URLs as additional context for your prompt. The model can then retrieve content from the URLs and use that content to inform and shape its response. This tool is useful for tasks like the following: Extracting key data points or talking points from articles Comparing information across multiple links Synthesizing data from several sources Answering questions based on the content of a specific page or pages Analyzing content for specific purposes (like writing a job description or creating test questions) This guide explains how to use the URL context tool in the Gemini API. Use URL context You can use the URL context tool in two main ways, by itself or in conjunction with Grounding with Google Search . URL Context Only You provide specific URLs that you want the model to analyze directly in your prompt. Example prompts: Summarize this document: YOUR_URLs Extract the key features from the product description on this page: YOUR_URLs Grounding with Google Search + URL Context You can also enable both URL context and Grounding with Google Search together. You can enter a prompt with or without URLs. The model may first search for relevant information and then use the URL context tool to read the content of the search results for a more in-depth understanding. Example prompts: Give me three day events schedule based on YOUR_URL . Also let me know what needs to taken care of considering weather and commute. Recommend 3 books for beginners to read to learn more about the latest YOUR_subject . Code examples with URL context only Python from google import genai from google.genai.types import Tool , GenerateContentConfig , GoogleSearch client = genai . Client () model_id = "gemini-2.5-flash" url_context_tool = Tool ( url_context = types . UrlContext ) response = client . models . generate_content ( model = model_id , contents = "Compare recipes from YOUR_URL1 and YOUR_URL2 " , config = GenerateContentConfig ( tools = [ url_context_tool ], response_modalities = [ "TEXT" ], ) ) for each in response . candidates [ 0 ] . content . parts : print ( each . text ) # get URLs retrieved for context print ( response . candidates [ 0 ] . url_context_metadata ) Javascript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : [ "Compare recipes from YOUR_URL1 and YOUR_URL2 " , ], config : { tools : [{ urlContext : {}}], }, }); console . log ( response . text ); // To get URLs retrieved for context console . log ( response . candidates [ 0 ]. urlContextMetadata ) } await main (); REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Type: application/json" \ -d '{ "contents": [ { "parts": [ {"text": "Compare recipes from YOUR_URL1 and YOUR_URL2 "} ] } ], "tools": [ { "url_context": {} } ] }' > result.json cat result.json Code examples with Grounding with Google Search Python from google import genai from google.genai.types import Tool , GenerateContentConfig , GoogleSearch client = genai . Client () model_id = "gemini-2.5-flash" tools = [] tools . append ( Tool ( url_context = types . UrlContext )) tools . append ( Tool ( google_search = types . GoogleSearch )) response = client . models . generate_content ( model = model_id , contents = "Give me three day events schedule based on YOUR_URL . Also let me know what needs to taken care of considering weather and commute." , config = GenerateContentConfig ( tools = tools , response_modalities = [ "TEXT" ], ) ) for each in response . candidates [ 0 ] . content . parts : print ( each . text ) # get URLs retrieved for context print ( response . candidates [ 0 ] . url_context_metadata ) Javascript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : [ "Give me three day events schedule based on YOUR_URL . Also let me know what needs to taken care of considering weather and commute." , ], config : { tools : [{ urlContext : {}}, { googleSearch : {}}], }, }); console . log ( response . text ); // To get URLs retrieved for context console . log ( response . candidates [ 0 ]. urlContextMetadata ) } await main (); REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Type: application/json" \ -d '{ "contents": [ { "parts": [ {"text": "Give me three day events schedule based on YOUR_URL . Also let me know what needs to taken care of considering weather and commute."} ] } ], "tools": [ { "url_context": {} }, { "google_search": {} } ] }' > result.json cat result.json For more details about Grounding with Google Search, see the overview page. Contextual response The model's response will be based on the content it retrieved from the URLs. If the model retrieved content from URLs, the response will include url_context_metadata . Such a response might look something like the following (parts of the response have been omitted for brevity): { "candidates" : [ { "content" : { "parts" : [ { "text" : "... \n" } ], "role" : "model" }, ... "url_context_metadata" : { "url_metadata" : [ { "retrieved_url" : "https://vertexaisearch.cloud.google.com/grounding-api-redirect/1234567890abcdef" , "url_retrieval_status" : < UrlRe tr ievalS tatus .URL_RETRIEVAL_STATUS_SUCCESS : "URL_RETRIEVAL_STATUS_SUCCESS" > }, { "retrieved_url" : "https://vertexaisearch.cloud.google.com/grounding-api-redirect/abcdef1234567890" , "url_retrieval_status" : < UrlRe tr ievalS tatus .URL_RETRIEVAL_STATUS_SUCCESS : "URL_RETRIEVAL_STATUS_SUCCESS" > }, { "retrieved_url" : " YOUR_URL " , "url_retrieval_status" : < UrlRe tr ievalS tatus .URL_RETRIEVAL_STATUS_SUCCESS : "URL_RETRIEVAL_STATUS_SUCCESS" > }, { "retrieved_url" : "https://vertexaisearch.cloud.google.com/grounding-api-redirect/fedcba0987654321" , "url_retrieval_status" : < UrlRe tr ievalS tatus .URL_RETRIEVAL_STATUS_SUCCESS : "URL_RETRIEVAL_STATUS_SUCCESS" > } ] } } } Supported models gemini-2.5-pro gemini-2.5-flash gemini-2.5-flash-lite gemini-2.0-flash gemini-2.0-flash-live-001 Limitations The tool will consume up to 20 URLs per request for analysis. For best results during experimental phase, use the tool on standard web pages rather than multimedia content such as YouTube videos. During experimental phase, the tool is free to use. Billing to come later. The experimental release has the following quotas: 1500 queries per day per project for requests made through the Gemini API 100 queries per day per user in Google AI Studio Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC.
|
text_content/docs_usage-policies_cd8a3783.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
URL: https://ai.google.dev/gemini-api/docs/usage-policies#abuse-monitoring
|
2 |
+
Title: Additional usage policies | Gemini API | Google AI for Developers
|
3 |
+
==================================================
|
4 |
+
|
5 |
+
Additional usage policies | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Additional usage policies This page includes additional usage policies for the Gemini API. Abuse monitoring Google is committed to the responsible development and use of AI. To ensure the safety and integrity of the Gemini API, we have created these policy guidelines. By using the Gemini API, you agree to the following guidelines, the Gemini API Additional Terms of Service and Generative AI Prohibited Use Policy . How We Monitor for Misuse Google's Trust and Safety Team employs a combination of automated and manual processes to detect potential misuse of the Gemini API and enforce our policies. Automated Detection: Automated systems scan API usage for violations of our Prohibited Use Policy, such as hate speech, harassment, sexually explicit content, and dangerous content. Manual Detection: If a project consistently exhibits suspicious activity, it may be flagged for manual review by authorized Google personnel. How We Handle Data To help with abuse monitoring, Google retains the following data for fifty-five (55) days: Prompts: The text prompts you submit to the API. Contextual Information: Any additional context you provide with your prompts. Output: The responses generated by the Gemini API. How We Investigate Potential Issues When prompts or model outputs are flagged by safety filters and abuse detection systems described above, authorized Google employees may assess the flagged content, and either confirm or correct the classification or determination based on predefined guidelines and policies. Data can be accessed for human review only by authorized Google employees via an internal governance assessment and review management platform. When data is logged for abuse monitoring, it is used solely for the purpose of policy enforcement and is not used to train or fine-tune any AI/ML models. Working with You on Policy Compliance If your use of Gemini doesn't align with our policies, we may take the following steps: Get in touch: We may reach out to you through email to understand your use case and explore ways to bring your usage into compliance. Temporary usage limits: We may limit your access to the Gemini API. Temporary suspension: We may temporarily pause your access to the Gemini API. Account closure: As a last resort, and for serious violations, we may permanently close your access to the Gemini API and other Google services. Scope These policy guidelines apply to the use of the Gemini API and AI Studio. Inline Preference Voting In Google AI Studio, you might occasionally see a side-by-side comparison of two different responses to your prompt. This is part of our Inline Preference Voting system. You'll be asked to choose which response you prefer. This helps us understand which model outputs users find most helpful. Why are we doing this? We're constantly working to improve our AI models and services. Your feedback through Inline Preference Voting helps us provide, improve, and develop Google products and services and machine learning technologies, including Google's enterprise features, products and services, consistent with the Gemini API Additional Terms of Service and Privacy Policy . What data is included in Feedback? To make informed decisions about our models, we collect certain data when you participate in Inline Preference Voting: Prompts and Responses: We record all prompts and responses, including any uploaded content, in the conversation you submitted feedback about. We also record the two response options that you selected from. This helps us understand the context of your preference. Your Vote: We record which response you preferred. This is the core of the feedback we're collecting. Usage Details: This includes information about which model generated the response and other technical and operational details about your usage of this feature. Your Privacy We take your privacy seriously. Google takes steps to protect your privacy as part of this process. This includes disconnecting this data from your Google Account, API key, and Cloud project before reviewers see or annotate it. Do not submit feedback on conversations that include sensitive, confidential, or personal information. Opting Out You'll have the option to skip the Inline Preference Voting when it appears. Thank you for helping us improve Google AI Studio! Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-03-24 UTC.
|
text_content/docs_video-understanding_d31a96cd.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
URL: https://ai.google.dev/gemini-api/docs/video-understanding#inline-video
|
2 |
+
Title: Video understanding | Gemini API | Google AI for Developers
|
3 |
+
==================================================
|
4 |
+
|
5 |
+
Video understanding | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Video understanding Gemini models can process videos, enabling many frontier developer use cases that would have historically required domain specific models. Some of Gemini's vision capabilities include the ability to: Describe, segment, and extract information from videos Answer questions about video content Refer to specific timestamps within a video Gemini was built to be multimodal from the ground up and we continue to push the frontier of what is possible. This guide shows how to use the Gemini API to generate text responses based on video inputs. Video input You can provide videos as input to Gemini in the following ways: Upload a video file using the File API before making a request to generateContent . Use this method for files larger than 20MB, videos longer than approximately 1 minute, or when you want to reuse the file across multiple requests. Pass inline video data with the request to generateContent . Use this method for smaller files (<20MB) and shorter durations. Include a YouTube URL directly in the prompt. Upload a video file You can use the Files API to upload a video file. Always use the Files API when the total request size (including the file, text prompt, system instructions, etc.) is larger than 20 MB, the video duration is significant, or if you intend to use the same video in multiple prompts. The File API accepts video file formats directly. This example uses the short NASA film "Jupiter's Great Red Spot Shrinks and Grows" . Credit: Goddard Space Flight Center (GSFC)/David Ladd (2018). "Jupiter's Great Red Spot Shrinks and Grows" is in the public domain and does not show identifiable people. ( NASA image and media usage guidelines. ) The following code downloads the sample video, uploads it using the File API, waits for it to be processed, and then uses the file reference in a generateContent request. Python from google import genai client = genai . Client () myfile = client . files . upload ( file = "path/to/sample.mp4" ) response = client . models . generate_content ( model = "gemini-2.0-flash" , contents = [ myfile , "Summarize this video. Then create a quiz with an answer key based on the information in this video." ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const myfile = await ai . files . upload ({ file : "path/to/sample.mp4" , config : { mimeType : "video/mp4" }, }); const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : createUserContent ([ createPartFromUri ( myfile . uri , myfile . mimeType ), "Summarize this video. Then create a quiz with an answer key based on the information in this video." , ]), }); console . log ( response . text ); } await main (); Go uploadedFile , _ := client . Files . UploadFromPath ( ctx , "path/to/sample.mp4" , nil ) parts := [] * genai . Part { genai . NewPartFromText ( "Summarize this video. Then create a quiz with an answer key based on the information in this video." ), genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.0-flash" , contents , nil , ) fmt . Println ( result . Text ()) REST VIDEO_PATH = "path/to/sample.mp4" MIME_TYPE = $( file -b --mime-type " ${ VIDEO_PATH } " ) NUM_BYTES = $( wc -c < " ${ VIDEO_PATH } " ) DISPLAY_NAME = VIDEO tmp_header_file = upload-header.tmp echo "Starting file upload..." curl "https://generativelanguage.googleapis.com/upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D ${ tmp_header_file } \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME } '}}" 2 > /dev/null upload_url = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " echo "Uploading video data..." curl " ${ upload_url } " \ -H "Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ VIDEO_PATH } " 2 > /dev/null > file_info.json file_uri = $( jq -r ".file.uri" file_info.json ) echo file_uri = $file_uri echo "File uploaded successfully. File URI: ${ file_uri } " # --- 3. Generate content using the uploaded video file --- echo "Generating content from video..." curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"file_data":{"mime_type": "' " ${ MIME_TYPE } " '", "file_uri": "' " ${ file_uri } " '"}}, {"text": "Summarize this video. Then create a quiz with an answer key based on the information in this video."}] }] }' 2 > /dev/null > response.json jq -r ".candidates[].content.parts[].text" response.json To learn more about working with media files, see Files API . Pass video data inline Instead of uploading a video file using the File API, you can pass smaller videos directly in the request to generateContent . This is suitable for shorter videos under 20MB total request size. Here's an example of providing inline video data: Python # Only for videos of size <20Mb video_file_name = "/path/to/your/video.mp4" video_bytes = open ( video_file_name , 'rb' ) . read () response = client . models . generate_content ( model = 'models/gemini-2.0-flash' , contents = types . Content ( parts = [ types . Part ( inline_data = types . Blob ( data = video_bytes , mime_type = 'video/mp4' ) ), types . Part ( text = 'Please summarize the video in 3 sentences.' ) ] ) ) JavaScript import { GoogleGenAI } from "@google/genai" ; import * as fs from "node:fs" ; const ai = new GoogleGenAI ({}); const base64VideoFile = fs . readFileSync ( "path/to/small-sample.mp4" , { encoding : "base64" , }); const contents = [ { inlineData : { mimeType : "video/mp4" , data : base64VideoFile , }, }, { text : "Please summarize the video in 3 sentences." } ]; const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : contents , }); console . log ( response . text ); REST Note: If you get an Argument list too long error, the base64 encoding of your file might be too long for the curl command line. Use the File API method instead for larger files. VIDEO_PATH = /path/to/your/video.mp4 if [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then B64FLAGS = "--input" else B64FLAGS = "-w0" fi curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ { "inline_data": { "mime_type":"video/mp4", "data": "' $( base64 $B64FLAGS $VIDEO_PATH ) '" } }, {"text": "Please summarize the video in 3 sentences."} ] }] }' 2 > /dev/null Include a YouTube URL Preview: The YouTube URL feature is in preview and is available at no charge. Pricing and rate limits are likely to change. The Gemini API and AI Studio support YouTube URLs as a file data Part . You can include a YouTube URL with a prompt asking the model to summarize, translate, or otherwise interact with the video content. Limitations: For the free tier, you can't upload more than 8 hours of YouTube video per day. For the paid tier, there is no limit based on video length. For models before 2.5, you can upload only 1 video per request. For models after 2.5, you can upload a maximum of 10 videos per request. You can only upload public videos (not private or unlisted videos). The following example shows how to include a YouTube URL with a prompt: Python response = client . models . generate_content ( model = 'models/gemini-2.0-flash' , contents = types . Content ( parts = [ types . Part ( file_data = types . FileData ( file_uri = 'https://www.youtube.com/watch?v=9hE5-98ZeCg' ) ), types . Part ( text = 'Please summarize the video in 3 sentences.' ) ] ) ) JavaScript import { GoogleGenerativeAI } from "@google/generative-ai" ; const genAI = new GoogleGenerativeAI ( process . env . GOOGLE_API_KEY ); const model = genAI . getGenerativeModel ({ model : "gemini-1.5-pro" }); const result = await model . generateContent ([ "Please summarize the video in 3 sentences." , { fileData : { fileUri : "https://www.youtube.com/watch?v=9hE5-98ZeCg" , }, }, ]); console . log ( result . response . text ()); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } parts := [] * genai . Part { genai . NewPartFromText ( "Please summarize the video in 3 sentences." ), genai . NewPartFromURI ( "https://www.youtube.com/watch?v=9hE5-98ZeCg" , "video/mp4" ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.0-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"text": "Please summarize the video in 3 sentences."}, { "file_data": { "file_uri": "https://www.youtube.com/watch?v=9hE5-98ZeCg" } } ] }] }' 2 > /dev/null Refer to timestamps in the content You can ask questions about specific points in time within the video using timestamps of the form MM:SS . Python prompt = "What are the examples given at 00:05 and 00:10 supposed to show us?" # Adjusted timestamps for the NASA video JavaScript const prompt = "What are the examples given at 00:05 and 00:10 supposed to show us?" ; Go prompt := [] * genai . Part { genai . NewPartFromURI ( currentVideoFile . URI , currentVideoFile . MIMEType ), // Adjusted timestamps for the NASA video genai . NewPartFromText ( "What are the examples given at 00:05 and " + "00:10 supposed to show us?" ), } REST PROMPT = "What are the examples given at 00:05 and 00:10 supposed to show us?" Transcribe video and provide visual descriptions The Gemini models can transcribe and provide visual descriptions of video content by processing both the audio track and visual frames. For visual descriptions, the model samples the video at a rate of 1 frame per second . This sampling rate may affect the level of detail in the descriptions, particularly for videos with rapidly changing visuals. Python prompt = "Transcribe the audio from this video, giving timestamps for salient events in the video. Also provide visual descriptions." JavaScript const prompt = "Transcribe the audio from this video, giving timestamps for salient events in the video. Also provide visual descriptions." ; Go prompt := [] * genai . Part { genai . NewPartFromURI ( currentVideoFile . URI , currentVideoFile . MIMEType ), genai . NewPartFromText ( "Transcribe the audio from this video, giving timestamps for salient events in the video. Also " + "provide visual descriptions." ), } REST PROMPT = "Transcribe the audio from this video, giving timestamps for salient events in the video. Also provide visual descriptions." Customize video processing You can customize video processing in the Gemini API by setting clipping intervals or providing custom frame rate sampling. Tip: Video clipping and frames per second (FPS) are supported by all models, but the quality is significantly higher from 2.5 series models. Set clipping intervals You can clip video by specifying videoMetadata with start and end offsets. Python response = client . models . generate_content ( model = 'models/gemini-2.5-flash-preview-05-20' , contents = types . Content ( parts = [ types . Part ( file_data = types . FileData ( file_uri = 'https://www.youtube.com/watch?v=XEzRZ35urlk' ), video_metadata = types . VideoMetadata ( start_offset = '1250s' , end_offset = '1570s' ) ), types . Part ( text = 'Please summarize the video in 3 sentences.' ) ] ) ) Set a custom frame rate You can set custom frame rate sampling by passing an fps argument to videoMetadata . Python # Only for videos of size <20Mb video_file_name = "/path/to/your/video.mp4" video_bytes = open ( video_file_name , 'rb' ) . read () response = client . models . generate_content ( model = 'models/gemini-2.5-flash-preview-05-20' , contents = types . Content ( parts = [ types . Part ( inline_data = types . Blob ( data = video_bytes , mime_type = 'video/mp4' ), video_metadata = types . VideoMetadata ( fps = 5 ) ), types . Part ( text = 'Please summarize the video in 3 sentences.' ) ] ) ) By default 1 frame per second (FPS) is sampled from the video. You might want to set low FPS (< 1) for long videos. This is especially useful for mostly static videos (e.g. lectures). If you want to capture more details in rapidly changing visuals, consider setting a higher FPS value. Supported video formats Gemini supports the following video format MIME types: video/mp4 video/mpeg video/mov video/avi video/x-flv video/mpg video/webm video/wmv video/3gpp Technical details about videos Supported models & context : All Gemini 2.0 and 2.5 models can process video data. Models with a 2M context window can process videos up to 2 hours long at default media resolution or 6 hours long at low media resolution, while models with a 1M context window can process videos up to 1 hour long at default media resolution or 3 hours long at low media resolution. File API processing : When using the File API, videos are sampled at 1 frame per second (FPS) and audio is processed at 1Kbps (single channel). Timestamps are added every second. These rates are subject to change in the future for improvements in inference. Token calculation : Each second of video is tokenized as follows: Individual frames (sampled at 1 FPS): If mediaResolution is set to low, frames are tokenized at 66 tokens per frame. Otherwise, frames are tokenized at 258 tokens per frame. Audio: 32 tokens per second. Metadata is also included. Total: Approximately 300 tokens per second of video at default media resolution, or 100 tokens per second of video at low media resolution. Timestamp format : When referring to specific moments in a video within your prompt, use the MM:SS format (e.g., 01:15 for 1 minute and 15 seconds). Best practices : Use only one video per prompt request for optimal results. If combining text and a single video, place the text prompt after the video part in the contents array. Be aware that fast action sequences might lose detail due to the 1 FPS sampling rate. Consider slowing down such clips if necessary. What's next This guide shows how to upload video files and generate text outputs from video inputs. To learn more, see the following resources: System instructions : System instructions let you steer the behavior of the model based on your specific needs and use cases. Files API : Learn more about uploading and managing files for use with Gemini. File prompting strategies : The Gemini API supports prompting with text, image, audio, and video data, also known as multimodal prompting. Safety guidance : Sometimes generative AI models produce unexpected outputs, such as outputs that are inaccurate, biased, or offensive. Post-processing and human evaluation are essential to limit the risk of harm from such outputs. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC.
|
text_content/docs_video_618562e2.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
URL: https://ai.google.dev/gemini-api/docs/video#generate-from-text
|
2 |
+
Title: Generate video using Veo | Gemini API | Google AI for Developers
|
3 |
+
==================================================
|
4 |
+
|
5 |
+
Generate video using Veo | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Generate video using Veo The Gemini API provides access to Veo 2 , Google's most capable video generation model to date. Veo generates videos in a wide range of cinematic and visual styles, capturing prompt nuance to render intricate details consistently across frames. This guide will help you get started with Veo using the Gemini API. For video prompting guidance, check out the Veo prompt guide section. Note: Veo is a paid feature and will not run in the Free tier. Visit the Pricing page for more details. Before you begin Before calling the Gemini API, ensure you have your SDK of choice installed, and a Gemini API key configured and ready to use. To use Veo with the Google Gen AI SDKs, ensure that you have one of the following versions installed: Python v1.10.0 or later TypeScript and JavaScript v0.8.0 or later Go v1.0.0 or later Generate videos This section provides code examples for generating videos using text prompts and using images . Generate from text You can use the following code to generate videos with Veo: Python import time from google import genai from google.genai import types client = genai . Client () operation = client . models . generate_videos ( model = "veo-2.0-generate-001" , prompt = "Panning wide shot of a calico kitten sleeping in the sunshine" , config = types . GenerateVideosConfig ( person_generation = "dont_allow" , # "dont_allow" or "allow_adult" aspect_ratio = "16:9" , # "16:9" or "9:16" ), ) while not operation . done : time . sleep ( 20 ) operation = client . operations . get ( operation ) for n , generated_video in enumerate ( operation . response . generated_videos ): client . files . download ( file = generated_video . video ) generated_video . video . save ( f "video { n } .mp4" ) # save the video JavaScript import { GoogleGenAI } from "@google/genai" ; import { createWriteStream } from "fs" ; import { Readable } from "stream" ; const ai = new GoogleGenAI ({}); async function main () { let operation = await ai . models . generateVideos ({ model : "veo-2.0-generate-001" , prompt : "Panning wide shot of a calico kitten sleeping in the sunshine" , config : { personGeneration : "dont_allow" , aspectRatio : "16:9" , }, }); while ( ! operation . done ) { await new Promise (( resolve ) = > setTimeout ( resolve , 10000 )); operation = await ai . operations . getVideosOperation ({ operation : operation , }); } operation . response ? . generatedVideos ? . forEach ( async ( generatedVideo , n ) = > { const resp = await fetch ( ` ${ generatedVideo . video ? . uri } & key=GEMINI_API_KEY` ); // append your API key const writer = createWriteStream ( `video ${ n } .mp4` ); Readable . fromWeb ( resp . body ). pipe ( writer ); }); } main (); Go package main import ( "context" "fmt" "os" "time" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } videoConfig := & genai . GenerateVideosConfig { AspectRatio : "16:9" , PersonGeneration : "dont_allow" , } operation , _ := client . Models . GenerateVideos ( ctx , "veo-2.0-generate-001" , "Panning wide shot of a calico kitten sleeping in the sunshine" , nil , videoConfig , ) for ! operation . Done { time . Sleep ( 20 * time . Second ) operation , _ = client . Operations . GetVideosOperation ( ctx , operation , nil ) } for n , video := range operation . Response . GeneratedVideos { client . Files . Download ( ctx , video . Video , nil ) fname := fmt . Sprintf ( "video_%d.mp4" , n ) _ = os . WriteFile ( fname , video . Video . VideoBytes , 0644 ) } } REST # Use curl to send a POST request to the predictLongRunning endpoint. # The request body includes the prompt for video generation. curl " ${ BASE_URL } /models/veo-2.0-generate-001:predictLongRunning" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Type: application/json" \ -X "POST" \ -d '{ "instances": [{ "prompt": "Panning wide shot of a calico kitten sleeping in the sunshine" } ], "parameters": { "aspectRatio": "16:9", "personGeneration": "dont_allow", } }' | tee result.json | jq .name | sed 's/"//g' > op_name # Obtain operation name to download video. op_name = $( cat op_name ) # Check against status of operation. while true ; do is_done = $( curl -H "x-goog-api-key: $GEMINI_API_KEY " " ${ BASE_URL } / ${ op_name } " | tee op_check.json | jq .done ) if [ " ${ is_done } " = "true" ] ; then cat op_check.json echo "** Attach API_KEY to download video, or examine error message." break fi echo "** Video ${ op_name } has not downloaded yet! Check again after 5 seconds..." # Wait for 5 seoncds to check again. sleep 5 done This code takes about 2-3 minutes to run, though it may take longer if resources are constrained. Once it's done running, you should see a video that looks something like this: If you see an error message instead of a video, this means that resources are constrained and your request couldn't be completed. In this case, run the code again. Generated videos are stored on the server for 2 days, after which they are removed. If you want to save a local copy of your generated video, you must run result() and save() within 2 days of generation. Generate from images You can also generate videos using images. The following code generates an image using Imagen, then uses the generated image as the starting frame for the generated video. First, generate an image using Imagen : Python prompt = "Panning wide shot of a calico kitten sleeping in the sunshine" , imagen = client . models . generate_images ( model = "imagen-3.0-generate-002" , prompt = prompt , config = types . GenerateImagesConfig ( aspect_ratio = "16:9" , number_of_images = 1 ) ) imagen . generated_images [ 0 ] . image JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); const response = await ai . models . generateImages ({ model : "imagen-3.0-generate-002" , prompt : "Panning wide shot of a calico kitten sleeping in the sunshine" , config : { numberOfImages : 1 , }, }); // you'll pass response.generatedImages[0].image.imageBytes to Veo Go package main import ( "context" "fmt" "os" "time" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } config := & genai . GenerateImagesConfig { AspectRatio : "16:9" , NumberOfImages : 1 , } response , _ := client . Models . GenerateImages ( ctx , "imagen-3.0-generate-002" , "Panning wide shot of a calico kitten sleeping in the sunshine" , config , ) // you'll pass response.GeneratedImages[0].Image to Veo } Then, generate a video using the resulting image as the first frame: Python operation = client . models . generate_videos ( model = "veo-2.0-generate-001" , prompt = prompt , image = imagen . generated_images [ 0 ] . image , config = types . GenerateVideosConfig ( person_generation = "dont_allow" , # "dont_allow" or "allow_adult" aspect_ratio = "16:9" , # "16:9" or "9:16" number_of_videos = 2 ), ) # Wait for videos to generate while not operation . done : time . sleep ( 20 ) operation = client . operations . get ( operation ) for n , video in enumerate ( operation . response . generated_videos ): fname = f 'with_image_input { n } .mp4' print ( fname ) client . files . download ( file = video . video ) video . video . save ( fname ) JavaScript import { GoogleGenAI } from "@google/genai" ; import { createWriteStream } from "fs" ; import { Readable } from "stream" ; const ai = new GoogleGenAI ({}); async function main () { // get image bytes from Imagen, as shown above let operation = await ai . models . generateVideos ({ model : "veo-2.0-generate-001" , prompt : "Panning wide shot of a calico kitten sleeping in the sunshine" , image : { imageBytes : response . generatedImages [ 0 ]. image . imageBytes , // response from Imagen mimeType : "image/png" , }, config : { aspectRatio : "16:9" , numberOfVideos : 2 , }, }); while ( ! operation . done ) { await new Promise (( resolve ) = > setTimeout ( resolve , 10000 )); operation = await ai . operations . getVideosOperation ({ operation : operation , }); } operation . response ? . generatedVideos ? . forEach ( async ( generatedVideo , n ) = > { const resp = await fetch ( ` ${ generatedVideo . video ? . uri } & key=GEMINI_API_KEY` , // append your API key ); const writer = createWriteStream ( `video ${ n } .mp4` ); Readable . fromWeb ( resp . body ). pipe ( writer ); }); } main (); Go image := response . GeneratedImages [ 0 ]. Image videoConfig := & genai . GenerateVideosConfig { AspectRatio : "16:9" , NumberOfVideos : 2 , } operation , _ := client . Models . GenerateVideos ( ctx , "veo-2.0-generate-001" , "A dramatic scene based on the input image" , image , videoConfig , ) for ! operation . Done { time . Sleep ( 20 * time . Second ) operation , _ = client . Operations . GetVideosOperation ( ctx , operation , nil ) } for n , video := range operation . Response . GeneratedVideos { client . Files . Download ( ctx , video . Video , nil ) fname := fmt . Sprintf ( "video_with_image_input_%d.mp4" , n ) _ = os . WriteFile ( fname , video . Video . VideoBytes , 0644 ) } Veo model parameters (Naming conventions vary by programming language.) prompt : The text prompt for the video. When present, the image parameter is optional. image : The image to use as the first frame for the video. When present, the prompt parameter is optional. negativePrompt : Text string that describes anything you want to discourage the model from generating aspectRatio : Changes the aspect ratio of the generated video. Supported values are "16:9" and "9:16" . The default is "16:9" . personGeneration : Allow the model to generate videos of people. The following values are supported: Text-to-video generation: "dont_allow" : Don't allow the inclusion of people or faces. "allow_adult" : Generate videos that include adults, but not children. "allow_all" : Generate videos that include adults and children. Image-to-video generation: "dont_allow" : Don't allow the inclusion of people or faces. "allow_adult" : Generate videos that include adults, but not children. See Limitations . numberOfVideos : Output videos requested, either 1 or 2 . durationSeconds : Length of each output video in seconds, between 5 and 8 . enhance_prompt : Enable or disable the prompt rewriter. Enabled by default. Specifications Modalities Text-to-video generation Image-to-video generation Request latency Min: 11 seconds Max: 6 minutes (during peak hours) Variable length generation 5-8 seconds Resolution 720p Frame rate 24fps Aspect ratio 16:9 - landscape 9:16 - portrait Input languages (text-to-video) English Limitations Image-to-video personGeneration is not allowed in EU, UK, CH, MENA locations Text-to-video personGeneration: "allow_all" is not allowed in EU, UK, CH, MENA locations Note: Check out the Models , Pricing , and Rate limits pages for more usage limitations for Veo. Videos created by Veo are watermarked using SynthID , our tool for watermarking and identifying AI-generated content, and are passed through safety filters and memorization checking processes that help mitigate privacy, copyright and bias risks. Things to try To get the most out of Veo, incorporate video-specific terminology into your prompts. Veo understands a wide range of terms related to: Shot composition: Specify the framing and number of subjects in the shot (e.g., "single shot," "two shot," "over-the-shoulder shot"). Camera positioning and movement: Control the camera's location and movement using terms like "eye level," "high angle," "worms eye," "dolly shot," "zoom shot," "pan shot," and "tracking shot." Focus and lens effects: Use terms like "shallow focus," "deep focus," "soft focus," "macro lens," and "wide-angle lens" to achieve specific visual effects. Overall style and subject: Guide Veo's creative direction by specifying styles like "sci-fi," "romantic comedy," "action movie," or "animation." You can also describe the subjects and backgrounds you want, such as "cityscape," "nature," "vehicles," or "animals." Veo prompt guide This section of the Veo guide contains examples of videos you can create using Veo, and shows you how to modify prompts to produce distinct results. Safety filters Veo applies safety filters across Gemini to help ensure that generated videos and uploaded photos don't contain offensive content. Prompts that violate our terms and guidelines are blocked. Prompt writing basics Good prompts are descriptive and clear. To get your generated video as close as possible to what you want, start with identifying your core idea, and then refine your idea by adding keywords and modifiers. The following elements should be included in your prompt: Subject : The object, person, animal, or scenery that you want in your video. Context : The background or context in which the subject is placed. Action : What the subject is doing (for example, walking , running , or turning their head ). Style : This can be general or very specific. Consider using specific film style keywords, such as horror film , film noir , or animated styles like cartoon style. Camera motion : [Optional] What the camera is doing, such as aerial view , eye-level , top-down shot , or low-angle shot . Composition : [Optional] How the shot is framed, such as wide shot , close-up , or extreme close-up . Ambiance : [Optional] How the color and light contribute to the scene, such as blue tones , night , or warm tones . More tips for writing prompts The following tips help you write prompts that generate your videos: Use descriptive language : Use adjectives and adverbs to paint a clear picture for Veo. Provide context : If necessary, include background information to help your model understand what you want. Reference specific artistic styles : If you have a particular aesthetic in mind, reference specific artistic styles or art movements. Utilize prompt engineering tools : Consider exploring prompt engineering tools or resources to help you refine your prompts and achieve optimal results. For more information, visit Introduction to prompt design . Enhance the facial details in your personal and group images : Specify facial details as a focus of the photo like using the word portrait in the prompt. Example prompts and output This section presents several prompts, highlighting how descriptive details can elevate the outcome of each video. Icicles This video demonstrates how you can use the elements of prompt writing basics in your prompt. Prompt Generated output Close up shot (composition) of melting icicles (subject) on a frozen rock wall (context) with cool blue tones (ambiance), zoomed in (camera motion) maintaining close-up detail of water drips (action). Man on the phone These videos demonstrate how you can revise your prompt with increasingly specific details to get Veo to refine the output to your liking. Prompt Generated output Analysis The camera dollies to show a close up of a desperate man in a green trench coat. He's making a call on a rotary-style wall phone with a green neon light. It looks like a movie scene. This is the first generated video based on the prompt. A close-up cinematic shot follows a desperate man in a weathered green trench coat as he dials a rotary phone mounted on a gritty brick wall, bathed in the eerie glow of a green neon sign. The camera dollies in, revealing the tension in his jaw and the desperation etched on his face as he struggles to make the call. The shallow depth of field focuses on his furrowed brow and the black rotary phone, blurring the background into a sea of neon colors and indistinct shadows, creating a sense of urgency and isolation. A more detailed prompt results in a video that is more focused with a richer environment. A video with smooth motion that dollies in on a desperate man in a green trench coat, using a vintage rotary phone against a wall bathed in an eerie green neon glow. The camera starts from a medium distance, slowly moving closer to the man's face, revealing his frantic expression and the sweat on his brow as he urgently dials the phone. The focus is on the man's hands, his fingers fumbling with the dial as he desperately tries to connect. The green neon light casts long shadows on the wall, adding to the tense atmosphere. The scene is framed to emphasize the isolation and desperation of the man, highlighting the stark contrast between the vibrant glow of the neon and the man's grim determination. Adding more detail gives the subject a realistic expression and creates an intense and vibrant scene. Snow leopard This example demonstrates the output Veo might generate for a simple prompt. Prompt Generated output A cute creature with snow leopard-like fur is walking in winter forest, 3D cartoon style render. Running snow leopard This prompt has more detail and demonstrates generated output that might be closer to what you want in your video. Prompt Generated output Create a short 3D animated scene in a joyful cartoon style. A cute creature with snow leopard-like fur, large expressive eyes, and a friendly, rounded form happily prances through a whimsical winter forest. The scene should feature rounded, snow-covered trees, gentle falling snowflakes, and warm sunlight filtering through the branches. The creature's bouncy movements and wide smile should convey pure delight. Aim for an upbeat, heartwarming tone with bright, cheerful colors and playful animation. Examples by writing elements These examples show you how to refine your prompts by each basic element. Subject This example shows you how to specify a subject description. Subject description Prompt Generated output The description can include a subject, or multiple subjects and actions. Here, our subject is "white concrete apartment building." An architectural rendering of a white concrete apartment building with flowing organic shapes, seamlessly blending with lush greenery and futuristic elements Context This example shows you how to specify context. Context Prompt Generated output The background or context in which the subject will be placed is very important. Try placing your subject in a variety of backgrounds like on a busy street, or in outer space. A satellite floating through outer space with the moon and some stars in the background. Action This example shows you how to specify action. Action Prompt Generated output What is the subject doing like walking, running, or turning their head. A wide shot of a woman walking along the beach, looking content and relaxed towards the horizon at sunset. Style This example shows you how to specify style. Style Prompt Generated output You can add keywords to improve generation quality and steer it closer to intended style, such as shallow depth of field, movie still, minimalistic, surreal, vintage, futuristic, or double-exposure. Film noir style, man and woman walk on the street, mystery, cinematic, black and white. Camera motion This example shows you how to specify camera motion. Camera motion Prompt Generated output Options for camera motion include POV shot, aerial view, tracking drone view, or tracking shot. A POV shot from a vintage car driving in the rain, Canada at night, cinematic. Composition This example shows you how to specify composition. Composition Prompt Generated output How the shot is framed (wide shot, close-up, low angle). Extreme close-up of a an eye with city reflected in it. Create a video of a wide shot of surfer walking on a beach with a surfboard, beautiful sunset, cinematic. Ambiance This example shows you how to specify ambiance. Ambiance Prompt Generated output Color palettes play a vital role in photography, influencing the mood and conveying intended emotions. Try things like "muted orange warm tones," "natural light," "sunrise" or "sunset". For example, a warm, golden palette can infuse a romantic and atmospheric feel into a photograph. A close-up of a girl holding adorable golden retriever puppy in the park, sunlight. Cinematic close-up shot of a sad woman riding a bus in the rain, cool blue tones, sad mood. Use reference images to generate videos You can bring images to life by using Veo's image-to-video capability. You can use existing assets, or try Imagen to generate something new. Prompt Generated output Bunny with a chocolate candy bar. Bunny runs away. Negative prompts Negative prompts can be a powerful tool to help specify elements you don't want in the video. Describe what you want to discourage the model from generating after the phrase "Negative prompt". Follow these tips: ❌ Don't use instructive language or words like no or don't . For example, "No walls" or "don't show walls". ✅ Do describe what you don't want to see. For example, "wall, frame", which means that you don't want a wall or a frame in the video. Prompt Generated output Generate a short, stylized animation of a large, solitary oak tree with leaves blowing vigorously in a strong wind. The tree should have a slightly exaggerated, whimsical form, with dynamic, flowing branches. The leaves should display a variety of autumn colors, swirling and dancing in the wind. The animation should use a warm, inviting color palette. Generate a short, stylized animation of a large, solitary oak tree with leaves blowing vigorously in a strong wind. The tree should have a slightly exaggerated, whimsical form, with dynamic, flowing branches. The leaves should display a variety of autumn colors, swirling and dancing in the wind. The animation should use a warm, inviting color palette. With negative prompt - urban background, man-made structures, dark, stormy, or threatening atmosphere. Aspect ratios Gemini Veo video generation supports the following two aspect ratios: Aspect ratio Description Widescreen or 16:9 The most common aspect ratio for televisions, monitors, and mobile phone screens (landscape). Use this when you want to capture more of the background, like in scenic landscapes. Portrait or 9:16 Rotated widescreen. This aspect ratio has been popularized by short form video applications, such as Youtube shorts. Use this for portraits or tall objects with strong vertical orientations, such as buildings, trees, waterfall, or buildings. Widescreen This prompt is an example of the widescreen aspect ratio of 16:9. Prompt Generated output Create a video with a tracking drone view of a man driving a red convertible car in Palm Springs, 1970s, warm sunlight, long shadows. Portrait This prompt is an example of the portrait aspect ratio of 9:16. Prompt Generated output Create a video highlighting the smooth motion of a majestic Hawaiian waterfall within a lush rainforest. Focus on realistic water flow, detailed foliage, and natural lighting to convey tranquility. Capture the rushing water, misty atmosphere, and dappled sunlight filtering through the dense canopy. Use smooth, cinematic camera movements to showcase the waterfall and its surroundings. Aim for a peaceful, realistic tone, transporting the viewer to the serene beauty of the Hawaiian rainforest. What's next Gain more experience generating AI videos with the Veo Colab . Check out cool examples using Veo 2 on the Google DeepMind site Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC.
|
text_content/docs_vision_1564825b.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
URL: https://ai.google.dev/gemini-api/docs/vision#object-detection
|
2 |
+
Title: Image understanding | Gemini API | Google AI for Developers
|
3 |
+
==================================================
|
4 |
+
|
5 |
+
Image understanding | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Image understanding Gemini models are built to be multimodal from the ground up, unlocking a wide range of image processing and computer vision tasks including but not limited to image captioning, classification, and visual question answering without having to train specialized ML models. Tip: In addition to their general multimodal capabilities, Gemini models (2.0 and newer) offer improved accuracy for specific use cases like object detection and segmentation , through additional training. See the Capabilities section for more details. Passing images to Gemini You can provide images as input to Gemini using two methods: Passing inline image data : Ideal for smaller files (total request size less than 20MB, including prompts). Uploading images using the File API : Recommended for larger files or for reusing images across multiple requests. Passing inline image data You can pass inline image data in the request to generateContent . You can provide image data as Base64 encoded strings or by reading local files directly (depending on the language). The following example shows how to read an image from a local file and pass it to generateContent API for processing. Python from google.genai import types with open ( 'path/to/small-sample.jpg' , 'rb' ) as f : image_bytes = f . read () response = client . models . generate_content ( model = 'gemini-2.5-flash' , contents = [ types . Part . from_bytes ( data = image_bytes , mime_type = 'image/jpeg' , ), 'Caption this image.' ] ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; import * as fs from "node:fs" ; const ai = new GoogleGenAI ({}); const base64ImageFile = fs . readFileSync ( "path/to/small-sample.jpg" , { encoding : "base64" , }); const contents = [ { inlineData : { mimeType : "image/jpeg" , data : base64ImageFile , }, }, { text : "Caption this image." }, ]; const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : contents , }); console . log ( response . text ); Go bytes , _ := os . ReadFile ( "path/to/small-sample.jpg" ) parts := [] * genai . Part { genai . NewPartFromBytes ( bytes , "image/jpeg" ), genai . NewPartFromText ( "Caption this image." ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) REST IMG_PATH = "/path/to/your/image1.jpg" if [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then B64FLAGS = "--input" else B64FLAGS = "-w0" fi curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ { "inline_data": { "mime_type":"image/jpeg", "data": "' " $( base64 $B64FLAGS $IMG_PATH ) " '" } }, {"text": "Caption this image."}, ] }] }' 2 > /dev/null You can also fetch an image from a URL, convert it to bytes, and pass it to generateContent as shown in the following examples. Python from google import genai from google.genai import types import requests image_path = "https://goo.gle/instrument-img" image_bytes = requests . get ( image_path ) . content image = types . Part . from_bytes ( data = image_bytes , mime_type = "image/jpeg" ) client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ "What is this image?" , image ], ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; async function main () { const ai = new GoogleGenAI ({}); const imageUrl = "https://goo.gle/instrument-img" ; const response = await fetch ( imageUrl ); const imageArrayBuffer = await response . arrayBuffer (); const base64ImageData = Buffer . from ( imageArrayBuffer ). toString ( 'base64' ); const result = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : [ { inlineData : { mimeType : 'image/jpeg' , data : base64ImageData , }, }, { text : "Caption this image." } ], }); console . log ( result . text ); } main (); Go package main import ( "context" "fmt" "os" "io" "net/http" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } // Download the image. imageResp , _ := http . Get ( "https://goo.gle/instrument-img" ) imageBytes , _ := io . ReadAll ( imageResp . Body ) parts := [] * genai . Part { genai . NewPartFromBytes ( imageBytes , "image/jpeg" ), genai . NewPartFromText ( "Caption this image." ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST IMG_URL = "https://goo.gle/instrument-img" MIME_TYPE = $( curl -sIL " $IMG_URL " | grep -i '^content-type:' | awk -F ': ' '{print $2}' | sed 's/\r$//' | head -n 1 ) if [[ -z " $MIME_TYPE " || ! " $MIME_TYPE " == image/* ]] ; then MIME_TYPE = "image/jpeg" fi # Check for macOS if [[ " $( uname ) " == "Darwin" ]] ; then IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 -b 0 ) elif [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 ) else IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 -w0 ) fi curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ { "inline_data": { "mime_type":"' " $MIME_TYPE " '", "data": "' " $IMAGE_B64 " '" } }, {"text": "Caption this image."} ] }] }' 2 > /dev/null Note: Inline image data limits your total request size (text prompts, system instructions, and inline bytes) to 20MB. For larger requests, upload image files using the File API. Files API is also more efficient for scenarios that use the same image repeatedly. Uploading images using the File API For large files or to be able to use the same image file repeatedly, use the Files API. The following code uploads an image file and then uses the file in a call to generateContent . See the Files API guide for more information and examples. Python from google import genai client = genai . Client () my_file = client . files . upload ( file = "path/to/sample.jpg" ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ my_file , "Caption this image." ], ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const myfile = await ai . files . upload ({ file : "path/to/sample.jpg" , config : { mimeType : "image/jpeg" }, }); const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : createUserContent ([ createPartFromUri ( myfile . uri , myfile . mimeType ), "Caption this image." , ]), }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } uploadedFile , _ := client . Files . UploadFromPath ( ctx , "path/to/sample.jpg" , nil ) parts := [] * genai . Part { genai . NewPartFromText ( "Caption this image." ), genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST IMAGE_PATH = "path/to/sample.jpg" MIME_TYPE = $( file -b --mime-type " ${ IMAGE_PATH } " ) NUM_BYTES = $( wc -c < " ${ IMAGE_PATH } " ) DISPLAY_NAME = IMAGE tmp_header_file = upload-header.tmp # Initial resumable request defining metadata. # The upload url is in the response headers dump them to a file. curl "https://generativelanguage.googleapis.com/upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D upload-header.tmp \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME } '}}" 2 > /dev/null upload_url = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " # Upload the actual bytes. curl " ${ upload_url } " \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ IMAGE_PATH } " 2 > /dev/null > file_info.json file_uri = $( jq -r ".file.uri" file_info.json ) echo file_uri = $file_uri # Now generate content using that file curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"file_data":{"mime_type": "' " ${ MIME_TYPE } " '", "file_uri": "' " ${ file_uri } " '"}}, {"text": "Caption this image."}] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json Prompting with multiple images You can provide multiple images in a single prompt by including multiple image Part objects in the contents array. These can be a mix of inline data (local files or URLs) and File API references. Python from google import genai from google.genai import types client = genai . Client () # Upload the first image image1_path = "path/to/image1.jpg" uploaded_file = client . files . upload ( file = image1_path ) # Prepare the second image as inline data image2_path = "path/to/image2.png" with open ( image2_path , 'rb' ) as f : img2_bytes = f . read () # Create the prompt with text and multiple images response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ "What is different between these two images?" , uploaded_file , # Use the uploaded file reference types . Part . from_bytes ( data = img2_bytes , mime_type = 'image/png' ) ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; import * as fs from "node:fs" ; const ai = new GoogleGenAI ({}); async function main () { // Upload the first image const image1_path = "path/to/image1.jpg" ; const uploadedFile = await ai . files . upload ({ file : image1_path , config : { mimeType : "image/jpeg" }, }); // Prepare the second image as inline data const image2_path = "path/to/image2.png" ; const base64Image2File = fs . readFileSync ( image2_path , { encoding : "base64" , }); // Create the prompt with text and multiple images const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : createUserContent ([ "What is different between these two images?" , createPartFromUri ( uploadedFile . uri , uploadedFile . mimeType ), { inlineData : { mimeType : "image/png" , data : base64Image2File , }, }, ]), }); console . log ( response . text ); } await main (); Go // Upload the first image image1Path := "path/to/image1.jpg" uploadedFile , _ := client . Files . UploadFromPath ( ctx , image1Path , nil ) // Prepare the second image as inline data image2Path := "path/to/image2.jpeg" imgBytes , _ := os . ReadFile ( image2Path ) parts := [] * genai . Part { genai . NewPartFromText ( "What is different between these two images?" ), genai . NewPartFromBytes ( imgBytes , "image/jpeg" ), genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) REST # Upload the first image IMAGE1_PATH = "path/to/image1.jpg" MIME1_TYPE = $( file -b --mime-type " ${ IMAGE1_PATH } " ) NUM1_BYTES = $( wc -c < " ${ IMAGE1_PATH } " ) DISPLAY_NAME1 = IMAGE1 tmp_header_file1 = upload-header1.tmp curl "https://generativelanguage.googleapis.com/upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D upload-header1.tmp \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM1_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME1_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME1 } '}}" 2 > /dev/null upload_url1 = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file1 } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file1 } " curl " ${ upload_url1 } " \ -H "Content-Length: ${ NUM1_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ IMAGE1_PATH } " 2 > /dev/null > file_info1.json file1_uri = $( jq ".file.uri" file_info1.json ) echo file1_uri = $file1_uri # Prepare the second image (inline) IMAGE2_PATH = "path/to/image2.png" MIME2_TYPE = $( file -b --mime-type " ${ IMAGE2_PATH } " ) if [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then B64FLAGS = "--input" else B64FLAGS = "-w0" fi IMAGE2_BASE64 = $( base64 $B64FLAGS $IMAGE2_PATH ) # Now generate content using both images curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"text": "What is different between these two images?"}, {"file_data":{"mime_type": "' " ${ MIME1_TYPE } " '", "file_uri": ' $file1_uri '}}, { "inline_data": { "mime_type":"' " ${ MIME2_TYPE } " '", "data": "' " $IMAGE2_BASE64 " '" } } ] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json Object detection From Gemini 2.0 onwards, models are further trained to detect objects in an image and get their bounding box coordinates. The coordinates, relative to image dimensions, scale to [0, 1000]. You need to descale these coordinates based on your original image size. Python from google import genai from google.genai import types from PIL import Image import json client = genai . Client () prompt = "Detect the all of the prominent items in the image. The box_2d should be [ymin, xmin, ymax, xmax] normalized to 0-1000." image = Image . open ( "/path/to/image.png" ) config = types . GenerateContentConfig ( response_mime_type = "application/json" ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ image , prompt ], config = config ) width , height = image . size bounding_boxes = json . loads ( response . text ) converted_bounding_boxes = [] for bounding_box in bounding_boxes : abs_y1 = int ( bounding_box [ "box_2d" ][ 0 ] / 1000 * height ) abs_x1 = int ( bounding_box [ "box_2d" ][ 1 ] / 1000 * width ) abs_y2 = int ( bounding_box [ "box_2d" ][ 2 ] / 1000 * height ) abs_x2 = int ( bounding_box [ "box_2d" ][ 3 ] / 1000 * width ) converted_bounding_boxes . append ([ abs_x1 , abs_y1 , abs_x2 , abs_y2 ]) print ( "Image size: " , width , height ) print ( "Bounding boxes:" , converted_bounding_boxes ) Note: The model also supports generating bounding boxes based on custom instructions, such as: "Show bounding boxes of all green objects in this image". It also support custom labels like "label the items with the allergens they can contain". For more examples, check following notebooks in the Gemini Cookbook : 2D spatial understanding notebook Experimental 3D pointing notebook Segmentation Starting with Gemini 2.5, models not only detect items but also segment them and provide their contour masks. The model predicts a JSON list, where each item represents a segmentation mask. Each item has a bounding box (" box_2d ") in the format [y0, x0, y1, x1] with normalized coordinates between 0 and 1000, a label (" label ") that identifies the object, and finally the segmentation mask inside the bounding box, as base64 encoded png that is a probability map with values between 0 and 255. The mask needs to be resized to match the bounding box dimensions, then binarized at your confidence threshold (127 for the midpoint). Note: For better results, disable thinking by setting the thinking budget to 0. See code sample below for an example. Python from google import genai from google.genai import types from PIL import Image , ImageDraw import io import base64 import json import numpy as np import os client = genai . Client () def parse_json ( json_output : str ): # Parsing out the markdown fencing lines = json_output . splitlines () for i , line in enumerate ( lines ): if line == "```json" : json_output = " \n " . join ( lines [ i + 1 :]) # Remove everything before "```json" output = json_output . split ( "```" )[ 0 ] # Remove everything after the closing "```" break # Exit the loop once "```json" is found return json_output def extract_segmentation_masks ( image_path : str , output_dir : str = "segmentation_outputs" ): # Load and resize image im = Image . open ( image_path ) im . thumbnail ([ 1024 , 1024 ], Image . Resampling . LANCZOS ) prompt = """ Give the segmentation masks for the wooden and glass items. Output a JSON list of segmentation masks where each entry contains the 2D bounding box in the key "box_2d", the segmentation mask in key "mask", and the text label in the key "label". Use descriptive labels. """ config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( thinking_budget = 0 ) # set thinking_budget to 0 for better results in object detection ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ prompt , im ], # Pillow images can be directly passed as inputs (which will be converted by the SDK) config = config ) # Parse JSON response items = json . loads ( parse_json ( response . text )) # Create output directory os . makedirs ( output_dir , exist_ok = True ) # Process each mask for i , item in enumerate ( items ): # Get bounding box coordinates box = item [ "box_2d" ] y0 = int ( box [ 0 ] / 1000 * im . size [ 1 ]) x0 = int ( box [ 1 ] / 1000 * im . size [ 0 ]) y1 = int ( box [ 2 ] / 1000 * im . size [ 1 ]) x1 = int ( box [ 3 ] / 1000 * im . size [ 0 ]) # Skip invalid boxes if y0 > = y1 or x0 > = x1 : continue # Process mask png_str = item [ "mask" ] if not png_str . startswith ( "data:image/png;base64," ): continue # Remove prefix png_str = png_str . removeprefix ( "data:image/png;base64," ) mask_data = base64 . b64decode ( png_str ) mask = Image . open ( io . BytesIO ( mask_data )) # Resize mask to match bounding box mask = mask . resize (( x1 - x0 , y1 - y0 ), Image . Resampling . BILINEAR ) # Convert mask to numpy array for processing mask_array = np . array ( mask ) # Create overlay for this mask overlay = Image . new ( 'RGBA' , im . size , ( 0 , 0 , 0 , 0 )) overlay_draw = ImageDraw . Draw ( overlay ) # Create overlay for the mask color = ( 255 , 255 , 255 , 200 ) for y in range ( y0 , y1 ): for x in range ( x0 , x1 ): if mask_array [ y - y0 , x - x0 ] > 128 : # Threshold for mask overlay_draw . point (( x , y ), fill = color ) # Save individual mask and its overlay mask_filename = f " { item [ 'label' ] } _ { i } _mask.png" overlay_filename = f " { item [ 'label' ] } _ { i } _overlay.png" mask . save ( os . path . join ( output_dir , mask_filename )) # Create and save overlay composite = Image . alpha_composite ( im . convert ( 'RGBA' ), overlay ) composite . save ( os . path . join ( output_dir , overlay_filename )) print ( f "Saved mask and overlay for { item [ 'label' ] } to { output_dir } " ) # Example usage if __name__ == "__main__" : extract_segmentation_masks ( "path/to/image.png" ) Check the segmentation example in the cookbook guide for a more detailed example. An example segmentation output with objects and segmentation masks Supported image formats Gemini supports the following image format MIME types: PNG - image/png JPEG - image/jpeg WEBP - image/webp HEIC - image/heic HEIF - image/heif Capabilities All Gemini model versions are multimodal and can be utilized in a wide range of image processing and computer vision tasks including but not limited to image captioning, visual question and answering, image classification, object detection and segmentation. Gemini can reduce the need to use specialized ML models depending on your quality and performance requirements. Some later model versions are specifically trained improve accuracy of specialized tasks in addition to generic capabilities: Gemini 2.0 models are further trained to support enhanced object detection . Gemini 2.5 models are further trained to support enhanced segmentation in addition to object detection . Limitations and key technical information File limit Gemini 2.5 Pro/Flash, 2.0 Flash, 1.5 Pro, and 1.5 Flash support a maximum of 3,600 image files per request. Token calculation Gemini 1.5 Flash and Gemini 1.5 Pro : 258 tokens if both dimensions <= 384 pixels. Larger images are tiled (min tile 256px, max 768px, resized to 768x768), with each tile costing 258 tokens. Gemini 2.0 Flash and Gemini 2.5 Flash/Pro : 258 tokens if both dimensions <= 384 pixels. Larger images are tiled into 768x768 pixel tiles, each costing 258 tokens. Tips and best practices Verify that images are correctly rotated. Use clear, non-blurry images. When using a single image with text, place the text prompt after the image part in the contents array. What's next This guide shows you how to upload image files and generate text outputs from image inputs. To learn more, see the following resources: Files API : Learn more about uploading and managing files for use with Gemini. System instructions : System instructions let you steer the behavior of the model based on your specific needs and use cases. File prompting strategies : The Gemini API supports prompting with text, image, audio, and video data, also known as multimodal prompting. Safety guidance : Sometimes generative AI models produce unexpected outputs, such as outputs that are inaccurate, biased, or offensive. Post-processing and human evaluation are essential to limit the risk of harm from such outputs. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC.
|
text_content/docs_vision_3b2d50f0.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
URL: https://ai.google.dev/gemini-api/docs/vision
|
2 |
+
Title: Image understanding | Gemini API | Google AI for Developers
|
3 |
+
==================================================
|
4 |
+
|
5 |
+
Image understanding | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Image understanding Gemini models are built to be multimodal from the ground up, unlocking a wide range of image processing and computer vision tasks including but not limited to image captioning, classification, and visual question answering without having to train specialized ML models. Tip: In addition to their general multimodal capabilities, Gemini models (2.0 and newer) offer improved accuracy for specific use cases like object detection and segmentation , through additional training. See the Capabilities section for more details. Passing images to Gemini You can provide images as input to Gemini using two methods: Passing inline image data : Ideal for smaller files (total request size less than 20MB, including prompts). Uploading images using the File API : Recommended for larger files or for reusing images across multiple requests. Passing inline image data You can pass inline image data in the request to generateContent . You can provide image data as Base64 encoded strings or by reading local files directly (depending on the language). The following example shows how to read an image from a local file and pass it to generateContent API for processing. Python from google.genai import types with open ( 'path/to/small-sample.jpg' , 'rb' ) as f : image_bytes = f . read () response = client . models . generate_content ( model = 'gemini-2.5-flash' , contents = [ types . Part . from_bytes ( data = image_bytes , mime_type = 'image/jpeg' , ), 'Caption this image.' ] ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; import * as fs from "node:fs" ; const ai = new GoogleGenAI ({}); const base64ImageFile = fs . readFileSync ( "path/to/small-sample.jpg" , { encoding : "base64" , }); const contents = [ { inlineData : { mimeType : "image/jpeg" , data : base64ImageFile , }, }, { text : "Caption this image." }, ]; const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : contents , }); console . log ( response . text ); Go bytes , _ := os . ReadFile ( "path/to/small-sample.jpg" ) parts := [] * genai . Part { genai . NewPartFromBytes ( bytes , "image/jpeg" ), genai . NewPartFromText ( "Caption this image." ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) REST IMG_PATH = "/path/to/your/image1.jpg" if [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then B64FLAGS = "--input" else B64FLAGS = "-w0" fi curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ { "inline_data": { "mime_type":"image/jpeg", "data": "' " $( base64 $B64FLAGS $IMG_PATH ) " '" } }, {"text": "Caption this image."}, ] }] }' 2 > /dev/null You can also fetch an image from a URL, convert it to bytes, and pass it to generateContent as shown in the following examples. Python from google import genai from google.genai import types import requests image_path = "https://goo.gle/instrument-img" image_bytes = requests . get ( image_path ) . content image = types . Part . from_bytes ( data = image_bytes , mime_type = "image/jpeg" ) client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ "What is this image?" , image ], ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; async function main () { const ai = new GoogleGenAI ({}); const imageUrl = "https://goo.gle/instrument-img" ; const response = await fetch ( imageUrl ); const imageArrayBuffer = await response . arrayBuffer (); const base64ImageData = Buffer . from ( imageArrayBuffer ). toString ( 'base64' ); const result = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : [ { inlineData : { mimeType : 'image/jpeg' , data : base64ImageData , }, }, { text : "Caption this image." } ], }); console . log ( result . text ); } main (); Go package main import ( "context" "fmt" "os" "io" "net/http" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } // Download the image. imageResp , _ := http . Get ( "https://goo.gle/instrument-img" ) imageBytes , _ := io . ReadAll ( imageResp . Body ) parts := [] * genai . Part { genai . NewPartFromBytes ( imageBytes , "image/jpeg" ), genai . NewPartFromText ( "Caption this image." ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST IMG_URL = "https://goo.gle/instrument-img" MIME_TYPE = $( curl -sIL " $IMG_URL " | grep -i '^content-type:' | awk -F ': ' '{print $2}' | sed 's/\r$//' | head -n 1 ) if [[ -z " $MIME_TYPE " || ! " $MIME_TYPE " == image/* ]] ; then MIME_TYPE = "image/jpeg" fi # Check for macOS if [[ " $( uname ) " == "Darwin" ]] ; then IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 -b 0 ) elif [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 ) else IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 -w0 ) fi curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ { "inline_data": { "mime_type":"' " $MIME_TYPE " '", "data": "' " $IMAGE_B64 " '" } }, {"text": "Caption this image."} ] }] }' 2 > /dev/null Note: Inline image data limits your total request size (text prompts, system instructions, and inline bytes) to 20MB. For larger requests, upload image files using the File API. Files API is also more efficient for scenarios that use the same image repeatedly. Uploading images using the File API For large files or to be able to use the same image file repeatedly, use the Files API. The following code uploads an image file and then uses the file in a call to generateContent . See the Files API guide for more information and examples. Python from google import genai client = genai . Client () my_file = client . files . upload ( file = "path/to/sample.jpg" ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ my_file , "Caption this image." ], ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const myfile = await ai . files . upload ({ file : "path/to/sample.jpg" , config : { mimeType : "image/jpeg" }, }); const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : createUserContent ([ createPartFromUri ( myfile . uri , myfile . mimeType ), "Caption this image." , ]), }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } uploadedFile , _ := client . Files . UploadFromPath ( ctx , "path/to/sample.jpg" , nil ) parts := [] * genai . Part { genai . NewPartFromText ( "Caption this image." ), genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST IMAGE_PATH = "path/to/sample.jpg" MIME_TYPE = $( file -b --mime-type " ${ IMAGE_PATH } " ) NUM_BYTES = $( wc -c < " ${ IMAGE_PATH } " ) DISPLAY_NAME = IMAGE tmp_header_file = upload-header.tmp # Initial resumable request defining metadata. # The upload url is in the response headers dump them to a file. curl "https://generativelanguage.googleapis.com/upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D upload-header.tmp \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME } '}}" 2 > /dev/null upload_url = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " # Upload the actual bytes. curl " ${ upload_url } " \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ IMAGE_PATH } " 2 > /dev/null > file_info.json file_uri = $( jq -r ".file.uri" file_info.json ) echo file_uri = $file_uri # Now generate content using that file curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"file_data":{"mime_type": "' " ${ MIME_TYPE } " '", "file_uri": "' " ${ file_uri } " '"}}, {"text": "Caption this image."}] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json Prompting with multiple images You can provide multiple images in a single prompt by including multiple image Part objects in the contents array. These can be a mix of inline data (local files or URLs) and File API references. Python from google import genai from google.genai import types client = genai . Client () # Upload the first image image1_path = "path/to/image1.jpg" uploaded_file = client . files . upload ( file = image1_path ) # Prepare the second image as inline data image2_path = "path/to/image2.png" with open ( image2_path , 'rb' ) as f : img2_bytes = f . read () # Create the prompt with text and multiple images response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ "What is different between these two images?" , uploaded_file , # Use the uploaded file reference types . Part . from_bytes ( data = img2_bytes , mime_type = 'image/png' ) ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; import * as fs from "node:fs" ; const ai = new GoogleGenAI ({}); async function main () { // Upload the first image const image1_path = "path/to/image1.jpg" ; const uploadedFile = await ai . files . upload ({ file : image1_path , config : { mimeType : "image/jpeg" }, }); // Prepare the second image as inline data const image2_path = "path/to/image2.png" ; const base64Image2File = fs . readFileSync ( image2_path , { encoding : "base64" , }); // Create the prompt with text and multiple images const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : createUserContent ([ "What is different between these two images?" , createPartFromUri ( uploadedFile . uri , uploadedFile . mimeType ), { inlineData : { mimeType : "image/png" , data : base64Image2File , }, }, ]), }); console . log ( response . text ); } await main (); Go // Upload the first image image1Path := "path/to/image1.jpg" uploadedFile , _ := client . Files . UploadFromPath ( ctx , image1Path , nil ) // Prepare the second image as inline data image2Path := "path/to/image2.jpeg" imgBytes , _ := os . ReadFile ( image2Path ) parts := [] * genai . Part { genai . NewPartFromText ( "What is different between these two images?" ), genai . NewPartFromBytes ( imgBytes , "image/jpeg" ), genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) REST # Upload the first image IMAGE1_PATH = "path/to/image1.jpg" MIME1_TYPE = $( file -b --mime-type " ${ IMAGE1_PATH } " ) NUM1_BYTES = $( wc -c < " ${ IMAGE1_PATH } " ) DISPLAY_NAME1 = IMAGE1 tmp_header_file1 = upload-header1.tmp curl "https://generativelanguage.googleapis.com/upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D upload-header1.tmp \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM1_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME1_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME1 } '}}" 2 > /dev/null upload_url1 = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file1 } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file1 } " curl " ${ upload_url1 } " \ -H "Content-Length: ${ NUM1_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ IMAGE1_PATH } " 2 > /dev/null > file_info1.json file1_uri = $( jq ".file.uri" file_info1.json ) echo file1_uri = $file1_uri # Prepare the second image (inline) IMAGE2_PATH = "path/to/image2.png" MIME2_TYPE = $( file -b --mime-type " ${ IMAGE2_PATH } " ) if [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then B64FLAGS = "--input" else B64FLAGS = "-w0" fi IMAGE2_BASE64 = $( base64 $B64FLAGS $IMAGE2_PATH ) # Now generate content using both images curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"text": "What is different between these two images?"}, {"file_data":{"mime_type": "' " ${ MIME1_TYPE } " '", "file_uri": ' $file1_uri '}}, { "inline_data": { "mime_type":"' " ${ MIME2_TYPE } " '", "data": "' " $IMAGE2_BASE64 " '" } } ] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json Object detection From Gemini 2.0 onwards, models are further trained to detect objects in an image and get their bounding box coordinates. The coordinates, relative to image dimensions, scale to [0, 1000]. You need to descale these coordinates based on your original image size. Python from google import genai from google.genai import types from PIL import Image import json client = genai . Client () prompt = "Detect the all of the prominent items in the image. The box_2d should be [ymin, xmin, ymax, xmax] normalized to 0-1000." image = Image . open ( "/path/to/image.png" ) config = types . GenerateContentConfig ( response_mime_type = "application/json" ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ image , prompt ], config = config ) width , height = image . size bounding_boxes = json . loads ( response . text ) converted_bounding_boxes = [] for bounding_box in bounding_boxes : abs_y1 = int ( bounding_box [ "box_2d" ][ 0 ] / 1000 * height ) abs_x1 = int ( bounding_box [ "box_2d" ][ 1 ] / 1000 * width ) abs_y2 = int ( bounding_box [ "box_2d" ][ 2 ] / 1000 * height ) abs_x2 = int ( bounding_box [ "box_2d" ][ 3 ] / 1000 * width ) converted_bounding_boxes . append ([ abs_x1 , abs_y1 , abs_x2 , abs_y2 ]) print ( "Image size: " , width , height ) print ( "Bounding boxes:" , converted_bounding_boxes ) Note: The model also supports generating bounding boxes based on custom instructions, such as: "Show bounding boxes of all green objects in this image". It also support custom labels like "label the items with the allergens they can contain". For more examples, check following notebooks in the Gemini Cookbook : 2D spatial understanding notebook Experimental 3D pointing notebook Segmentation Starting with Gemini 2.5, models not only detect items but also segment them and provide their contour masks. The model predicts a JSON list, where each item represents a segmentation mask. Each item has a bounding box (" box_2d ") in the format [y0, x0, y1, x1] with normalized coordinates between 0 and 1000, a label (" label ") that identifies the object, and finally the segmentation mask inside the bounding box, as base64 encoded png that is a probability map with values between 0 and 255. The mask needs to be resized to match the bounding box dimensions, then binarized at your confidence threshold (127 for the midpoint). Note: For better results, disable thinking by setting the thinking budget to 0. See code sample below for an example. Python from google import genai from google.genai import types from PIL import Image , ImageDraw import io import base64 import json import numpy as np import os client = genai . Client () def parse_json ( json_output : str ): # Parsing out the markdown fencing lines = json_output . splitlines () for i , line in enumerate ( lines ): if line == "```json" : json_output = " \n " . join ( lines [ i + 1 :]) # Remove everything before "```json" output = json_output . split ( "```" )[ 0 ] # Remove everything after the closing "```" break # Exit the loop once "```json" is found return json_output def extract_segmentation_masks ( image_path : str , output_dir : str = "segmentation_outputs" ): # Load and resize image im = Image . open ( image_path ) im . thumbnail ([ 1024 , 1024 ], Image . Resampling . LANCZOS ) prompt = """ Give the segmentation masks for the wooden and glass items. Output a JSON list of segmentation masks where each entry contains the 2D bounding box in the key "box_2d", the segmentation mask in key "mask", and the text label in the key "label". Use descriptive labels. """ config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( thinking_budget = 0 ) # set thinking_budget to 0 for better results in object detection ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ prompt , im ], # Pillow images can be directly passed as inputs (which will be converted by the SDK) config = config ) # Parse JSON response items = json . loads ( parse_json ( response . text )) # Create output directory os . makedirs ( output_dir , exist_ok = True ) # Process each mask for i , item in enumerate ( items ): # Get bounding box coordinates box = item [ "box_2d" ] y0 = int ( box [ 0 ] / 1000 * im . size [ 1 ]) x0 = int ( box [ 1 ] / 1000 * im . size [ 0 ]) y1 = int ( box [ 2 ] / 1000 * im . size [ 1 ]) x1 = int ( box [ 3 ] / 1000 * im . size [ 0 ]) # Skip invalid boxes if y0 > = y1 or x0 > = x1 : continue # Process mask png_str = item [ "mask" ] if not png_str . startswith ( "data:image/png;base64," ): continue # Remove prefix png_str = png_str . removeprefix ( "data:image/png;base64," ) mask_data = base64 . b64decode ( png_str ) mask = Image . open ( io . BytesIO ( mask_data )) # Resize mask to match bounding box mask = mask . resize (( x1 - x0 , y1 - y0 ), Image . Resampling . BILINEAR ) # Convert mask to numpy array for processing mask_array = np . array ( mask ) # Create overlay for this mask overlay = Image . new ( 'RGBA' , im . size , ( 0 , 0 , 0 , 0 )) overlay_draw = ImageDraw . Draw ( overlay ) # Create overlay for the mask color = ( 255 , 255 , 255 , 200 ) for y in range ( y0 , y1 ): for x in range ( x0 , x1 ): if mask_array [ y - y0 , x - x0 ] > 128 : # Threshold for mask overlay_draw . point (( x , y ), fill = color ) # Save individual mask and its overlay mask_filename = f " { item [ 'label' ] } _ { i } _mask.png" overlay_filename = f " { item [ 'label' ] } _ { i } _overlay.png" mask . save ( os . path . join ( output_dir , mask_filename )) # Create and save overlay composite = Image . alpha_composite ( im . convert ( 'RGBA' ), overlay ) composite . save ( os . path . join ( output_dir , overlay_filename )) print ( f "Saved mask and overlay for { item [ 'label' ] } to { output_dir } " ) # Example usage if __name__ == "__main__" : extract_segmentation_masks ( "path/to/image.png" ) Check the segmentation example in the cookbook guide for a more detailed example. An example segmentation output with objects and segmentation masks Supported image formats Gemini supports the following image format MIME types: PNG - image/png JPEG - image/jpeg WEBP - image/webp HEIC - image/heic HEIF - image/heif Capabilities All Gemini model versions are multimodal and can be utilized in a wide range of image processing and computer vision tasks including but not limited to image captioning, visual question and answering, image classification, object detection and segmentation. Gemini can reduce the need to use specialized ML models depending on your quality and performance requirements. Some later model versions are specifically trained improve accuracy of specialized tasks in addition to generic capabilities: Gemini 2.0 models are further trained to support enhanced object detection . Gemini 2.5 models are further trained to support enhanced segmentation in addition to object detection . Limitations and key technical information File limit Gemini 2.5 Pro/Flash, 2.0 Flash, 1.5 Pro, and 1.5 Flash support a maximum of 3,600 image files per request. Token calculation Gemini 1.5 Flash and Gemini 1.5 Pro : 258 tokens if both dimensions <= 384 pixels. Larger images are tiled (min tile 256px, max 768px, resized to 768x768), with each tile costing 258 tokens. Gemini 2.0 Flash and Gemini 2.5 Flash/Pro : 258 tokens if both dimensions <= 384 pixels. Larger images are tiled into 768x768 pixel tiles, each costing 258 tokens. Tips and best practices Verify that images are correctly rotated. Use clear, non-blurry images. When using a single image with text, place the text prompt after the image part in the contents array. What's next This guide shows you how to upload image files and generate text outputs from image inputs. To learn more, see the following resources: Files API : Learn more about uploading and managing files for use with Gemini. System instructions : System instructions let you steer the behavior of the model based on your specific needs and use cases. File prompting strategies : The Gemini API supports prompting with text, image, audio, and video data, also known as multimodal prompting. Safety guidance : Sometimes generative AI models produce unexpected outputs, such as outputs that are inaccurate, biased, or offensive. Post-processing and human evaluation are essential to limit the risk of harm from such outputs. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC.
|
text_content/docs_vision_879c35c5.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
URL: https://ai.google.dev/gemini-api/docs/vision#inline-image
|
2 |
+
Title: Image understanding | Gemini API | Google AI for Developers
|
3 |
+
==================================================
|
4 |
+
|
5 |
+
Image understanding | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Image understanding Gemini models are built to be multimodal from the ground up, unlocking a wide range of image processing and computer vision tasks including but not limited to image captioning, classification, and visual question answering without having to train specialized ML models. Tip: In addition to their general multimodal capabilities, Gemini models (2.0 and newer) offer improved accuracy for specific use cases like object detection and segmentation , through additional training. See the Capabilities section for more details. Passing images to Gemini You can provide images as input to Gemini using two methods: Passing inline image data : Ideal for smaller files (total request size less than 20MB, including prompts). Uploading images using the File API : Recommended for larger files or for reusing images across multiple requests. Passing inline image data You can pass inline image data in the request to generateContent . You can provide image data as Base64 encoded strings or by reading local files directly (depending on the language). The following example shows how to read an image from a local file and pass it to generateContent API for processing. Python from google.genai import types with open ( 'path/to/small-sample.jpg' , 'rb' ) as f : image_bytes = f . read () response = client . models . generate_content ( model = 'gemini-2.5-flash' , contents = [ types . Part . from_bytes ( data = image_bytes , mime_type = 'image/jpeg' , ), 'Caption this image.' ] ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; import * as fs from "node:fs" ; const ai = new GoogleGenAI ({}); const base64ImageFile = fs . readFileSync ( "path/to/small-sample.jpg" , { encoding : "base64" , }); const contents = [ { inlineData : { mimeType : "image/jpeg" , data : base64ImageFile , }, }, { text : "Caption this image." }, ]; const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : contents , }); console . log ( response . text ); Go bytes , _ := os . ReadFile ( "path/to/small-sample.jpg" ) parts := [] * genai . Part { genai . NewPartFromBytes ( bytes , "image/jpeg" ), genai . NewPartFromText ( "Caption this image." ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) REST IMG_PATH = "/path/to/your/image1.jpg" if [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then B64FLAGS = "--input" else B64FLAGS = "-w0" fi curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ { "inline_data": { "mime_type":"image/jpeg", "data": "' " $( base64 $B64FLAGS $IMG_PATH ) " '" } }, {"text": "Caption this image."}, ] }] }' 2 > /dev/null You can also fetch an image from a URL, convert it to bytes, and pass it to generateContent as shown in the following examples. Python from google import genai from google.genai import types import requests image_path = "https://goo.gle/instrument-img" image_bytes = requests . get ( image_path ) . content image = types . Part . from_bytes ( data = image_bytes , mime_type = "image/jpeg" ) client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ "What is this image?" , image ], ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; async function main () { const ai = new GoogleGenAI ({}); const imageUrl = "https://goo.gle/instrument-img" ; const response = await fetch ( imageUrl ); const imageArrayBuffer = await response . arrayBuffer (); const base64ImageData = Buffer . from ( imageArrayBuffer ). toString ( 'base64' ); const result = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : [ { inlineData : { mimeType : 'image/jpeg' , data : base64ImageData , }, }, { text : "Caption this image." } ], }); console . log ( result . text ); } main (); Go package main import ( "context" "fmt" "os" "io" "net/http" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } // Download the image. imageResp , _ := http . Get ( "https://goo.gle/instrument-img" ) imageBytes , _ := io . ReadAll ( imageResp . Body ) parts := [] * genai . Part { genai . NewPartFromBytes ( imageBytes , "image/jpeg" ), genai . NewPartFromText ( "Caption this image." ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST IMG_URL = "https://goo.gle/instrument-img" MIME_TYPE = $( curl -sIL " $IMG_URL " | grep -i '^content-type:' | awk -F ': ' '{print $2}' | sed 's/\r$//' | head -n 1 ) if [[ -z " $MIME_TYPE " || ! " $MIME_TYPE " == image/* ]] ; then MIME_TYPE = "image/jpeg" fi # Check for macOS if [[ " $( uname ) " == "Darwin" ]] ; then IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 -b 0 ) elif [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 ) else IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 -w0 ) fi curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ { "inline_data": { "mime_type":"' " $MIME_TYPE " '", "data": "' " $IMAGE_B64 " '" } }, {"text": "Caption this image."} ] }] }' 2 > /dev/null Note: Inline image data limits your total request size (text prompts, system instructions, and inline bytes) to 20MB. For larger requests, upload image files using the File API. Files API is also more efficient for scenarios that use the same image repeatedly. Uploading images using the File API For large files or to be able to use the same image file repeatedly, use the Files API. The following code uploads an image file and then uses the file in a call to generateContent . See the Files API guide for more information and examples. Python from google import genai client = genai . Client () my_file = client . files . upload ( file = "path/to/sample.jpg" ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ my_file , "Caption this image." ], ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const myfile = await ai . files . upload ({ file : "path/to/sample.jpg" , config : { mimeType : "image/jpeg" }, }); const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : createUserContent ([ createPartFromUri ( myfile . uri , myfile . mimeType ), "Caption this image." , ]), }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } uploadedFile , _ := client . Files . UploadFromPath ( ctx , "path/to/sample.jpg" , nil ) parts := [] * genai . Part { genai . NewPartFromText ( "Caption this image." ), genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST IMAGE_PATH = "path/to/sample.jpg" MIME_TYPE = $( file -b --mime-type " ${ IMAGE_PATH } " ) NUM_BYTES = $( wc -c < " ${ IMAGE_PATH } " ) DISPLAY_NAME = IMAGE tmp_header_file = upload-header.tmp # Initial resumable request defining metadata. # The upload url is in the response headers dump them to a file. curl "https://generativelanguage.googleapis.com/upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D upload-header.tmp \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME } '}}" 2 > /dev/null upload_url = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " # Upload the actual bytes. curl " ${ upload_url } " \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ IMAGE_PATH } " 2 > /dev/null > file_info.json file_uri = $( jq -r ".file.uri" file_info.json ) echo file_uri = $file_uri # Now generate content using that file curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"file_data":{"mime_type": "' " ${ MIME_TYPE } " '", "file_uri": "' " ${ file_uri } " '"}}, {"text": "Caption this image."}] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json Prompting with multiple images You can provide multiple images in a single prompt by including multiple image Part objects in the contents array. These can be a mix of inline data (local files or URLs) and File API references. Python from google import genai from google.genai import types client = genai . Client () # Upload the first image image1_path = "path/to/image1.jpg" uploaded_file = client . files . upload ( file = image1_path ) # Prepare the second image as inline data image2_path = "path/to/image2.png" with open ( image2_path , 'rb' ) as f : img2_bytes = f . read () # Create the prompt with text and multiple images response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ "What is different between these two images?" , uploaded_file , # Use the uploaded file reference types . Part . from_bytes ( data = img2_bytes , mime_type = 'image/png' ) ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; import * as fs from "node:fs" ; const ai = new GoogleGenAI ({}); async function main () { // Upload the first image const image1_path = "path/to/image1.jpg" ; const uploadedFile = await ai . files . upload ({ file : image1_path , config : { mimeType : "image/jpeg" }, }); // Prepare the second image as inline data const image2_path = "path/to/image2.png" ; const base64Image2File = fs . readFileSync ( image2_path , { encoding : "base64" , }); // Create the prompt with text and multiple images const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : createUserContent ([ "What is different between these two images?" , createPartFromUri ( uploadedFile . uri , uploadedFile . mimeType ), { inlineData : { mimeType : "image/png" , data : base64Image2File , }, }, ]), }); console . log ( response . text ); } await main (); Go // Upload the first image image1Path := "path/to/image1.jpg" uploadedFile , _ := client . Files . UploadFromPath ( ctx , image1Path , nil ) // Prepare the second image as inline data image2Path := "path/to/image2.jpeg" imgBytes , _ := os . ReadFile ( image2Path ) parts := [] * genai . Part { genai . NewPartFromText ( "What is different between these two images?" ), genai . NewPartFromBytes ( imgBytes , "image/jpeg" ), genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) REST # Upload the first image IMAGE1_PATH = "path/to/image1.jpg" MIME1_TYPE = $( file -b --mime-type " ${ IMAGE1_PATH } " ) NUM1_BYTES = $( wc -c < " ${ IMAGE1_PATH } " ) DISPLAY_NAME1 = IMAGE1 tmp_header_file1 = upload-header1.tmp curl "https://generativelanguage.googleapis.com/upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D upload-header1.tmp \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM1_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME1_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME1 } '}}" 2 > /dev/null upload_url1 = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file1 } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file1 } " curl " ${ upload_url1 } " \ -H "Content-Length: ${ NUM1_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ IMAGE1_PATH } " 2 > /dev/null > file_info1.json file1_uri = $( jq ".file.uri" file_info1.json ) echo file1_uri = $file1_uri # Prepare the second image (inline) IMAGE2_PATH = "path/to/image2.png" MIME2_TYPE = $( file -b --mime-type " ${ IMAGE2_PATH } " ) if [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then B64FLAGS = "--input" else B64FLAGS = "-w0" fi IMAGE2_BASE64 = $( base64 $B64FLAGS $IMAGE2_PATH ) # Now generate content using both images curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"text": "What is different between these two images?"}, {"file_data":{"mime_type": "' " ${ MIME1_TYPE } " '", "file_uri": ' $file1_uri '}}, { "inline_data": { "mime_type":"' " ${ MIME2_TYPE } " '", "data": "' " $IMAGE2_BASE64 " '" } } ] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json Object detection From Gemini 2.0 onwards, models are further trained to detect objects in an image and get their bounding box coordinates. The coordinates, relative to image dimensions, scale to [0, 1000]. You need to descale these coordinates based on your original image size. Python from google import genai from google.genai import types from PIL import Image import json client = genai . Client () prompt = "Detect the all of the prominent items in the image. The box_2d should be [ymin, xmin, ymax, xmax] normalized to 0-1000." image = Image . open ( "/path/to/image.png" ) config = types . GenerateContentConfig ( response_mime_type = "application/json" ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ image , prompt ], config = config ) width , height = image . size bounding_boxes = json . loads ( response . text ) converted_bounding_boxes = [] for bounding_box in bounding_boxes : abs_y1 = int ( bounding_box [ "box_2d" ][ 0 ] / 1000 * height ) abs_x1 = int ( bounding_box [ "box_2d" ][ 1 ] / 1000 * width ) abs_y2 = int ( bounding_box [ "box_2d" ][ 2 ] / 1000 * height ) abs_x2 = int ( bounding_box [ "box_2d" ][ 3 ] / 1000 * width ) converted_bounding_boxes . append ([ abs_x1 , abs_y1 , abs_x2 , abs_y2 ]) print ( "Image size: " , width , height ) print ( "Bounding boxes:" , converted_bounding_boxes ) Note: The model also supports generating bounding boxes based on custom instructions, such as: "Show bounding boxes of all green objects in this image". It also support custom labels like "label the items with the allergens they can contain". For more examples, check following notebooks in the Gemini Cookbook : 2D spatial understanding notebook Experimental 3D pointing notebook Segmentation Starting with Gemini 2.5, models not only detect items but also segment them and provide their contour masks. The model predicts a JSON list, where each item represents a segmentation mask. Each item has a bounding box (" box_2d ") in the format [y0, x0, y1, x1] with normalized coordinates between 0 and 1000, a label (" label ") that identifies the object, and finally the segmentation mask inside the bounding box, as base64 encoded png that is a probability map with values between 0 and 255. The mask needs to be resized to match the bounding box dimensions, then binarized at your confidence threshold (127 for the midpoint). Note: For better results, disable thinking by setting the thinking budget to 0. See code sample below for an example. Python from google import genai from google.genai import types from PIL import Image , ImageDraw import io import base64 import json import numpy as np import os client = genai . Client () def parse_json ( json_output : str ): # Parsing out the markdown fencing lines = json_output . splitlines () for i , line in enumerate ( lines ): if line == "```json" : json_output = " \n " . join ( lines [ i + 1 :]) # Remove everything before "```json" output = json_output . split ( "```" )[ 0 ] # Remove everything after the closing "```" break # Exit the loop once "```json" is found return json_output def extract_segmentation_masks ( image_path : str , output_dir : str = "segmentation_outputs" ): # Load and resize image im = Image . open ( image_path ) im . thumbnail ([ 1024 , 1024 ], Image . Resampling . LANCZOS ) prompt = """ Give the segmentation masks for the wooden and glass items. Output a JSON list of segmentation masks where each entry contains the 2D bounding box in the key "box_2d", the segmentation mask in key "mask", and the text label in the key "label". Use descriptive labels. """ config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( thinking_budget = 0 ) # set thinking_budget to 0 for better results in object detection ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ prompt , im ], # Pillow images can be directly passed as inputs (which will be converted by the SDK) config = config ) # Parse JSON response items = json . loads ( parse_json ( response . text )) # Create output directory os . makedirs ( output_dir , exist_ok = True ) # Process each mask for i , item in enumerate ( items ): # Get bounding box coordinates box = item [ "box_2d" ] y0 = int ( box [ 0 ] / 1000 * im . size [ 1 ]) x0 = int ( box [ 1 ] / 1000 * im . size [ 0 ]) y1 = int ( box [ 2 ] / 1000 * im . size [ 1 ]) x1 = int ( box [ 3 ] / 1000 * im . size [ 0 ]) # Skip invalid boxes if y0 > = y1 or x0 > = x1 : continue # Process mask png_str = item [ "mask" ] if not png_str . startswith ( "data:image/png;base64," ): continue # Remove prefix png_str = png_str . removeprefix ( "data:image/png;base64," ) mask_data = base64 . b64decode ( png_str ) mask = Image . open ( io . BytesIO ( mask_data )) # Resize mask to match bounding box mask = mask . resize (( x1 - x0 , y1 - y0 ), Image . Resampling . BILINEAR ) # Convert mask to numpy array for processing mask_array = np . array ( mask ) # Create overlay for this mask overlay = Image . new ( 'RGBA' , im . size , ( 0 , 0 , 0 , 0 )) overlay_draw = ImageDraw . Draw ( overlay ) # Create overlay for the mask color = ( 255 , 255 , 255 , 200 ) for y in range ( y0 , y1 ): for x in range ( x0 , x1 ): if mask_array [ y - y0 , x - x0 ] > 128 : # Threshold for mask overlay_draw . point (( x , y ), fill = color ) # Save individual mask and its overlay mask_filename = f " { item [ 'label' ] } _ { i } _mask.png" overlay_filename = f " { item [ 'label' ] } _ { i } _overlay.png" mask . save ( os . path . join ( output_dir , mask_filename )) # Create and save overlay composite = Image . alpha_composite ( im . convert ( 'RGBA' ), overlay ) composite . save ( os . path . join ( output_dir , overlay_filename )) print ( f "Saved mask and overlay for { item [ 'label' ] } to { output_dir } " ) # Example usage if __name__ == "__main__" : extract_segmentation_masks ( "path/to/image.png" ) Check the segmentation example in the cookbook guide for a more detailed example. An example segmentation output with objects and segmentation masks Supported image formats Gemini supports the following image format MIME types: PNG - image/png JPEG - image/jpeg WEBP - image/webp HEIC - image/heic HEIF - image/heif Capabilities All Gemini model versions are multimodal and can be utilized in a wide range of image processing and computer vision tasks including but not limited to image captioning, visual question and answering, image classification, object detection and segmentation. Gemini can reduce the need to use specialized ML models depending on your quality and performance requirements. Some later model versions are specifically trained improve accuracy of specialized tasks in addition to generic capabilities: Gemini 2.0 models are further trained to support enhanced object detection . Gemini 2.5 models are further trained to support enhanced segmentation in addition to object detection . Limitations and key technical information File limit Gemini 2.5 Pro/Flash, 2.0 Flash, 1.5 Pro, and 1.5 Flash support a maximum of 3,600 image files per request. Token calculation Gemini 1.5 Flash and Gemini 1.5 Pro : 258 tokens if both dimensions <= 384 pixels. Larger images are tiled (min tile 256px, max 768px, resized to 768x768), with each tile costing 258 tokens. Gemini 2.0 Flash and Gemini 2.5 Flash/Pro : 258 tokens if both dimensions <= 384 pixels. Larger images are tiled into 768x768 pixel tiles, each costing 258 tokens. Tips and best practices Verify that images are correctly rotated. Use clear, non-blurry images. When using a single image with text, place the text prompt after the image part in the contents array. What's next This guide shows you how to upload image files and generate text outputs from image inputs. To learn more, see the following resources: Files API : Learn more about uploading and managing files for use with Gemini. System instructions : System instructions let you steer the behavior of the model based on your specific needs and use cases. File prompting strategies : The Gemini API supports prompting with text, image, audio, and video data, also known as multimodal prompting. Safety guidance : Sometimes generative AI models produce unexpected outputs, such as outputs that are inaccurate, biased, or offensive. Post-processing and human evaluation are essential to limit the risk of harm from such outputs. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC.
|
text_content/function-calling_tutorial_491a2277.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
URL: https://ai.google.dev/gemini-api/docs/function-calling/tutorial#step-4
|
2 |
+
Title: Function calling with the Gemini API | Google AI for Developers
|
3 |
+
==================================================
|
4 |
+
|
5 |
+
Function calling with the Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Function calling with the Gemini API Function calling lets you connect models to external tools and APIs. Instead of generating text responses, the model determines when to call specific functions and provides the necessary parameters to execute real-world actions. This allows the model to act as a bridge between natural language and real-world actions and data. Function calling has 3 primary use cases: Augment Knowledge: Access information from external sources like databases, APIs, and knowledge bases. Extend Capabilities: Use external tools to perform computations and extend the limitations of the model, such as using a calculator or creating charts. Take Actions: Interact with external systems using APIs, such as scheduling appointments, creating invoices, sending emails, or controlling smart home devices. Get Weather Schedule Meeting Create Chart How function calling works Function calling involves a structured interaction between your application, the model, and external functions. Here's a breakdown of the process: Define Function Declaration: Define the function declaration in your application code. Function Declarations describe the function's name, parameters, and purpose to the model. Call LLM with function declarations: Send user prompt along with the function declaration(s) to the model. It analyzes the request and determines if a function call would be helpful. If so, it responds with a structured JSON object. Execute Function Code (Your Responsibility): The Model does not execute the function itself. It's your application's responsibility to process the response and check for Function Call, if Yes : Extract the name and args of the function and execute the corresponding function in your application. No: The model has provided a direct text response to the prompt (this flow is less emphasized in the example but is a possible outcome). Create User friendly response: If a function was executed, capture the result and send it back to the model in a subsequent turn of the conversation. It will use the result to generate a final, user-friendly response that incorporates the information from the function call. This process can be repeated over multiple turns, allowing for complex interactions and workflows. The model also supports calling multiple functions in a single turn ( parallel function calling ) and in sequence ( compositional function calling ). Step 1: Define a function declaration Define a function and its declaration within your application code that allows users to set light values and make an API request. This function could call external services or APIs. Python # Define a function that the model can call to control smart lights set_light_values_declaration = { "name" : "set_light_values" , "description" : "Sets the brightness and color temperature of a light." , "parameters" : { "type" : "object" , "properties" : { "brightness" : { "type" : "integer" , "description" : "Light level from 0 to 100. Zero is off and 100 is full brightness" , }, "color_temp" : { "type" : "string" , "enum" : [ "daylight" , "cool" , "warm" ], "description" : "Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`." , }, }, "required" : [ "brightness" , "color_temp" ], }, } # This is the actual function that would be called based on the model's suggestion def set_light_values ( brightness : int , color_temp : str ) - > dict [ str , int | str ]: """Set the brightness and color temperature of a room light. (mock API). Args: brightness: Light level from 0 to 100. Zero is off and 100 is full brightness color_temp: Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`. Returns: A dictionary containing the set brightness and color temperature. """ return { "brightness" : brightness , "colorTemperature" : color_temp } JavaScript import { Type } from '@google/genai' ; // Define a function that the model can call to control smart lights const setLightValuesFunctionDeclaration = { name : 'set_light_values' , description : 'Sets the brightness and color temperature of a light.' , parameters : { type : Type . OBJECT , properties : { brightness : { type : Type . NUMBER , description : 'Light level from 0 to 100. Zero is off and 100 is full brightness' , }, color_temp : { type : Type . STRING , enum : [ 'daylight' , 'cool' , 'warm' ], description : 'Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`.' , }, }, required : [ 'brightness' , 'color_temp' ], }, }; /** * Set the brightness and color temperature of a room light. (mock API) * @param {number} brightness - Light level from 0 to 100. Zero is off and 100 is full brightness * @param {string} color_temp - Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`. * @return {Object} A dictionary containing the set brightness and color temperature. */ function setLightValues ( brightness , color_temp ) { return { brightness : brightness , colorTemperature : color_temp }; } Step 2: Call the model with function declarations Once you have defined your function declarations, you can prompt the model to use them. It analyzes the prompt and function declarations and decides whether to respond directly or to call a function. If a function is called, the response object will contain a function call suggestion. Python from google.genai import types # Configure the client and tools client = genai . Client () tools = types . Tool ( function_declarations = [ set_light_values_declaration ]) config = types . GenerateContentConfig ( tools = [ tools ]) # Define user prompt contents = [ types . Content ( role = "user" , parts = [ types . Part ( text = "Turn the lights down to a romantic level" )] ) ] # Send request with function declarations response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = contents config = config , ) print ( response . candidates [ 0 ] . content . parts [ 0 ] . function_call ) JavaScript import { GoogleGenAI } from '@google/genai' ; // Generation config with function declaration const config = { tools : [{ functionDeclarations : [ setLightValuesFunctionDeclaration ] }] }; // Configure the client const ai = new GoogleGenAI ({}); // Define user prompt const contents = [ { role : 'user' , parts : [{ text : 'Turn the lights down to a romantic level' }] } ]; // Send request with function declarations const response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : contents , config : config }); console . log ( response . functionCalls [ 0 ]); The model then returns a functionCall object in an OpenAPI compatible schema specifying how to call one or more of the declared functions in order to respond to the user's question. Python id = None args = { 'color_temp' : 'warm' , 'brightness' : 25 } name = 'set_light_values' JavaScript { name : 'set_light_values' , args : { brightness : 25 , color_temp : 'warm' } } Step 3: Execute set_light_values function code Extract the function call details from the model's response, parse the arguments , and execute the set_light_values function. Python # Extract tool call details, it may not be in the first part. tool_call = response . candidates [ 0 ] . content . parts [ 0 ] . function_call if tool_call . name == "set_light_values" : result = set_light_values ( ** tool_call . args ) print ( f "Function execution result: { result } " ) JavaScript // Extract tool call details const tool_call = response . functionCalls [ 0 ] let result ; if ( tool_call . name === 'set_light_values' ) { result = setLightValues ( tool_call . args . brightness , tool_call . args . color_temp ); console . log ( `Function execution result: ${ JSON . stringify ( result ) } ` ); } Step 4: Create user friendly response with function result and call the model again Finally, send the result of the function execution back to the model so it can incorporate this information into its final response to the user. Python # Create a function response part function_response_part = types . Part . from_function_response ( name = tool_call . name , response = { "result" : result }, ) # Append function call and result of the function execution to contents contents . append ( response . candidates [ 0 ] . content ) # Append the content from the model's response. contents . append ( types . Content ( role = "user" , parts = [ function_response_part ])) # Append the function response final_response = client . models . generate_content ( model = "gemini-2.5-flash" , config = config , contents = contents , ) print ( final_response . text ) JavaScript // Create a function response part const function_response_part = { name : tool_call . name , response : { result } } // Append function call and result of the function execution to contents contents . push ( response . candidates [ 0 ]. content ); contents . push ({ role : 'user' , parts : [{ functionResponse : function_response_part }] }); // Get the final response from the model const final_response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : contents , config : config }); console . log ( final_response . text ); This completes the function calling flow. The model successfully used the set_light_values function to perform the request action of the user. Function declarations When you implement function calling in a prompt, you create a tools object, which contains one or more function declarations . You define functions using JSON, specifically with a select subset of the OpenAPI schema format. A single function declaration can include the following parameters: name (string): A unique name for the function ( get_weather_forecast , send_email ). Use descriptive names without spaces or special characters (use underscores or camelCase). description (string): A clear and detailed explanation of the function's purpose and capabilities. This is crucial for the model to understand when to use the function. Be specific and provide examples if helpful ("Finds theaters based on location and optionally movie title which is currently playing in theaters."). parameters (object): Defines the input parameters the function expects. type (string): Specifies the overall data type, such as object . properties (object): Lists individual parameters, each with: type (string): The data type of the parameter, such as string , integer , boolean, array . description (string): A description of the parameter's purpose and format. Provide examples and constraints ("The city and state, e.g., 'San Francisco, CA' or a zip code e.g., '95616'."). enum (array, optional): If the parameter values are from a fixed set, use "enum" to list the allowed values instead of just describing them in the description. This improves accuracy ("enum": ["daylight", "cool", "warm"]). required (array): An array of strings listing the parameter names that are mandatory for the function to operate. Function calling with thinking Enabling "thinking" can improve function call performance by allowing the model to reason through a request before suggesting function calls. However, because the Gemini API is stateless, this reasoning context is lost between turns, which can reduce the quality of function calls as they require multiple turn requests. To preserve this context you can use thought signatures. A thought signature is an encrypted representation of the model's internal thought process that you pass back to the model on subsequent turns. To use thought signatures: Receive the signature: When thinking is enabled, the API response will include a thought_signature field containing an encrypted representation of the model's reasoning. Return the signature: When you send the function's execution result back to the server, include the thought_signature you received. This allows the model to restore its previous thinking context and will likely result in better function calling performance. Receiving signatures from the server Signatures are returned in the part after the model's thinking phase, which typically is a text or function call. Here are some examples of what thought signatures look like returned in each type of part, in response to the request "What's the weather in Lake Tahoe?" using the Get Weather example: Text part [{ "candidates" : [ { "content" : { "parts" : [ { "text" : "Here's what the weather in Lake Tahoe is today" , "thoughtSignature" : "ClcBVKhc7ru7KzUI7SrdUoIdAYLm/+i93aHjfIt4xHyAoO/G70tApxnK2ujBhOhC1PrRy1pkQa88fqFvpHNVd1HDjNLO7mkp6/hFwE+SPPEB3fh0hs4oM8MKhgIBVKhc7uIGvrS7i/T4HpfbnYrluFfWNjZ62gewqe4cVdR/Dlh+zbjtYmDD0gPZ+SuBO7vvHQdzsjePRP+2Y5XddX6LEf/cGGgakq8EhVvw/a6IVzUO6XmpHg2Ag1sl8E9+VFH/lC0R0ZuYdFWligtDuYwp5p5q3o59G0TtWeU2MC1y2MJfE9u/KWd313ldka80/X2W/xF2O/4djMp5G2WKcULfve75zeRCy0mc5iS3SB9mTH0cT6x0vtKjeBx50gcg+CQWtJcRuwTVzz54dmvmK9xvnqA8gKGw3DuaM9wfy5hyY7Qg0z3iyyWdP8T/lbjKim8IEQOk7O1vVwP1Ko7oMYH8JgA1CsoBAVSoXO6v4c5RSyd1cn6EIU0pEFQsjW7rYWPuZdOFq/tsGJT9BCfW7KGkPGwlNSq8jTJFvbcJ/DjtndISQYXwiXd2kGa5JfdS2Kh4zOxCxiWtOk+2nCc3+XQk2nonhO+esGJpkDdbbHZSqRgcUtYKq7q28iPFOQvOFyCiZNB7K86Z/6Hnagu2snSlN/BcTMaFGaWpcCClSUo4foRZn3WbNCoM8rcpD7qEJMp4a5baaSxyyeL1ZTGd2HLpFys/oiW6e3oAnhxuIysCwg==" } ] , "role" : "model" } , "index" : 0 } ] , # Remainder of response... Function call part [{ "candidates" : [ { "content" : { "parts" : [ { "functionCall" : { "name" : "getWeather" , "args" : { "city" : "Lake Tahoe" } } , "thoughtSignature" : "CiwBVKhc7nRyTi3HmggPD9iQiRc261f5jwuMdw3H/itDH0emsb9ZVo3Nwx9p6wpsAVSoXO5i8fDV4jBSBLoaWxB5zUdlGY6aIGp+I0oEnwRRSRQ1LOvrDlojEH8JE8HjiKXALdJrvNPiG+HY3GZEO8pZjEZtc3UoBUh7+SVyjK7Xolu7aRYYeUyzrCapoETWypER1jbrJXnFV23hCosBAVSoXO6oIPNJSmbuEDfGafOhuCSHkpr1yjTp35RXYqmCESzRzWf5+nFXLqncqeFo4ohoxbiYQVpVQbOZF81p8o9zg6xeRE7qMeOv+XN7enXGJ4/s3qNFQpfkSMqRdBITN1VpX7jyfEAjvxBNc7PDfDJZmEPY338ZIY5nFFcmzJSWjVrboFt2sMFv+A==" } ] , "role" : "model" } , "finishReason" : "STOP" , "index" : 0 } ] , # Remainder of response... You can confirm that you received a signature and see what a signature looks like using the following code: # Step 2: Call the model with function declarations # ...Generation config, Configure the client, and Define user prompt (No changes) # Send request with declarations (using a thinking model) response = client . models . generate_content ( model = "gemini-2.5-flash" , config = config , contents = contents ) # See thought signatures for part in response . candidates [ 0 ] . content . parts : if part . thought_signature : print ( "Thought signature:" ) print ( part . thought_signature ) Returning signatures back to the server In order to return signatures back: You should return signatures along with their containing parts back to the server You shouldn't merge a part with a signature with another part which also contains a signature. The signature string is not concatenable You shouldn't merge one part with a signature with another part without a signature. This breaks the correct positioning of the thought represented by the signature. The code will remain the same as in Step 4 of the previous section. But in this case (as indicated in the comment below) you will return signatures to the model along with the result of the function execution so the model can incorporate the thoughts into its final response: Python # Step 4: Create user friendly response with function result and call the model again # ...Create a function response part (No change) # Append thought signatures, function call and result of the function execution to contents function_call_content = response . candidates [ 0 ] . content # Append the model's function call message, which includes thought signatures contents . append ( function_call_content ) contents . append ( types . Content ( role = "user" , parts = [ function_response_part ])) # Append the function response final_response = client . models . generate_content ( model = "gemini-2.5-flash" , config = config , contents = contents , ) print ( final_response . text ) JavaScript // Step 4: Create user friendly response with function result and call the model again // ...Create a function response part (No change) // Append thought signatures, function call and result of the function execution to contents const function_response_content = response . candidates [ 0 ]. content ; contents . push ( function_response_content ); contents . push ({ role : 'user' , parts : [{ functionResponse : function_response_part }] }); const final_response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : contents , config : config }); console . log ( final_response . text ); The following shows what a request returning a thought signature may look like: [{ "contents" : [ { "role" : "user" , "parts" : [ { "text" : "what is the weather in Lake Tahoe?" } ] } , { "parts" : [ { "functionCall" : { "name" : "getWeather" , "args" : { "city" : "Lake Tahoe" } } , "thoughtSignature" : "CiIBVKhc7oDPpCaXyJKKssjqr4g3JNOSgJ/M2V+1THC1icsWCmwBVKhc7pBABbZ+zR3e9234WnWWS6GFXmf8IVwpnzjd5KYd7vyJbn/4vTorWBGayj/vbd9JPaZQjxdAIXhoE5mX/MDsQ7M9N/b0qJjHm39tYIBvS4sIWkMDHqTJqXGLzhhKtrTkfbV3RbaJEkQKmwEBVKhc7qVUgC3hfTXZLo9R3AJzUUIx50NKvJTb9B+UU+LBqgg7Nck1x5OpjWVS2R+SsveprIuYOruk2Y0H53J2OJF8qsxTdIq2si8DGW2V7WK8xyoJH5kbqd7drIw1jLb44b6lx4SMyB0VaULuTBki4d+Ljjg1tJTwR0IYMKqDLDZt9mheINsi0ZxcNjfpnDydRXdWbcSwzmK/wgqJAQFUqFzuKgNVElxs3cbO+xebr2IwcOro84nKTisi0tTp9bICPC9fTUhn3L+rvQWA+d3J1Za8at2bakrqiRj7BTh+CVO9fWQMAEQAs3ni0Z2hfaYG92tOD26E4IoZwyYEoWbfNudpH1fr5tEkyqnEGtWIh7H+XoZQ2DXeiOa+br7Zk88SrNE+trJMCogBAVSoXO5e9fBLg7hnbkmKsrzNLnQtLsQm1gNzjcjEC7nJYklYPp0KI2uGBE1PkM8XNsfllAfHVn7LzHcHNlbQ9pJ7QZTSIeG42goS971r5wNZwxaXwCTphClQh826eqJWo6A/28TtAVQWLhTx5ekbP7qb4nh1UblESZ1saxDQAEo4OKPbDzx5BgqKAQFUqFzuVyjNm5i0wN8hTDnKjfpDroEpPPTs531iFy9BOX+xDCdGHy8D+osFpaoBq6TFekQQbz4hIoUR1YEcP4zI80/cNimEeb9IcFxZTTxiNrbhbbcv0969DSMWhB+ZEqIz4vuw4GLe/xcUvqhlChQwFdgIbdOQHSHpatn5uDlktnP/bi26nKuXIwo0AVSoXO7US22OUH7d1f4abNPI0IyAvhqkPp12rbtWLx9vkOtojE8IP+xCfYtIFuZIzRNZqA==" } ] , "role" : "model" } , { "role" : "user" , "parts" : [ { "functionResponse" : { "name" : "getWeather" , "response" : { "response" : { "stringValue" : "Sunny and hot. 90 degrees Fahrenheit" } } } } ] } ] , # Remainder of request... Learn more about limitations and usage of thought signatures, and about thinking models in general, on the Thinking page. Parallel function calling In addition to single turn function calling, you can also call multiple functions at once. Parallel function calling lets you execute multiple functions at once and is used when the functions are not dependent on each other. This is useful in scenarios like gathering data from multiple independent sources, such as retrieving customer details from different databases or checking inventory levels across various warehouses or performing multiple actions such as converting your apartment into a disco. Python power_disco_ball = { "name" : "power_disco_ball" , "description" : "Powers the spinning disco ball." , "parameters" : { "type" : "object" , "properties" : { "power" : { "type" : "boolean" , "description" : "Whether to turn the disco ball on or off." , } }, "required" : [ "power" ], }, } start_music = { "name" : "start_music" , "description" : "Play some music matching the specified parameters." , "parameters" : { "type" : "object" , "properties" : { "energetic" : { "type" : "boolean" , "description" : "Whether the music is energetic or not." , }, "loud" : { "type" : "boolean" , "description" : "Whether the music is loud or not." , }, }, "required" : [ "energetic" , "loud" ], }, } dim_lights = { "name" : "dim_lights" , "description" : "Dim the lights." , "parameters" : { "type" : "object" , "properties" : { "brightness" : { "type" : "number" , "description" : "The brightness of the lights, 0.0 is off, 1.0 is full." , } }, "required" : [ "brightness" ], }, } JavaScript import { Type } from '@google/genai' ; const powerDiscoBall = { name : 'power_disco_ball' , description : 'Powers the spinning disco ball.' , parameters : { type : Type . OBJECT , properties : { power : { type : Type . BOOLEAN , description : 'Whether to turn the disco ball on or off.' } }, required : [ 'power' ] } }; const startMusic = { name : 'start_music' , description : 'Play some music matching the specified parameters.' , parameters : { type : Type . OBJECT , properties : { energetic : { type : Type . BOOLEAN , description : 'Whether the music is energetic or not.' }, loud : { type : Type . BOOLEAN , description : 'Whether the music is loud or not.' } }, required : [ 'energetic' , 'loud' ] } }; const dimLights = { name : 'dim_lights' , description : 'Dim the lights.' , parameters : { type : Type . OBJECT , properties : { brightness : { type : Type . NUMBER , description : 'The brightness of the lights, 0.0 is off, 1.0 is full.' } }, required : [ 'brightness' ] } }; Configure the function calling mode to allow using all of the specified tools. To learn more, you can read about configuring function calling . Python from google import genai from google.genai import types # Configure the client and tools client = genai . Client () house_tools = [ types . Tool ( function_declarations = [ power_disco_ball , start_music , dim_lights ]) ] config = types . GenerateContentConfig ( tools = house_tools , automatic_function_calling = types . AutomaticFunctionCallingConfig ( disable = True ), # Force the model to call 'any' function, instead of chatting. tool_config = types . ToolConfig ( function_calling_config = types . FunctionCallingConfig ( mode = 'ANY' ) ), ) chat = client . chats . create ( model = "gemini-2.5-flash" , config = config ) response = chat . send_message ( "Turn this place into a party!" ) # Print out each of the function calls requested from this single call print ( "Example 1: Forced function calling" ) for fn in response . function_calls : args = ", " . join ( f " { key } = { val } " for key , val in fn . args . items ()) print ( f " { fn . name } ( { args } )" ) JavaScript import { GoogleGenAI } from '@google/genai' ; // Set up function declarations const houseFns = [ powerDiscoBall , startMusic , dimLights ]; const config = { tools : [{ functionDeclarations : houseFns }], // Force the model to call 'any' function, instead of chatting. toolConfig : { functionCallingConfig : { mode : 'any' } } }; // Configure the client const ai = new GoogleGenAI ({}); // Create a chat session const chat = ai . chats . create ({ model : 'gemini-2.5-flash' , config : config }); const response = await chat . sendMessage ({ message : 'Turn this place into a party!' }); // Print out each of the function calls requested from this single call console . log ( "Example 1: Forced function calling" ); for ( const fn of response . functionCalls ) { const args = Object . entries ( fn . args ) . map (([ key , val ]) = > ` ${ key } = ${ val } ` ) . join ( ', ' ); console . log ( ` ${ fn . name } ( ${ args } )` ); } Each of the printed results reflects a single function call that the model has requested. To send the results back, include the responses in the same order as they were requested. The Python SDK supports automatic function calling , which automatically converts Python functions to declarations, handles the function call execution and response cycle for you. Following is an example for the disco use case. Note: Automatic Function Calling is a Python SDK only feature at the moment. Python from google import genai from google.genai import types # Actual function implementations def power_disco_ball_impl ( power : bool ) - > dict : """Powers the spinning disco ball. Args: power: Whether to turn the disco ball on or off. Returns: A status dictionary indicating the current state. """ return { "status" : f "Disco ball powered { 'on' if power else 'off' } " } def start_music_impl ( energetic : bool , loud : bool ) - > dict : """Play some music matching the specified parameters. Args: energetic: Whether the music is energetic or not. loud: Whether the music is loud or not. Returns: A dictionary containing the music settings. """ music_type = "energetic" if energetic else "chill" volume = "loud" if loud else "quiet" return { "music_type" : music_type , "volume" : volume } def dim_lights_impl ( brightness : float ) - > dict : """Dim the lights. Args: brightness: The brightness of the lights, 0.0 is off, 1.0 is full. Returns: A dictionary containing the new brightness setting. """ return { "brightness" : brightness } # Configure the client client = genai . Client () config = types . GenerateContentConfig ( tools = [ power_disco_ball_impl , start_music_impl , dim_lights_impl ] ) # Make the request response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "Do everything you need to this place into party!" , config = config , ) print ( " \n Example 2: Automatic function calling" ) print ( response . text ) # I've turned on the disco ball, started playing loud and energetic music, and dimmed the lights to 50% brightness. Let's get this party started! Compositional function calling Compositional or sequential function calling allows Gemini to chain multiple function calls together to fulfill a complex request. For example, to answer "Get the temperature in my current location", the Gemini API might first invoke a get_current_location() function followed by a get_weather() function that takes the location as a parameter. The following example demonstrates how to implement compositional function calling using the Python SDK and automatic function calling. Python This example uses the automatic function calling feature of the google-genai Python SDK. The SDK automatically converts the Python functions to the required schema, executes the function calls when requested by the model, and sends the results back to the model to complete the task. import os from google import genai from google.genai import types # Example Functions def get_weather_forecast ( location : str ) - > dict : """Gets the current weather temperature for a given location.""" print ( f "Tool Call: get_weather_forecast(location= { location } )" ) # TODO: Make API call print ( "Tool Response: {'temperature': 25, 'unit': 'celsius'}" ) return { "temperature" : 25 , "unit" : "celsius" } # Dummy response def set_thermostat_temperature ( temperature : int ) - > dict : """Sets the thermostat to a desired temperature.""" print ( f "Tool Call: set_thermostat_temperature(temperature= { temperature } )" ) # TODO: Interact with a thermostat API print ( "Tool Response: {'status': 'success'}" ) return { "status" : "success" } # Configure the client and model client = genai . Client () config = types . GenerateContentConfig ( tools = [ get_weather_forecast , set_thermostat_temperature ] ) # Make the request response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "If it's warmer than 20°C in London, set the thermostat to 20°C, otherwise set it to 18°C." , config = config , ) # Print the final, user-facing response print ( response . text ) Expected Output When you run the code, you will see the SDK orchestrating the function calls. The model first calls get_weather_forecast , receives the temperature, and then calls set_thermostat_temperature with the correct value based on the logic in the prompt. Tool Call : get_weather_forecast ( location = London ) Tool Response : { 'temperature' : 25 , 'unit' : 'celsius' } Tool Call : set_thermostat_temperature ( temperature = 20 ) Tool Response : { 'status' : 'success' } OK . I 've set the thermostat to 20°C. JavaScript This example shows how to use JavaScript/TypeScript SDK to do comopositional function calling using a manual execution loop. import { GoogleGenAI , Type } from "@google/genai" ; // Configure the client const ai = new GoogleGenAI ({}); // Example Functions function get_weather_forecast ({ location }) { console . log ( `Tool Call: get_weather_forecast(location= ${ location } )` ); // TODO: Make API call console . log ( "Tool Response: {'temperature': 25, 'unit': 'celsius'}" ); return { temperature : 25 , unit : "celsius" }; } function set_thermostat_temperature ({ temperature }) { console . log ( `Tool Call: set_thermostat_temperature(temperature= ${ temperature } )` , ); // TODO: Make API call console . log ( "Tool Response: {'status': 'success'}" ); return { status : "success" }; } const toolFunctions = { get_weather_forecast , set_thermostat_temperature , }; const tools = [ { functionDeclarations : [ { name : "get_weather_forecast" , description : "Gets the current weather temperature for a given location." , parameters : { type : Type . OBJECT , properties : { location : { type : Type . STRING , }, }, required : [ "location" ], }, }, { name : "set_thermostat_temperature" , description : "Sets the thermostat to a desired temperature." , parameters : { type : Type . OBJECT , properties : { temperature : { type : Type . NUMBER , }, }, required : [ "temperature" ], }, }, ], }, ]; // Prompt for the model let contents = [ { role : "user" , parts : [ { text : "If it's warmer than 20°C in London, set the thermostat to 20°C, otherwise set it to 18°C." , }, ], }, ]; // Loop until the model has no more function calls to make while ( true ) { const result = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents , config : { tools }, }); if ( result . functionCalls && result . functionCalls . length > 0 ) { const functionCall = result . functionCalls [ 0 ]; const { name , args } = functionCall ; if ( ! toolFunctions [ name ]) { throw new Error ( `Unknown function call: ${ name } ` ); } // Call the function and get the response. const toolResponse = toolFunctions [ name ]( args ); const functionResponsePart = { name : functionCall . name , response : { result : toolResponse , }, }; // Send the function response back to the model. contents . push ({ role : "model" , parts : [ { functionCall : functionCall , }, ], }); contents . push ({ role : "user" , parts : [ { functionResponse : functionResponsePart , }, ], }); } else { // No more function calls, break the loop. console . log ( result . text ); break ; } } Expected Output When you run the code, you will see the SDK orchestrating the function calls. The model first calls get_weather_forecast , receives the temperature, and then calls set_thermostat_temperature with the correct value based on the logic in the prompt. Tool Call : get_weather_forecast ( location = London ) Tool Response : { 'temperature' : 25 , 'unit' : 'celsius' } Tool Call : set_thermostat_temperature ( temperature = 20 ) Tool Response : { 'status' : 'success' } OK . It 's 25°C in London, so I' ve set the thermostat to 20 ° C . Compositional function calling is a native Live API feature. This means Live API can handle the function calling similar to the Python SDK. Python # Light control schemas turn_on_the_lights_schema = { 'name' : 'turn_on_the_lights' } turn_off_the_lights_schema = { 'name' : 'turn_off_the_lights' } prompt = """ Hey, can you write run some python code to turn on the lights, wait 10s and then turn off the lights? """ tools = [ { 'code_execution' : {}}, { 'function_declarations' : [ turn_on_the_lights_schema , turn_off_the_lights_schema ]} ] await run ( prompt , tools = tools , modality = "AUDIO" ) JavaScript // Light control schemas const turnOnTheLightsSchema = { name : 'turn_on_the_lights' }; const turnOffTheLightsSchema = { name : 'turn_off_the_lights' }; const prompt = ` Hey, can you write run some python code to turn on the lights, wait 10s and then turn off the lights? ` ; const tools = [ { codeExecution : {} }, { functionDeclarations : [ turnOnTheLightsSchema , turnOffTheLightsSchema ] } ]; await run ( prompt , tools = tools , modality = "AUDIO" ) Function calling modes The Gemini API lets you control how the model uses the provided tools (function declarations). Specifically, you can set the mode within the. function_calling_config . AUTO (Default) : The model decides whether to generate a natural language response or suggest a function call based on the prompt and context. This is the most flexible mode and recommended for most scenarios. ANY : The model is constrained to always predict a function call and guarantees function schema adherence. If allowed_function_names is not specified, the model can choose from any of the provided function declarations. If allowed_function_names is provided as a list, the model can only choose from the functions in that list. Use this mode when you require a function call response to every prompt (if applicable). NONE : The model is prohibited from making function calls. This is equivalent to sending a request without any function declarations. Use this to temporarily disable function calling without removing your tool definitions. Python from google.genai import types # Configure function calling mode tool_config = types . ToolConfig ( function_calling_config = types . FunctionCallingConfig ( mode = "ANY" , allowed_function_names = [ "get_current_temperature" ] ) ) # Create the generation config config = types . GenerateContentConfig ( tools = [ tools ], # not defined here. tool_config = tool_config , ) JavaScript import { FunctionCallingConfigMode } from '@google/genai' ; // Configure function calling mode const toolConfig = { functionCallingConfig : { mode : FunctionCallingConfigMode . ANY , allowedFunctionNames : [ 'get_current_temperature' ] } }; // Create the generation config const config = { tools : tools , // not defined here. toolConfig : toolConfig , }; Automatic function calling (Python only) When using the Python SDK, you can provide Python functions directly as tools. The SDK automatically converts the Python function to declarations, handles the function call execution and the response cycle for you. The Python SDK then automatically: Detects function call responses from the model. Call the corresponding Python function in your code. Sends the function response back to the model. Returns the model's final text response. To use this, define your function with type hints and a docstring, and then pass the function itself (not a JSON declaration) as a tool: Python from google import genai from google.genai import types # Define the function with type hints and docstring def get_current_temperature ( location : str ) - > dict : """Gets the current temperature for a given location. Args: location: The city and state, e.g. San Francisco, CA Returns: A dictionary containing the temperature and unit. """ # ... (implementation) ... return { "temperature" : 25 , "unit" : "Celsius" } # Configure the client client = genai . Client () config = types . GenerateContentConfig ( tools = [ get_current_temperature ] ) # Pass the function itself # Make the request response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "What's the temperature in Boston?" , config = config , ) print ( response . text ) # The SDK handles the function call and returns the final text You can disable automatic function calling with: Python config = types . GenerateContentConfig ( tools = [ get_current_temperature ], automatic_function_calling = types . AutomaticFunctionCallingConfig ( disable = True ) ) Automatic function schema declaration Automatic schema extraction from Python functions doesn't work in all cases. For example, it doesn't handle cases where you describe the fields of a nested dictionary-object. The API is able to describe any of the following types: Python AllowedType = ( int | float | bool | str | list [ 'AllowedType' ] | dict [ str , AllowedType ]) To see what the inferred schema looks like, you can convert it using from_callable : Python def multiply ( a : float , b : float ): """Returns a * b.""" return a * b fn_decl = types . FunctionDeclaration . from_callable ( callable = multiply , client = client ) # to_json_dict() provides a clean JSON representation. print ( fn_decl . to_json_dict ()) Multi-tool use: Combine native tools with function calling You can enable multiple tools combining native tools with function calling at the same time. Here's an example that enables two tools, Grounding with Google Search and code execution , in a request using the Live API . Note: Multi-tool use is a- Live API only feature at the moment. The run() function declaration, which handles the asynchronous websocket setup, is omitted for brevity. Python # Multiple tasks example - combining lights, code execution, and search prompt = """ Hey, I need you to do three things for me. 1. Turn on the lights. 2. Then compute the largest prime palindrome under 100000. 3. Then use Google Search to look up information about the largest earthquake in California the week of Dec 5 2024. Thanks! """ tools = [ { 'google_search' : {}}, { 'code_execution' : {}}, { 'function_declarations' : [ turn_on_the_lights_schema , turn_off_the_lights_schema ]} # not defined here. ] # Execute the prompt with specified tools in audio modality await run ( prompt , tools = tools , modality = "AUDIO" ) JavaScript // Multiple tasks example - combining lights, code execution, and search const prompt = ` Hey, I need you to do three things for me. 1. Turn on the lights. 2. Then compute the largest prime palindrome under 100000. 3. Then use Google Search to look up information about the largest earthquake in California the week of Dec 5 2024. Thanks! ` ; const tools = [ { googleSearch : {} }, { codeExecution : {} }, { functionDeclarations : [ turnOnTheLightsSchema , turnOffTheLightsSchema ] } // not defined here. ]; // Execute the prompt with specified tools in audio modality await run ( prompt , { tools : tools , modality : "AUDIO" }); Python developers can try this out in the Live API Tool Use notebook . Model context protocol (MCP) Model Context Protocol (MCP) is an open standard for connecting AI applications with external tools and data. MCP provides a common protocol for models to access context, such as functions (tools), data sources (resources), or predefined prompts. The Gemini SDKs have built-in support for the MCP, reducing boilerplate code and offering automatic tool calling for MCP tools. When the model generates an MCP tool call, the Python and JavaScript client SDK can automatically execute the MCP tool and send the response back to the model in a subsequent request, continuing this loop until no more tool calls are made by the model. Here, you can find an example of how to use a local MCP server with Gemini and mcp SDK. Python Make sure the latest version of the mcp SDK is installed on your platform of choice. pip install mcp Note: Python supports automatic tool calling by passing in the ClientSession into the tools parameters. If you want to disable it, you can provide automatic_function_calling with disabled True . import os import asyncio from datetime import datetime from mcp import ClientSession , StdioServerParameters from mcp.client.stdio import stdio_client from google import genai client = genai . Client () # Create server parameters for stdio connection server_params = StdioServerParameters ( command = "npx" , # Executable args = [ "-y" , "@philschmid/weather-mcp" ], # MCP Server env = None , # Optional environment variables ) async def run (): async with stdio_client ( server_params ) as ( read , write ): async with ClientSession ( read , write ) as session : # Prompt to get the weather for the current day in London. prompt = f "What is the weather in London in { datetime . now () . strftime ( '%Y-%m- %d ' ) } ?" # Initialize the connection between client and server await session . initialize () # Send request to the model with MCP function declarations response = await client . aio . models . generate_content ( model = "gemini-2.5-flash" , contents = prompt , config = genai . types . GenerateContentConfig ( temperature = 0 , tools = [ session ], # uses the session, will automatically call the tool # Uncomment if you **don't** want the SDK to automatically call the tool # automatic_function_calling=genai.types.AutomaticFunctionCallingConfig( # disable=True # ), ), ) print ( response . text ) # Start the asyncio event loop and run the main function asyncio . run ( run ()) JavaScript Make sure the latest version of the mcp SDK is installed on your platform of choice. npm install @modelcontextprotocol/sdk Note: JavaScript supports automatic tool calling by wrapping the client with mcpToTool . If you want to disable it, you can provide automaticFunctionCalling with disabled true . import { GoogleGenAI , FunctionCallingConfigMode , mcpToTool } from '@google/genai' ; import { Client } from "@modelcontextprotocol/sdk/client/index.js" ; import { StdioClientTransport } from "@modelcontextprotocol/sdk/client/stdio.js" ; // Create server parameters for stdio connection const serverParams = new StdioClientTransport ({ command : "npx" , // Executable args : [ "-y" , "@philschmid/weather-mcp" ] // MCP Server }); const client = new Client ( { name : "example-client" , version : "1.0.0" } ); // Configure the client const ai = new GoogleGenAI ({}); // Initialize the connection between client and server await client . connect ( serverParams ); // Send request to the model with MCP tools const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : `What is the weather in London in ${ new Date (). toLocaleDateString () } ?` , config : { tools : [ mcpToTool ( client )], // uses the session, will automatically call the tool // Uncomment if you **don't** want the sdk to automatically call the tool // automaticFunctionCalling: { // disable: true, // }, }, }); console . log ( response . text ) // Close the connection await client . close (); Limitations with built-in MCP support Built-in MCP support is a experimental feature in our SDKs and has the following limitations: Only tools are supported, not resources nor prompts It is available for the Python and JavaScript/TypeScript SDK. Breaking changes might occur in future releases. Manual integration of MCP servers is always an option if these limit what you're building. Supported models This section lists models and their function calling capabilities. Experimental models are not included. You can find a comprehensive capabilities overview on the model overview page. Model Function Calling Parallel Function Calling Compositional Function Calling Gemini 2.5 Pro ✔️ ✔️ ✔️ Gemini 2.5 Flash ✔️ ✔️ ✔️ Gemini 2.5 Flash-Lite ✔️ ✔️ ✔️ Gemini 2.0 Flash ✔️ ✔️ ✔️ Gemini 2.0 Flash-Lite X X X Best practices Function and Parameter Descriptions: Be extremely clear and specific in your descriptions. The model relies on these to choose the correct function and provide appropriate arguments. Naming: Use descriptive function names (without spaces, periods, or dashes). Strong Typing: Use specific types (integer, string, enum) for parameters to reduce errors. If a parameter has a limited set of valid values, use an enum. Tool Selection: While the model can use an arbitrary number of tools, providing too many can increase the risk of selecting an incorrect or suboptimal tool. For best results, aim to provide only the relevant tools for the context or task, ideally keeping the active set to a maximum of 10-20. Consider dynamic tool selection based on conversation context if you have a large total number of tools. Prompt Engineering: Provide context: Tell the model its role (e.g., "You are a helpful weather assistant."). Give instructions: Specify how and when to use functions (e.g., "Don't guess dates; always use a future date for forecasts."). Encourage clarification: Instruct the model to ask clarifying questions if needed. Temperature: Use a low temperature (e.g., 0) for more deterministic and reliable function calls. Validation: If a function call has significant consequences (e.g., placing an order), validate the call with the user before executing it. Error Handling : Implement robust error handling in your functions to gracefully handle unexpected inputs or API failures. Return informative error messages that the model can use to generate helpful responses to the user. Security: Be mindful of security when calling external APIs. Use appropriate authentication and authorization mechanisms. Avoid exposing sensitive data in function calls. Token Limits: Function descriptions and parameters count towards your input token limit. If you're hitting token limits, consider limiting the number of functions or the length of the descriptions, break down complex tasks into smaller, more focused function sets. Notes and limitations Only a subset of the OpenAPI schema is supported. Supported parameter types in Python are limited. Automatic function calling is a Python SDK feature only. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-10 UTC.
|
text_content/models_experimental-models_b7a92f62.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
URL: https://ai.google.dev/gemini-api/docs/models/experimental-models#gemini-2.5-pro
|
2 |
+
Title: Gemini models | Gemini API | Google AI for Developers
|
3 |
+
==================================================
|
4 |
+
|
5 |
+
Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: <model>-<generation>-<variation> . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: <model>-<generation>-<variation>-<version> . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: <model>-<generation>-<variation>-<version> . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: <model>-<generation>-<variation>-<version> . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC.
|
text_content/models_experimental-models_ba05a1bd.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
URL: https://ai.google.dev/gemini-api/docs/models/experimental-models#gemini-1.5-pro
|
2 |
+
Title: Gemini models | Gemini API | Google AI for Developers
|
3 |
+
==================================================
|
4 |
+
|
5 |
+
Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: <model>-<generation>-<variation> . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: <model>-<generation>-<variation>-<version> . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: <model>-<generation>-<variation>-<version> . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: <model>-<generation>-<variation>-<version> . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC.
|
text_content/models_experimental-models_de2a19a1.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
URL: https://ai.google.dev/gemini-api/docs/models/experimental-models#gemini-2.5-flash-native-audio
|
2 |
+
Title: Gemini models | Gemini API | Google AI for Developers
|
3 |
+
==================================================
|
4 |
+
|
5 |
+
Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: <model>-<generation>-<variation> . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: <model>-<generation>-<variation>-<version> . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: <model>-<generation>-<variation>-<version> . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: <model>-<generation>-<variation>-<version> . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC.
|
text_content/models_experimental-models_df2e8560.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
URL: https://ai.google.dev/gemini-api/docs/models/experimental-models#token-size
|
2 |
+
Title: Gemini models | Gemini API | Google AI for Developers
|
3 |
+
==================================================
|
4 |
+
|
5 |
+
Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: <model>-<generation>-<variation> . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: <model>-<generation>-<variation>-<version> . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: <model>-<generation>-<variation>-<version> . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: <model>-<generation>-<variation>-<version> . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC.
|
text_content/models_gemini_571c1682.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
URL: https://ai.google.dev/gemini-api/docs/models/gemini#gemini-1.5-flash
|
2 |
+
Title: Gemini models | Gemini API | Google AI for Developers
|
3 |
+
==================================================
|
4 |
+
|
5 |
+
Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: <model>-<generation>-<variation> . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: <model>-<generation>-<variation>-<version> . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: <model>-<generation>-<variation>-<version> . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: <model>-<generation>-<variation>-<version> . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC.
|
text_content/models_gemini_65cc88ec.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
URL: https://ai.google.dev/gemini-api/docs/models/gemini#gemini-2.0-flash-preview-image-generation
|
2 |
+
Title: Gemini models | Gemini API | Google AI for Developers
|
3 |
+
==================================================
|
4 |
+
|
5 |
+
Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: <model>-<generation>-<variation> . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: <model>-<generation>-<variation>-<version> . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: <model>-<generation>-<variation>-<version> . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: <model>-<generation>-<variation>-<version> . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC.
|
text_content/models_gemini_6932841a.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
URL: https://ai.google.dev/gemini-api/docs/models/gemini#token-size
|
2 |
+
Title: Gemini models | Gemini API | Google AI for Developers
|
3 |
+
==================================================
|
4 |
+
|
5 |
+
Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: <model>-<generation>-<variation> . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: <model>-<generation>-<variation>-<version> . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: <model>-<generation>-<variation>-<version> . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: <model>-<generation>-<variation>-<version> . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC.
|
text_content/models_gemini_77ab48aa.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
URL: https://ai.google.dev/gemini-api/docs/models/gemini#gemini-2.5-pro
|
2 |
+
Title: Gemini models | Gemini API | Google AI for Developers
|
3 |
+
==================================================
|
4 |
+
|
5 |
+
Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: <model>-<generation>-<variation> . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: <model>-<generation>-<variation>-<version> . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: <model>-<generation>-<variation>-<version> . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: <model>-<generation>-<variation>-<version> . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC.
|
text_content/models_gemini_ff799621.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
URL: https://ai.google.dev/gemini-api/docs/models/gemini#veo-2
|
2 |
+
Title: Gemini models | Gemini API | Google AI for Developers
|
3 |
+
==================================================
|
4 |
+
|
5 |
+
Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: <model>-<generation>-<variation> . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: <model>-<generation>-<variation>-<version> . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: <model>-<generation>-<variation>-<version> . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: <model>-<generation>-<variation>-<version> . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC.
|